1
0
mirror of synced 2026-01-12 00:42:56 +00:00

On-demand UNICODE with READBOM, new .TEDIT, and small 2-way run-time mapping tables (#1620)

Replaces PR rmk105--UNICODE-on-demand
This commit is contained in:
rmkaplan 2024-04-01 15:03:45 -07:00 committed by GitHub
parent 5437fac7aa
commit 8df2418f97
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 3180 additions and 97 deletions

View File

@ -1,12 +1,14 @@
(DEFINE-FILE-INFO PACKAGE "INTERLISP" READTABLE "INTERLISP" BASE 10)
(FILECREATED " 5-Feb-2024 13:49:35" {WMEDLEY}<library>UNICODE.;57 88440
(FILECREATED "27-Mar-2024 23:07:42" {WMEDLEY}<library>UNICODE.;73 100984
:EDIT-BY rmk
:CHANGES-TO (FNS INVERT-ALL-UNICODE-MAPPINGS UNICODE-EXTEND-TRANSLATION?)
:CHANGES-TO (FNS UNICODE-EXTEND-TRANSLATION? INVERT-ALL-UNICODE-MAPPINGS ALL-UNICODE-MAPPINGS
MERGE-UNICODE-TRANSLATION-TABLES)
(VARS UNICODECOMS)
:PREVIOUS-DATE " 4-Feb-2024 12:42:00" {WMEDLEY}<library>UNICODE.;56)
:PREVIOUS-DATE "27-Mar-2024 14:50:54" {WMEDLEY}<library>UNICODE.;72)
(PRETTYCOMPRINT UNICODECOMS)
@ -15,6 +17,8 @@
((COMS (* ; "External formats")
(FNS UTF8.OUTCHARFN UTF8.INCCODEFN UTF8.PEEKCCODEFN \UTF8.BACKCCODEFN)
(FNS UTF16BE.OUTCHARFN UTF16BE.INCCODEFN UTF16BE.PEEKCCODEFN \UTF16BE.BACKCCODEFN)
(FNS UTF16LE.OUTCHARFN UTF16LE.INCCODEFN UTF16LE.PEEKCCODEFN \UTF16LE.BACKCCODEFN)
(FNS READBOM WRITEBOM)
(INITVARS (EXTERNALEOL 'LF))
(FNS MAKE-UNICODE-FORMATS)
(P (MAKE-UNICODE-FORMATS EXTERNALEOL))
@ -35,7 +39,7 @@
 "Make translation tables for UTF external formats")
(FNS MAKE-UNICODE-TRANSLATION-TABLES MERGE-UNICODE-TRANSLATION-TABLES
MERGE-UNICODE-TRANSLATION-TABLES1)
(FNS INVERT-ALL-UNICODE-MAPPINGS)
(FNS INVERT-ALL-UNICODE-MAPPINGS ALL-UNICODE-MAPPINGS)
(INITVARS (*XCCSTOUNICODE*)
(*UNICODETOXCCS*)
(*INVERTED-UNICODE-MAPPINGS*))
@ -66,6 +70,7 @@
16]
(VARS UNICODE-MAPPING-HEADER))
(FNS UTF8HEXSTRING XTOUSTRING XCCSSTRING)
(FNS UNHEXSTRING)
(FNS SHOWCHARS)
(DECLARE%: EVAL@COMPILE DONTCOPY (FILES (FROM LOADUPS)
EXPORTS.ALL))
@ -336,27 +341,29 @@
(UNICODE.TRANSLATE CHARCODE *XCCSTOUNICODE*)) DO (\WOUT STREAM C])
(UTF16BE.INCCODEFN
[LAMBDA (STREAM COUNTP RAW) (* ; "Edited 6-Aug-2021 16:05 by rmk:")
[LAMBDA (STREAM COUNTP RAW) (* ; "Edited 10-Mar-2024 12:00 by rmk")
(* ; "Edited 6-Aug-2021 16:05 by rmk:")
(* ;;
 "Do not do UNICODE to XCCS translation if RAW. Test for SMALLPin case of funky EOF behavior")
 "Do not do UNICODE to XCCS translation if RAW. Test for SMALLPin case of funky EOF behavior")
(DECLARE (USEDFREE *BYTECOUNTER*))
(LET (CODE BYTE1 BYTE2 COUNT)
(IF [AND (SMALLP (SETQ BYTE1 (\BIN STREAM)))
(SMALLP (SETQ BYTE2 (\BIN STREAM]
(SMALLP (SETQ BYTE2 (\BIN STREAM]
THEN (SETQ COUNT 2)
(SETQ CODE (LOGOR (LLSH (\BIN STREAM)
8)
(\BIN STREAM)))
(CL:UNLESS RAW
(SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOXCCS*)))
(CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT))
CODE
(SETQ CODE (create WORD
HIBYTE _ (\BIN STREAM)
LOBYTE _ (\BIN STREAM)))
(CL:UNLESS RAW
(SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOXCCS*)))
(CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT))
CODE
ELSE (ERROR "ODD NUMBER OF BYTES IN UTF16 FILE" STREAM])
(UTF16BE.PEEKCCODEFN
[LAMBDA (STREAM NOERROR RAW) (* ; "Edited 14-Jun-2021 22:58 by rmk:")
[LAMBDA (STREAM NOERROR RAW) (* ; "Edited 10-Mar-2024 12:01 by rmk")
(* ; "Edited 14-Jun-2021 22:58 by rmk:")
(* ;; "Could be that the caller takes care of backing up the file position if the number of binned-bytes is returned.")
@ -366,35 +373,36 @@
(SETQ BYTE1 (\PEEKBIN STREAM NOERROR))
(IF BYTE1
THEN (\BIN STREAM)
(SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
(\BACKFILEPTR STREAM)
(IF BYTE2
THEN (SETQ CODE (LOGOR (LLSH BYTE1 8)
BYTE2))
(CL:IF RAW
CODE
(UNICODE.TRANSLATE CODE *UNICODETOXCCS*))
ELSEIF NOERROR
THEN NIL)
(SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
(\BACKFILEPTR STREAM)
(IF BYTE2
THEN (SETQ CODE (create WORD
HIBYTE _ BYTE1
LOBYTE _ BYTE2))
(CL:IF RAW
CODE
(UNICODE.TRANSLATE CODE *UNICODETOXCCS*))
ELSEIF NOERROR
THEN NIL)
ELSEIF NOERROR
THEN NIL
ELSE (ERROR "INVALID UTF16 CHARACTER" (LIST BYTE1 BYTE2])
(\UTF16BE.BACKCCODEFN
[LAMBDA (STREAM COUNTP RAW) (* ; "Edited 19-Jul-2022 15:14 by rmk")
[LAMBDA (STREAM COUNTP RAW) (* ; "Edited 10-Mar-2024 12:02 by rmk")
(* ; "Edited 19-Jul-2022 15:14 by rmk")
(* ; "Edited 6-Aug-2021 16:07 by rmk:")
(* ;; "\BACKFILEPTR is NIL at beginning of FILE, do nothing.")
(* ;; "Common for big-ending and little-ending")
(DECLARE (USEDFREE *BYTECOUNTER*))
(CL:WHEN (\BACKFILEPTR STREAM)
(LET (CODE (BYTE2 (\PEEKBIN STREAM)))
(IF (\BACKFILEPTR STREAM)
THEN (CL:WHEN COUNTP (SETQ *BYTECOUNTER* -2))
(SETQ CODE (LOGOR (LLSH BYTE2 8)
(\PEEKBIN STREAM)))
(SETQ CODE (create WORD
HIBYTE _ (\PEEKBIN STREAM)
LOBYTE _ BYTE2))
(CL:IF RAW
CODE
(UNICODE.TRANSLATE CODE *UNICODETOXCCS*))
@ -402,12 +410,164 @@
THEN (SETQ *BYTECOUNTER* -1)
NIL)))])
)
(DEFINEQ
(UTF16LE.OUTCHARFN
[LAMBDA (STREAM CHARCODE RAW) (* ; "Edited 10-Mar-2024 11:58 by rmk")
(* ; "Edited 8-Aug-2021 13:09 by rmk:")
(* ; "Edited 30-Jan-2020 23:08 by rmk:")
(* ;; "PRINT UTF16 sequence for CHARCODE. Do not do XCCS to UNICODE translation if RAW.")
(* ;; "Not sure about EOL conversion if truly %"raw%"")
(IF (EQ CHARCODE (CHARCODE EOL))
THEN (FREPLACE (STREAM CHARPOSITION) OF STREAM WITH 0)
ELSE (CHANGE (FFETCH (STREAM CHARPOSITION) OF STREAM)
(IPLUS16 1 DATUM)))
(FOR C INSIDE (CL:IF RAW
CHARCODE
(UNICODE.TRANSLATE CHARCODE *XCCSTOUNICODE*))
DO (BOUT STREAM (fetch LOBYTE of CHARCODE))
(BOUT STREAM (fetch HIBYTE of CHARCODE])
(UTF16LE.INCCODEFN
[LAMBDA (STREAM COUNTP RAW) (* ; "Edited 10-Mar-2024 12:03 by rmk")
(* ; "Edited 6-Aug-2021 16:05 by rmk:")
(* ;;
 "Do not do UNICODE to XCCS translation if RAW. Test for SMALLPin case of funky EOF behavior")
(DECLARE (USEDFREE *BYTECOUNTER*))
(LET (CODE BYTE1 BYTE2 COUNT)
(IF [AND (SMALLP (SETQ BYTE1 (\BIN STREAM)))
(SMALLP (SETQ BYTE2 (\BIN STREAM]
THEN (SETQ COUNT 2)
(SETQ CODE (create WORD
LOBYTE _ (\BIN STREAM)
HIBYTE _ (\BIN STREAM)))
(CL:UNLESS RAW
(SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOXCCS*)))
(CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT))
CODE
ELSE (ERROR "ODD NUMBER OF BYTES IN UTF16 FILE" STREAM])
(UTF16LE.PEEKCCODEFN
[LAMBDA (STREAM NOERROR RAW) (* ; "Edited 10-Mar-2024 11:43 by rmk")
(* ; "Edited 14-Jun-2021 22:58 by rmk:")
(* ;; "Could be that the caller takes care of backing up the file position if the number of binned-bytes is returned.")
(* ;; "Do not do UNICODE to XCCS translation if RAW")
(LET (BYTE1 BYTE2 CODE)
(SETQ BYTE1 (\PEEKBIN STREAM NOERROR))
(IF BYTE1
THEN (\BIN STREAM)
(SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
(\BACKFILEPTR STREAM)
(IF BYTE2
THEN (SETQ CODE (LOGOR (LLSH BYTE2 8)
BYTE1))
(CL:IF RAW
CODE
(UNICODE.TRANSLATE CODE *UNICODETOXCCS*))
ELSEIF NOERROR
THEN NIL)
ELSEIF NOERROR
THEN NIL
ELSE (ERROR "INVALID UTF16 CHARACTER" (LIST BYTE1 BYTE2])
(\UTF16LE.BACKCCODEFN
[LAMBDA (STREAM COUNTP RAW) (* ; "Edited 10-Mar-2024 12:04 by rmk")
(* ; "Edited 19-Jul-2022 15:14 by rmk")
(* ; "Edited 6-Aug-2021 16:07 by rmk:")
(* ;; "\BACKFILEPTR is NIL at beginning of FILE, do nothing.")
(DECLARE (USEDFREE *BYTECOUNTER*))
(CL:WHEN (\BACKFILEPTR STREAM)
(LET (CODE (BYTE2 (\PEEKBIN STREAM)))
(IF (\BACKFILEPTR STREAM)
THEN (CL:WHEN COUNTP (SETQ *BYTECOUNTER* -2))
(SETQ CODE (create WORD
HIBYTE _ BYTE2
LOBYTE _ (\PEEKBIN STREAM)))
(CL:IF RAW
CODE
(UNICODE.TRANSLATE CODE *UNICODETOXCCS*))
ELSEIF COUNTP
THEN (SETQ *BYTECOUNTER* -1)
NIL)))])
)
(DEFINEQ
(READBOM
[LAMBDA (STREAM COUNTP) (* ; "Edited 11-Mar-2024 23:53 by rmk")
(* ; "Edited 10-Mar-2024 13:01 by rmk")
(* ;; "If COUNTP, this must be under a generic \INCCODE that binds *BYTECOUNTER*")
(* ;; "Reads and decodes the BOM bytes. If BOM ispresent, the stream is left at the first following byte, otherwise the stream is reset to its position on entry (presumably 0).")
(* ;; "I used the UNHEXTRING constants so that the hex bytes are visible in the code, maybe there's another function that does that?")
(DECLARE (USEDFREE *BYTECOUNTER*))
(SELECTC (\PEEKBIN STREAM T)
((UNHEXSTRING "EF")
(BIN STREAM)
(if (EQ (CONSTANT (UNHEXSTRING "BB"))
(\PEEKBIN STREAM T))
then (BIN STREAM)
(if (EQ (CONSTANT (UNHEXSTRING "BF"))
(\PEEKBIN STREAM T))
then (BIN STREAM)
(CL:WHEN COUNTP (add *BYTECOUNTER* 3))
:UTF-8
else (\BACKFILEPTR STREAM))
else (\BACKFILEPTR STREAM)))
((UNHEXSTRING "FE")
(BIN STREAM)
(if (EQ (CONSTANT (UNHEXSTRING "FF"))
(\PEEKBIN STREAM T))
then (BIN STREAM)
(CL:WHEN COUNTP (add *BYTECOUNTER* 2))
:UTF-16BE
else (\BACKFILEPTR STREAM)))
((UNHEXSTRING "FF")
(BIN STREAM)
(if (EQ (CONSTANT (UNHEXSTRING "FE"))
(\PEEKBIN STREAM T))
then (BIN STREAM)
(CL:WHEN COUNTP (add *BYTECOUNTER* 2))
:UTF-16LE
else (\BACKFILEPTR STREAM)))
NIL])
(WRITEBOM
[LAMBDA (STREAM FORMAT) (* ; "Edited 16-Mar-2024 20:53 by rmk")
(* ; "Edited 11-Mar-2024 23:53 by rmk")
(* ; "Edited 10-Mar-2024 13:01 by rmk")
(* ;; "Writes a BOM that represents FORMAT (:UTF-8, :UTF16-BE, :UTF16-LE")
(SELECTQ FORMAT
(:UTF-8 (BOUT STREAM (CONSTANT (UNHEXSTRING "EF")))
(BOUT STREAM (CONSTANT (UNHEXSTRING "BB")))
(BOUT STREAM (CONSTANT (UNHEXSTRING "BF"))))
(:UTF-16BE (BOUT STREAM (CONSTANT (UNHEXSTRING "FE")))
(BOUT STREAM (CONSTANT (UNHEXSTRING "FF"))))
(:UTF-16LE (BOUT STREAM (CONSTANT (UNHEXSTRING "FF")))
(BOUT STREAM (UNHEXSTRING "FE")))
NIL])
)
(RPAQ? EXTERNALEOL 'LF)
(DEFINEQ
(MAKE-UNICODE-FORMATS
[LAMBDA (EXTERNALEOL) (* ; "Edited 8-Dec-2023 15:19 by rmk")
[LAMBDA (EXTERNALEOL) (* ; "Edited 10-Mar-2024 11:55 by rmk")
(* ; "Edited 8-Dec-2023 15:19 by rmk")
(* ; "Edited 19-Jul-2022 15:36 by rmk")
(* ; "Edited 6-Aug-2021 16:08 by rmk:")
@ -442,6 +602,20 @@
(\UTF16BE.BACKCCODEFN STREAM COUNTP T]
[FUNCTION (LAMBDA (STREAM CHARCODE)
(UTF16BE.OUTCHARFN STREAM CHARCODE T]
NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
(MAKE-EXTERNALFORMAT :UTF-16LE (FUNCTION UTF16LE.INCCODEFN)
(FUNCTION UTF16LE.PEEKCCODEFN)
(FUNCTION \UTF16LE.BACKCCODEFN)
(FUNCTION UTF16LE.OUTCHARFN)
NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
(MAKE-EXTERNALFORMAT :UTF-16LE-RAW [FUNCTION (LAMBDA (STREAM COUNTP)
(UTF16LE.INCCODEFN STREAM COUNTP T]
[FUNCTION (LAMBDA (STREAM NOERROR)
(UTF16LE.PEEKCCODEFN STREAM NOERROR T]
[FUNCTION (LAMBDA (STREAM COUNTP)
(\UTF16LE.BACKCCODEFN STREAM COUNTP T]
[FUNCTION (LAMBDA (STREAM CHARCODE)
(UTF16LE.OUTCHARFN STREAM CHARCODE T]
NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL])
)
@ -487,37 +661,36 @@
NEXTCODE])
(UNICODE-EXTEND-TRANSLATION?
[LAMBDA (CODE TRANSLATION-TABLE) (* ; "Edited 5-Feb-2024 13:48 by rmk")
[LAMBDA (CODE TRANSLATION-TABLE) (* ; "Edited 27-Mar-2024 23:02 by rmk")
(* ; "Edited 5-Feb-2024 13:48 by rmk")
(* ; "Edited 3-Feb-2024 12:40 by rmk")
(* ;; "There is currently no mapping for CODE in TRANSLATION-TABLE, hopefully just because the relevant character-set mapping as not be installed. We infer from TRANSLATION-TABLE whether CODE is an XCCS or UNICODE code.")
(* ;; "There is currently no mapping for CODE in TRANSLATION-TABLE, hopefully just because the relevant character-set mapping has not been installed. We infer from TRANSLATION-TABLE whether CODE is an XCCS or UNICODE code and look for the proper mapping table (forward or inverted) for its character set. ")
(* ;; "The relevant mapping file, if any, can be determined directly from an XCCS code, since the mapping files are indexed by XCCS charset.")
(LET (MAPPING FILE (INVERTED (EQ TRANSLATION-TABLE *UNICODETOXCCS*)))
(SETQ FILE (FINDFILE (CL:IF INVERTED
'INVERTED-UNICODE-MAPPINGS.TXT
'UNICODE-MAPPINGS.TXT)
T UNICODEDIRECTORIES))
(CL:WHEN FILE
(SETQ MAPPING (CL:WITH-OPEN-FILE (STREAM FILE :INPUT)
(FFILEPOS (CONCAT "[" (LRSH CODE 8)
" ")
STREAM NIL NIL NIL T)
(READ STREAM)))
(CL:WHEN MAPPING
(* ;; "To find the file for a Unicode code by first running it through the precomputed inverted Unicode-to-XCCS inverted index, and then find the charset file for the corresponding XCCS code. Presumably that has the inverted mapping for future fast lookups.")
(* ;;
 "Merge MAPPING into both tables, respecting the direction indicated by TRANSLATION-TABLE.")
(LET (XCCSCODE NEWMAPPING INVERTEDFILE)
[SETQ XCCSCODE (if (EQ TRANSLATION-TABLE *XCCSTOUNICODE*)
then CODE
elseif (SETQ INVERTEDFILE (FINDFILE 'INVERTED-UNICODE-MAPPINGS.TXT T
UNICODEDIRECTORIES))
then
(* ;; "Note that we open/scan the inverted file for each unknown character, read the relevant character set (if any), get the XCCS code, and throw away what we just read. We will have installed all of the characters in the XCCS charset corresponding to CODE, that will catch a lot of what would otherwise be future unknowns (e.g. all Greeks are in). We may hit the same one repeatedly for Unicode JIS, since they appear to be scattered across XCCS.")
(if INVERTED
then (MERGE-UNICODE-TRANSLATION-TABLES MAPPING *UNICODETOXCCS* *XCCSTOUNICODE*)
else (MERGE-UNICODE-TRANSLATION-TABLES MAPPING *XCCSTOUNICODE* *UNICODETOXCCS*))
(CL:WITH-OPEN-FILE (STREAM INVERTEDFILE :INPUT)
(FFILEPOS (CONCAT "[" (LRSH CODE 8)
" ")
STREAM NIL NIL NIL T)
(CADR (ASSOC CODE (READ STREAM]
(CL:WHEN (AND XCCSCODE (SETQ NEWMAPPING (READ-UNICODE-MAPPING XCCSCODE T T)))
(* ;;
 "Hopefully we have now installed and can retrieve the mapping for CODE in its translation table.")
(* ;; "Whatever we find, we merge it in both directions--the tables bound to these variables are the only game in town.")
(MERGE-UNICODE-TRANSLATION-TABLES NEWMAPPING)
(* ;; "CODE's charset may not have a mapping for idiosyncratic CODE. ")
(UNICODE.TRANSLATE CODE TRANSLATION-TABLE T))])
(UNICODE.TRANSLATE CODE TRANSLATION-TABLE T)))])
(UTF8.BINCODE
[LAMBDA (STREAM RAW) (* ; "Edited 4-Feb-2024 01:06 by rmk")
@ -1019,30 +1192,26 @@
(LIST LTORARRAY RTOLARRAY])
(MERGE-UNICODE-TRANSLATION-TABLES
[LAMBDA (ADDITION TARGET) (* ; "Edited 3-Feb-2024 12:46 by rmk")
[LAMBDA (ADDITION TABLE INVERSETABLE) (* ; "Edited 27-Mar-2024 12:10 by rmk")
(* ; "Edited 3-Feb-2024 12:46 by rmk")
(* ; "Edited 31-Jan-2024 10:06 by rmk")
(* ;; "ADDITION is a pair containing an LTOR array and an inverse RTOL array. TARGET is either NIL or an array pair.. If NIL, the current values of *XCCSTOUNICODE* and *UNICODETOXCCS* are used.")
(* ;; "ADDITION is a pair containing a mapping array and its inverse, or a list that maps codes in the forward direction. ")
(* ;; "The ADDTION mappings are merged destructively into the TARGET mappings. This assumes that there are as yet no uncoded elements in the ADDITION hash arrays.")
(* ;; "The forward ADDITION mappings are merged destructively into TABLE and its inverses are merged into INVERSETABLE. ")
(LET (TLTORARRAY TRTOLARRAY)
(CL:UNLESS (AND (LISTP ADDITION)
(CL:ARRAYP (CAR ADDITION))
(CL:ARRAYP (CADR ADDITION)))
(SETQ ADDITION (MAKE-UNICODE-TRANSLATION-TABLES ADDITION)))
(if (NULL TARGET)
then (SETQ TLTORARRAY *XCCSTOUNICODE*)
(SETQ TRTOLARRAY *UNICODETOXCCS*)
elseif (LISTP TARGET)
then (SETQ TLTORARRAY (CAR TARGET))
(SETQ TRTOLARRAY (CADR TARGET))
else (\ILLEGAL.ARG TARGET))
(MERGE-UNICODE-TRANSLATION-TABLES1 (CAR ADDITION)
TLTORARRAY)
(MERGE-UNICODE-TRANSLATION-TABLES1 (CADR ADDITION)
TRTOLARRAY)
(LIST TLTORARRAY TRTOLARRAY])
(CL:UNLESS (AND (LISTP ADDITION)
(CL:ARRAYP (CAR ADDITION))
(CL:ARRAYP (CADR ADDITION)))
(* ;; "Make temporary mapping arrays when ADDTION is a list of corresponding code-pairs.")
(SETQ ADDITION (MAKE-UNICODE-TRANSLATION-TABLES ADDITION)))
(MERGE-UNICODE-TRANSLATION-TABLES1 (CAR ADDITION)
TABLE)
(MERGE-UNICODE-TRANSLATION-TABLES1 (CADR ADDITION)
INVERSETABLE)
(LIST TABLE INVERSETABLE])
(MERGE-UNICODE-TRANSLATION-TABLES1
[LAMBDA (ADDARRAY TARGETARRAY) (* ; "Edited 2-Feb-2024 13:18 by rmk")
@ -1104,10 +1273,11 @@
(DEFINEQ
(INVERT-ALL-UNICODE-MAPPINGS
[LAMBDA (MAKEFILE) (* ; "Edited 5-Feb-2024 13:14 by rmk")
[LAMBDA (FILE) (* ; "Edited 27-Mar-2024 14:50 by rmk")
(* ; "Edited 5-Feb-2024 13:14 by rmk")
(* ; "Edited 3-Feb-2024 09:16 by rmk")
(* ;; "Reads all the XCCS-to-UNICODE mapping files that we know about, and iproduces a 2-level index that maps each UNICODE code back to the one or more XCCS corresponding XCCS codes.")
(* ;; "Reads all the XCCS-to-UNICODE mapping files that we know about, and produces a 2-level index that maps each UNICODE code back to the one or more XCCS corresponding XCCS codes.")
(* ;; "The first index level groups all the unicode codes that have the same high-ordere byte. The index is sorted by the high-order bytes, the pairs within each group are sorted by their unicode. If a given unicode maps to multiple XCCS codes, the pair with the lowest XCCS code comes first.")
@ -1115,7 +1285,7 @@
(* ;; " (CADR (ASSOC UCODE (CADR (ASSOC (LRSH UCODE 8) INDEX)))).")
(* ;; "If IMAKEFILE is given, the resulting is written to that file.")
(* ;; "If FILE is given, the resulting is written to that file.")
(LET (INDEX)
[for M in (READ-UNICODE-MAPPING (for N in XCCS-CHARSETS collect (CAR N))
@ -1139,11 +1309,70 @@
(ILESSP (CADR M1)
(CADR M2]
(SETQ INDEX (SORT INDEX T)) (* ; "Sort groups")
(if MAKEFILE
then (CL:WITH-OPEN-FILE (STREAM (PACKFILENAME 'DIRECTORY (CAR (MKLIST UNICODEDIRECTORIES
(if FILE
then (CL:WITH-OPEN-FILE (STREAM [PACKFILENAME 'DIRECTORY (CAR (MKLIST UNICODEDIRECTORIES
))
'BODY
'INVERTED-UNICODE-MAPPINGS.TXT)
(CL:IF (EQ FILE T)
'INVERTED-UNICODE-MAPPINGS.TXT
(PACKFILENAME 'BODY FILE 'EXTENSION
'TXT))]
:DIRECTION :OUTPUT :IF-EXISTS :NEW-VERSION)
(* ;; "We can FFILEPOS for %"[nnn %" then READ. Or just READFILE")
(for I in INDEX do (PRINTOUT STREAM "[" (CAR I)
" "
(CADR I)
"]" T))
(FULLNAME STREAM))
else INDEX])
(ALL-UNICODE-MAPPINGS
[LAMBDA (FILE) (* ; "Edited 27-Mar-2024 14:48 by rmk")
(* ; "Edited 5-Feb-2024 13:14 by rmk")
(* ; "Edited 3-Feb-2024 09:16 by rmk")
(* ;; "Reads all the XCCS-to-UNICODE mapping files that we know about, and iproduces a 2-level index that maps each XCCS code to the corresponding UNICODE.")
(* ;; "The first index level groups all the XCCS codes in the same character set. The index is sorted by the high-order bytes, the pairs within each group are sorted by their XCCS code. ")
(* ;; "GIven a XCCS code, the lookup for the corresonding Unicode is")
(* ;; " (CADR (ASSOC XCCSCODE (CADR (ASSOC (LRSH XCCSCODE 8) INDEX)))).")
(* ;; "If FILE is given, the resulting is written to that file. If FILE is T, the file is UNICODE-MAPPINGS.TXT")
(LET (INDEX)
[for M in (READ-UNICODE-MAPPING (for N in XCCS-CHARSETS collect (CAR N))
T)
do (push [CDR (OR (ASSOC (LRSH (CAR M)
8)
INDEX)
(CAR (push INDEX (CONS (LRSH (CAR M)
8]
(LIST (CAR M)
(CADR M]
(* ;; "Push the sublists down an extra CONS, so that a subsequent READ will get them all.")
[for I in INDEX do (change (CDR I)
(CONS (SORT DATUM (FUNCTION (LAMBDA (M1 M2)
(OR (ILESSP (CAR M1)
(CAR M2))
(AND (EQ (CAR M1)
(CAR M2))
(ILESSP (CADR M1)
(CADR M2]
(SETQ INDEX (SORT INDEX T)) (* ; "Sort groups")
(if FILE
then (CL:WITH-OPEN-FILE (STREAM [PACKFILENAME 'DIRECTORY (CAR (MKLIST UNICODEDIRECTORIES
))
'BODY
(CL:IF (EQ FILE T)
'UNICODE-MAPPINGS.TXT
(PACKFILENAME 'BODY FILE 'EXTENSION
'TXT))]
:DIRECTION :OUTPUT :IF-EXISTS :NEW-VERSION)
(* ;; "We can FFILEPOS for %"[nnn %" then READ. Or just READFILE")
@ -1580,6 +1809,25 @@
)
(DEFINEQ
(UNHEXSTRING
[LAMBDA (HSTRING) (* ; "Edited 10-Mar-2024 12:56 by rmk")
(* ;; "Converts a hexstring to its number.")
(for I B (N _ 0) from 1 while (SETQ B (NTHCHARCODE HSTRING I))
do [SETQ N (IPLUS (LLSH N 4)
(if (AND (IGEQ B (CHARCODE 0))
(ILEQ B (CHARCODE 9)))
then (IDIFFERENCE B (CHARCODE 0))
elseif (AND (IGEQ (SETQ B (UCASECODE B))
(CHARCODE A))
(ILEQ B (CHARCODE F)))
then (IPLUS 10 (IDIFFERENCE B (CHARCODE A)))
else (ERROR "INVALID HEX CHARACTER" (NTHCHARCODE HSTRING I]
finally (RETURN N])
)
(DEFINEQ
(SHOWCHARS
[LAMBDA (FROMCHAR TOCHAR FONT) (* ; "Edited 26-Jan-2024 14:18 by mth")
(* ; "Edited 1-Aug-2020 09:27 by rmk:")
@ -1618,20 +1866,23 @@
(PUTPROPS UNICODE FILETYPE :TCOMPL)
(DECLARE%: DONTCOPY
(FILEMAP (NIL (3915 18061 (UTF8.OUTCHARFN 3925 . 6723) (UTF8.INCCODEFN 6725 . 12337) (UTF8.PEEKCCODEFN
12339 . 17079) (\UTF8.BACKCCODEFN 17081 . 18059)) (18062 21929 (UTF16BE.OUTCHARFN 18072 . 18982) (
UTF16BE.INCCODEFN 18984 . 19883) (UTF16BE.PEEKCCODEFN 19885 . 20956) (\UTF16BE.BACKCCODEFN 20958 .
21927)) (21959 24240 (MAKE-UNICODE-FORMATS 21969 . 24238)) (24337 33379 (UNICODE.UNMAPPED 24347 .
26421) (UNICODE-EXTEND-TRANSLATION? 26423 . 29002) (UTF8.BINCODE 29004 . 31583) (\UTF8.FETCHCODE 31585
. 33377)) (33380 38901 (UTF8.VALIDATE 33390 . 35987) (UTF8-SIZE-FROM-BYTE1 35989 . 36421) (
NUTF8-BYTE1-BYTES 36423 . 37160) (NUTF8-CODE-BYTES 37162 . 38219) (NUTF8-STRING-BYTES 38221 . 38899))
(40332 40681 (XTOUCODE 40342 . 40510) (UTOXCODE 40512 . 40679)) (41624 47670 (
READ-UNICODE-MAPPING-FILENAMES 41634 . 44581) (READ-UNICODE-MAPPING 44583 . 47668)) (47737 61373 (
MAKE-UNICODE-TRANSLATION-TABLES 47747 . 56819) (MERGE-UNICODE-TRANSLATION-TABLES 56821 . 58261) (
MERGE-UNICODE-TRANSLATION-TABLES1 58263 . 61371)) (61374 64671 (INVERT-ALL-UNICODE-MAPPINGS 61384 .
64669)) (65639 78070 (WRITE-UNICODE-MAPPING 65649 . 69399) (WRITE-UNICODE-INCLUDED 69401 . 74123) (
WRITE-UNICODE-MAPPING-HEADER 74125 . 75373) (WRITE-UNICODE-MAPPING-FILENAME 75375 . 76905) (HEXSTRING
76907 . 78068)) (78071 78747 (XCCS-UTF8-AFTER-OPEN 78081 . 78745)) (81272 86774 (UTF8HEXSTRING 81282
. 83487) (XTOUSTRING 83489 . 86409) (XCCSSTRING 86411 . 86772)) (86775 88285 (SHOWCHARS 86785 . 88283
)))))
(FILEMAP (NIL (4211 18357 (UTF8.OUTCHARFN 4221 . 7019) (UTF8.INCCODEFN 7021 . 12633) (UTF8.PEEKCCODEFN
12635 . 17375) (\UTF8.BACKCCODEFN 17377 . 18355)) (18358 22612 (UTF16BE.OUTCHARFN 18368 . 19278) (
UTF16BE.INCCODEFN 19280 . 20296) (UTF16BE.PEEKCCODEFN 20298 . 21529) (\UTF16BE.BACKCCODEFN 21531 .
22610)) (22613 26900 (UTF16LE.OUTCHARFN 22623 . 23630) (UTF16LE.INCCODEFN 23632 . 24648) (
UTF16LE.PEEKCCODEFN 24650 . 25817) (\UTF16LE.BACKCCODEFN 25819 . 26898)) (26901 29830 (READBOM 26911
. 28915) (WRITEBOM 28917 . 29828)) (29860 33050 (MAKE-UNICODE-FORMATS 29870 . 33048)) (33147 41529 (
UNICODE.UNMAPPED 33157 . 35231) (UNICODE-EXTEND-TRANSLATION? 35233 . 37152) (UTF8.BINCODE 37154 .
39733) (\UTF8.FETCHCODE 39735 . 41527)) (41530 47051 (UTF8.VALIDATE 41540 . 44137) (
UTF8-SIZE-FROM-BYTE1 44139 . 44571) (NUTF8-BYTE1-BYTES 44573 . 45310) (NUTF8-CODE-BYTES 45312 . 46369)
(NUTF8-STRING-BYTES 46371 . 47049)) (48482 48831 (XTOUCODE 48492 . 48660) (UTOXCODE 48662 . 48829)) (
49774 55820 (READ-UNICODE-MAPPING-FILENAMES 49784 . 52731) (READ-UNICODE-MAPPING 52733 . 55818)) (
55887 69217 (MAKE-UNICODE-TRANSLATION-TABLES 55897 . 64969) (MERGE-UNICODE-TRANSLATION-TABLES 64971 .
66105) (MERGE-UNICODE-TRANSLATION-TABLES1 66107 . 69215)) (69218 76326 (INVERT-ALL-UNICODE-MAPPINGS
69228 . 72849) (ALL-UNICODE-MAPPINGS 72851 . 76324)) (77294 89725 (WRITE-UNICODE-MAPPING 77304 . 81054
) (WRITE-UNICODE-INCLUDED 81056 . 85778) (WRITE-UNICODE-MAPPING-HEADER 85780 . 87028) (
WRITE-UNICODE-MAPPING-FILENAME 87030 . 88560) (HEXSTRING 88562 . 89723)) (89726 90402 (
XCCS-UTF8-AFTER-OPEN 89736 . 90400)) (92927 98429 (UTF8HEXSTRING 92937 . 95142) (XTOUSTRING 95144 .
98064) (XCCSSTRING 98066 . 98427)) (98430 99318 (UNHEXSTRING 98440 . 99316)) (99319 100829 (SHOWCHARS
99329 . 100827)))))
STOP

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff