Rmk100 unicode utf 8 update (#1489)
* UNICODE: a few additional Tedit helpers, revised documentation * New JIS files (courtesy of Peter) * Updated mapping files (courtesy of Peter Craven) * UNICODE: changed SHOULDNT to ERROR
This commit is contained in:
437
library/UNICODE
437
library/UNICODE
@@ -1,12 +1,12 @@
|
||||
(DEFINE-FILE-INFO PACKAGE "INTERLISP" READTABLE "INTERLISP" BASE 10)
|
||||
|
||||
(FILECREATED "19-Jul-2023 09:26:13" {WMEDLEY}<library>UNICODE.;199 65282
|
||||
(FILECREATED " 8-Jan-2024 10:58:06" {WMEDLEY}<library>UNICODE.;212 72240
|
||||
|
||||
:EDIT-BY rmk
|
||||
|
||||
:CHANGES-TO (VARS UNICODECOMS)
|
||||
:CHANGES-TO (FNS NUTF8CODEBYTES)
|
||||
|
||||
:PREVIOUS-DATE "19-Jul-2022 15:36:40" {WMEDLEY}<library>UNICODE.;198)
|
||||
:PREVIOUS-DATE " 5-Jan-2024 17:25:29" {WMEDLEY}<library>UNICODE.;211)
|
||||
|
||||
|
||||
(PRETTYCOMPRINT UNICODECOMS)
|
||||
@@ -23,7 +23,7 @@
|
||||
(ADDVARS (*DEFAULT-EXTERNALFORMATS* (UNIX :UTF-8)))
|
||||
(FNS UNICODE.UNMAPPED)
|
||||
(FNS XCCS-UTF8-AFTER-OPEN)
|
||||
(DECLARE%: EVAL@COMPILE DONTCOPY (MACROS UNICODE.TRANSLATE))
|
||||
(DECLARE%: EVAL@COMPILE DONTCOPY (MACROS UNICODE.TRANSLATE \UTF8.GETBASEBYTE))
|
||||
(FNS XTOUCODE UTOXCODE))
|
||||
(COMS
|
||||
(* ;; "Unicode mapping files")
|
||||
@@ -45,8 +45,10 @@
|
||||
(* ;; "Set up translation tables for UTF8 and UTFBE external formats")
|
||||
|
||||
(FNS MAKE-UNICODE-TRANSLATION-TABLES)
|
||||
[INITVARS (DEFAULT-XCCS-CHARSETS '(LATIN SYMBOLS1 SYMBOLS2 EXTENDED-LATIN FORMS
|
||||
SYMBOLS3 SYMBOLS4 ACCENTED-LATIN GREEK]
|
||||
[INITVARS (DEFAULT-XCCS-CHARSETS '(LATIN JAPANESE-SYMBOLS1 JAPANESE-SYMBOLS2
|
||||
EXTENDED-LATIN FORMS SYMBOLS1 SYMBOLS2
|
||||
ACCENTED-LATIN1 GREEK))
|
||||
(DEFAULT-XCCS-JAPANESE-CHARSETS '(HIRAGANA KATAKANA JIS]
|
||||
[DECLARE%: DONTEVAL@LOAD DOCOPY (P (MAKE-UNICODE-TRANSLATION-TABLES (
|
||||
READ-UNICODE-MAPPING
|
||||
|
||||
@@ -55,7 +57,9 @@
|
||||
'*XCCSTOUNICODE*
|
||||
'*UNICODETOXCCS*]
|
||||
(GLOBALVARS *XCCSTOUNICODE* *UNICODETOXCCS*))
|
||||
(FNS HEXSTRING UTF8HEXSTRING NUTF8CODEBYTES NUTF8STRINGBYTES XTOUSTRING XCCSSTRING)
|
||||
(FNS UTF-8.VALIDATE HEXSTRING UTF8HEXSTRING NUTF8CODEBYTES NUTF8STRINGBYTES XTOUSTRING
|
||||
XCCSSTRING)
|
||||
(FNS \UTF8.FETCHCODE)
|
||||
(FNS SHOWCHARS)
|
||||
(DECLARE%: EVAL@COMPILE DONTCOPY (FILES (FROM LOADUPS)
|
||||
EXPORTS.ALL)
|
||||
@@ -402,7 +406,8 @@
|
||||
(DEFINEQ
|
||||
|
||||
(MAKE-UNICODE-FORMATS
|
||||
[LAMBDA (EXTERNALEOL) (* ; "Edited 19-Jul-2022 15:36 by rmk")
|
||||
[LAMBDA (EXTERNALEOL) (* ; "Edited 8-Dec-2023 15:19 by rmk")
|
||||
(* ; "Edited 19-Jul-2022 15:36 by rmk")
|
||||
(* ; "Edited 6-Aug-2021 16:08 by rmk:")
|
||||
|
||||
(* ;; "RAW formats do not do XCCS/Unicode translation, just deal with the byte encoding.")
|
||||
@@ -413,7 +418,7 @@
|
||||
(FUNCTION UTF8.PEEKCCODEFN)
|
||||
(FUNCTION \UTF8.BACKCCODEFN)
|
||||
(FUNCTION UTF8.OUTCHARFN)
|
||||
NIL EXTERNALEOL)
|
||||
NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
|
||||
(MAKE-EXTERNALFORMAT :UTF-8-RAW [FUNCTION (LAMBDA (STREAM COUNTP)
|
||||
(UTF8.INCCODEFN STREAM COUNTP T]
|
||||
[FUNCTION (LAMBDA (STREAM NOERROR)
|
||||
@@ -422,12 +427,12 @@
|
||||
(\UTF8.BACKCCODEFN STREAM COUNTP T]
|
||||
[FUNCTION (LAMBDA (STREAM CHARCODE)
|
||||
(UTF8.OUTCHARFN STREAM CHARCODE T]
|
||||
NIL EXTERNALEOL)
|
||||
NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
|
||||
(MAKE-EXTERNALFORMAT :UTF-16BE (FUNCTION UTF16BE.INCCODEFN)
|
||||
(FUNCTION UTF16BE.PEEKCCODEFN)
|
||||
(FUNCTION \UTF16BE.BACKCCODEFN)
|
||||
(FUNCTION UTF16BE.OUTCHARFN)
|
||||
NIL EXTERNALEOL)
|
||||
NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
|
||||
(MAKE-EXTERNALFORMAT :UTF-16BE-RAW [FUNCTION (LAMBDA (STREAM COUNTP)
|
||||
(UTF16BE.INCCODEFN STREAM COUNTP T]
|
||||
[FUNCTION (LAMBDA (STREAM NOERROR)
|
||||
@@ -436,7 +441,7 @@
|
||||
(\UTF16BE.BACKCCODEFN STREAM COUNTP T]
|
||||
[FUNCTION (LAMBDA (STREAM CHARCODE)
|
||||
(UTF16BE.OUTCHARFN STREAM CHARCODE T]
|
||||
NIL EXTERNALEOL])
|
||||
NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL])
|
||||
)
|
||||
|
||||
(MAKE-UNICODE-FORMATS EXTERNALEOL)
|
||||
@@ -469,15 +474,17 @@
|
||||
(DEFINEQ
|
||||
|
||||
(XCCS-UTF8-AFTER-OPEN
|
||||
[LAMBDA (STREAM ACCESS PARAMETERS) (* ; "Edited 13-Aug-2020 11:54 by rmk:")
|
||||
[LAMBDA (STREAM ACCESS PARAMETERS) (* ; "Edited 3-Jan-2024 10:27 by rmk")
|
||||
(* ; "Edited 13-Aug-2020 11:54 by rmk:")
|
||||
|
||||
(* ;; "If added to STREAM-AFTER-OPEN-FNS, causes mapping files to be opened as UTF8.")
|
||||
(* ;;
|
||||
"If added to STREAM-AFTER-OPEN-FNS, causes mapping files to be opened as UTF-8. For development")
|
||||
|
||||
(CL:WHEN (AND (STRPOS "XCCS-" (U-CASE (FULLNAME STREAM)))
|
||||
[EQ 'TXT (U-CASE (FILENAMEFIELD (FULLNAME STREAM)
|
||||
'EXTENSION]
|
||||
(NOT (ASSOC 'EXTERNALFORMAT PARAMETERS)))
|
||||
(STREAMPROP STREAM 'EXTERNALFORMAT :UTF8))])
|
||||
(STREAMPROP STREAM 'EXTERNALFORMAT :UTF-8))])
|
||||
)
|
||||
(DECLARE%: EVAL@COMPILE DONTCOPY
|
||||
(DECLARE%: EVAL@COMPILE
|
||||
@@ -492,6 +499,15 @@
|
||||
CODE))
|
||||
[(AND X (CL:SVREF X (LOGAND CODE TRANSLATION-MASK]
|
||||
(T (UNICODE.UNMAPPED CODE TRANSLATION-TABLE])
|
||||
|
||||
(PUTPROPS \UTF8.GETBASEBYTE MACRO ((BASE OFFSET ERROR?) (* ;
|
||||
"Fetches the OFFSET'th byte from BASE, checking for UTF-8 validity if ERROR?")
|
||||
(IF ERROR?
|
||||
THEN (LET ((BYTE (\GETBASEBYTE BASE OFFSET)))
|
||||
(CL:WHEN (ILESSP BYTE 128)
|
||||
(ERROR "INVALID UTF8 BYTE" BYTE))
|
||||
BYTE)
|
||||
ELSE (\GETBASEBYTE BASE OFFSET))))
|
||||
)
|
||||
)
|
||||
(DEFINEQ
|
||||
@@ -512,24 +528,40 @@
|
||||
(DEFINEQ
|
||||
|
||||
(READ-UNICODE-MAPPING-FILENAMES
|
||||
[LAMBDA (FILESPEC) (* ; "Edited 5-Aug-2020 15:59 by kaplan")
|
||||
[LAMBDA (FILESPEC DIRS) (* ; "Edited 5-Jan-2024 17:24 by rmk")
|
||||
(* ; "Edited 5-Aug-2020 15:59 by kaplan")
|
||||
(* ; "Edited 4-Aug-2020 17:31 by rmk:")
|
||||
(FOR F X CSI INSIDE FILESPEC
|
||||
COLLECT (IF (FINDFILE (PACKFILENAME 'BODY F 'EXTENSION 'TXT)
|
||||
T UNICODEDIRECTORIES)
|
||||
ELSEIF [SETQ CSI (OR (SASSOC F XCCS-SET-NAMES)
|
||||
(FIND N IN XCCS-SET-NAMES
|
||||
SUCHTHAT (EQ F (CADR N]
|
||||
THEN (FINDFILE (PACKFILENAME 'BODY (CONCAT 'XCCS- (CAR CSI)
|
||||
'=
|
||||
(CADR CSI))
|
||||
'EXTENSION
|
||||
'TXT)
|
||||
T UNICODEDIRECTORIES)
|
||||
ELSE F])
|
||||
(CL:UNLESS DIRS (SETQ DIRS UNICODEDIRECTORIES))
|
||||
(FOR F X CSI INSIDE FILESPEC JOIN
|
||||
(* ;;
|
||||
"Last case hopes to pick up tables that are gruped together in a subdirectory (e.g. JIS)")
|
||||
|
||||
(OR (FINDFILE (PACKFILENAME 'BODY F 'EXTENSION 'TXT)
|
||||
T DIRS)
|
||||
(for D inside DIRS
|
||||
when (SETQ D (FILDIR (PACKFILENAME 'NAME
|
||||
(CONCAT "XCCS-*=" F)
|
||||
'EXTENSION
|
||||
'TXT
|
||||
'BODY D)))
|
||||
do (RETURN D))
|
||||
(AND [SETQ CSI (OR (SASSOC F XCCS-SET-NAMES)
|
||||
(FIND N IN XCCS-SET-NAMES
|
||||
SUCHTHAT (EQ F (CADR N]
|
||||
(MKLIST (FINDFILE (PACKFILENAME 'BODY
|
||||
(CONCAT 'XCCS- (CAR CSI)
|
||||
'=
|
||||
(CADR CSI))
|
||||
'EXTENSION
|
||||
'TXT)
|
||||
T DIRS)))
|
||||
(for D inside DIRS
|
||||
when (DIRECTORYNAMEP (SETQ D (CONCAT D ">" F ">")))
|
||||
join (FILDIR (CONCAT D ">*.TXT;*"])
|
||||
|
||||
(READ-UNICODE-MAPPING
|
||||
[LAMBDA (FILESPEC NOPRINT NOERROR) (* ; "Edited 3-Jul-2021 13:37 by rmk:")
|
||||
[LAMBDA (FILESPEC NOPRINT NOERROR) (* ; "Edited 5-Jan-2024 12:26 by rmk")
|
||||
(* ; "Edited 3-Jul-2021 13:37 by rmk:")
|
||||
|
||||
(* ;; "Combines the char-mapping tables from FILES coded in the Uncode-CDROM format. Comments prefixed by # and")
|
||||
|
||||
@@ -539,8 +571,7 @@
|
||||
|
||||
(* ;; " 0xXXXX ... 0xYYYY")
|
||||
|
||||
(* ;;
|
||||
" Column 3: (after #) Character name in some mapping files, utf-8 character")
|
||||
(* ;; " Column 3: (after #) Character name in some mapping files, utf-8 character")
|
||||
|
||||
(* ;; " for XCCS mapping files")
|
||||
|
||||
@@ -548,37 +579,34 @@
|
||||
|
||||
(* ;; "Result is a list of (fromcode tocode1 ... tocoden) integer lists (almost always with only a single tocode")
|
||||
|
||||
(FOR FILE [SEPBITTABLE _ (MAKEBITTABLE (CHARCODE (TAB SPACE] IN (
|
||||
READ-UNICODE-MAPPING-FILENAMES
|
||||
FILESPEC)
|
||||
(FOR FILE [SEPBITTABLE _ (MAKEBITTABLE (CHARCODE (TAB SPACE] IN (READ-UNICODE-MAPPING-FILENAMES
|
||||
FILESPEC)
|
||||
JOIN (CL:WITH-OPEN-FILE (STREAM FILE :DIRECTION :INPUT :EXTERNAL-FORMAT :UTF-8-RAW)
|
||||
(BIND LINE START FIRST (CL:UNLESS
|
||||
(FILEPOS "Name:" STREAM NIL NIL NIL T)
|
||||
(ERROR "NOT A UNICODE MAPPING FILE"
|
||||
(FULLNAME STREAM)))
|
||||
(SETQ LINE (CL:READ-LINE STREAM NIL NIL))
|
||||
(CL:UNLESS NOPRINT
|
||||
(PRINTOUT T T "Unicode mapping: "
|
||||
(CL:STRING-TRIM " " LINE)
|
||||
T)) WHILE (SETQ LINE
|
||||
(CL:READ-LINE STREAM NIL
|
||||
NIL))
|
||||
WHEN (SETQ START (STRPOSL SEPBITTABLE LINE 1 T))
|
||||
UNLESS (EQ (CHARCODE %#)
|
||||
(NTHCHARCODE LINE START))
|
||||
COLLECT (BIND END WHILE [SETQ END (OR (STRPOSL SEPBITTABLE LINE
|
||||
START)
|
||||
(ADD1 (NCHARS LINE]
|
||||
COLLECT [CHARCODE.DECODE (SUBSTRING LINE START
|
||||
(SUB1 END)
|
||||
(CONSTANT (CONCAT]
|
||||
REPEATWHILE (AND (SETQ START (STRPOSL SEPBITTABLE LINE
|
||||
END T))
|
||||
(NEQ (CHARCODE %#)
|
||||
(NTHCHARCODE LINE START])
|
||||
(BIND LINE NAME CHARSET START
|
||||
FIRST (CL:UNLESS (FILEPOS "Name:" STREAM NIL NIL NIL T)
|
||||
(ERROR "NOT A UNICODE MAPPING FILE" (FULLNAME STREAM)))
|
||||
(SETQ NAME (CL:STRING-TRIM " " (CL:READ-LINE STREAM NIL NIL)))
|
||||
(SETQ CHARSET (CL:IF (FILEPOS "XCCS charset:" STREAM NIL NIL NIL T)
|
||||
(CL:STRING-TRIM " " (CL:READ-LINE STREAM NIL NIL))
|
||||
""))
|
||||
(CL:UNLESS NOPRINT (* ; "Strip off XCCS in front of name")
|
||||
(PRINTOUT T T CHARSET " " [SUBSTRING NAME (CONSTANT
|
||||
(ADD1 (NCHARS "XCCS"]
|
||||
T)) WHILE (SETQ LINE (CL:READ-LINE STREAM NIL NIL))
|
||||
WHEN (SETQ START (STRPOSL SEPBITTABLE LINE 1 T))
|
||||
UNLESS (EQ (CHARCODE %#)
|
||||
(NTHCHARCODE LINE START))
|
||||
COLLECT (BIND END WHILE [SETQ END (OR (STRPOSL SEPBITTABLE LINE START)
|
||||
(ADD1 (NCHARS LINE]
|
||||
COLLECT [CHARCODE.DECODE (SUBSTRING LINE START (SUB1 END)
|
||||
(CONSTANT (CONCAT]
|
||||
REPEATWHILE (AND (SETQ START (STRPOSL SEPBITTABLE LINE END T))
|
||||
(NEQ (CHARCODE %#)
|
||||
(NTHCHARCODE LINE START])
|
||||
|
||||
(WRITE-UNICODE-MAPPING
|
||||
[LAMBDA (MAPPING INCLUDECHARSETS FILE EMPTYOK) (* ; "Edited 16-Aug-2020 16:56 by rmk:")
|
||||
[LAMBDA (MAPPING INCLUDECHARSETS FILE EMPTYOK) (* ; "Edited 4-Jan-2024 22:44 by rmk")
|
||||
(* ; "Edited 16-Aug-2020 16:56 by rmk:")
|
||||
|
||||
(* ;; "Writes a symbol unicode mapping file. Mapping is a list of (XCCS-code Unicode) pairs, which may contain codes in multiple character sets.")
|
||||
|
||||
@@ -587,21 +615,18 @@
|
||||
(* ;; "The output lines are of the form x0XXX<tab>x0UUUU<tab># Unicode-char")
|
||||
|
||||
(* ;;
|
||||
"If INCLUDECHARSETS=T then the mappings are split up into separate per-character set files.")
|
||||
"If INCLUDECHARSETS=T then the mappings are split up into separate per-character set files.")
|
||||
|
||||
(* ;; "Otherwise, all and only mappings included in thos charsets are included in a single output file--an implicit subset.")
|
||||
|
||||
(IF (AND (EQ INCLUDECHARSETS T)
|
||||
(NULL FILE))
|
||||
(NULL FILE))
|
||||
THEN (IF MAPPING
|
||||
THEN (FOR CSI F IN XCCS-SET-NAMES WHEN (SETQ F
|
||||
(WRITE-UNICODE-MAPPING
|
||||
MAPPING
|
||||
(CAR CSI)
|
||||
NIL T)) COLLECT
|
||||
F)
|
||||
ELSE (PRINTOUT T "THERE ARE NO MAPPINGS" T)
|
||||
NIL)
|
||||
THEN (FOR CSI F IN XCCS-SET-NAMES WHEN (SETQ F (WRITE-UNICODE-MAPPING MAPPING
|
||||
(CAR CSI)
|
||||
NIL T)) COLLECT F)
|
||||
ELSE (PRINTOUT T "THERE ARE NO MAPPINGS" T)
|
||||
NIL)
|
||||
ELSE
|
||||
(LET
|
||||
(IMAPPING CSETINFO RANGES)
|
||||
@@ -609,47 +634,45 @@
|
||||
(WRITE-UNICODE-INCLUDED MAPPING INCLUDECHARSETS))
|
||||
(IF IMAPPING
|
||||
THEN (CL:WITH-OPEN-FILE
|
||||
(STREAM (WRITE-UNICODE-MAPPING-FILENAME FILE CSETINFO RANGES)
|
||||
:DIRECTION :OUTPUT :IF-EXISTS :NEW-VERSION :EXTERNAL-FORMAT :UTF8-RAW)
|
||||
(WRITE-UNICODE-MAPPING-HEADER STREAM CSETINFO RANGES)
|
||||
(SORT IMAPPING T)
|
||||
(FOR M CSET LEFTC FIRSTRIGHTC CSI IN IMAPPING
|
||||
DO (SETQ LEFTC (CAR M))
|
||||
(SETQ FIRSTRIGHTC (CADR M))
|
||||
(CL:UNLESS (EQ CSET (LRSH LEFTC 8))
|
||||
(SETQ CSET (LRSH LEFTC 8))
|
||||
(SETQ CSI (ASSOC CSET CSETINFO))
|
||||
(PRINTOUT STREAM T "# " .P2 (CADR CSI)
|
||||
" "
|
||||
(CADDR CSI)
|
||||
T))
|
||||
(PRINTOUT STREAM "0x" (HEXSTRING LEFTC 4)
|
||||
%#
|
||||
(FOR RIGHTC IN (CDR M)
|
||||
DO (PRINTOUT NIL " " "0x" (HEXSTRING RIGHTC 4)))
|
||||
" # "
|
||||
(SELECTC FIRSTRIGHTC
|
||||
(UNDEFINEDCODE
|
||||
(* ;; "FFFF")
|
||||
(STREAM (WRITE-UNICODE-MAPPING-FILENAME FILE CSETINFO RANGES)
|
||||
:DIRECTION :OUTPUT :IF-EXISTS :NEW-VERSION :EXTERNAL-FORMAT :UTF-8-RAW)
|
||||
(WRITE-UNICODE-MAPPING-HEADER STREAM CSETINFO RANGES)
|
||||
(SORT IMAPPING T)
|
||||
(FOR M CSET LEFTC FIRSTRIGHTC CSI IN IMAPPING
|
||||
DO (SETQ LEFTC (CAR M))
|
||||
(SETQ FIRSTRIGHTC (CADR M))
|
||||
(CL:UNLESS (EQ CSET (LRSH LEFTC 8))
|
||||
(SETQ CSET (LRSH LEFTC 8))
|
||||
(SETQ CSI (ASSOC CSET CSETINFO))
|
||||
(PRINTOUT STREAM T "# " .P2 (CADR CSI)
|
||||
" "
|
||||
(CADDR CSI)
|
||||
T))
|
||||
(PRINTOUT STREAM "0x" (HEXSTRING LEFTC 4)
|
||||
%#
|
||||
(FOR RIGHTC IN (CDR M) DO (PRINTOUT NIL " " "0x" (HEXSTRING RIGHTC 4)))
|
||||
" # "
|
||||
(SELECTC FIRSTRIGHTC
|
||||
(UNDEFINEDCODE
|
||||
(* ;; "FFFF")
|
||||
|
||||
"UNDEFINED")
|
||||
(MISSINGCODE
|
||||
(* ;; "FFFE")
|
||||
"UNDEFINED")
|
||||
(MISSINGCODE
|
||||
(* ;; "FFFE")
|
||||
|
||||
"MISSING")
|
||||
(IF (ILESSP FIRSTRIGHTC 32)
|
||||
THEN (* ; "Control chars")
|
||||
[CONCAT "^" (CHARACTER (IPLUS FIRSTRIGHTC
|
||||
(CHARCODE @]
|
||||
ELSE (CHARACTER FIRSTRIGHTC)))
|
||||
T))
|
||||
(FULLNAME STREAM))
|
||||
"MISSING")
|
||||
(IF (ILESSP FIRSTRIGHTC 32)
|
||||
THEN (* ; "Control chars")
|
||||
[CONCAT "^" (CHARACTER (IPLUS FIRSTRIGHTC (CHARCODE @]
|
||||
ELSE (CHARACTER FIRSTRIGHTC)))
|
||||
T))
|
||||
(FULLNAME STREAM))
|
||||
ELSEIF (NOT EMPTYOK)
|
||||
THEN (PRINTOUT T "THERE ARE NO MAPPINGS")
|
||||
(CL:WHEN INCLUDECHARSETS
|
||||
(PRINTOUT T " FOR " .PPVTL (MKLIST INCLUDECHARSETS)
|
||||
T))
|
||||
NIL])
|
||||
(CL:WHEN INCLUDECHARSETS
|
||||
(PRINTOUT T " FOR " .PPVTL (MKLIST INCLUDECHARSETS)
|
||||
T))
|
||||
NIL])
|
||||
|
||||
(WRITE-UNICODE-INCLUDED
|
||||
[LAMBDA (MAPPING INCLUDECHARSETS) (* ; "Edited 4-Aug-2020 17:47 by rmk:")
|
||||
@@ -724,28 +747,28 @@
|
||||
(CL:VALUES IMAPPING CSETINFO RANGES])
|
||||
|
||||
(WRITE-UNICODE-MAPPING-HEADER
|
||||
[LAMBDA (STREAM CSETINFO RANGES) (* ; "Edited 4-Aug-2020 17:38 by rmk:")
|
||||
[LAMBDA (STREAM CSETINFO RANGES) (* ; "Edited 5-Jan-2024 13:24 by rmk")
|
||||
(* ; "Edited 4-Aug-2020 17:38 by rmk:")
|
||||
|
||||
(* ;; "Writes the standard per-file header information")
|
||||
|
||||
(FOR LINE IN UNICODE-MAPPING-HEADER
|
||||
DO (PRINTOUT STREAM "#" 2)
|
||||
(SELECTQ LINE
|
||||
(XCCSCHARACTERSETS
|
||||
(PRINTOUT STREAM " XCCS charset")
|
||||
(IF (CDR CSETINFO)
|
||||
THEN (PRINTOUT STREAM "s:" -4)
|
||||
(FOR R IN RANGES DO (PRINTOUT STREAM R " "))
|
||||
(TERPRI STREAM)
|
||||
ELSE (* ; "Singleton")
|
||||
(PRINTOUT STREAM ": " -4 (CADAR CSETINFO)
|
||||
" "
|
||||
(CADDAR CSETINFO)))
|
||||
(TERPRI STREAM))
|
||||
(DATE (PRINTOUT STREAM " Date:" -13 (DATE (DATEFORMAT NO.TIME
|
||||
NO.LEADING.SPACES))
|
||||
T))
|
||||
(PRINTOUT STREAM LINE T)))
|
||||
(SELECTQ LINE
|
||||
(XCCSCHARACTERSETS
|
||||
(PRINTOUT STREAM " XCCS charset")
|
||||
(IF (CDR CSETINFO)
|
||||
THEN (PRINTOUT STREAM "s:" -4)
|
||||
(FOR R IN RANGES DO (PRINTOUT STREAM R " "))
|
||||
ELSE (* ; "Singleton")
|
||||
(PRINTOUT STREAM ": " -4 (CADAR CSETINFO)
|
||||
" "
|
||||
(CADDAR CSETINFO)))
|
||||
(TERPRI STREAM))
|
||||
(DATE (PRINTOUT STREAM " Date:" -13 (DATE (DATEFORMAT NO.TIME NO.LEADING.SPACES)
|
||||
)
|
||||
T))
|
||||
(PRINTOUT STREAM LINE T)))
|
||||
(TERPRI STREAM])
|
||||
|
||||
(WRITE-UNICODE-MAPPING-FILENAME
|
||||
@@ -774,24 +797,28 @@
|
||||
|
||||
(RPAQQ XCCS-SET-NAMES
|
||||
(("0" LATIN)
|
||||
("41" SYMBOLS1)
|
||||
("42" SYMBOLS2)
|
||||
("41" JAPANESE-SYMBOLS1)
|
||||
("42" JAPANESE-SYMBOLS2)
|
||||
("43" EXTENDED-LATIN)
|
||||
("44" HIRAGANA)
|
||||
("45" KATAKANA)
|
||||
("46" GREEK)
|
||||
("47" CYRILLIC)
|
||||
("50" FORMS)
|
||||
("60-172" JIS)
|
||||
("60-166" JIS)
|
||||
("340" ARABIC)
|
||||
("341" HEBREW)
|
||||
("342" IPA)
|
||||
("343" HANGUL)
|
||||
("344" GEORGIAN-ARMENIAN)
|
||||
("356" SYMBOLS3)
|
||||
("357" SYMBOLS4)
|
||||
("345" DEVANAGRI)
|
||||
("346" BENGALI)
|
||||
("347" GURMUKHI)
|
||||
("350" THAI-LAO)
|
||||
("356" SYMBOLS2)
|
||||
("357" SYMBOLS1)
|
||||
("360" LIGATURES)
|
||||
("361" ACCENTED-LATIN)
|
||||
("361" ACCENTED-LATIN1)
|
||||
("365" MORE-ARABIC)
|
||||
("375" GRAPHIC-VARIANTS)))
|
||||
|
||||
@@ -813,22 +840,21 @@
|
||||
)
|
||||
|
||||
(RPAQQ UNICODE-MAPPING-HEADER
|
||||
("" " Name: XCCS (XC-3-1-1-0) to Unicode" " Unicode version: 3.0"
|
||||
("" " Name: XCCS (Version 2.0) to Unicode" " Unicode version: 3.0"
|
||||
XCCSCHARACTERSETS " Table version: 0.1" " Table format: Format A"
|
||||
DATE " Author: Ron Kaplan <Ron.Kaplan@post.harvard.edu>" ""
|
||||
"This file contains mappings from the Xerox Character Code Standard (version"
|
||||
"XC1-3-3-0, 1987) into Unicode 3.0. standard codes. That is the version of"
|
||||
"XCCS corresponding to the fonts in the Medley system." ""
|
||||
"2.0, 1990) into Unicode 3.0. standard codes. That is an extension of the"
|
||||
"version of XCCS corresponding to the fonts in the Medley system." ""
|
||||
"The format of this file conforms to the format of the other Unicode-supplied"
|
||||
"mapping files:" " Three white-space (tab or spaces) separated columns:"
|
||||
" Column 1 is the XCCS code (as hex 0xXXXX)"
|
||||
" Column 2 is the corresponding Unicode (as hex 0xXXXX)"
|
||||
" Column 3 (after #) is a comment column. For convenience, it contains the"
|
||||
" Unicode character itself (since the Unicode character names"
|
||||
" are not available)"
|
||||
" Unicode character itself and the Unicode character names when available."
|
||||
"Unicode FFFF is used for undefined XCCS codes (Column 3 = UNDEFINED"
|
||||
"Unicode FFFE is used for XCCS codes that have not yet been filled in."
|
||||
"(Column 3 = MISSING)" "" "This file is encoded in UTF8, so that the Unicode characters"
|
||||
"(Column 3 = MISSING)" "" "This file is encoded in UTF-8, so that the Unicode characters"
|
||||
"are properly displayed in Column 3 and can be edited by standard"
|
||||
"Unicode-enabled editors (e.g. Mac Textedit)." ""
|
||||
"This file can also be read by the function"
|
||||
@@ -991,8 +1017,10 @@
|
||||
(LIST LTORARRAY RTOLARRAY])
|
||||
)
|
||||
|
||||
(RPAQ? DEFAULT-XCCS-CHARSETS '(LATIN SYMBOLS1 SYMBOLS2 EXTENDED-LATIN FORMS SYMBOLS3 SYMBOLS4
|
||||
ACCENTED-LATIN GREEK))
|
||||
(RPAQ? DEFAULT-XCCS-CHARSETS '(LATIN JAPANESE-SYMBOLS1 JAPANESE-SYMBOLS2 EXTENDED-LATIN FORMS
|
||||
SYMBOLS1 SYMBOLS2 ACCENTED-LATIN1 GREEK))
|
||||
|
||||
(RPAQ? DEFAULT-XCCS-JAPANESE-CHARSETS '(HIRAGANA KATAKANA JIS))
|
||||
(DECLARE%: DONTEVAL@LOAD DOCOPY
|
||||
|
||||
(MAKE-UNICODE-TRANSLATION-TABLES (READ-UNICODE-MAPPING DEFAULT-XCCS-CHARSETS T)
|
||||
@@ -1005,6 +1033,63 @@
|
||||
)
|
||||
(DEFINEQ
|
||||
|
||||
(UTF-8.VALIDATE
|
||||
[LAMBDA (STREAM BYTE1) (* ; "Edited 28-Dec-2023 11:57 by rmk")
|
||||
(* ; "Edited 6-Aug-2021 16:02 by rmk:")
|
||||
(* ; "Edited 6-Aug-2020 17:13 by rmk:")
|
||||
|
||||
(* ;; "Returns the codesize if the bytes starting at STREAM's current position form a valid UTF-8 sequence.")
|
||||
|
||||
(* ;; "If BYTE1 is provided, it is interpreted as the just-read header byte with the stream is positioned just after it.")
|
||||
|
||||
(* ;; "Test for smallp because the stream's End-of-file operation may suppress the error--otherwise an error will happen if the streams runs out of necessary bytes.")
|
||||
|
||||
(* ;; "For valid sequences, returns the same value as NUTF8CODEBYTES, but this reads/validates the rest of the bytes. On a non-NILreturn the stream is positioned before the header byte of the next putative code. The stream position is uncertain on a NIL return.")
|
||||
|
||||
(* ;; "")
|
||||
|
||||
(CL:UNLESS BYTE1
|
||||
(SETQ BYTE1 (\BIN STREAM)))
|
||||
(PROG (BYTE2 BYTE3 BYTE4)
|
||||
|
||||
(* ;; "Distinguish on the header byte BYTE1.")
|
||||
|
||||
(CL:WHEN (SMALLP BYTE1)
|
||||
(IF (ILESSP BYTE1 128)
|
||||
THEN (RETURN 1)
|
||||
ELSEIF (IGEQ BYTE1 (LLSH 15 4))
|
||||
THEN (* ; "4 bytes")
|
||||
(SETQ BYTE2 (\BIN STREAM))
|
||||
(CL:WHEN (OR (NOT (SMALLP BYTE2))
|
||||
(ILESSP BYTE2 128))
|
||||
(RETURN))
|
||||
(SETQ BYTE3 (\BIN STREAM))
|
||||
(CL:WHEN (OR (NOT (SMALLP BYTE3))
|
||||
(ILESSP BYTE3 128))
|
||||
(RETURN))
|
||||
(SETQ BYTE4 (\BIN STREAM))
|
||||
(CL:WHEN (OR (NOT (SMALLP BYTE4))
|
||||
(ILESSP BYTE4 128))
|
||||
(RETURN))
|
||||
(RETURN 4)
|
||||
ELSEIF (IGEQ BYTE1 (LLSH 7 5))
|
||||
THEN (* ; "3 bytes")
|
||||
(SETQ BYTE2 (\BIN STREAM))
|
||||
(CL:WHEN (OR (NOT (SMALLP BYTE2))
|
||||
(ILESSP BYTE2 128))
|
||||
(RETURN))
|
||||
(SETQ BYTE3 (\BIN STREAM))
|
||||
(CL:WHEN (OR (NOT (SMALLP BYTE3))
|
||||
(ILESSP BYTE3 128))
|
||||
(RETURN))
|
||||
(RETURN 3)
|
||||
ELSE (* ; " 2 bytes")
|
||||
(SETQ BYTE2 (\BIN STREAM))
|
||||
(CL:WHEN (OR (NOT (SMALLP BYTE2))
|
||||
(ILESSP BYTE2 128))
|
||||
(RETURN NIL))
|
||||
(RETURN 2)))])
|
||||
|
||||
(HEXSTRING
|
||||
[LAMBDA (N WIDTH) (* ; "Edited 23-Jul-2020 08:28 by rmk:")
|
||||
(* ; "Edited 20-Dec-93 17:51 by rmk:")
|
||||
@@ -1068,23 +1153,24 @@
|
||||
ELSE (ERROR "CHARCODE too big for UTF8" CHARCODE])
|
||||
|
||||
(NUTF8CODEBYTES
|
||||
[LAMBDA (N) (* ; "Edited 28-Jun-2022 00:02 by rmk")
|
||||
[LAMBDA (BYTE) (* ; "Edited 8-Jan-2024 10:57 by rmk")
|
||||
(* ; "Edited 28-Jun-2022 00:02 by rmk")
|
||||
(* ; "Edited 10-Aug-2020 12:35 by rmk:")
|
||||
|
||||
(* ;; "Returns the number of bytes needed to encode N in UTF8, ")
|
||||
(* ;; "Returns the number of bytes needed to encode in UTF8 a number headed by BYTE. ")
|
||||
|
||||
(IF (ILESSP N 128)
|
||||
(IF (ILESSP BYTE 128)
|
||||
THEN 1
|
||||
ELSEIF (ILESSP N 2048)
|
||||
ELSEIF (ILESSP BYTE 2048)
|
||||
THEN (* ; "x800")
|
||||
2
|
||||
ELSEIF (ILESSP N 65536)
|
||||
ELSEIF (ILESSP BYTE 65536)
|
||||
THEN (* ; "x10000")
|
||||
3
|
||||
ELSEIF (ILESSP N 2097152)
|
||||
ELSEIF (ILESSP BYTE 2097152)
|
||||
THEN (* ; "x200000")
|
||||
4
|
||||
ELSE (SHOULDNT])
|
||||
ELSE (ERROR "INVALID UTF-8 HEADER BYTE"])
|
||||
|
||||
(NUTF8STRINGBYTES
|
||||
[LAMBDA (STRING RAWFLG) (* ; "Edited 10-Aug-2020 09:06 by rmk:")
|
||||
@@ -1160,6 +1246,44 @@
|
||||
)
|
||||
(DEFINEQ
|
||||
|
||||
(\UTF8.FETCHCODE
|
||||
[LAMBDA (CODESIZE BUFFER BYTEOFFSET) (* ; "Edited 28-Dec-2023 13:32 by rmk")
|
||||
(* ; "Edited 6-Aug-2021 16:02 by rmk:")
|
||||
(* ; "Edited 6-Aug-2020 17:13 by rmk:")
|
||||
|
||||
(* ;; "Decodes a UTF8 byte sequence of size CODESIZE in BUFFER starting at BYTEOFFSET.")
|
||||
|
||||
(* ;; "The validity of the thesize, buffer, and offset are guaranteed by the caller.")
|
||||
|
||||
(LET ((BYTE1 (\GETBASEBYTE BUFFER BYTEOFFSET))
|
||||
BYTE2 BYTE3 BYTE4)
|
||||
(SELECTQ CODESIZE
|
||||
(2 (SETQ BYTE2 (\UTF8.GETBASEBYTE BUFFER (IPLUS 1 BYTEOFFSET)))
|
||||
(LOGOR (LLSH (LOADBYTE BYTE1 0 5)
|
||||
6)
|
||||
(LOADBYTE BYTE2 0 6)))
|
||||
(3 (SETQ BYTE2 (\UTF8.GETBASEBYTE BUFFER (IPLUS 1 BYTEOFFSET)))
|
||||
(SETQ BYTE3 (\UTF8.GETBASEBYTE BUFFER (IPLUS 2 BYTEOFFSET)))
|
||||
(LOGOR (LLSH (LOADBYTE BYTE1 0 4)
|
||||
12)
|
||||
(LLSH (LOADBYTE BYTE2 0 6)
|
||||
6)
|
||||
(LOADBYTE BYTE3 0 6)))
|
||||
(4 (SETQ BYTE2 (\UTF8.GETBASEBYTE BUFFER (IPLUS 1 BYTEOFFSET)))
|
||||
(SETQ BYTE3 (\UTF8.GETBASEBYTE BUFFER (IPLUS 2 BYTEOFFSET)))
|
||||
(SETQ BYTE4 (\UTF8.GETBASEBYTE BUFFER (IPLUS 3 BYTEOFFSET)))
|
||||
(LOGOR (LLSH (LOADBYTE BYTE1 0 3)
|
||||
18)
|
||||
(LLSH (LOADBYTE BYTE2 0 6)
|
||||
12)
|
||||
(LLSH (LOADBYTE BYTE3 0 6)
|
||||
6)
|
||||
(LOADBYTE BYTE4 0 6)))
|
||||
(1 BYTE1)
|
||||
(SHOULDNT])
|
||||
)
|
||||
(DEFINEQ
|
||||
|
||||
(SHOWCHARS
|
||||
[LAMBDA (FROMCHAR TOCHAR FONT) (* ; "Edited 1-Aug-2020 09:27 by rmk:")
|
||||
(RESETFORM (DSPFONT (OR FONT '(CLASSIC 12))
|
||||
@@ -1215,15 +1339,16 @@
|
||||
)
|
||||
)
|
||||
(DECLARE%: DONTCOPY
|
||||
(FILEMAP (NIL (3717 17808 (UTF8.OUTCHARFN 3727 . 6558) (UTF8.INCCODEFN 6560 . 12050) (UTF8.PEEKCCODEFN
|
||||
12052 . 16826) (\UTF8.BACKCCODEFN 16828 . 17806)) (17809 21590 (UTF16BE.OUTCHARFN 17819 . 18643) (
|
||||
UTF16BE.INCCODEFN 18645 . 19544) (UTF16BE.PEEKCCODEFN 19546 . 20617) (\UTF16BE.BACKCCODEFN 20619 .
|
||||
21588)) (21620 23681 (MAKE-UNICODE-FORMATS 21630 . 23679)) (23778 25084 (UNICODE.UNMAPPED 23788 .
|
||||
25082)) (25085 25621 (XCCS-UTF8-AFTER-OPEN 25095 . 25619)) (26454 26803 (XTOUCODE 26464 . 26632) (
|
||||
UTOXCODE 26634 . 26801)) (26843 42965 (READ-UNICODE-MAPPING-FILENAMES 26853 . 27954) (
|
||||
READ-UNICODE-MAPPING 27956 . 31254) (WRITE-UNICODE-MAPPING 31256 . 35473) (WRITE-UNICODE-INCLUDED
|
||||
35475 . 40197) (WRITE-UNICODE-MAPPING-HEADER 40199 . 41431) (WRITE-UNICODE-MAPPING-FILENAME 41433 .
|
||||
42963)) (46178 54657 (MAKE-UNICODE-TRANSLATION-TABLES 46188 . 54655)) (55074 63100 (HEXSTRING 55084 .
|
||||
56245) (UTF8HEXSTRING 56247 . 58452) (NUTF8CODEBYTES 58454 . 59239) (NUTF8STRINGBYTES 59241 . 59722) (
|
||||
XTOUSTRING 59724 . 62735) (XCCSSTRING 62737 . 63098)) (63101 64570 (SHOWCHARS 63111 . 64568)))))
|
||||
(FILEMAP (NIL (3950 18041 (UTF8.OUTCHARFN 3960 . 6791) (UTF8.INCCODEFN 6793 . 12283) (UTF8.PEEKCCODEFN
|
||||
12285 . 17059) (\UTF8.BACKCCODEFN 17061 . 18039)) (18042 21823 (UTF16BE.OUTCHARFN 18052 . 18876) (
|
||||
UTF16BE.INCCODEFN 18878 . 19777) (UTF16BE.PEEKCCODEFN 19779 . 20850) (\UTF16BE.BACKCCODEFN 20852 .
|
||||
21821)) (21853 24134 (MAKE-UNICODE-FORMATS 21863 . 24132)) (24231 25537 (UNICODE.UNMAPPED 24241 .
|
||||
25535)) (25538 26214 (XCCS-UTF8-AFTER-OPEN 25548 . 26212)) (27670 28019 (XTOUCODE 27680 . 27848) (
|
||||
UTOXCODE 27850 . 28017)) (28059 44757 (READ-UNICODE-MAPPING-FILENAMES 28069 . 30519) (
|
||||
READ-UNICODE-MAPPING 30521 . 33497) (WRITE-UNICODE-MAPPING 33499 . 37249) (WRITE-UNICODE-INCLUDED
|
||||
37251 . 41973) (WRITE-UNICODE-MAPPING-HEADER 41975 . 43223) (WRITE-UNICODE-MAPPING-FILENAME 43225 .
|
||||
44755)) (48071 56550 (MAKE-UNICODE-TRANSLATION-TABLES 48081 . 56548)) (57055 68253 (UTF-8.VALIDATE
|
||||
57065 . 60067) (HEXSTRING 60069 . 61230) (UTF8HEXSTRING 61232 . 63437) (NUTF8CODEBYTES 63439 . 64392)
|
||||
(NUTF8STRINGBYTES 64394 . 64875) (XTOUSTRING 64877 . 67888) (XCCSSTRING 67890 . 68251)) (68254 70058 (
|
||||
\UTF8.FETCHCODE 68264 . 70056)) (70059 71528 (SHOWCHARS 70069 . 71526)))))
|
||||
STOP
|
||||
|
||||
Binary file not shown.
BIN
library/UNICODE.TEDIT
Normal file
BIN
library/UNICODE.TEDIT
Normal file
Binary file not shown.
Reference in New Issue
Block a user