(DEFINE-FILE-INFO PACKAGE "INTERLISP" READTABLE "INTERLISP" BASE 10)

(FILECREATED "26-Jan-2024 14:19:50" {LIB}UNICODE.;4 72688  

      :EDIT-BY "mth"

      :CHANGES-TO (FNS MAKE-UNICODE-FORMATS MAKE-UNICODE-TRANSLATION-TABLES SHOWCHARS 
                       READ-UNICODE-MAPPING-FILENAMES)
                  (VARS UNICODECOMS)

      :PREVIOUS-DATE " 8-Jan-2024 10:58:06" {LIB}UNICODE.;1)


(PRETTYCOMPRINT UNICODECOMS)

(RPAQQ UNICODECOMS
       ((COMS 
              (* ;; "External formats")

              (FNS UTF8.OUTCHARFN UTF8.INCCODEFN UTF8.PEEKCCODEFN \UTF8.BACKCCODEFN)
              (FNS UTF16BE.OUTCHARFN UTF16BE.INCCODEFN UTF16BE.PEEKCCODEFN \UTF16BE.BACKCCODEFN)
              (INITVARS (EXTERNALEOL 'LF))
              (FNS MAKE-UNICODE-FORMATS)
              (P (MAKE-UNICODE-FORMATS EXTERNALEOL))
              (ADDVARS (*DEFAULT-EXTERNALFORMATS* (UNIX :UTF-8)))
              (FNS UNICODE.UNMAPPED)
              (FNS XCCS-UTF8-AFTER-OPEN)
              (DECLARE%: EVAL@COMPILE DONTCOPY (MACROS UNICODE.TRANSLATE \UTF8.GETBASEBYTE))
              (FNS XTOUCODE UTOXCODE))
        (COMS 
              (* ;; "Unicode mapping files")

              (FNS READ-UNICODE-MAPPING-FILENAMES READ-UNICODE-MAPPING WRITE-UNICODE-MAPPING 
                   WRITE-UNICODE-INCLUDED WRITE-UNICODE-MAPPING-HEADER WRITE-UNICODE-MAPPING-FILENAME
                   )
              (VARS XCCS-SET-NAMES)
              
              (* ;; "Automate dumping of a documentation prefix")

              [DECLARE%: EVAL@COMPILE DONTCOPY (CONSTANTS (MISSINGCODE (CL:PARSE-INTEGER "FFFE" 
                                                                              :RADIX 16))
                                                      (UNDEFINEDCODE (CL:PARSE-INTEGER "FFFF" :RADIX
                                                                            16]
              (VARS UNICODE-MAPPING-HEADER)
              (INITVARS (UNICODEDIRECTORIES NIL)))
        (COMS 
              (* ;; "Set up translation tables for UTF8 and UTFBE external formats")

              (FNS MAKE-UNICODE-TRANSLATION-TABLES)
              [INITVARS (DEFAULT-XCCS-CHARSETS '(LATIN JAPANESE-SYMBOLS1 JAPANESE-SYMBOLS2 
                                                       EXTENDED-LATIN FORMS SYMBOLS1 SYMBOLS2 
                                                       ACCENTED-LATIN1 GREEK))
                     (DEFAULT-XCCS-JAPANESE-CHARSETS '(HIRAGANA KATAKANA JIS]
              [DECLARE%: DONTEVAL@LOAD DOCOPY (P (MAKE-UNICODE-TRANSLATION-TABLES (
                                                                                 READ-UNICODE-MAPPING
                                                                                   
                                                                                DEFAULT-XCCS-CHARSETS
                                                                                   T)
                                                        '*XCCSTOUNICODE*
                                                        '*UNICODETOXCCS*]
              (GLOBALVARS *XCCSTOUNICODE* *UNICODETOXCCS*))
        (FNS UTF-8.VALIDATE HEXSTRING UTF8HEXSTRING NUTF8CODEBYTES NUTF8STRINGBYTES XTOUSTRING 
             XCCSSTRING)
        (FNS \UTF8.FETCHCODE)
        (FNS SHOWCHARS)
        [DECLARE%: EVAL@COMPILE DONTCOPY (FILES (FROM LOADUPS)
                                                EXPORTS.ALL)
               
               (* ;; "These control the layout of the translation tables.  Since many of the upper panels are sparse, doing it per-panel (128) seems more space-efficient, and residual alists can be shorter")

               (CONSTANTS (TRANSLATION-SEGMENT-SIZE 128)
                      (MAX-ALIST-LENGTH 10)
                      (N-TRANSLATION-SEGMENTS (IQUOTIENT 65536 TRANSLATION-SEGMENT-SIZE))
                      (TRANSLATION-SHIFT (INTEGERLENGTH (SUB1 TRANSLATION-SEGMENT-SIZE)))
                      (TRANSLATION-MASK (SUB1 TRANSLATION-SEGMENT-SIZE]
        (PROP (FILETYPE)
              UNICODE)))



(* ;; "External formats")

(DEFINEQ

(UTF8.OUTCHARFN
  [LAMBDA (STREAM CHARCODE RAW)                         (* ; "Edited  8-Aug-2021 13:02 by rmk:")
                                                            (* ; "Edited 17-Aug-2020 08:45 by rmk:")
                                                            (* ; "Edited 30-Jan-2020 23:08 by rmk:")

    (* ;; "Perhaps the translation table should already do the mapping for EOL to LF, but that seems to be a separate property of the stream. Also, CRLF=2 bytes.")

    (* ;; "Print UTF8 sequence for CHARCODE.  Do not do XCCS to Unicode translation if RAW.")

    (IF (EQ CHARCODE (CHARCODE EOL))
        THEN (FREPLACE (STREAM CHARPOSITION) OF STREAM WITH 0)
              (\BOUTEOL STREAM)
      ELSE (CHANGE (FFETCH (STREAM CHARPOSITION) OF STREAM)
                      (IPLUS16 1 DATUM))
            (FOR C INSIDE (CL:IF RAW
                                      CHARCODE
                                      (UNICODE.TRANSLATE CHARCODE *XCCSTOUNICODE*))
               DO (IF (ILESSP C 128)
                          THEN (\BOUT STREAM C)
                        ELSEIF (ILESSP C 2048)
                          THEN                           (* ; "x800")
                                (\BOUT STREAM (LOGOR (LLSH 3 6)
                                                     (LRSH C 6)))
                                (\BOUT STREAM (LOGOR (LLSH 2 6)
                                                     (LOADBYTE C 0 6)))
                        ELSEIF (ILESSP C 65536)
                          THEN                           (* ; "x10000")
                                (\BOUT STREAM (LOGOR (LLSH 7 5)
                                                     (LRSH C 12)))
                                (\BOUT STREAM (LOGOR (LLSH 2 6)
                                                     (LOADBYTE C 6 6)))
                                (\BOUT STREAM (LOGOR (LLSH 2 6)
                                                     (LOADBYTE C 0 6)))
                        ELSEIF (ILESSP C 2097152)
                          THEN                           (* ; "x200000")
                                (\BOUT STREAM (LOGOR (LLSH 15 4)
                                                     (LRSH C 18)))
                                (\BOUT STREAM (LOGOR (LLSH 2 6)
                                                     (LOADBYTE C 12 6)))
                                (\BOUT STREAM (LOGOR (LLSH 2 6)
                                                     (LOADBYTE C 6 6)))
                                (\BOUT STREAM (LOGOR (LLSH 2 6)
                                                     (LOADBYTE C 0 6)))
                        ELSE (ERROR "CHARCODE too big for UTF8" C])

(UTF8.INCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                           (* ; "Edited  6-Aug-2021 16:02 by rmk:")
                                                            (* ; "Edited  6-Aug-2020 17:13 by rmk:")

    (* ;; "Do not do UNICODE to XCSS translation if RAW.")

    (* ;; "Test for smallp because the stream's End-of-file operation may suppress the error")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (LET (BYTE1 BYTE2 BYTE3 BYTE4 CODE (COUNT 1))
         (SETQ BYTE1 (\BIN STREAM))

         (* ;; "Distinguish on header bytes (modulo peculiar EOF behavior--the caller will get whatever ended up in BYTE1")

         (CL:WHEN (SMALLP BYTE1)
             [SETQ CODE (IF (ILESSP BYTE1 128)
                            THEN 

                                  (* ;; 
                         "Test first:  Ascii is the common case.  EOL requires its own translation")

                                  (SELCHARQ BYTE1
                                       (CR (SELECTC (FETCH (STREAM EOLCONVENTION) OF STREAM)
                                               (CR.EOLC      (* ; "Also eq BYTE1")
                                                        (CHARCODE EOL))
                                               (CRLF.EOLC (IF (EQ (CHARCODE LF)
                                                                      (\PEEKBIN STREAM T))
                                                              THEN (\BIN STREAM)
                                                                    (CL:WHEN COUNTP (SETQ COUNT 2))
                                                                    (CHARCODE EOL)
                                                            ELSE BYTE1))
                                               BYTE1))
                                       (LF (CL:IF (EQ LF.EOLC (FETCH (STREAM EOLCONVENTION)
                                                                 OF STREAM))
                                               (CHARCODE EOL)
                                               BYTE1))
                                       BYTE1)
                          ELSEIF (IGEQ BYTE1 (LLSH 15 4))
                            THEN                         (* ; "4 bytes")
                                  (SETQ BYTE2 (\BIN STREAM))
                                  (CL:WHEN (OR (NOT (SMALLP BYTE2))
                                               (ILESSP BYTE2 128))
                                      (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                                  (SETQ BYTE3 (\BIN STREAM))
                                  (CL:WHEN (OR (NOT (SMALLP BYTE3))
                                               (ILESSP BYTE3 128))
                                      (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3)))
                                  (SETQ BYTE4 (\BIN STREAM))
                                  (CL:WHEN (OR (NOT (SMALLP BYTE4))
                                               (ILESSP BYTE4 128))
                                      (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3 BYTE4)))
                                  (SETQ COUNT 4)
                                  (LOGOR (LLSH (LOADBYTE BYTE1 0 3)
                                               18)
                                         (LLSH (LOADBYTE BYTE2 0 6)
                                               12)
                                         (LLSH (LOADBYTE BYTE3 0 6)
                                               6)
                                         (LOADBYTE BYTE4 0 6))
                          ELSEIF (IGEQ BYTE1 (LLSH 7 5))
                            THEN                         (* ; "3 bytes")
                                  (SETQ BYTE2 (\BIN STREAM))
                                  (CL:WHEN (OR (NOT (SMALLP BYTE2))
                                               (ILESSP BYTE2 128))
                                      (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                                  (SETQ BYTE3 (\BIN STREAM))
                                  (CL:WHEN (OR (NOT (SMALLP BYTE3))
                                               (ILESSP BYTE3 128))
                                      (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3)))
                                  (SETQ COUNT 3)
                                  (LOGOR (LLSH (LOADBYTE BYTE1 0 4)
                                               12)
                                         (LLSH (LOADBYTE BYTE2 0 6)
                                               6)
                                         (LOADBYTE BYTE3 0 6))
                          ELSE                           (* ; "Must be 2 bytes")
                                (SETQ COUNT 2)
                                (SETQ BYTE2 (\BIN STREAM))
                                (CL:WHEN (OR (NOT (SMALLP BYTE2))
                                             (ILESSP BYTE2 128))
                                    (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                                (LOGOR (LLSH (LOADBYTE BYTE1 0 5)
                                             6)
                                       (LOADBYTE BYTE2 0 6])
         (CL:UNLESS (OR RAW (NOT (SMALLP CODE)))
             (SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOXCCS*)))
         (CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT))
         CODE])

(UTF8.PEEKCCODEFN
  [LAMBDA (STREAM NOERROR RAW)                          (* ; "Edited 14-Jun-2021 22:53 by rmk:")

    (* ;; "Modeled this after \EUCPEEK on LLREAD.  In the multi-byte (non-ASCII) case, backs the file pointer to the beginning by the proper number of \BACKFILEPTRs, and returns a count of 0.  Returns NIL if NOERROR and either invalid UTF8 or end of file.")

    (* ;; "Could be that the caller takes care of backing up the file position if the number of binned-bytes is returned.")

    (* ;; "Do not do UNICODE to XCCS translation if RAW")

    (PROG (BYTE1 BYTE2 BYTE3 BYTE4 CODE)
          (SETQ BYTE1 (\PEEKBIN STREAM NOERROR))

     (* ;; "Distinguish on header bytex")

          (CL:UNLESS BYTE1 (RETURN NIL))
          [IF (ILESSP BYTE1 128)
              THEN 

                    (* ;; 
                  "Test first:  Ascii is the common case.  No need to back up, since we peeked.")

                    (SETQ CODE BYTE1)
            ELSEIF (IGEQ BYTE1 (LLSH 15 4))
              THEN                                       (* ; "4 bytes")
                    (\BIN STREAM)
                    (CL:UNLESS (AND (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                                    (IGEQ BYTE2 128))
                        (\BACKFILEPTR STREAM)
                        (OR NOERROR (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                        (RETURN CODE))
                    (\BIN STREAM)
                    (CL:UNLESS (AND (SETQ BYTE3 (\PEEKBIN STREAM NOERROR))
                                    (IGEQ BYTE3 128))
                        (\BACKFILEPTR STREAM)
                        (\BACKFILEPTR STREAM)
                        (OR NOERROR (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3)))
                        (RETURN CODE))
                    (\BIN STREAM)
                    (SETQ BYTE4 (\PEEKBIN STREAM NOERROR))   (* ; 
                                                           "PEEK the last, no need to back it up")
                    (\BACKFILEPTR STREAM)
                    (\BACKFILEPTR STREAM)
                    (\BACKFILEPTR STREAM)
                    (IF (AND BYTE4 (IGEQ BYTE4 128))
                        THEN (SETQ CODE (LOGOR (LLSH (LOADBYTE BYTE1 0 3)
                                                         18)
                                                   (LLSH (LOADBYTE BYTE2 0 6)
                                                         12)
                                                   (LLSH (LOADBYTE BYTE3 0 6)
                                                         6)
                                                   (LOADBYTE BYTE4 0 6)))
                      ELSEIF NOERROR
                      ELSE (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3 BYTE4)))
            ELSEIF (IGEQ BYTE1 (LLSH 7 5))
              THEN                                       (* ; "3 bytes")
                    (\BIN STREAM)
                    (CL:UNLESS (AND (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                                    (IGEQ BYTE2 128))
                        (\BACKFILEPTR STREAM)
                        (OR NOERROR (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                        (RETURN CODE))
                    (\BIN STREAM)
                    (SETQ BYTE3 (\PEEKBIN STREAM NOERROR))
                    (\BACKFILEPTR STREAM)
                    (\BACKFILEPTR STREAM)
                    (IF (AND BYTE3 (IGEQ BYTE3 128))
                        THEN (SETQ CODE (LOGOR (LLSH (LOADBYTE BYTE1 0 4)
                                                         12)
                                                   (LLSH (LOADBYTE BYTE2 0 6)
                                                         6)
                                                   (LOADBYTE BYTE3 0 6)))
                      ELSEIF NOERROR
                      ELSE (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3)))
            ELSE                                         (* ; "Must be 2 bytes")
                  (\BIN STREAM)
                  (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                  (\BACKFILEPTR STREAM)
                  (IF (AND BYTE2 (IGEQ BYTE2 128))
                      THEN (SETQ CODE (LOGOR (LLSH (LOADBYTE BYTE1 0 5)
                                                       6)
                                                 (LOADBYTE BYTE2 0 6)))
                    ELSEIF NOERROR
                    ELSE (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2]
          (CL:WHEN (AND CODE (NOT RAW))
              (SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOXCCS*)))
          (RETURN CODE])

(\UTF8.BACKCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 19-Jul-2022 15:30 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:04 by rmk:")

    (* ;; "\BACKFILEPTR is NIL at beginning of FILE.  Presumably a little bit more efficient if we decoded the UTF8 bytes backwards and didn't do the peek, but probably not worth the complexity. ")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (BIND (C _ 0) WHILE (IF (\BACKFILEPTR STREAM)
                            THEN (ADD C -1)
                                 (EQ 2 (LRSH (\PEEKBIN STREAM)
                                             6))
                          ELSE (CL:WHEN COUNTP (SETQ *BYTECOUNTER* C))
                               (RETURN NIL)) REPEATUNTIL (EQ C -4)
       FINALLY (CL:WHEN COUNTP (SETQ *BYTECOUNTER* C))
             (RETURN (UTF8.PEEKCCODEFN STREAM NIL RAW])
)
(DEFINEQ

(UTF16BE.OUTCHARFN
  [LAMBDA (STREAM CHARCODE RAW)                         (* ; "Edited  8-Aug-2021 13:09 by rmk:")
                                                            (* ; "Edited 30-Jan-2020 23:08 by rmk:")

    (* ;; "PRINT UTF16 sequence for CHARCODE.  Do not do XCCS to UNICODE translation if RAW.")

    (* ;; "Not sure about EOL conversion if truly %"raw%"")

    (IF (EQ CHARCODE (CHARCODE EOL))
        THEN (FREPLACE (STREAM CHARPOSITION) OF STREAM WITH 0)
      ELSE (CHANGE (FFETCH (STREAM CHARPOSITION) OF STREAM)
                      (IPLUS16 1 DATUM)))
    (FOR C INSIDE (CL:IF RAW
                              CHARCODE
                              (UNICODE.TRANSLATE CHARCODE *XCCSTOUNICODE*))
       DO (\WOUT STREAM C])

(UTF16BE.INCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                           (* ; "Edited  6-Aug-2021 16:05 by rmk:")

    (* ;; 
  "Do not do UNICODE to XCCS translation if RAW.  Test for SMALLPin case of funky EOF behavior")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (LET (CODE BYTE1 BYTE2 COUNT)
         (IF [AND (SMALLP (SETQ BYTE1 (\BIN STREAM)))
                      (SMALLP (SETQ BYTE2 (\BIN STREAM]
             THEN (SETQ COUNT 2)
                   (SETQ CODE (LOGOR (LLSH (\BIN STREAM)
                                           8)
                                     (\BIN STREAM)))
                   (CL:UNLESS RAW
                       (SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOXCCS*)))
                   (CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT))
                   CODE
           ELSE (ERROR "ODD NUMBER OF BYTES IN UTF16 FILE" STREAM])

(UTF16BE.PEEKCCODEFN
  [LAMBDA (STREAM NOERROR RAW)                          (* ; "Edited 14-Jun-2021 22:58 by rmk:")

    (* ;; "Could be that the caller takes care of backing up the file position if the number of binned-bytes is returned.")

    (* ;; "Do not do UNICODE to XCCS translation if RAW")

    (LET (BYTE1 BYTE2 CODE)
         (SETQ BYTE1 (\PEEKBIN STREAM NOERROR))
         (IF BYTE1
             THEN (\BIN STREAM)
                   (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                   (\BACKFILEPTR STREAM)
                   (IF BYTE2
                       THEN (SETQ CODE (LOGOR (LLSH BYTE1 8)
                                                  BYTE2))
                             (CL:IF RAW
                                 CODE
                                 (UNICODE.TRANSLATE CODE *UNICODETOXCCS*))
                     ELSEIF NOERROR
                       THEN NIL)
           ELSEIF NOERROR
             THEN NIL
           ELSE (ERROR "INVALID UTF16 CHARACTER" (LIST BYTE1 BYTE2])

(\UTF16BE.BACKCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 19-Jul-2022 15:14 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:07 by rmk:")

    (* ;; "\BACKFILEPTR is NIL at beginning of FILE, do nothing.")

    (* ;; "Common for big-ending and little-ending")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (CL:WHEN (\BACKFILEPTR STREAM)
        (LET (CODE (BYTE2 (\PEEKBIN STREAM)))
             (IF (\BACKFILEPTR STREAM)
                 THEN (CL:WHEN COUNTP (SETQ *BYTECOUNTER* -2))
                      (SETQ CODE (LOGOR (LLSH BYTE2 8)
                                        (\PEEKBIN STREAM)))
                      (CL:IF RAW
                          CODE
                          (UNICODE.TRANSLATE CODE *UNICODETOXCCS*))
               ELSEIF COUNTP
                 THEN (SETQ *BYTECOUNTER* -1)
                      NIL)))])
)

(RPAQ? EXTERNALEOL 'LF)
(DEFINEQ

(MAKE-UNICODE-FORMATS
  [LAMBDA (EXTERNALEOL)                                      (* ; "Edited  8-Dec-2023 15:19 by rmk")
                                                             (* ; "Edited 19-Jul-2022 15:36 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:08 by rmk:")

    (* ;; "RAW formats do not do XCCS/Unicode translation, just deal with the byte encoding.")

    (* ;; "The EXTERNALEOL specifies the EOLCONVENTION of the stream, particularly to produce output files with the desired convention.  On input the macro \CHECKEOLC (LLREAD) coerces only that coding to the internal EOL, which is a mistake.")

    (MAKE-EXTERNALFORMAT :UTF-8 (FUNCTION UTF8.INCCODEFN)
           (FUNCTION UTF8.PEEKCCODEFN)
           (FUNCTION \UTF8.BACKCCODEFN)
           (FUNCTION UTF8.OUTCHARFN)
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (MAKE-EXTERNALFORMAT :UTF-8-RAW [FUNCTION (LAMBDA (STREAM COUNTP)
                                                (UTF8.INCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM NOERROR)
                       (UTF8.PEEKCCODEFN STREAM NOERROR T]
           [FUNCTION (LAMBDA (STREAM COUNTP)
                       (\UTF8.BACKCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM CHARCODE)
                       (UTF8.OUTCHARFN STREAM CHARCODE T]
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (MAKE-EXTERNALFORMAT :UTF-16BE (FUNCTION UTF16BE.INCCODEFN)
           (FUNCTION UTF16BE.PEEKCCODEFN)
           (FUNCTION \UTF16BE.BACKCCODEFN)
           (FUNCTION UTF16BE.OUTCHARFN)
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (MAKE-EXTERNALFORMAT :UTF-16BE-RAW [FUNCTION (LAMBDA (STREAM COUNTP)
                                                   (UTF16BE.INCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM NOERROR)
                       (UTF16BE.PEEKCCODEFN STREAM NOERROR T]
           [FUNCTION (LAMBDA (STREAM COUNTP)
                       (\UTF16BE.BACKCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM CHARCODE)
                       (UTF16BE.OUTCHARFN STREAM CHARCODE T]
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL])
)

(MAKE-UNICODE-FORMATS EXTERNALEOL)

(ADDTOVAR *DEFAULT-EXTERNALFORMATS* (UNIX :UTF-8))
(DEFINEQ

(UNICODE.UNMAPPED
  [LAMBDA (CODE TRANSLATION-TABLE)                      (* ; "Edited 11-Aug-2020 20:23 by rmk:")

    (* ;; "This is the slow fall-out when UNICODE.TRANSLATE determines that CODED has no fast mapping in TRANSLATION-TABLE.")

    (* ;; "We return an existing entry in the hash array of the table.  If CODE has not previously been seen, we allocate a new code in the forward unmapped hasharray and put the inverse in the backward array.")

    (LET ((FORWARD (CL:SVREF TRANSLATION-TABLE N-TRANSLATION-SEGMENTS))
          INVERSE NEXTCODE)
         (IF (GETHASH CODE (CAR FORWARD))
           ELSEIF (AND (ILEQ CODE (CADDR FORWARD))
                           (IGEQ CODE (CADDDR FORWARD)))
             THEN (ERROR "UNMAPPED CODE IS EITHER XCCS-UNUSED OR UNICODE-PRIVATE" CODE)
           ELSE (SETQ INVERSE (CL:SVREF TRANSLATION-TABLE (ADD1 N-TRANSLATION-SEGMENTS)))
                 (SETQ NEXTCODE (ADD (CADR INVERSE)
                                       1))
                 (CL:WHEN (IGREATERP NEXTCODE (CADDR INVERSE))
                        (ERROR "EXHAUSTED RANGE FOR UNMAPPED CODES" CODE))
                 (PUTHASH CODE NEXTCODE (CAR FORWARD))
                 (PUTHASH NEXTCODE CODE (CAR INVERSE))
                 NEXTCODE])
)
(DEFINEQ

(XCCS-UTF8-AFTER-OPEN
  [LAMBDA (STREAM ACCESS PARAMETERS)                         (* ; "Edited  3-Jan-2024 10:27 by rmk")
                                                            (* ; "Edited 13-Aug-2020 11:54 by rmk:")

    (* ;; 
    "If added to STREAM-AFTER-OPEN-FNS, causes mapping files to be opened as UTF-8. For development")

    (CL:WHEN (AND (STRPOS "XCCS-" (U-CASE (FULLNAME STREAM)))
                  [EQ 'TXT (U-CASE (FILENAMEFIELD (FULLNAME STREAM)
                                          'EXTENSION]
                  (NOT (ASSOC 'EXTERNALFORMAT PARAMETERS)))
        (STREAMPROP STREAM 'EXTERNALFORMAT :UTF-8))])
)
(DECLARE%: EVAL@COMPILE DONTCOPY 
(DECLARE%: EVAL@COMPILE 

(PUTPROPS UNICODE.TRANSLATE MACRO [OPENLAMBDA (CODE TRANSLATION-TABLE)
                                    (LET [(X (CL:SVREF TRANSLATION-TABLE (LRSH CODE TRANSLATION-SHIFT
                                                                               ]
                                         (COND
                                            ((LISTP X)
                                             (OR (CDR (FASSOC (LOGAND CODE TRANSLATION-SHIFT)
                                                             X))
                                                 CODE))
                                            [(AND X (CL:SVREF X (LOGAND CODE TRANSLATION-MASK]
                                            (T (UNICODE.UNMAPPED CODE TRANSLATION-TABLE])

(PUTPROPS \UTF8.GETBASEBYTE MACRO ((BASE OFFSET ERROR?)      (* ; 
                        "Fetches the OFFSET'th byte from BASE, checking for UTF-8 validity if ERROR?")
                                   (IF ERROR?
                                       THEN (LET ((BYTE (\GETBASEBYTE BASE OFFSET)))
                                                 (CL:WHEN (ILESSP BYTE 128)
                                                        (ERROR "INVALID UTF8 BYTE" BYTE))
                                                 BYTE)
                                     ELSE (\GETBASEBYTE BASE OFFSET))))
)
)
(DEFINEQ

(XTOUCODE
  [LAMBDA (XCCSCODE)                                    (* ; "Edited  9-Aug-2020 09:04 by rmk:")
    (UNICODE.TRANSLATE XCCSCODE *XCCSTOUNICODE*])

(UTOXCODE
  [LAMBDA (UNICODE)                                     (* ; "Edited  9-Aug-2020 09:04 by rmk:")
    (UNICODE.TRANSLATE UNICODE *UNICODETOXCCS*])
)



(* ;; "Unicode mapping files")

(DEFINEQ

(READ-UNICODE-MAPPING-FILENAMES
  [LAMBDA (FILESPEC DIRS)                                    (* ; "Edited 26-Jan-2024 14:02 by mth")
                                                             (* ; "Edited  5-Jan-2024 17:24 by rmk")
                                                          (* ; "Edited  5-Aug-2020 15:59 by kaplan")
                                                            (* ; "Edited  4-Aug-2020 17:31 by rmk:")
    (DECLARE (USEDFREE UNICODEDIRECTORIES XCCS-SET-NAMES))
    (CL:UNLESS DIRS (SETQ DIRS UNICODEDIRECTORIES))
    (FOR F X CSI INSIDE FILESPEC JOIN 
                                      (* ;; 
            "Last case hopes to pick up tables that are gruped together in a subdirectory (e.g. JIS)")

                                      (OR (MKLIST (FINDFILE (PACKFILENAME 'BODY F 'EXTENSION
                                                                   'TXT)
                                                         T DIRS))
                                          (for D inside DIRS
                                             when (SETQ D (FILDIR (PACKFILENAME 'NAME
                                                                         (CONCAT "XCCS-*=" F)
                                                                         'EXTENSION
                                                                         'TXT
                                                                         'BODY D)))
                                             do (RETURN D))
                                          (AND [SETQ CSI (OR (SASSOC F XCCS-SET-NAMES)
                                                             (FIND N IN XCCS-SET-NAMES
                                                                SUCHTHAT (EQ F (CADR N]
                                               (MKLIST (FINDFILE (PACKFILENAME 'BODY
                                                                        (CONCAT 'XCCS- (CAR CSI)
                                                                               '=
                                                                               (CADR CSI))
                                                                        'EXTENSION
                                                                        'TXT)
                                                              T DIRS)))
                                          (for D inside DIRS
                                             when (DIRECTORYNAMEP (SETQ D (CONCAT D ">" F ">")))
                                             join (FILDIR (CONCAT D ">*.TXT;*"])

(READ-UNICODE-MAPPING
  [LAMBDA (FILESPEC NOPRINT NOERROR)                         (* ; "Edited  5-Jan-2024 12:26 by rmk")
                                                            (* ; "Edited  3-Jul-2021 13:37 by rmk:")

    (* ;; "Combines the char-mapping tables from FILES coded in the Uncode-CDROM format.  Comments prefixed by # and")

    (* ;; "               Column 1:  Input hex code in the format 0xXXXX")

    (* ;; "               Column 2:  Corresponding Unicode code-sequence in the format")

    (* ;; "                                            0xXXXX  ... 0xYYYY")

    (* ;; "               Column 3:  (after #) Character name in some mapping files, utf-8 character")

    (* ;; "                                     for XCCS mapping files")

    (* ;; "")

    (* ;; "Result is a list of (fromcode tocode1 ... tocoden) integer lists (almost always with only a single tocode")

    (FOR FILE [SEPBITTABLE _ (MAKEBITTABLE (CHARCODE (TAB SPACE] IN (READ-UNICODE-MAPPING-FILENAMES
                                                                     FILESPEC)
       JOIN (CL:WITH-OPEN-FILE (STREAM FILE :DIRECTION :INPUT :EXTERNAL-FORMAT :UTF-8-RAW)
                   (BIND LINE NAME CHARSET START
                      FIRST (CL:UNLESS (FILEPOS "Name:" STREAM NIL NIL NIL T)
                                (ERROR "NOT A UNICODE MAPPING FILE" (FULLNAME STREAM)))
                            (SETQ NAME (CL:STRING-TRIM " " (CL:READ-LINE STREAM NIL NIL)))
                            (SETQ CHARSET (CL:IF (FILEPOS "XCCS charset:" STREAM NIL NIL NIL T)
                                              (CL:STRING-TRIM " " (CL:READ-LINE STREAM NIL NIL))
                                              ""))
                            (CL:UNLESS NOPRINT               (* ; "Strip off XCCS in front of name")
                                (PRINTOUT T T CHARSET " " [SUBSTRING NAME (CONSTANT
                                                                           (ADD1 (NCHARS "XCCS"]
                                       T)) WHILE (SETQ LINE (CL:READ-LINE STREAM NIL NIL))
                      WHEN (SETQ START (STRPOSL SEPBITTABLE LINE 1 T))
                      UNLESS (EQ (CHARCODE %#)
                                 (NTHCHARCODE LINE START))
                      COLLECT (BIND END WHILE [SETQ END (OR (STRPOSL SEPBITTABLE LINE START)
                                                            (ADD1 (NCHARS LINE]
                                 COLLECT [CHARCODE.DECODE (SUBSTRING LINE START (SUB1 END)
                                                                 (CONSTANT (CONCAT]
                                 REPEATWHILE (AND (SETQ START (STRPOSL SEPBITTABLE LINE END T))
                                                  (NEQ (CHARCODE %#)
                                                       (NTHCHARCODE LINE START])

(WRITE-UNICODE-MAPPING
  [LAMBDA (MAPPING INCLUDECHARSETS FILE EMPTYOK)             (* ; "Edited  4-Jan-2024 22:44 by rmk")
                                                            (* ; "Edited 16-Aug-2020 16:56 by rmk:")

    (* ;; "Writes a symbol unicode mapping file.  Mapping is a list of (XCCS-code Unicode) pairs, which may contain codes in multiple character sets.")

    (* ;; "If FILE is NIL, it defaults to a name XCCS- followed by the octal character sets in the mapping, in the unicode/XEROX directory.")

    (* ;; "The output lines are of the form x0XXX<tab>x0UUUU<tab>#  Unicode-char")

    (* ;; 
    "If INCLUDECHARSETS=T then the mappings are split up into separate per-character set files.")

    (* ;; "Otherwise, all and only mappings included in thos charsets are included in a single output file--an implicit subset.")

    (IF (AND (EQ INCLUDECHARSETS T)
             (NULL FILE))
        THEN (IF MAPPING
                 THEN (FOR CSI F IN XCCS-SET-NAMES WHEN (SETQ F (WRITE-UNICODE-MAPPING MAPPING
                                                                       (CAR CSI)
                                                                       NIL T)) COLLECT F)
               ELSE (PRINTOUT T "THERE ARE NO MAPPINGS" T)
                    NIL)
      ELSE
      (LET
       (IMAPPING CSETINFO RANGES)
       (CL:MULTIPLE-VALUE-SETQ (IMAPPING CSETINFO RANGES)
              (WRITE-UNICODE-INCLUDED MAPPING INCLUDECHARSETS))
       (IF IMAPPING
           THEN (CL:WITH-OPEN-FILE
                 (STREAM (WRITE-UNICODE-MAPPING-FILENAME FILE CSETINFO RANGES)
                        :DIRECTION :OUTPUT :IF-EXISTS :NEW-VERSION :EXTERNAL-FORMAT :UTF-8-RAW)
                 (WRITE-UNICODE-MAPPING-HEADER STREAM CSETINFO RANGES)
                 (SORT IMAPPING T)
                 (FOR M CSET LEFTC FIRSTRIGHTC CSI IN IMAPPING
                    DO (SETQ LEFTC (CAR M))
                       (SETQ FIRSTRIGHTC (CADR M))
                       (CL:UNLESS (EQ CSET (LRSH LEFTC 8))
                           (SETQ CSET (LRSH LEFTC 8))
                           (SETQ CSI (ASSOC CSET CSETINFO))
                           (PRINTOUT STREAM T "#  " .P2 (CADR CSI)
                                  " "
                                  (CADDR CSI)
                                  T))
                       (PRINTOUT STREAM "0x" (HEXSTRING LEFTC 4)
                              %#
                              (FOR RIGHTC IN (CDR M) DO (PRINTOUT NIL "	" "0x" (HEXSTRING RIGHTC 4)))
                              "	#  "
                              (SELECTC FIRSTRIGHTC
                                  (UNDEFINEDCODE 
                                                 (* ;; "FFFF")

                                                 "UNDEFINED")
                                  (MISSINGCODE 
                                               (* ;; "FFFE")

                                               "MISSING")
                                  (IF (ILESSP FIRSTRIGHTC 32)
                                      THEN                   (* ; "Control chars")
                                           [CONCAT "^" (CHARACTER (IPLUS FIRSTRIGHTC (CHARCODE @]
                                    ELSE (CHARACTER FIRSTRIGHTC)))
                              T))
                 (FULLNAME STREAM))
         ELSEIF (NOT EMPTYOK)
           THEN (PRINTOUT T "THERE ARE NO MAPPINGS")
                (CL:WHEN INCLUDECHARSETS
                    (PRINTOUT T " FOR " .PPVTL (MKLIST INCLUDECHARSETS)
                           T))
                NIL])

(WRITE-UNICODE-INCLUDED
  [LAMBDA (MAPPING INCLUDECHARSETS)                     (* ; "Edited  4-Aug-2020 17:47 by rmk:")

    (* ;; "CSETINFO is a list of (num string name) for each included character set.")

    (LET (CHARSETS CSETINFO RANGES ICSETS IMAPPING)

         (* ;; "Normalize the INCLUDECHARSETS, then reduce MAPPING to the included mappings")

         [SETQ ICSETS (FOR C POS KNOWN INSIDE (OR INCLUDECHARSETS (FOR CSI IN 
                                                                                       XCCS-SET-NAMES
                                                                             COLLECT (CAR CSI)))
                         JOIN [SETQ KNOWN (OR (SASSOC C XCCS-SET-NAMES)
                                                  (FIND N IN XCCS-SET-NAMES
                                                     SUCHTHAT (EQ C (CADR N)))
                                                  (HELP "UNKNOWN CHARACTER SET" (OCTALSTRING C]
                               (IF (SETQ POS (STRPOS "-" (CAR KNOWN)))
                                   THEN (FOR I FROM (CL:PARSE-INTEGER (SUBSTRING
                                                                                   (CAR KNOWN)
                                                                                   1
                                                                                   (SUB1 POS))
                                                                       :RADIX 8)
                                               TO (CL:PARSE-INTEGER (SUBSTRING (CAR KNOWN)
                                                                               (ADD1 POS))
                                                             :RADIX 8)
                                               COLLECT (LIST I (OCTALSTRING I)
                                                                 (CADR KNOWN)))
                                 ELSE (CONS (CONS (CL:PARSE-INTEGER (CAR KNOWN)
                                                             :RADIX 8)
                                                      KNOWN]
         (SETQ IMAPPING (FOR M CSI IN MAPPING WHEN (SETQ CSI (ASSOC (LRSH (CAR M)
                                                                                      8)
                                                                                ICSETS))
                           COLLECT 

                                 (* ;; "The attested subset of INCLUDED")

                                 (CL:UNLESS (MEMB CSI CSETINFO)
                                        (PUSH CSETINFO CSI))
                                 M))

         (* ;; "Sort as numbers, not octal strings, then group into consecutive ranges")

         (SETQ CSETINFO (SORT CSETINFO T))
         [SETQ RANGES (FOR CTAIL C START END ON (FOR CSI IN CSETINFO
                                                           COLLECT (CAR CSI)) WHILE CTAIL
                         COLLECT (SETQ START (CAR CTAIL))
                               (SETQ END START)
                               (CONS START (WHILE [AND (CDR CTAIL)
                                                           (EQ END (SUB1 (CADR CTAIL]
                                              COLLECT (SETQ CTAIL (CDR CTAIL))
                                                    (SETQ END (CAR CTAIL]

         (* ;; "Split out groups of less than 3.  But if a range exhaustively covers a known subset (like JIS), replace by the name")

         [SETQ RANGES (FOR R STR KNOWN LAST IN RANGES
                         JOIN (SETQ LAST (CAR (LAST R)))
                               (IF (EQ (CAR R)
                                           LAST)
                                   THEN (CONS (OCTALSTRING (CAR R)))
                                 ELSEIF (SETQ KNOWN (SASSOC (SETQ STR (CONCAT (OCTALSTRING
                                                                                   (CAR R))
                                                                                 "-"
                                                                                 (OCTALSTRING LAST)))
                                                               XCCS-SET-NAMES))
                                   THEN (CONS (CADR KNOWN))
                                 ELSEIF (CDDR R)
                                   THEN (CONS STR)
                                 ELSE (LIST (OCTALSTRING (CAR R))
                                                (OCTALSTRING LAST]
         (CL:VALUES IMAPPING CSETINFO RANGES])

(WRITE-UNICODE-MAPPING-HEADER
  [LAMBDA (STREAM CSETINFO RANGES)                           (* ; "Edited  5-Jan-2024 13:24 by rmk")
                                                            (* ; "Edited  4-Aug-2020 17:38 by rmk:")

    (* ;; "Writes the standard per-file header information")

    (FOR LINE IN UNICODE-MAPPING-HEADER
       DO (PRINTOUT STREAM "#" 2)
          (SELECTQ LINE
              (XCCSCHARACTERSETS 
                   (PRINTOUT STREAM "        XCCS charset")
                   (IF (CDR CSETINFO)
                       THEN (PRINTOUT STREAM "s:" -4)
                            (FOR R IN RANGES DO (PRINTOUT STREAM R " "))
                     ELSE                                    (* ; "Singleton")
                          (PRINTOUT STREAM ": " -4 (CADAR CSETINFO)
                                 " "
                                 (CADDAR CSETINFO)))
                   (TERPRI STREAM))
              (DATE (PRINTOUT STREAM "        Date:" -13 (DATE (DATEFORMAT NO.TIME NO.LEADING.SPACES)
                                                               )
                           T))
              (PRINTOUT STREAM LINE T)))
    (TERPRI STREAM])

(WRITE-UNICODE-MAPPING-FILENAME
  [LAMBDA (FILE CSETINFO RANGES)                        (* ; "Edited  4-Aug-2020 19:34 by rmk:")
    (PACKFILENAME 'BODY [OR FILE (CONCATLIST
                                  (CONS 'XCCS- (IF (CDR CSETINFO)
                                                   THEN (FOR RTAIL R ON RANGES
                                                               JOIN (SETQ R (CAR RTAIL))
                                                                     (SETQ R
                                                                      (CL:IF (LISTP R)
                                                                          (LIST (CAR R)
                                                                                "-"
                                                                                (CDR R))
                                                                          (CONS R)))
                                                                     (CL:IF (CDR RTAIL)
                                                                            (NCONC1 R ","))
                                                                     R)
                                                 ELSE (LIST (CADAR CSETINFO)
                                                                "="
                                                                (CADDAR CSETINFO]
           'DIRECTORY
           (CAR UNICODEDIRECTORIES)
           'EXTENSION
           'TXT])
)

(RPAQQ XCCS-SET-NAMES
       (("0" LATIN)
        ("41" JAPANESE-SYMBOLS1)
        ("42" JAPANESE-SYMBOLS2)
        ("43" EXTENDED-LATIN)
        ("44" HIRAGANA)
        ("45" KATAKANA)
        ("46" GREEK)
        ("47" CYRILLIC)
        ("50" FORMS)
        ("60-166" JIS)
        ("340" ARABIC)
        ("341" HEBREW)
        ("342" IPA)
        ("343" HANGUL)
        ("344" GEORGIAN-ARMENIAN)
        ("345" DEVANAGRI)
        ("346" BENGALI)
        ("347" GURMUKHI)
        ("350" THAI-LAO)
        ("356" SYMBOLS2)
        ("357" SYMBOLS1)
        ("360" LIGATURES)
        ("361" ACCENTED-LATIN1)
        ("365" MORE-ARABIC)
        ("375" GRAPHIC-VARIANTS)))



(* ;; "Automate dumping of a documentation prefix")

(DECLARE%: EVAL@COMPILE DONTCOPY 
(DECLARE%: EVAL@COMPILE 

(RPAQ MISSINGCODE (CL:PARSE-INTEGER "FFFE" :RADIX 16))

(RPAQ UNDEFINEDCODE (CL:PARSE-INTEGER "FFFF" :RADIX 16))


(CONSTANTS (MISSINGCODE (CL:PARSE-INTEGER "FFFE" :RADIX 16))
       (UNDEFINEDCODE (CL:PARSE-INTEGER "FFFF" :RADIX 16)))
)
)

(RPAQQ UNICODE-MAPPING-HEADER
       ("" "        Name:             XCCS (Version 2.0) to Unicode" "        Unicode version:  3.0"
           XCCSCHARACTERSETS "        Table version:    0.1" "        Table format:     Format A" 
           DATE "        Author:           Ron Kaplan <Ron.Kaplan@post.harvard.edu>" "" 
           "This file contains mappings from the Xerox Character Code Standard (version" 
           "2.0, 1990) into Unicode 3.0. standard codes.  That is an extension of the" 
           "version of XCCS corresponding to the fonts in the Medley system." "" 
           "The format of this file conforms to the format of the other Unicode-supplied" 
           "mapping files:" "   Three white-space (tab or spaces) separated columns:" 
           "     Column 1 is the XCCS code (as hex 0xXXXX)" 
           "     Column 2 is the corresponding Unicode (as hex 0xXXXX)" 
           "     Column 3 (after #) is a comment column. For convenience, it contains the" 
           "        Unicode character itself and the Unicode character names when available." 
           "Unicode FFFF is used for undefined XCCS codes (Column 3 = UNDEFINED" 
           "Unicode FFFE is used for XCCS codes that have not yet been filled in." 
           "(Column 3 = MISSING)" "" "This file is encoded in UTF-8, so that the Unicode characters"
           "are properly displayed in Column 3 and can be edited by standard" 
           "Unicode-enabled editors (e.g. Mac Textedit)." "" 
           "This file can also be read by the function" 
           "READ-UNICODE-MAPPING in the UNICODE Medley library package." "" 
           "The entries are in XCCS order and grouped by character sets.  In front of" 
           "the mappings, for convenience, there is a line with the octal XCCS" 
           "character set, after #." "" 
           "Note that a given XCCS code might map to codes in several different Unicode" 
           "positions, since there are repetitions in the Unicode standard." "" 
           "For more details, see the associated README.TXT file." "" 
           "Any comments or problems, contact <ron.kaplan@post.harvard.edu>"))

(RPAQ? UNICODEDIRECTORIES NIL)



(* ;; "Set up translation tables for UTF8 and UTFBE external formats")

(DEFINEQ

(MAKE-UNICODE-TRANSLATION-TABLES
  [LAMBDA (MAPPING LTORVAR RTOLVAR)                         (* ; "Edited 21-Aug-2021 13:12 by rmk:")
                                                            (* ; "Edited 17-Aug-2020 08:46 by rmk:")

    (* ;; "MAPPING is the list of numeric code correspondence pairs constructed by applying READ-UNICODE-MAPPING to a Unicode mapping file.")

    (* ;; "This produces two recoding arrays, one maps left-side codes into right-side codes (e.g. XCCS or ISO8859-1 to Unicode), for printing, the other maps right-side (Unicode) codes to corresponding right-side codes (e.g. XCCS).")

    (* ;; "")

    (* ;; "We assume that the left-to-right mapping into Unicode is functional, so that each left code maps to a unique right (Unicode) code, because Unicode is presumably the most refined coding scheme.  But several Unicode codes may map to the same left code, for logically different codes that happen to have the same glyphs. In that case the heuristic is to map each %"from%" code to the lowest of the possible %"to%" codes. This means that round-trip reading/writing or writing/reading from one or both starting points may not always be lossless.")

    (* ;; " ")

    (* ;; " Each recoding array has 256 elements, one for each possible high-order byte of a character code.  An array entry is either NIL, a 256-array of codes indexed by low-order bytes, or an alist of (lower-order-bytes . codes).  The latter is used to save space for sparsely populated character sets.")

    (* ;; "")

    (* ;; "The element 256 of each array contains a hash table for characters that might be encountered in XCCS memory or Unicode files for which there is no mapping.  Element 257 contains the corresponding inverse unmapped hash-array, so that UNICODE.TRANSLATE can update them consistently.")

    (* ;; "")

    (* ;; "UNICODE.TRANSLATE assigns an unmapped Unicode character to a %"not used%" XCCS code position (from 5,0 to 40,FF, leaving other low not-used sets for other internal uses (TEDIT?).")

    (* ;; "")

    (* ;; 
    "An unmapped XCCS character is assigned a code in the %"private use%" code blocks 0xE000-F8FF")

    (* ;; "")

    (* ;; "For the convenience of not having to deal with the multiple values, if LTORVAR or RTOLVAR are given, they are set to the constructed arrays before return.")

    (* ;; "")

    (LET ((LTORARRAY (CL:MAKE-ARRAY (IPLUS 2 N-TRANSLATION-SEGMENTS)
                            :INITIAL-ELEMENT NIL))
          (RTOLARRAY (CL:MAKE-ARRAY (IPLUS 2 N-TRANSLATION-SEGMENTS)
                            :INITIAL-ELEMENT NIL)))

         (* ;; "The left-to-right direction (into Unicode).    We start by distributing the mappings into alists in arrays indexed by the higher-order (charaset set byte).  The second loop converts long alists into arrays.")

         [FOR M LEFTC RBASE RCODES IN MAPPING EACHTIME (SETQ RCODES (CDR M))
                                                    (SETQ RBASE (CAR RCODES))
            UNLESS (IGEQ RBASE MISSINGCODE) DO (SETQ LEFTC (CAR M)) 

                                               (* ;; "(CDR RCODES) contains combiners on the base")

                                               (CL:PUSH (CONS (LOGAND LEFTC TRANSLATION-MASK)
                                                              (CL:IF (CDR RCODES)
                                                                  RCODES
                                                                  RBASE))
                                                      (CL:SVREF LTORARRAY (LRSH LEFTC 
                                                                                TRANSLATION-SHIFT]
         (FOR I CSA FROM 0 TO (SUB1 N-TRANSLATION-SEGMENTS)
            WHEN (IGREATERP (LENGTH (CL:SVREF LTORARRAY I))
                        MAX-ALIST-LENGTH) DO 
                                             (* ;; "Leave it alone if the alist is short")

                                             (SETQ CSA (CL:MAKE-ARRAY TRANSLATION-SEGMENT-SIZE 
                                                              :INITIAL-ELEMENT NIL))
                                             (FOR P IN (CL:SVREF LTORARRAY I)
                                                DO (CL:SETF (CL:SVREF CSA (LOGAND (CAR P)
                                                                                 TRANSLATION-MASK))
                                                          (CDR P)))
                                             (CL:SETF (CL:SVREF LTORARRAY I)
                                                    CSA))

         (* ;; "")

         (* ;; "Now the right-to-left direction (from Unicode). Here we have to detect and compensate for ambiguity.")

         (FOR M LEFTC RBASE RCOMBINERS PREV IN MAPPING EACHTIME (SETQ RBASE (CADR M))
                                                             (SETQ RCOMBINERS (CDDR M))
            UNLESS (OR (IGEQ RBASE MISSINGCODE)
                       RCOMBINERS) DO 
                                      (* ;; 
                                      "Have we already seen an explicit mapping from right to left?")

                                      (SETQ LEFTC (CAR M))
                                      [SETQ PREV (ASSOC (LOGAND RBASE TRANSLATION-MASK)
                                                        (CL:SVREF RTOLARRAY (LRSH RBASE 
                                                                                  TRANSLATION-SHIFT]
                                      (IF (NULL PREV)
                                          THEN (CL:PUSH (CONS (LOGAND RBASE TRANSLATION-MASK)
                                                              LEFTC)
                                                      (CL:SVREF RTOLARRAY (LRSH RBASE 
                                                                                TRANSLATION-SHIFT)))
                                        ELSEIF (IGREATERP (CDR PREV)
                                                      LEFTC)
                                          THEN (RPLACD PREV LEFTC)))
         (FOR I CSA FROM 0 TO (SUB1 N-TRANSLATION-SEGMENTS)
            WHEN (IGREATERP (LENGTH (CL:SVREF RTOLARRAY I))
                        MAX-ALIST-LENGTH) DO 
                                             (* ;; "Long list, make an array")

                                             (SETQ CSA (CL:MAKE-ARRAY TRANSLATION-SEGMENT-SIZE 
                                                              :INITIAL-ELEMENT NIL))
                                             (FOR P IN (CL:SVREF RTOLARRAY I)
                                                DO (CL:SETF (CL:SVREF CSA (LOGAND (CAR P)
                                                                                 TRANSLATION-MASK))
                                                          (CDR P)))
                                             (CL:SETF (CL:SVREF RTOLARRAY I)
                                                    CSA))

         (* ;; "")

         (* ;; "Allocate the hash arrays for future out-of-map codes.  We we have to keep track of the next available and last possible codes, as well as the first available, for error checking.")

         (CL:SETF (CL:SVREF LTORARRAY N-TRANSLATION-SEGMENTS)
                (LIST (HASHARRAY 10)
                      (CHARCODE.DECODE "5,0")
                      (CHARCODE.DECODE "40,0")
                      (CHARCODE.DECODE "5,0")))
         (CL:SETF (CL:SVREF RTOLARRAY N-TRANSLATION-SEGMENTS)
                (LIST (HASHARRAY 10)
                      (CHARCODE.DECODE "U+E000")
                      (CHARCODE.DECODE "U+F8FF")
                      (CHARCODE.DECODE "U+E000")))

         (* ;; "Now put in the inverse unmapped hash arrays")

         (CL:SETF (CL:SVREF LTORARRAY (ADD1 N-TRANSLATION-SEGMENTS))
                (CL:SVREF RTOLARRAY N-TRANSLATION-SEGMENTS))
         (CL:SETF (CL:SVREF RTOLARRAY (ADD1 N-TRANSLATION-SEGMENTS))
                (CL:SVREF LTORARRAY N-TRANSLATION-SEGMENTS))

         (* ;; "")

         (CL:WHEN LTORVAR (SETATOMVAL LTORVAR LTORARRAY))
         (CL:WHEN RTOLVAR (SETATOMVAL RTOLVAR RTOLARRAY))
         (LIST LTORARRAY RTOLARRAY])
)

(RPAQ? DEFAULT-XCCS-CHARSETS '(LATIN JAPANESE-SYMBOLS1 JAPANESE-SYMBOLS2 EXTENDED-LATIN FORMS 
                                     SYMBOLS1 SYMBOLS2 ACCENTED-LATIN1 GREEK))

(RPAQ? DEFAULT-XCCS-JAPANESE-CHARSETS '(HIRAGANA KATAKANA JIS))
(DECLARE%: DONTEVAL@LOAD DOCOPY 

(MAKE-UNICODE-TRANSLATION-TABLES (READ-UNICODE-MAPPING DEFAULT-XCCS-CHARSETS T)
       '*XCCSTOUNICODE*
       '*UNICODETOXCCS*)
)
(DECLARE%: DOEVAL@COMPILE DONTCOPY

(GLOBALVARS *XCCSTOUNICODE* *UNICODETOXCCS*)
)
(DEFINEQ

(UTF-8.VALIDATE
  [LAMBDA (STREAM BYTE1)                                     (* ; "Edited 28-Dec-2023 11:57 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:02 by rmk:")
                                                            (* ; "Edited  6-Aug-2020 17:13 by rmk:")

    (* ;; "Returns the codesize if the bytes starting at STREAM's current position form a valid UTF-8 sequence.")

    (* ;; "If BYTE1 is provided, it is interpreted as the just-read header byte with the stream is positioned just after it.")

    (* ;; "Test for smallp because the stream's End-of-file operation may suppress the error--otherwise an error will happen if the streams runs out of necessary bytes.")

    (* ;; "For valid sequences, returns the same value as NUTF8CODEBYTES, but this reads/validates the rest of the bytes. On a non-NILreturn the  stream is positioned before the header byte of the next putative code. The stream position is uncertain on a NIL return.")

    (* ;; "")

    (CL:UNLESS BYTE1
        (SETQ BYTE1 (\BIN STREAM)))
    (PROG (BYTE2 BYTE3 BYTE4)

     (* ;; "Distinguish on the header byte BYTE1.")

          (CL:WHEN (SMALLP BYTE1)
              (IF (ILESSP BYTE1 128)
                  THEN (RETURN 1)
                ELSEIF (IGEQ BYTE1 (LLSH 15 4))
                  THEN                                       (* ; "4 bytes")
                       (SETQ BYTE2 (\BIN STREAM))
                       (CL:WHEN (OR (NOT (SMALLP BYTE2))
                                    (ILESSP BYTE2 128))
                              (RETURN))
                       (SETQ BYTE3 (\BIN STREAM))
                       (CL:WHEN (OR (NOT (SMALLP BYTE3))
                                    (ILESSP BYTE3 128))
                              (RETURN))
                       (SETQ BYTE4 (\BIN STREAM))
                       (CL:WHEN (OR (NOT (SMALLP BYTE4))
                                    (ILESSP BYTE4 128))
                              (RETURN))
                       (RETURN 4)
                ELSEIF (IGEQ BYTE1 (LLSH 7 5))
                  THEN                                       (* ; "3 bytes")
                       (SETQ BYTE2 (\BIN STREAM))
                       (CL:WHEN (OR (NOT (SMALLP BYTE2))
                                    (ILESSP BYTE2 128))
                              (RETURN))
                       (SETQ BYTE3 (\BIN STREAM))
                       (CL:WHEN (OR (NOT (SMALLP BYTE3))
                                    (ILESSP BYTE3 128))
                              (RETURN))
                       (RETURN 3)
                ELSE                                         (* ; " 2 bytes")
                     (SETQ BYTE2 (\BIN STREAM))
                     (CL:WHEN (OR (NOT (SMALLP BYTE2))
                                  (ILESSP BYTE2 128))
                            (RETURN NIL))
                     (RETURN 2)))])

(HEXSTRING
  [LAMBDA (N WIDTH)                                     (* ; "Edited 23-Jul-2020 08:28 by rmk:")
                                                             (* ; "Edited 20-Dec-93 17:51 by rmk:")

    (* ;; 
  "Converts positive numbers to Hex strings, padding on the right with 0 up to WIDTH if given.")

    (CL:UNLESS (FIXP N)
        (SETQ N (CHARCODE.DECODE N)))
    (LET [CHAR (STR (ALLOCSTRING [IMAX (OR WIDTH 0)
                                       (FOR I (LEFT _ N) FROM 0 UNTIL (EQ LEFT 0)
                                          DO (SETQ LEFT (LRSH LEFT 4))
                                          FINALLY (RETURN (MAX I 1]
                           (CHARCODE 0]
         (FOR I FROM -1 BY -1 UNTIL (EQ N 0)
            DO (SETQ CHAR (LOGAND N 15))
                  [RPLCHARCODE STR I (IF (ILESSP CHAR 10)
                                         THEN (+ CHAR (CHARCODE 0))
                                       ELSE (+ (- CHAR 10)
                                                   (CHARCODE A]
                  (SETQ N (LRSH N 4)))
         STR])

(UTF8HEXSTRING
  [LAMBDA (CHARCODE)                                    (* ; "Edited 10-Aug-2020 08:33 by rmk:")

    (* ;; "Utility to produces the UTF8 hexstring representing CODE")

    (HEXSTRING (IF (ILESSP CHARCODE 128)
                       THEN CHARCODE
                     ELSEIF (ILESSP CHARCODE 2048)
                       THEN                              (* ; "x800")
                             (LOGOR (LLSH (LOGOR (LLSH 3 6)
                                                 (LRSH CHARCODE 6))
                                          8)
                                    (LOGOR (LLSH 2 6)
                                           (LOADBYTE CHARCODE 0 6)))
                     ELSEIF (ILESSP CHARCODE 65536)
                       THEN                              (* ; "x10000")
                             (LOGOR (LLSH (LOGOR (LLSH 7 5)
                                                 (LRSH CHARCODE 12))
                                          16)
                                    (LLSH (LOGOR (LLSH 2 6)
                                                 (LOADBYTE CHARCODE 6 6))
                                          8)
                                    (LOGOR (LLSH 2 6)
                                           (LOADBYTE CHARCODE 0 6)))
                     ELSEIF (ILESSP CHARCODE 2097152)
                       THEN                              (* ; "x200000")
                             (LOGOR (LLSH (LOGOR (LLSH 15 4)
                                                 (LRSH CHARCODE 18))
                                          24)
                                    (LLSH (LOGOR (LLSH 2 6)
                                                 (LOADBYTE CHARCODE 12 6))
                                          16)
                                    (LLSH (LOGOR (LLSH 2 6)
                                                 (LOADBYTE CHARCODE 6 6))
                                          8)
                                    (LOGOR (LLSH 2 6)
                                           (LOADBYTE CHARCODE 0 6)))
                     ELSE (ERROR "CHARCODE too big for UTF8" CHARCODE])

(NUTF8CODEBYTES
  [LAMBDA (BYTE)                                             (* ; "Edited  8-Jan-2024 10:57 by rmk")
                                                             (* ; "Edited 28-Jun-2022 00:02 by rmk")
                                                            (* ; "Edited 10-Aug-2020 12:35 by rmk:")

    (* ;; "Returns the number of bytes needed to encode in UTF8 a number headed by BYTE. ")

    (IF (ILESSP BYTE 128)
        THEN 1
      ELSEIF (ILESSP BYTE 2048)
        THEN                                                 (* ; "x800")
             2
      ELSEIF (ILESSP BYTE 65536)
        THEN                                                 (* ; "x10000")
             3
      ELSEIF (ILESSP BYTE 2097152)
        THEN                                                 (* ; "x200000")
             4
      ELSE (ERROR "INVALID UTF-8 HEADER BYTE"])

(NUTF8STRINGBYTES
  [LAMBDA (STRING RAWFLG)                               (* ; "Edited 10-Aug-2020 09:06 by rmk:")

    (* ;; "Returns the number of bytes it would take to represent STRING in UTF8, assuming it is an XCCS string unless RAWFLG. ")

    (FOR I C FROM 1 WHILE (SETQ C (NTHCHARCODE STRING I))
       SUM (NUTF8CODEBYTES (CL:IF RAWFLG
                                       C
                                       (XTOUCODE C))])

(XTOUSTRING
  [LAMBDA (XCCSSTRING RAWFLG)                       (* ; "Edited 10-Aug-2020 21:42 by rmk:")

    (* ;; "Produces a string that contains the UTF8 bytes that represent the characters in XCCSSTRING.  Applies the XCCSTOUNICODE translation unless RAWFLG.  ")

    (* ;; "The resulting string will not be readable inside Medley.")

    (LET [(USTR (ALLOCSTRING (NUTF8STRINGBYTES XCCSSTRING RAWFLG]
         (FOR I CHARCODE (SINDEX _ 0) FROM 1 WHILE (SETQ CHARCODE (NTHCHARCODE XCCSSTRING
                                                                                     I))
            DO (CL:UNLESS RAWFLG
                       (SETQ CHARCODE (XTOUCODE CHARCODE)))
                  (IF (ILESSP CHARCODE 128)
                      THEN (RPLCHARCODE USTR (ADD SINDEX 1)
                                      CHARCODE)
                    ELSEIF (ILESSP CHARCODE 2048)
                      THEN                               (* ; "x800")
                            (RPLCHARCODE USTR (ADD SINDEX 1)
                                   (LOGOR (LLSH 3 6)
                                          (LRSH CHARCODE 6)))
                            (RPLCHARCODE USTR (ADD SINDEX 1)
                                   (LOGOR (LLSH 2 6)
                                          (LOADBYTE CHARCODE 0 6)))
                    ELSEIF (ILESSP CHARCODE 65536)
                      THEN                               (* ; "x10000")
                            (RPLCHARCODE USTR (ADD SINDEX 1)
                                   (LOGOR (LLSH 7 5)
                                          (LRSH CHARCODE 12)))
                            (RPLCHARCODE USTR (ADD SINDEX 1)
                                   (LOGOR (LLSH 2 6)
                                          (LOADBYTE CHARCODE 6 6)))
                            (RPLCHARCODE USTR (ADD SINDEX 1)
                                   (LOGOR (LLSH 2 6)
                                          (LOADBYTE CHARCODE 0 6)))
                    ELSEIF (ILESSP CHARCODE 2097152)
                      THEN                               (* ; "x200000")
                            (RPLCHARCODE USTR (ADD SINDEX 1)
                                   (LOGOR (LLSH 15 4)
                                          (LRSH CHARCODE 18)))
                            (RPLCHARCODE USTR (ADD SINDEX 1)
                                   (LOGOR (LLSH 2 6)
                                          (LOADBYTE CHARCODE 12 6)))
                            (RPLCHARCODE USTR (ADD SINDEX 1)
                                   (LOGOR (LLSH 2 6)
                                          (LOADBYTE CHARCODE 6 6)))
                            (RPLCHARCODE USTR (ADD SINDEX 1)
                                   (LOGOR (LLSH 2 6)
                                          (LOADBYTE CHARCODE 0 6)))
                    ELSE (SHOULDNT)))
         USTR])

(XCCSSTRING
  [LAMBDA (CODE)                                        (* ; "Edited 13-Aug-2020 12:16 by rmk:")

    (* ;; "Returns XCCS character representation of string %"cset,char%"")

    (CL:UNLESS (FIXP CODE)
        (SETQ CODE (CHCON1 CODE)))
    (CONCAT (OCTALSTRING (LRSH CODE 8))
           ","
           (OCTALSTRING (LOGAND CODE 255])
)
(DEFINEQ

(\UTF8.FETCHCODE
  [LAMBDA (CODESIZE BUFFER BYTEOFFSET)                       (* ; "Edited 28-Dec-2023 13:32 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:02 by rmk:")
                                                            (* ; "Edited  6-Aug-2020 17:13 by rmk:")

    (* ;; "Decodes a UTF8 byte sequence of size CODESIZE in BUFFER starting at BYTEOFFSET.")

    (* ;; "The validity of the thesize, buffer, and offset are guaranteed by the caller.")

    (LET ((BYTE1 (\GETBASEBYTE BUFFER BYTEOFFSET))
          BYTE2 BYTE3 BYTE4)
         (SELECTQ CODESIZE
             (2 (SETQ BYTE2 (\UTF8.GETBASEBYTE BUFFER (IPLUS 1 BYTEOFFSET)))
                (LOGOR (LLSH (LOADBYTE BYTE1 0 5)
                             6)
                       (LOADBYTE BYTE2 0 6)))
             (3 (SETQ BYTE2 (\UTF8.GETBASEBYTE BUFFER (IPLUS 1 BYTEOFFSET)))
                (SETQ BYTE3 (\UTF8.GETBASEBYTE BUFFER (IPLUS 2 BYTEOFFSET)))
                (LOGOR (LLSH (LOADBYTE BYTE1 0 4)
                             12)
                       (LLSH (LOADBYTE BYTE2 0 6)
                             6)
                       (LOADBYTE BYTE3 0 6)))
             (4 (SETQ BYTE2 (\UTF8.GETBASEBYTE BUFFER (IPLUS 1 BYTEOFFSET)))
                (SETQ BYTE3 (\UTF8.GETBASEBYTE BUFFER (IPLUS 2 BYTEOFFSET)))
                (SETQ BYTE4 (\UTF8.GETBASEBYTE BUFFER (IPLUS 3 BYTEOFFSET)))
                (LOGOR (LLSH (LOADBYTE BYTE1 0 3)
                             18)
                       (LLSH (LOADBYTE BYTE2 0 6)
                             12)
                       (LLSH (LOADBYTE BYTE3 0 6)
                             6)
                       (LOADBYTE BYTE4 0 6)))
             (1 BYTE1)
             (SHOULDNT])
)
(DEFINEQ

(SHOWCHARS
  [LAMBDA (FROMCHAR TOCHAR FONT)                             (* ; "Edited 26-Jan-2024 14:18 by mth")
                                                            (* ; "Edited  1-Aug-2020 09:27 by rmk:")
    (RESETFORM (DSPFONT (OR FONT '(CLASSIC 12))
                      T)
           (CL:WHEN (AND (SMALLP FROMCHAR)
                         (NOT TOCHAR))

               (* ;; 
      "If a small number, assume it's an octal (in decimal) character set, no need for string quotes")

               (SETQ TOCHAR (CONCAT FROMCHAR "," 376))
               (SETQ FROMCHAR (CONCAT FROMCHAR "," 41)))
           (CL:UNLESS (SMALLP FROMCHAR)
               (SETQ FROMCHAR (CHARCODE.DECODE FROMCHAR)))
           (CL:UNLESS (SMALLP TOCHAR)
               (SETQ TOCHAR (CL:IF TOCHAR
                                (CHARCODE.DECODE TOCHAR)
                                FROMCHAR)))
           (for C from FROMCHAR to TOCHAR unless (AND (IGEQ (LOGAND C 255)
                                                            127)
                                                      (ILEQ (LOGAND C 255)
                                                            (PLUS 128 33)))
              do (PRINTOUT T .P2 (CONCAT (OCTALSTRING (LRSH C 8))
                                        ","
                                        (OCTALSTRING (LOGAND C 255)))
                        10
                        (CHARACTER C)
                        T])
)
(DECLARE%: EVAL@COMPILE DONTCOPY 

(FILESLOAD (FROM LOADUPS)
       EXPORTS.ALL)

(DECLARE%: EVAL@COMPILE 

(RPAQQ TRANSLATION-SEGMENT-SIZE 128)

(RPAQQ MAX-ALIST-LENGTH 10)

(RPAQ N-TRANSLATION-SEGMENTS (IQUOTIENT 65536 TRANSLATION-SEGMENT-SIZE))

(RPAQ TRANSLATION-SHIFT (INTEGERLENGTH (SUB1 TRANSLATION-SEGMENT-SIZE)))

(RPAQ TRANSLATION-MASK (SUB1 TRANSLATION-SEGMENT-SIZE))


(CONSTANTS (TRANSLATION-SEGMENT-SIZE 128)
       (MAX-ALIST-LENGTH 10)
       (N-TRANSLATION-SEGMENTS (IQUOTIENT 65536 TRANSLATION-SEGMENT-SIZE))
       (TRANSLATION-SHIFT (INTEGERLENGTH (SUB1 TRANSLATION-SEGMENT-SIZE)))
       (TRANSLATION-MASK (SUB1 TRANSLATION-SEGMENT-SIZE)))
)
)

(PUTPROPS UNICODE FILETYPE :TCOMPL)
(DECLARE%: DONTCOPY
  (FILEMAP (NIL (4111 18202 (UTF8.OUTCHARFN 4121 . 6952) (UTF8.INCCODEFN 6954 . 12444) (UTF8.PEEKCCODEFN
 12446 . 17220) (\UTF8.BACKCCODEFN 17222 . 18200)) (18203 21984 (UTF16BE.OUTCHARFN 18213 . 19037) (
UTF16BE.INCCODEFN 19039 . 19938) (UTF16BE.PEEKCCODEFN 19940 . 21011) (\UTF16BE.BACKCCODEFN 21013 . 
21982)) (22014 24295 (MAKE-UNICODE-FORMATS 22024 . 24293)) (24392 25698 (UNICODE.UNMAPPED 24402 . 
25696)) (25699 26375 (XCCS-UTF8-AFTER-OPEN 25709 . 26373)) (27831 28180 (XTOUCODE 27841 . 28009) (
UTOXCODE 28011 . 28178)) (28220 45174 (READ-UNICODE-MAPPING-FILENAMES 28230 . 30936) (
READ-UNICODE-MAPPING 30938 . 33914) (WRITE-UNICODE-MAPPING 33916 . 37666) (WRITE-UNICODE-INCLUDED 
37668 . 42390) (WRITE-UNICODE-MAPPING-HEADER 42392 . 43640) (WRITE-UNICODE-MAPPING-FILENAME 43642 . 
45172)) (48488 56912 (MAKE-UNICODE-TRANSLATION-TABLES 48498 . 56910)) (57417 68615 (UTF-8.VALIDATE 
57427 . 60429) (HEXSTRING 60431 . 61592) (UTF8HEXSTRING 61594 . 63799) (NUTF8CODEBYTES 63801 . 64754) 
(NUTF8STRINGBYTES 64756 . 65237) (XTOUSTRING 65239 . 68250) (XCCSSTRING 68252 . 68613)) (68616 70420 (
\UTF8.FETCHCODE 68626 . 70418)) (70421 71931 (SHOWCHARS 70431 . 71929)))))
STOP
