(DEFINE-FILE-INFO PACKAGE "INTERLISP" READTABLE "INTERLISP" BASE 10)

(FILECREATED " 9-Sep-2025 08:59:44" {WMEDLEY}<library>UNICODE.;171 111736 

      :EDIT-BY rmk

      :CHANGES-TO (FNS MTOUTF8STRING N-MCHARS UTF8TOMSTRING)
                  (VARS UNICODECOMS)

      :PREVIOUS-DATE " 7-Sep-2025 20:30:24" {WMEDLEY}<library>UNICODE.;170)


(PRETTYCOMPRINT UNICODECOMS)

(RPAQQ UNICODECOMS
       ((COMS                                                (* ; "External formats")
              (FNS UTF8.OUTCHARFN UTF8.SLUG.OUTCHARFN UTF8.INCCODEFN UTF8.PEEKCCODEFN 
                   \UTF8.BACKCCODEFN)
              (FNS UTF16BE.OUTCHARFN UTF16BE.INCCODEFN UTF16BE.PEEKCCODEFN \UTF16BE.BACKCCODEFN)
              (FNS UTF16LE.OUTCHARFN UTF16LE.INCCODEFN UTF16LE.PEEKCCODEFN \UTF16LE.BACKCCODEFN)
              (FNS READBOM WRITEBOM)
              (INITVARS (EXTERNALEOL 'LF))
              (FNS MAKE-UNICODE-FORMATS)
              (P (MAKE-UNICODE-FORMATS EXTERNALEOL))
              (ADDVARS (*DEFAULT-EXTERNALFORMATS* (UNIX :UTF-8)))
              (FNS UTF8.BINCODE \UTF8.FETCHCODE)
              (FNS UTF8.VALIDATE NUTF8-BYTE1-BYTES NUTF8-CODE-BYTES NUTF8-STRING-BYTES N-MCHARS)
              (DECLARE%: EVAL@COMPILE DONTCOPY (MACROS UNICODE.TRANSLATE \UTF8.GETBASEBYTE))
              (FNS MTOUCODE UTOMCODE MTOUCODE? UTOMCODE? MTOUSTRING UTOMSTRING MTOUTF8STRING 
                   UTF8TOMSTRING)
              (FNS XTOUCODE UTOXCODE XTOUCODE? UTOXCODE? XTOUSTRING UTOXSTRING XTOUTF8STRING))
        
        (* ;; "")

        (COMS                                                (* ; "Read Unicode mapping files")
              (INITVARS (UNICODEDIRECTORIES NIL))
              (VARS XCCS-CHARSETS)
              (FNS READ-UNICODE-MAPPING-FILENAMES READ-UNICODE-MAPPING))
        [COMS                                                (* ; 
                                                  "Make translation tables for  UTF external formats")
              (FNS MAKE-UNICODE-TRANSLATION-TABLES MERGE-UNICODE-TRANSLATION-TABLES UNICODE.UNMAPPED
                   UNICODE-EXTEND-TRANSLATION?)
              (FNS ALL-UNICODE-MAPPINGS XCCSJAPANESECHARSETS)
              (INITVARS (*MCCSTOUNICODE*)
                     (*UNICODETOMCCS*)
                     (*MCCS-LOADED-CHARSETS*)
                     (*UNICODE-LOADED-CHARSETS*))
              (GLOBALVARS *MCCSTOUNICODE* *UNICODETOMCCS* *NEXT-PRIVATE-UNICODE* 
                     *NEXT-PRIVATE-MCCSCODE* *MCCS-LOADED-CHARSETS* *UNICODE-LOADED-CHARSETS*)
              (DECLARE%: EVAL@COMPILE DONTCOPY 

                     (* ;; "There are 6400 private Unicodes in 25 256-code charsets. For XCCS we map to a contiguous region of unused/reserved--private isn't big enough.")

                     (CONSTANTS (FIRST-PRIVATE-UNICODE (HEXNUM? "E000"))
                            (LAST-PRIVATE-UNICODE (HEXNUM? "F8FF"))
                            (FIRST-PRIVATE-MCCSCODE (CHARCODE "200,0"))
                            (LAST-PRIVATE-MCCSCODE (CHARCODE "230,377")))
                     (MACROS TRUECODEP))
              (DECLARE%: DONTEVAL@LOAD DOCOPY (P (MAKE-UNICODE-TRANSLATION-TABLES 'ALL]
        
        (* ;; "")

        (COMS                                                (* ; "Write Unicode mapping files")
              (FNS WRITE-UNICODE-MAPPING WRITE-UNICODE-INCLUDED WRITE-UNICODE-MAPPING-HEADER 
                   WRITE-UNICODE-MAPPING-FILENAME)
              (FNS XCCS-UTF8-AFTER-OPEN)
              
              (* ;; "Automate dumping of a documentation prefix")

              [DECLARE%: EVAL@COMPILE DONTCOPY (CONSTANTS (MISSINGCODE (CL:PARSE-INTEGER "FFFE" 
                                                                              :RADIX 16))
                                                      (UNDEFINEDCODE (CL:PARSE-INTEGER "FFFF" :RADIX
                                                                            16]
              (VARS UNICODE-MAPPING-HEADER))
        (FNS UTF8HEXSTRING)
        (COMS                                                (* ; "debugging")
              (FNS SHOWCHARS)
              (DECLARE%: DOEVAL@LOAD DONTCOPY (MACROS HEXCHAR OCTALCHAR)))
        (DECLARE%: EVAL@COMPILE DONTCOPY (FILES (FROM LOADUPS)
                                                EXPORTS.ALL))
        (PROP (FILETYPE)
              UNICODE)))



(* ; "External formats")

(DEFINEQ

(UTF8.OUTCHARFN
  [LAMBDA (STREAM CHARCODE RAW)                              (* ; "Edited 24-Apr-2025 15:43 by rmk")
                                                             (* ; "Edited 20-Jan-2025 20:45 by rmk")
                                                             (* ; "Edited 31-Jan-2024 00:32 by rmk")
                                                            (* ; "Edited  8-Aug-2021 13:02 by rmk:")
                                                            (* ; "Edited 17-Aug-2020 08:45 by rmk:")
                                                            (* ; "Edited 30-Jan-2020 23:08 by rmk:")

    (* ;; "Perhaps the translation table should already do the mapping for EOL to LF, but that seems to be a separate property of the stream. Also, CRLF=2 bytes.")

    (* ;; "Print UTF8 sequence for CHARCODE.  Do not do MCCS to Unicode translation if RAW.")

    (IF (EQ CHARCODE (CHARCODE EOL))
        THEN (FREPLACE (STREAM CHARPOSITION) OF STREAM WITH 0)
             (\BOUTEOL STREAM)
      ELSE (CHANGE (FFETCH (STREAM CHARPOSITION) OF STREAM)
                  (IPLUS16 1 DATUM))
           (FOR C INSIDE (CL:IF RAW
                             CHARCODE
                             (UNICODE.TRANSLATE CHARCODE *MCCSTOUNICODE*))
              DO (IF (ILESSP C 128)
                     THEN (\BOUT STREAM C)
                   ELSEIF (ILESSP C 2048)
                     THEN                                    (* ; "x800")
                          (\BOUT STREAM (LOGOR (LLSH 3 6)
                                               (LRSH C 6)))
                          (\BOUT STREAM (LOGOR (LLSH 2 6)
                                               (LOADBYTE C 0 6)))
                   ELSEIF (ILESSP C 65536)
                     THEN                                    (* ; "x10000")
                          (\BOUT STREAM (LOGOR (LLSH 7 5)
                                               (LRSH C 12)))
                          (\BOUT STREAM (LOGOR (LLSH 2 6)
                                               (LOADBYTE C 6 6)))
                          (\BOUT STREAM (LOGOR (LLSH 2 6)
                                               (LOADBYTE C 0 6)))
                   ELSEIF (ILESSP C 2097152)
                     THEN                                    (* ; "x200000")
                          (\BOUT STREAM (LOGOR (LLSH 15 4)
                                               (LRSH C 18)))
                          (\BOUT STREAM (LOGOR (LLSH 2 6)
                                               (LOADBYTE C 12 6)))
                          (\BOUT STREAM (LOGOR (LLSH 2 6)
                                               (LOADBYTE C 6 6)))
                          (\BOUT STREAM (LOGOR (LLSH 2 6)
                                               (LOADBYTE C 0 6)))
                   ELSE (ERROR "CHARCODE too big for UTF8" C])

(UTF8.SLUG.OUTCHARFN
  [LAMBDA (STREAM CODE RAW)                                  (* ; "Edited 24-Apr-2025 15:43 by rmk")
                                                             (* ; "Edited 21-Jan-2025 18:37 by rmk")
                                                             (* ; "Edited 14-Jan-2025 12:39 by rmk")

    (* ;; "Produces Unicode Representative FFFD as a slug for MCCS unmapped characters")

    (UTF8.OUTCHARFN STREAM (OR (CL:IF RAW
                                   CODE
                                   (XTOUCODE? CODE))
                               (CONSTANT (HEXNUM? "FFFD")))
           T])

(UTF8.INCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 24-Apr-2025 15:44 by rmk")
                                                             (* ; "Edited  2-Feb-2024 11:44 by rmk")
                                                             (* ; "Edited 30-Jan-2024 22:56 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:02 by rmk:")
                                                            (* ; "Edited  6-Aug-2020 17:13 by rmk:")

    (* ;; "Do not do UNICODE to MCSS translation if RAW.")

    (* ;; "Test for smallp because the stream's End-of-file operation may suppress the error")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (LET (BYTE1 BYTE2 BYTE3 BYTE4 CODE (COUNT 1))
         (SETQ BYTE1 (\BIN STREAM))

         (* ;; "Distinguish on header bytes (modulo peculiar EOF behavior--the caller will get whatever ended up in BYTE1")

         (CL:WHEN (SMALLP BYTE1)
             [SETQ CODE (if (ILEQ BYTE1 127)
                            then 
                                 (* ;; 
                           "Test first:  Ascii is the common case.  EOL requires its own translation")

                                 (SELCHARQ BYTE1
                                      (CR (SELECTC (fetch (STREAM EOLCONVENTION) of STREAM)
                                              (CR.EOLC       (* ; "Also eq BYTE1")
                                                       (CHARCODE EOL))
                                              (CRLF.EOLC (if (EQ (CHARCODE LF)
                                                                 (\PEEKBIN STREAM T))
                                                             then (\BIN STREAM)
                                                                  (CL:WHEN COUNTP (SETQ COUNT 2))
                                                                  (CHARCODE EOL)
                                                           else BYTE1))
                                              BYTE1))
                                      (LF (CL:IF (EQ LF.EOLC (fetch (STREAM EOLCONVENTION)
                                                                of STREAM))
                                              (CHARCODE EOL)
                                              BYTE1))
                                      BYTE1)
                          elseif (ILEQ BYTE1 223)
                            then                             (* ; "2 bytes")
                                 (SETQ COUNT 2)
                                 (SETQ BYTE2 (\BIN STREAM))
                                 (CL:WHEN (OR (NOT (SMALLP BYTE2))
                                              (ILESSP BYTE2 128))
                                     (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                                 (LOGOR (LLSH (LOADBYTE BYTE1 0 5)
                                              6)
                                        (LOADBYTE BYTE2 0 6))
                          elseif (ILEQ BYTE1 239)
                            then                             (* ; "3 bytes")
                                 (SETQ BYTE2 (\BIN STREAM))
                                 (CL:WHEN (OR (NOT (SMALLP BYTE2))
                                              (ILESSP BYTE2 128))
                                     (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                                 (SETQ BYTE3 (\BIN STREAM))
                                 (CL:WHEN (OR (NOT (SMALLP BYTE3))
                                              (ILESSP BYTE3 128))
                                     (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3)))
                                 (SETQ COUNT 3)
                                 (LOGOR (LLSH (LOADBYTE BYTE1 0 4)
                                              12)
                                        (LLSH (LOADBYTE BYTE2 0 6)
                                              6)
                                        (LOADBYTE BYTE3 0 6))
                          else                               (* ; "4 bytes")
                               (SETQ BYTE2 (\BIN STREAM))
                               (CL:WHEN (OR (NOT (SMALLP BYTE2))
                                            (ILESSP BYTE2 128))
                                   (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                               (SETQ BYTE3 (\BIN STREAM))
                               (CL:WHEN (OR (NOT (SMALLP BYTE3))
                                            (ILESSP BYTE3 128))
                                   (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3)))
                               (SETQ BYTE4 (\BIN STREAM))
                               (CL:WHEN (OR (NOT (SMALLP BYTE4))
                                            (ILESSP BYTE4 128))
                                   (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3 BYTE4)))
                               (SETQ COUNT 4)
                               (LOGOR (LLSH (LOADBYTE BYTE1 0 3)
                                            18)
                                      (LLSH (LOADBYTE BYTE2 0 6)
                                            12)
                                      (LLSH (LOADBYTE BYTE3 0 6)
                                            6)
                                      (LOADBYTE BYTE4 0 6])
         (CL:UNLESS (OR RAW (NOT (SMALLP CODE)))
             (SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOMCCS*)))
         (CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT))
         CODE])

(UTF8.PEEKCCODEFN
  [LAMBDA (STREAM NOERROR RAW)                               (* ; "Edited 24-Apr-2025 15:44 by rmk")
                                                             (* ; "Edited  2-Feb-2024 11:48 by rmk")
                                                            (* ; "Edited 14-Jun-2021 22:53 by rmk:")

    (* ;; "Modeled this after \EUCPEEK on LLREAD.  In the multi-byte (non-ASCII) case, backs the file pointer to the beginning by the proper number of \BACKFILEPTRs, and returns a count of 0.  Returns NIL if NOERROR and either invalid UTF8 or end of file.")

    (* ;; "Could be that the caller takes care of backing up the file position if the number of binned-bytes is returned.")

    (* ;; "Do not do UNICODE to MCCS translation if RAW")

    (PROG (BYTE1 BYTE2 BYTE3 BYTE4 CODE)
          (SETQ BYTE1 (\PEEKBIN STREAM NOERROR))

     (* ;; "Distinguish on header bytex")

          (CL:UNLESS BYTE1 (RETURN NIL))
          [if (ILEQ BYTE1 127)
              then 
                   (* ;; 
                   "Test first:  Ascii is the common case.  No need to back up, since we peeked.")

                   (SETQ CODE BYTE1)
            elseif [ILEQ BYTE1 223                           (* ; "2 bytes")
                         (BIN STREAM)
                         (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                         (\BACKFILEPTR STREAM)
                         (if (AND BYTE2 (IGEQ BYTE2 128))
                             then (SETQ CODE (LOGOR (LLSH (LOADBYTE BYTE1 0 5)
                                                          6)
                                                    (LOADBYTE BYTE2 0 6)))
                           elseif NOERROR
                           else (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2]
            elseif (ILEQ BYTE1 239)
              then                                           (* ; "3 bytes")
                   (BIN STREAM)
                   (CL:UNLESS (AND (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                                   (IGEQ BYTE2 128))
                       (\BACKFILEPTR STREAM)
                       (OR NOERROR (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                       (RETURN CODE))
                   (BIN STREAM)
                   (SETQ BYTE3 (\PEEKBIN STREAM NOERROR))    (* ; 
                                                             "PEEK the last, no need to back it up")
                   (\BACKFILEPTR STREAM)
                   (\BACKFILEPTR STREAM)
                   (if (AND BYTE3 (IGEQ BYTE3 128))
                       then (SETQ CODE (LOGOR (LLSH (LOADBYTE BYTE1 0 4)
                                                    12)
                                              (LLSH (LOADBYTE BYTE2 0 6)
                                                    6)
                                              (LOADBYTE BYTE3 0 6)))
                     elseif NOERROR
                     else (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3)))
            else                                             (* ; "4 bytes")
                 (BIN STREAM)
                 (CL:UNLESS (AND (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                                 (IGEQ BYTE2 128))
                     (\BACKFILEPTR STREAM)
                     (OR NOERROR (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                     (RETURN CODE))
                 (BIN STREAM)
                 (CL:UNLESS (AND (SETQ BYTE3 (\PEEKBIN STREAM NOERROR))
                                 (IGEQ BYTE3 128))
                     (\BACKFILEPTR STREAM)
                     (\BACKFILEPTR STREAM)
                     (OR NOERROR (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3)))
                     (RETURN CODE))
                 (BIN STREAM)
                 (SETQ BYTE4 (\PEEKBIN STREAM NOERROR))
                 (\BACKFILEPTR STREAM)
                 (\BACKFILEPTR STREAM)
                 (\BACKFILEPTR STREAM)
                 (if (AND BYTE4 (IGEQ BYTE4 128))
                     then (SETQ CODE (LOGOR (LLSH (LOADBYTE BYTE1 0 3)
                                                  18)
                                            (LLSH (LOADBYTE BYTE2 0 6)
                                                  12)
                                            (LLSH (LOADBYTE BYTE3 0 6)
                                                  6)
                                            (LOADBYTE BYTE4 0 6)))
                   elseif NOERROR
                   else (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3 BYTE4]
          (CL:WHEN (AND CODE (NOT RAW))
              (SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOMCCS*)))
          (RETURN CODE])

(\UTF8.BACKCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 19-Jul-2022 15:30 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:04 by rmk:")

    (* ;; "\BACKFILEPTR is NIL at beginning of FILE.  Presumably a little bit more efficient if we decoded the UTF8 bytes backwards and didn't do the peek, but probably not worth the complexity. ")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (BIND (C _ 0) WHILE (IF (\BACKFILEPTR STREAM)
                            THEN (ADD C -1)
                                 (EQ 2 (LRSH (\PEEKBIN STREAM)
                                             6))
                          ELSE (CL:WHEN COUNTP (SETQ *BYTECOUNTER* C))
                               (RETURN NIL)) REPEATUNTIL (EQ C -4)
       FINALLY (CL:WHEN COUNTP (SETQ *BYTECOUNTER* C))
             (RETURN (UTF8.PEEKCCODEFN STREAM NIL RAW])
)
(DEFINEQ

(UTF16BE.OUTCHARFN
  [LAMBDA (STREAM CHARCODE RAW)                              (* ; "Edited 24-Apr-2025 15:44 by rmk")
                                                             (* ; "Edited 31-Jan-2024 00:32 by rmk")
                                                            (* ; "Edited  8-Aug-2021 13:09 by rmk:")
                                                            (* ; "Edited 30-Jan-2020 23:08 by rmk:")

    (* ;; "PRINT UTF16 sequence for CHARCODE.  Do not do MCCS to UNICODE translation if RAW.")

    (* ;; "Not sure about EOL conversion if truly %"raw%"")

    (IF (EQ CHARCODE (CHARCODE EOL))
        THEN (FREPLACE (STREAM CHARPOSITION) OF STREAM WITH 0)
      ELSE (CHANGE (FFETCH (STREAM CHARPOSITION) OF STREAM)
                  (IPLUS16 1 DATUM)))
    (FOR C INSIDE (CL:IF RAW
                      CHARCODE
                      (UNICODE.TRANSLATE CHARCODE *MCCSTOUNICODE*)) DO (\WOUT STREAM C])

(UTF16BE.INCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 24-Apr-2025 15:45 by rmk")
                                                             (* ; "Edited 10-Mar-2024 12:00 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:05 by rmk:")

    (* ;; 
    "Do not do UNICODE to MCCS translation if RAW.  Test for SMALLPin case of funky EOF behavior")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (LET (CODE BYTE1 BYTE2 COUNT)
         (IF [AND (SMALLP (SETQ BYTE1 (\BIN STREAM)))
                  (SMALLP (SETQ BYTE2 (\BIN STREAM]
             THEN (SETQ COUNT 2)
                  (SETQ CODE (create WORD
                                    HIBYTE _ (\BIN STREAM)
                                    LOBYTE _ (\BIN STREAM)))
                  (CL:UNLESS RAW
                      (SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOMCCS*)))
                  (CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT))
                  CODE
           ELSE (ERROR "ODD NUMBER OF BYTES IN UTF16 FILE" STREAM])

(UTF16BE.PEEKCCODEFN
  [LAMBDA (STREAM NOERROR RAW)                               (* ; "Edited 24-Apr-2025 15:45 by rmk")
                                                             (* ; "Edited 10-Mar-2024 12:01 by rmk")
                                                            (* ; "Edited 14-Jun-2021 22:58 by rmk:")

    (* ;; "Could be that the caller takes care of backing up the file position if the number of binned-bytes is returned.")

    (* ;; "Do not do UNICODE to MCCS translation if RAW")

    (LET (BYTE1 BYTE2 CODE)
         (SETQ BYTE1 (\PEEKBIN STREAM NOERROR))
         (IF BYTE1
             THEN (\BIN STREAM)
                  (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                  (\BACKFILEPTR STREAM)
                  (IF BYTE2
                      THEN (SETQ CODE (create WORD
                                             HIBYTE _ BYTE1
                                             LOBYTE _ BYTE2))
                           (CL:IF RAW
                               CODE
                               (UNICODE.TRANSLATE CODE *UNICODETOMCCS*))
                    ELSEIF NOERROR
                      THEN NIL)
           ELSEIF NOERROR
             THEN NIL
           ELSE (ERROR "INVALID UTF16 CHARACTER" (LIST BYTE1 BYTE2])

(\UTF16BE.BACKCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 24-Apr-2025 15:28 by rmk")
                                                             (* ; "Edited 10-Mar-2024 12:02 by rmk")
                                                             (* ; "Edited 19-Jul-2022 15:14 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:07 by rmk:")

    (* ;; "\BACKFILEPTR is NIL at beginning of FILE, do nothing.")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (CL:WHEN (\BACKFILEPTR STREAM)
        (LET (CODE (BYTE2 (\PEEKBIN STREAM)))
             (IF (\BACKFILEPTR STREAM)
                 THEN (CL:WHEN COUNTP (SETQ *BYTECOUNTER* -2))
                      (SETQ CODE (create WORD
                                        HIBYTE _ (\PEEKBIN STREAM)
                                        LOBYTE _ BYTE2))
                      (CL:IF RAW
                          CODE
                          (UNICODE.TRANSLATE CODE *UNICODETOMCCS*))
               ELSEIF COUNTP
                 THEN (SETQ *BYTECOUNTER* -1)
                      NIL)))])
)
(DEFINEQ

(UTF16LE.OUTCHARFN
  [LAMBDA (STREAM CHARCODE RAW)                              (* ; "Edited 24-Apr-2025 15:45 by rmk")
                                                             (* ; "Edited 10-Mar-2024 11:58 by rmk")
                                                            (* ; "Edited  8-Aug-2021 13:09 by rmk:")
                                                            (* ; "Edited 30-Jan-2020 23:08 by rmk:")

    (* ;; "PRINT UTF16 sequence for CHARCODE.  Do not do MCCS to UNICODE translation if RAW.")

    (* ;; "Not sure about EOL conversion if truly %"raw%"")

    (IF (EQ CHARCODE (CHARCODE EOL))
        THEN (FREPLACE (STREAM CHARPOSITION) OF STREAM WITH 0)
      ELSE (CHANGE (FFETCH (STREAM CHARPOSITION) OF STREAM)
                  (IPLUS16 1 DATUM)))
    (FOR C INSIDE (CL:IF RAW
                      CHARCODE
                      (UNICODE.TRANSLATE CHARCODE *MCCSTOUNICODE*))
       DO (BOUT STREAM (fetch LOBYTE of CHARCODE))
          (BOUT STREAM (fetch HIBYTE of CHARCODE])

(UTF16LE.INCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 24-Apr-2025 15:45 by rmk")
                                                             (* ; "Edited 10-Mar-2024 12:03 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:05 by rmk:")

    (* ;; 
    "Do not do UNICODE to MCCS translation if RAW.  Test for SMALLPin case of funky EOF behavior")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (LET (CODE BYTE1 BYTE2 COUNT)
         (IF [AND (SMALLP (SETQ BYTE1 (\BIN STREAM)))
                  (SMALLP (SETQ BYTE2 (\BIN STREAM]
             THEN (SETQ COUNT 2)
                  (SETQ CODE (create WORD
                                    LOBYTE _ (\BIN STREAM)
                                    HIBYTE _ (\BIN STREAM)))
                  (CL:UNLESS RAW
                      (SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOMCCS*)))
                  (CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT))
                  CODE
           ELSE (ERROR "ODD NUMBER OF BYTES IN UTF16 FILE" STREAM])

(UTF16LE.PEEKCCODEFN
  [LAMBDA (STREAM NOERROR RAW)                               (* ; "Edited 24-Apr-2025 15:46 by rmk")
                                                             (* ; "Edited 10-Mar-2024 11:43 by rmk")
                                                            (* ; "Edited 14-Jun-2021 22:58 by rmk:")

    (* ;; "Could be that the caller takes care of backing up the file position if the number of binned-bytes is returned.")

    (* ;; "Do not do UNICODE to MCCS translation if RAW")

    (LET (BYTE1 BYTE2 CODE)
         (SETQ BYTE1 (\PEEKBIN STREAM NOERROR))
         (IF BYTE1
             THEN (\BIN STREAM)
                  (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                  (\BACKFILEPTR STREAM)
                  (IF BYTE2
                      THEN (SETQ CODE (LOGOR (LLSH BYTE2 8)
                                             BYTE1))
                           (CL:IF RAW
                               CODE
                               (UNICODE.TRANSLATE CODE *UNICODETOMCCS*))
                    ELSEIF NOERROR
                      THEN NIL)
           ELSEIF NOERROR
             THEN NIL
           ELSE (ERROR "INVALID UTF16 CHARACTER" (LIST BYTE1 BYTE2])

(\UTF16LE.BACKCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 24-Apr-2025 15:28 by rmk")
                                                             (* ; "Edited 10-Mar-2024 12:04 by rmk")
                                                             (* ; "Edited 19-Jul-2022 15:14 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:07 by rmk:")

    (* ;; "\BACKFILEPTR is NIL at beginning of FILE, do nothing.")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (CL:WHEN (\BACKFILEPTR STREAM)
        (LET (CODE (BYTE2 (\PEEKBIN STREAM)))
             (IF (\BACKFILEPTR STREAM)
                 THEN (CL:WHEN COUNTP (SETQ *BYTECOUNTER* -2))
                      (SETQ CODE (create WORD
                                        HIBYTE _ BYTE2
                                        LOBYTE _ (\PEEKBIN STREAM)))
                      (CL:IF RAW
                          CODE
                          (UNICODE.TRANSLATE CODE *UNICODETOMCCS*))
               ELSEIF COUNTP
                 THEN (SETQ *BYTECOUNTER* -1)
                      NIL)))])
)
(DEFINEQ

(READBOM
  [LAMBDA (STREAM COUNTP)                                    (* ; "Edited 17-Jan-2025 11:29 by rmk")
                                                             (* ; "Edited 11-Mar-2024 23:53 by rmk")
                                                             (* ; "Edited 10-Mar-2024 13:01 by rmk")

    (* ;; "If COUNTP, this must be under a generic \INCCODE that binds *BYTECOUNTER*")

    (* ;; "Reads and decodes the BOM bytes.  If BOM ispresent, the stream is left at the first following byte, otherwise the stream is reset to its position on entry (presumably 0).")

    (* ;; "I used the UNHEXTRING constants so that the hex bytes are visible in the code, maybe there's another function that does that?")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (SELECTC (\PEEKBIN STREAM T)
        ((HEXNUM? "EF") 
             (BIN STREAM)
             (if (EQ (CONSTANT (HEXNUM? "BB"))
                     (\PEEKBIN STREAM T))
                 then (BIN STREAM)
                      (if (EQ (CONSTANT (HEXNUM? "BF"))
                              (\PEEKBIN STREAM T))
                          then (BIN STREAM)
                               (CL:WHEN COUNTP (add *BYTECOUNTER* 3))
                               :UTF-8
                        else (\BACKFILEPTR STREAM))
               else (\BACKFILEPTR STREAM)))
        ((HEXNUM? "FE") 
             (BIN STREAM)
             (if (EQ (CONSTANT (HEXNUM? "FF"))
                     (\PEEKBIN STREAM T))
                 then (BIN STREAM)
                      (CL:WHEN COUNTP (add *BYTECOUNTER* 2))
                      :UTF-16BE
               else (\BACKFILEPTR STREAM)))
        ((HEXNUM? "FF") 
             (BIN STREAM)
             (if (EQ (CONSTANT (HEXNUM? "FE"))
                     (\PEEKBIN STREAM T))
                 then (BIN STREAM)
                      (CL:WHEN COUNTP (add *BYTECOUNTER* 2))
                      :UTF-16LE
               else (\BACKFILEPTR STREAM)))
        NIL])

(WRITEBOM
  [LAMBDA (STREAM FORMAT)                                    (* ; "Edited 17-Jan-2025 11:29 by rmk")
                                                             (* ; "Edited 16-Mar-2024 20:53 by rmk")
                                                             (* ; "Edited 11-Mar-2024 23:53 by rmk")
                                                             (* ; "Edited 10-Mar-2024 13:01 by rmk")

    (* ;; "Writes a BOM that represents FORMAT (:UTF-8, :UTF16-BE, :UTF16-LE")

    (SELECTQ FORMAT
        (:UTF-8 (BOUT STREAM (CONSTANT (HEXNUM? "EF")))
                (BOUT STREAM (CONSTANT (HEXNUM? "BB")))
                (BOUT STREAM (CONSTANT (HEXNUM? "BF"))))
        (:UTF-16BE (BOUT STREAM (CONSTANT (HEXNUM? "FE")))
                   (BOUT STREAM (CONSTANT (HEXNUM? "FF"))))
        (:UTF-16LE (BOUT STREAM (CONSTANT (HEXNUM? "FF")))
                   (BOUT STREAM (HEXNUM? "FE")))
        NIL])
)

(RPAQ? EXTERNALEOL 'LF)
(DEFINEQ

(MAKE-UNICODE-FORMATS
  [LAMBDA (EXTERNALEOL)                                      (* ; "Edited 17-Jan-2025 18:38 by rmk")
                                                             (* ; "Edited 10-Mar-2024 11:55 by rmk")
                                                             (* ; "Edited  8-Dec-2023 15:19 by rmk")
                                                             (* ; "Edited 19-Jul-2022 15:36 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:08 by rmk:")

    (* ;; "RAW formats do not do XCCS/Unicode translation, just deal with the byte encoding.")

    (* ;; "The EXTERNALEOL specifies the EOLCONVENTION of the stream, particularly to produce output files with the desired convention.  On input the macro \CHECKEOLC (LLREAD) coerces only that coding to the internal EOL, which is a mistake.")

    (MAKE-EXTERNALFORMAT :UTF-8 (FUNCTION UTF8.INCCODEFN)
           (FUNCTION UTF8.PEEKCCODEFN)
           (FUNCTION \UTF8.BACKCCODEFN)
           (FUNCTION UTF8.OUTCHARFN)
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (MAKE-EXTERNALFORMAT :UTF-8-RAW [FUNCTION (LAMBDA (STREAM COUNTP)
                                                (UTF8.INCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM NOERROR)
                       (UTF8.PEEKCCODEFN STREAM NOERROR T]
           [FUNCTION (LAMBDA (STREAM COUNTP)
                       (\UTF8.BACKCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM CHARCODE)
                       (UTF8.OUTCHARFN STREAM CHARCODE T]
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (MAKE-EXTERNALFORMAT :UTF-16BE (FUNCTION UTF16BE.INCCODEFN)
           (FUNCTION UTF16BE.PEEKCCODEFN)
           (FUNCTION \UTF16BE.BACKCCODEFN)
           (FUNCTION UTF16BE.OUTCHARFN)
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (MAKE-EXTERNALFORMAT :UTF-16BE-RAW [FUNCTION (LAMBDA (STREAM COUNTP)
                                                   (UTF16BE.INCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM NOERROR)
                       (UTF16BE.PEEKCCODEFN STREAM NOERROR T]
           [FUNCTION (LAMBDA (STREAM COUNTP)
                       (\UTF16BE.BACKCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM CHARCODE)
                       (UTF16BE.OUTCHARFN STREAM CHARCODE T]
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (MAKE-EXTERNALFORMAT :UTF-16LE (FUNCTION UTF16LE.INCCODEFN)
           (FUNCTION UTF16LE.PEEKCCODEFN)
           (FUNCTION \UTF16LE.BACKCCODEFN)
           (FUNCTION UTF16LE.OUTCHARFN)
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (MAKE-EXTERNALFORMAT :UTF-16LE-RAW [FUNCTION (LAMBDA (STREAM COUNTP)
                                                   (UTF16LE.INCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM NOERROR)
                       (UTF16LE.PEEKCCODEFN STREAM NOERROR T]
           [FUNCTION (LAMBDA (STREAM COUNTP)
                       (\UTF16LE.BACKCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM CHARCODE)
                       (UTF16LE.OUTCHARFN STREAM CHARCODE T]
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (\INSTALL.EXTERNALFORMAT (create EXTERNALFORMAT using (FIND-FORMAT :UTF-8)
                                                          NAME _ :UTF-8-SLUG OUTCHARFN _
                                                          (FUNCTION UTF8.SLUG.OUTCHARFN])
)

(MAKE-UNICODE-FORMATS EXTERNALEOL)

(ADDTOVAR *DEFAULT-EXTERNALFORMATS* (UNIX :UTF-8))
(DEFINEQ

(UTF8.BINCODE
  [LAMBDA (STREAM RAW)                                       (* ; "Edited 24-Apr-2025 15:28 by rmk")
                                                             (* ; "Edited  4-Feb-2024 01:06 by rmk")
                                                             (* ; "Edited  1-Feb-2024 11:21 by rmk")
                                                             (* ; "Edited 28-Dec-2023 13:32 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:02 by rmk:")
                                                            (* ; "Edited  6-Aug-2020 17:13 by rmk:")

    (* ;; "Decodes a UTF8 character code by binning  from STREAM ")

    (* ;; "The validity of STREAM is guaranteed by the caller (presumably TEDIT), we aren't testing here for the validity of the trailing bytes.")

    (* ;; "This doesn't do EOL conversion or translation, unlike UTF8.INCCODEFN.")

    (LET ((BYTE1 (BIN STREAM))
          CODE)
         [SETQ CODE (if (ILEQ BYTE1 127)
                        then BYTE1
                      elseif (ILEQ BYTE1 223)
                        then                                 (* ; "2 bytes")
                             (LOGOR (LLSH (LOADBYTE BYTE1 0 5)
                                          6)
                                    (LOADBYTE (BIN STREAM)
                                           0 6))
                      elseif (ILEQ BYTE1 239)
                        then                                 (* ; "3 bytes")
                             (LOGOR (LLSH (LOADBYTE BYTE1 0 4)
                                          12)
                                    (LLSH (LOADBYTE (BIN STREAM)
                                                 0 6)
                                          6)
                                    (LOADBYTE (BIN STREAM)
                                           0 6))
                      else                                   (* ; "4 bytes")
                           (LOGOR (LLSH (LOADBYTE BYTE1 0 3)
                                        18)
                                  (LLSH (LOADBYTE (BIN STREAM)
                                               0 6)
                                        12)
                                  (LLSH (LOADBYTE (BIN STREAM)
                                               0 6)
                                        6)
                                  (LOADBYTE (BIN STREAM)
                                         0 6]
         (CL:IF RAW
             CODE
             (UNICODE.TRANSLATE CODE *UNICODETOMCCS*))])

(\UTF8.FETCHCODE
  [LAMBDA (CODESIZE BUFFER BYTEOFFSET)                       (* ; "Edited 28-Dec-2023 13:32 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:02 by rmk:")
                                                            (* ; "Edited  6-Aug-2020 17:13 by rmk:")

    (* ;; "Decodes a UTF8 byte sequence of size CODESIZE in BUFFER starting at BYTEOFFSET.")

    (* ;; "The validity of the thesize, buffer, and offset are guaranteed by the caller.")

    (LET ((BYTE1 (\GETBASEBYTE BUFFER BYTEOFFSET))
          BYTE2 BYTE3 BYTE4)
         (SELECTQ CODESIZE
             (2 (SETQ BYTE2 (\UTF8.GETBASEBYTE BUFFER (IPLUS 1 BYTEOFFSET)))
                (LOGOR (LLSH (LOADBYTE BYTE1 0 5)
                             6)
                       (LOADBYTE BYTE2 0 6)))
             (3 (SETQ BYTE2 (\UTF8.GETBASEBYTE BUFFER (IPLUS 1 BYTEOFFSET)))
                (SETQ BYTE3 (\UTF8.GETBASEBYTE BUFFER (IPLUS 2 BYTEOFFSET)))
                (LOGOR (LLSH (LOADBYTE BYTE1 0 4)
                             12)
                       (LLSH (LOADBYTE BYTE2 0 6)
                             6)
                       (LOADBYTE BYTE3 0 6)))
             (4 (SETQ BYTE2 (\UTF8.GETBASEBYTE BUFFER (IPLUS 1 BYTEOFFSET)))
                (SETQ BYTE3 (\UTF8.GETBASEBYTE BUFFER (IPLUS 2 BYTEOFFSET)))
                (SETQ BYTE4 (\UTF8.GETBASEBYTE BUFFER (IPLUS 3 BYTEOFFSET)))
                (LOGOR (LLSH (LOADBYTE BYTE1 0 3)
                             18)
                       (LLSH (LOADBYTE BYTE2 0 6)
                             12)
                       (LLSH (LOADBYTE BYTE3 0 6)
                             6)
                       (LOADBYTE BYTE4 0 6)))
             (1 BYTE1)
             (SHOULDNT])
)
(DEFINEQ

(UTF8.VALIDATE
  [LAMBDA (STREAM BYTE)                                      (* ; "Edited  2-Feb-2024 12:03 by rmk")
                                                             (* ; "Edited 28-Dec-2023 11:57 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:02 by rmk:")
                                                            (* ; "Edited  6-Aug-2020 17:13 by rmk:")

    (* ;; "Returns the codesize if the bytes starting at STREAM's current position form a valid UTF-8 sequence.")

    (* ;; "If BYTE is provided, it is interpreted as the just-read header byte with the stream is positioned just after it.")

    (* ;; "Test for smallp because the stream's End-of-file operation may suppress the error--otherwise an error will happen if the streams runs out of necessary bytes.")

    (* ;; "For valid sequences, returns the same value as UTF8-SIZE-FROM-BYTE1, but this reads/validates the rest of the bytes. On a non-NILreturn the  stream is positioned before the header byte of the next putative code. The stream position is uncertain on a NIL return.")

    (* ;; "")

    (* ;; "Distinguish on the header byte BYTE.  Not SMALLP presumably if ENDOFSTREAMOP did something unusual.")

    (CL:UNLESS BYTE
        (SETQ BYTE (BIN STREAM)))
    (CL:WHEN (SMALLP BYTE)
        (if (ILEQ BYTE 127)
            then 1
          elseif (ILEQ BYTE 223)
            then                                             (* ; " 2 bytes")
                 (CL:UNLESS (OR [NOT (SMALLP (SETQ BYTE (BIN STREAM]
                                (ILESSP BYTE 128))
                        2)
          elseif (ILEQ BYTE 239)
            then                                             (* ; "3 bytes")
                 (CL:UNLESS (OR (OR [NOT (SMALLP (SETQ BYTE (BIN STREAM]
                                    (ILESSP BYTE 128))
                                (OR [NOT (SMALLP (SETQ BYTE (BIN STREAM]
                                    (ILESSP BYTE 128)))
                        3)
          else                                               (* ; "4 bytes")
               (CL:UNLESS (OR (OR [NOT (SMALLP (SETQ BYTE (BIN STREAM]
                                  (ILESSP BYTE 128))
                              (OR [NOT (SMALLP (SETQ BYTE (BIN STREAM]
                                  (ILESSP BYTE 128))
                              (OR [NOT (SMALLP (SETQ BYTE (BIN STREAM]
                                  (ILESSP BYTE 128)))
                      4)))])

(NUTF8-BYTE1-BYTES
  [LAMBDA (BYTE1)                                            (* ; "Edited  3-Feb-2024 15:00 by rmk")
                                                             (* ; "Edited  8-Jan-2024 10:57 by rmk")
                                                             (* ; "Edited 28-Jun-2022 00:02 by rmk")
                                                            (* ; "Edited 10-Aug-2020 12:35 by rmk:")

    (* ;; "Returns the number of bytes in a UTF8 code representation whose first byte is BYTEE1. ")

    (IF (ILEQ BYTE1 127)
        THEN 1
      ELSEIF (ILEQ BYTE1 223)
        THEN 2
      ELSEIF (ILEQ BYTE1 239)
        THEN 3
      ELSE 4])

(NUTF8-CODE-BYTES
  [LAMBDA (CODE)                                             (* ; "Edited  3-Feb-2024 14:42 by rmk")
                                                             (* ; "Edited  8-Jan-2024 10:57 by rmk")
                                                             (* ; "Edited 28-Jun-2022 00:02 by rmk")
                                                            (* ; "Edited 10-Aug-2020 12:35 by rmk:")

    (* ;; "Returns the number of bytes needed to encode in UTF8 a number headed by BYTE. ")

    (IF (ILESSP CODE 128)
        THEN 1
      ELSEIF (ILESSP CODE 2048)
        THEN                                                 (* ; "x800")
             2
      ELSEIF (ILESSP CODE 65536)
        THEN                                                 (* ; "x10000")
             3
      ELSEIF (ILESSP CODE 2097152)
        THEN                                                 (* ; "x200000")
             4
      ELSE (ERROR "INVALID UTF-8 CODE"])

(NUTF8-STRING-BYTES
  [LAMBDA (STRING RAW)                                       (* ; "Edited  2-Sep-2025 10:40 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:37 by rmk")
                                                             (* ; "Edited  3-Feb-2024 21:32 by rmk")
                                                            (* ; "Edited 10-Aug-2020 09:06 by rmk:")

    (* ;; "Returns the number of bytes it would take to represent STRING in UTF8, assuming it is an MCCS string unless RAWFLG. ")

    (for I C from 1 while (SETQ C (NTHCHARCODE STRING I)) sum (NUTF8-CODE-BYTES (CL:IF RAW
                                                                                    C
                                                                                    (MTOUCODE C))])

(N-MCHARS
  [LAMBDA (UTF8STRING)                                       (* ; "Edited  9-Sep-2025 08:35 by rmk")

    (* ;; "Returns the number of MCCS characters coded in UTF8STRING")

    (for I B from 1 while (SETQ B (NTHCHARCODE UTF8STRING I)) by (NUTF8-BYTE1-BYTES B) count T])
)
(DECLARE%: EVAL@COMPILE DONTCOPY 
(DECLARE%: EVAL@COMPILE 

(PUTPROPS UNICODE.TRANSLATE MACRO [OPENLAMBDA (CODE TRANSLATION-TABLE DONTFAKE RETURNALL)

                                   (* ;; "If RETURNALL and there are alternatives in the RANG, the list is returned. Othewise just the first one if the fake flag allows ")

                                    (LET [(RANGE (OR (GETHASH CODE TRANSLATION-TABLE)
                                                     (UNICODE.UNMAPPED CODE TRANSLATION-TABLE 
                                                            DONTFAKE]
                                         (CL:WHEN RANGE
                                             (if (AND RETURNALL (CDR RANGE))
                                                 then RANGE
                                               else (SETQ RANGE (CAR RANGE))
                                                    (CL:IF DONTFAKE
                                                        (TRUECODEP RANGE TRANSLATION-TABLE)
                                                        RANGE)))])

(PUTPROPS \UTF8.GETBASEBYTE MACRO ((BASE OFFSET ERROR?)      (* ; 
                        "Fetches the OFFSET'th byte from BASE, checking for UTF-8 validity if ERROR?")
                                   (IF ERROR?
                                       THEN (LET ((BYTE (\GETBASEBYTE BASE OFFSET)))
                                                 (CL:WHEN (ILESSP BYTE 128)
                                                        (ERROR "INVALID UTF8 BYTE" BYTE))
                                                 BYTE)
                                     ELSE (\GETBASEBYTE BASE OFFSET))))
)
)
(DEFINEQ

(MTOUCODE
  [LAMBDA (MCODE)                                            (* ; "Edited  4-Sep-2025 15:10 by rmk")
                                                             (* ; "Edited 24-Apr-2025 10:19 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")
    (UNICODE.TRANSLATE MCODE *MCCSTOUNICODE*])

(UTOMCODE
  [LAMBDA (UNNICODE)                                         (* ; "Edited 24-Apr-2025 10:17 by rmk")
                                                             (* ; "Edited 16-Jan-2025 23:46 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")
    (UNICODE.TRANSLATE UNNICODE *UNICODETOMCCS*])

(MTOUCODE?
  [LAMBDA (MCODE)                                            (* ; "Edited  4-Sep-2025 15:09 by rmk")
                                                             (* ; "Edited 24-Apr-2025 10:18 by rmk")
                                                             (* ; "Edited 20-Jan-2025 20:38 by rmk")
                                                             (* ; "Edited 18-Jan-2025 11:44 by rmk")
                                                             (* ; "Edited 15-Jan-2025 19:51 by rmk")
                                                             (* ; "Edited 14-Jan-2025 13:14 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")

    (* ;; "Returns the Unix range-code(s) corresponding to MCODE if there are true mapppings, otherwise NIL.  Alternative codes are returned in a list, the code itself is returned for a singleton.")

    (UNICODE.TRANSLATE MCODE *MCCSTOUNICODE* T T])

(UTOMCODE?
  [LAMBDA (UNICODE)                                          (* ; "Edited 24-Apr-2025 10:18 by rmk")
                                                             (* ; "Edited 19-Jan-2025 21:14 by rmk")
                                                             (* ; "Edited 18-Jan-2025 11:46 by rmk")
                                                             (* ; "Edited 15-Jan-2025 19:51 by rmk")
                                                             (* ; "Edited 14-Jan-2025 13:14 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")

    (* ;; "Returns the MCCS range-code(s) corresponding to UNICODE if there are true mapppings, otherwise NIL.  ")

    (* ;; 
    " NOTE:  Alternative codes are returned in a list, the code itself is returned for a singleton.")

    (UNICODE.TRANSLATE UNICODE *UNICODETOMCCS* T T])

(MTOUSTRING
  [LAMBDA (MSTRING DESTRUCTIVE)                              (* ; "Edited  2-Sep-2025 12:19 by rmk")
                                                             (* ; "Edited 29-Apr-2025 12:01 by rmk")

    (* ;; "Converts MCCS codes in MSTRING to Unicodes.")

    (for I MCODE (USTRING _ (CL:IF DESTRUCTIVE
                                MSTRING
                                (CONCAT MSTRING))) from 1 while (SETQ MCODE (NTHCHARCODE MSTRING I))
       do (RPLCHARCODE USTRING I (MTOUCODE MCODE)) finally (RETURN USTRING])

(UTOMSTRING
  [LAMBDA (USTRING DESTRUCTIVE)                              (* ; "Edited  2-Sep-2025 12:18 by rmk")
                                                             (* ; "Edited 29-Apr-2025 12:00 by rmk")

    (* ;; "Converts Unicodes to MCCS codes in USTRING.")

    (for I UCODE (MSTRING _ (CL:IF DESTRUCTIVE
                                USTRING
                                (CONCAT USTRING))) from 1 while (SETQ UCODE (NTHCHARCODE USTRING I))
       do (RPLCHARCODE MSTRING I (UTOMCODE UCODE)) finally (RETURN MSTRING])

(MTOUTF8STRING
  [LAMBDA (MSTRING)                                          (* ; "Edited  9-Sep-2025 07:51 by rmk")
                                                             (* ; "Edited  4-Sep-2025 15:13 by rmk")
                                                             (* ; "Edited  2-Sep-2025 11:12 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:37 by rmk")
                                                             (* ; "Edited  3-Feb-2024 14:55 by rmk")
                                                            (* ; "Edited 10-Aug-2020 21:42 by rmk:")

    (* ;; 
    "Produces a string that contains the UTF8 bytes that represent the characters in MSTRING.  ")

    (* ;; "The resulting string will not be directly interpretable inside Medley.")

    (if (if (STRINGP MSTRING)
            then (OR (ffetch (STRINGP FATSTRINGP) of MSTRING)
                     (thereis C instring MSTRING suchthat (IGEQ C 128)))
          elseif (LITATOM MSTRING)
            then (OR (ffetch (LITATOM FATPNAMEP) of MSTRING)
                     (thereis C inatom MSTRING suchthat (IGEQ C 128)))
          else T)
        then (LET [(USTR (ALLOCSTRING (NUTF8-STRING-BYTES MSTRING]
                  (for I UCODE MCODE (SINDEX _ 0) from 1 while (SETQ MCODE (NTHCHARCODE MSTRING I))
                     do (SETQ UCODE (MTOUCODE MCODE))
                        (if (ILESSP UCODE 128)
                            then (RPLCHARCODE USTR (ADD SINDEX 1)
                                        UCODE)
                          elseif (ILESSP UCODE 2048)
                            then                             (* ; "x800")
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 3 6)
                                               (LRSH UCODE 6)))
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 2 6)
                                               (LOADBYTE UCODE 0 6)))
                          elseif (ILESSP UCODE 65536)
                            then                             (* ; "x10000")
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 7 5)
                                               (LRSH UCODE 12)))
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 2 6)
                                               (LOADBYTE UCODE 6 6)))
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 2 6)
                                               (LOADBYTE UCODE 0 6)))
                          elseif (ILESSP UCODE 2097152)
                            then                             (* ; "x200000")
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 15 4)
                                               (LRSH UCODE 18)))
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 2 6)
                                               (LOADBYTE UCODE 12 6)))
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 2 6)
                                               (LOADBYTE UCODE 6 6)))
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 2 6)
                                               (LOADBYTE UCODE 0 6)))
                          else (SHOULDNT)))
                  USTR)
      else MSTRING])

(UTF8TOMSTRING
  [LAMBDA (UTF8STRING)                                       (* ; "Edited  9-Sep-2025 08:59 by rmk")
    (CL:UNLESS (OR (STRINGP UTF8STRING)
                   (LITATOM UTF8STRING))
        (SETQ UTF8STRING (MKSTRING UTF8STRING)))
    (CL:WHEN (ffetch (STRINGP FATSTRINGP) of UTF8STRING)
           (\ILLEGAL.ARG UTF8STRING))
    (LET* ((NMCHARS (N-MCHARS UTF8STRING))
           (MSTRING (ALLOCSTRING NMCHARS)))
          [for M NBYTES BYTE1 (BASE _ (ffetch (STRINGP BASE) of UTF8STRING)) from 1 to NMCHARS
             as OFFSET from (fetch (STRINGP OFFST) of MSTRING) by NBYTES
             do (SETQ BYTE1 (\GETBASEBYTE BASE OFFSET))
                (SETQ NBYTES (NUTF8-BYTE1-BYTES BYTE1))
                (RPLCHARCODE MSTRING M (UTOMCODE (\UTF8.FETCHCODE NBYTES BASE OFFSET]
          MSTRING])
)
(DEFINEQ

(XTOUCODE
  [LAMBDA (XCODE)                                            (* ; "Edited  4-Sep-2025 15:09 by rmk")
                                                             (* ; "Edited 24-May-2025 23:16 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:27 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")
    (UNICODE.TRANSLATE (XTOMCODE XCODE)
           *MCCSTOUNICODE*])

(UTOXCODE
  [LAMBDA (UNICODE)                                          (* ; "Edited 24-May-2025 23:17 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:28 by rmk")
                                                             (* ; "Edited 16-Jan-2025 23:46 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")
    (MTOXCODE (UNICODE.TRANSLATE UNICODE *UNICODETOMCCS*])

(XTOUCODE?
  [LAMBDA (XCCSCODE)                                         (* ; "Edited 24-May-2025 23:18 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:27 by rmk")
                                                             (* ; "Edited 20-Jan-2025 20:38 by rmk")
                                                             (* ; "Edited 18-Jan-2025 11:44 by rmk")
                                                             (* ; "Edited 15-Jan-2025 19:51 by rmk")
                                                             (* ; "Edited 14-Jan-2025 13:14 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")

    (* ;; "Returns the Unix range-code(s) corresponding to XCCSCODE if there are true mapppings, otherwise NIL.  Alternative codes are returned in a list, the code itself is returned for a singleton.")

    (UNICODE.TRANSLATE (XTOMCODE XCCSCODE)
           *MCCSTOUNICODE* T T])

(UTOXCODE?
  [LAMBDA (UNICODE)                                          (* ; "Edited 24-May-2025 23:19 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:28 by rmk")
                                                             (* ; "Edited 19-Jan-2025 21:14 by rmk")
                                                             (* ; "Edited 18-Jan-2025 11:46 by rmk")
                                                             (* ; "Edited 15-Jan-2025 19:51 by rmk")
                                                             (* ; "Edited 14-Jan-2025 13:14 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")

    (* ;; "Returns the XCCS range-code(s) corresponding to UNICODE if there are true mapppings, otherwise NIL.  ")

    (* ;; 
    " NOTE:  Alternative codes are returned in a list, the code itself is returned for a singleton.")

    (MTOXCODE (UNICODE.TRANSLATE UNICODE *UNICODETOMCCS* T T])

(XTOUSTRING
  [LAMBDA (XSTRING DESTRUCTIVE)                              (* ; "Edited  2-Sep-2025 12:00 by rmk")
                                                             (* ; "Edited 29-Apr-2025 12:01 by rmk")

    (* ;; "Converts XCCS codes in XSTRING to Unicodes.")

    (for I UCODE XCODE (USTRING _ (CL:IF DESTRUCTIVE
                                      XSTRING
                                      (CONCAT XSTRING))) from 1 while (SETQ XCODE (NTHCHARCODE 
                                                                                         XSTRING I))
       do (RPLCHARCODE USTRING I (XTOUCODE XCODE)) finally (RETURN USTRING])

(UTOXSTRING
  [LAMBDA (USTRING DESTRUCTIVE)                              (* ; "Edited  2-Sep-2025 11:54 by rmk")
                                                             (* ; "Edited 29-Apr-2025 12:00 by rmk")

    (* ;; "Converts Unicodes in USTRING to XCCS codes.")

    (for I XCODE UCODE (XSTRING _ (CL:IF DESTRUCTIVE
                                      USTRING
                                      (CONCAT USTRING))) from 1 while (SETQ UCODE (NTHCHARCODE 
                                                                                         USTRING I))
       unless (EQ UCODE (SETQ XCODE (UTOXCODE UCODE))) do (RPLCHARCODE XSTRING I XCODE)
       finally (RETURN XSTRING])

(XTOUTF8STRING
  [LAMBDA (XSTRING)                                          (* ; "Edited  4-Sep-2025 18:37 by rmk")
                                                             (* ; "Edited  2-Sep-2025 11:37 by rmk")
                                                             (* ; "Edited 29-Apr-2025 12:53 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:42 by rmk")
                                                             (* ; "Edited  3-Feb-2024 14:55 by rmk")
                                                            (* ; "Edited 10-Aug-2020 21:42 by rmk:")

    (* ;; "Produces a string that contains the UTF8 bytes that represent the characters in XSTRING.  Applies the ")

    (* ;; "The resulting string will not be interpretable inside Medley.")

    (for I C (MSTRING _ (CONCAT XSTRING)) from 1 while (SETQ C (NTHCHARCODE XSTRING I))
       do (RPLCHARCODE MSTRING I (XTOMCODE C)) finally (RETURN (MTOUTF8STRING MSTRING])
)



(* ;; "")




(* ; "Read Unicode mapping files")


(RPAQ? UNICODEDIRECTORIES NIL)

(RPAQQ XCCS-CHARSETS
       ((LATIN "0")
        (JAPANESE-SYMBOLS1 "41")
        (JAPANESE-SYMBOLS2 "42")
        (EXTENDED-LATIN "43")
        (HIRAGANA "44")
        (KATAKANA "45")
        (GREEK "46")
        (CYRILLIC "47")
        (FORMS "50")
        (RUNIC-GOTHIC "51")
        (MORE-CYRILLIC "52")
        (UNKNOWN1 "56")
        (UNKNOWN2 "57")
        (JIS "60-166")
        (ARABIC "340")
        (HEBREW "341")
        (IPA "342")
        (HANGUL "343")
        (GEORGIAN-ARMENIAN "344")
        (DEVANAGRI "345")
        (BENGALI "346")
        (GURMUKHI "347")
        (THAI-LAO "350")
        (SYMBOLS3 "353")
        (EXTENDED-ITC-DINGBATS "354")
        (ITC-DINGBATS1 "355")
        (SYMBOLS2 "356")
        (SYMBOLS1 "357")
        (LIGATURES "360")
        (ACCENTED-LATIN1 "361")
        (ACCENTED-LATIN2 "362")
        (ACCENTED-GREEK1 "363")
        (ACCENTED-GREEK2 "364")
        (MORE-ARABIC "365")
        (GRAPHIC-VARIANTS "375")
        (DEFAULT LATIN ACCENTED-LATIN1 EXTENDED-LATIN SYMBOLS1 SYMBOLS2 FORMS JAPANESE-SYMBOLS1 
               JAPANESE-SYMBOLS2)
        (JAPANESE HIRAGANA KATAKANA JIS)))
(DEFINEQ

(READ-UNICODE-MAPPING-FILENAMES
  [LAMBDA (FILESPEC)                                         (* ; "Edited  4-Sep-2025 00:11 by rmk")
                                                             (* ; "Edited 27-Jan-2025 16:46 by rmk")
                                                             (* ; "Edited 21-Jan-2025 22:51 by rmk")
                                                             (* ; "Edited 19-Jan-2025 12:21 by rmk")
                                                             (* ; "Edited  3-Feb-2024 11:00 by rmk")
                                                             (* ; "Edited 30-Jan-2024 08:45 by rmk")
                                                             (* ; "Edited 26-Jan-2024 14:02 by mth")
                                                          (* ; "Edited  5-Aug-2020 15:59 by kaplan")
                                                            (* ; "Edited  4-Aug-2020 17:31 by rmk:")

    (* ;; "FILESPEC can be a file name, character-set name, the name of a collection of character sets, an XCCS character code, or a list of those. Maps those into the names of files that contain the indicated Unicode mappings.")

    (CL:REMOVE-DUPLICATES
     [if (EQ FILESPEC 'ALL)
         then 
              (* ;; 
          "Perhaps should figure out which files in the directories and subdirectories are relevant?")

              (READ-UNICODE-MAPPING-FILENAMES (for N in XCCS-CHARSETS collect (CAR N)))
       else (FOR F X CSI INSIDE FILESPEC
               JOIN 
                    (* ;; "Last case hopes to pick up all the tables that are grouped together in a subdirectory (e.g. if F is JIS)")

                    (OR (CL:WHEN (CHARCODEP F)               (* ; 
                                                        "An XCCS code can retrieve its character set")
                            (for D FN (FOCTAL _ (OCTALSTRING (LRSH F 8))) inside UNICODEDIRECTORIES
                               when (SETQ FN (FILDIR (PACKFILENAME 'DIRECTORY D 'BODY
                                                            (CONCAT 'XCCS- FOCTAL '=*)
                                                            'EXTENSION
                                                            'TXT
                                                            'VERSION ""))) do (RETURN FN)))
                        (MKLIST (FINDFILE (PACKFILENAME 'BODY F 'EXTENSION 'TXT 'VERSION "")
                                       T UNICODEDIRECTORIES))
                        (for D inside UNICODEDIRECTORIES
                           when [SETQ $$VAL (OR (FILDIR (PACKFILENAME 'NAME (CONCAT "XCCS-*=" F)
                                                               'EXTENSION
                                                               'TXT
                                                               'VERSION "" 'BODY D))
                                                (FILDIR (PACKFILENAME 'NAME (CONCAT "XCCS-" F "=*")
                                                               'EXTENSION
                                                               'TXT
                                                               'VERSION "" 'BODY D]
                           do (RETURN $$VAL))
                        (AND (SETQ CSI (ASSOC F XCCS-CHARSETS))
                             (READ-UNICODE-MAPPING-FILENAMES (CDR CSI)))
                        (for D inside UNICODEDIRECTORIES
                           when (DIRECTORYNAMEP (SETQ D (CONCAT D ">" F ">")))
                           join (FILDIR (CONCAT D ">*.TXT;"]
     :TEST
     (FUNCTION STRING.EQUAL])

(READ-UNICODE-MAPPING
  [LAMBDA (FILESPEC PRINT NOERROR)                           (* ; "Edited  4-Sep-2025 00:17 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:32 by rmk")
                                                             (* ; "Edited 31-Jan-2025 17:43 by rmk")
                                                             (* ; "Edited 17-Jan-2025 16:41 by rmk")
                                                             (* ; "Edited  3-Feb-2024 00:21 by rmk")
                                                             (* ; "Edited  5-Jan-2024 12:26 by rmk")
                                                            (* ; "Edited  3-Jul-2021 13:37 by rmk:")

    (* ;; "Combines the char-mapping tables from FILES coded in the Uncode-CDROM format.  Comments prefixed by # and")

    (* ;; "               Column 1:  XCCS input hex code in the format 0xXXXX")

    (* ;; "               Column 2:  Corresponding Unicode code-sequence in the format")

    (* ;; "                                            0xXXXX  ... 0xYYYY")

    (* ;; "               Column 3:  (after #) Character name in some mapping files, utf-8 character")

    (* ;; "                                     for XCCS mapping files")

    (* ;; "")

    (* ;; "Result is a list of (fromcode tocode1 ... tocoden) integer lists (almost always with only a single tocode, where fromcode is an MCCS code and the tocodes are corresponding Unicodes.")

    (for FILE [SEPBITTABLE _ (MAKEBITTABLE (CHARCODE (TAB SPACE] in (READ-UNICODE-MAPPING-FILENAMES
                                                                     FILESPEC)
       join 
            (* ;; "External format  :THROUGH means read as bytes, so the Unicode UTF-8 comments cannot cause reading problems.")

            (CL:WITH-OPEN-FILE (STREAM FILE :DIRECTION :INPUT :EXTERNAL-FORMAT `(:THROUGH LF))
                   (bind LINE NAME CHARSET START MAP
                      first (CL:UNLESS (FILEPOS "Name:" STREAM NIL NIL NIL T)
                                (ERROR "NOT A UNICODE MAPPING FILE" (FULLNAME STREAM)))
                            (SETQ NAME (CL:STRING-TRIM " " (CL:READ-LINE STREAM NIL NIL)))
                            (SETQ CHARSET (CL:IF (FILEPOS "XCCS charset:" STREAM NIL NIL NIL T)
                                              (CL:STRING-TRIM " " (CL:READ-LINE STREAM NIL NIL))
                                              ""))
                            (CL:WHEN PRINT                   (* ; "Strip off XCCS in front of name")
                                (PRINTOUT T T CHARSET " " [SUBSTRING NAME (CONSTANT
                                                                           (ADD1 (NCHARS "XCCS"]
                                       T)) while (SETQ LINE (CL:READ-LINE STREAM NIL NIL))
                      when (SETQ START (STRPOSL SEPBITTABLE LINE 1 T))
                      unless (EQ (CHARCODE %#)
                                 (NTHCHARCODE LINE START))
                      collect [SETQ MAP (bind END CODES while [SETQ END (OR (STRPOSL SEPBITTABLE LINE
                                                                                   START)
                                                                            (ADD1 (NCHARS LINE]
                                           collect [CHARCODE.DECODE (SUBSTRING LINE START
                                                                           (SUB1 END)
                                                                           (CONSTANT (CONCAT]
                                           repeatwhile (AND (SETQ START (STRPOSL SEPBITTABLE LINE END
                                                                               T))
                                                            (NEQ (CHARCODE %#)
                                                                 (NTHCHARCODE LINE START)))
                                           finally (CL:WHEN (CDDR $$VAL)
                                                             (* ; "Combiners go into a CADR list")
                                                       (RPLACD $$VAL (CONS (CDR $$VAL))))]
                            (change (CAR MAP)
                                   (XTOMCODE DATUM))
                            MAP])
)



(* ; "Make translation tables for  UTF external formats")

(DEFINEQ

(MAKE-UNICODE-TRANSLATION-TABLES
  [LAMBDA (MAPPING REINSTALL)                                (* ; "Edited  4-Sep-2025 00:30 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:47 by rmk")
                                                             (* ; "Edited 31-Jan-2025 17:46 by rmk")
                                                             (* ; "Edited 26-Jan-2025 19:36 by rmk")
                                                             (* ; "Edited 22-Jan-2025 14:22 by rmk")
                                                             (* ; "Edited 19-Jan-2025 15:08 by rmk")
                                                             (* ; "Edited 18-Jan-2025 11:52 by rmk")
                                                             (* ; "Edited  3-Feb-2024 00:24 by rmk")
                                                             (* ; "Edited 30-Jan-2024 09:54 by rmk")
                                                            (* ; "Edited 21-Aug-2021 13:12 by rmk:")
                                                            (* ; "Edited 17-Aug-2020 08:46 by rmk:")
    (CL:UNLESS [AND (LISTP MAPPING)
                    (FOR PAIR R IN MAPPING AS I TO 10
                       ALWAYS (AND (LISTP PAIR)
                                   (CHARCODEP (CAR PAIR))
                                   [FIXP (SETQ R (CAR (MKLIST (CADR PAIR]
                                   (CHARCODEP (IABS R]

        (* ;; "Seems like the argument is not already a list of mapping pairs (perhaps with a combiner), presumably a list of charsets to be read.")

        (SETQ MAPPING (READ-UNICODE-MAPPING MAPPING)))

    (* ;; "MAPPING is the list of numeric code correspondence pairs constructed by applying READ-UNICODE-MAPPING to Unicode mapping files.")

    (* ;; "This updates or produces two recoding arrays, one maps left-side codes into right-side codes (e.g. XCCS or ISO8859-1 to Unicode), for printing, the other maps right-side (Unicode) codes to corresponding right-side codes (e.g. XCCS).")

    (* ;; "")

    (* ;; "If REINSTALL is T, the new mapping vectors replace the current maps in the *XCCSTOUNICODE* and *UNICODETOXCCS* global variables.  Otherwise we create new tables (mostly for comparison and debugging).")

    (* ;; "")

    (if REINSTALL
        then (SETQ *MCCS-LOADED-CHARSETS* (SETQ *UNICODE-LOADED-CHARSETS* NIL))
             (SETQ *NEXT-PRIVATE-MCCSCODE* FIRST-PRIVATE-MCCSCODE)
             (SETQ *NEXT-PRIVATE-UNICODE* FIRST-PRIVATE-UNICODE)
             (LET [(TABLE (HASHARRAY (LENGTH MAPPING)))
                   (INVERSETABLE (HASHARRAY (LENGTH MAPPING]
                  (MERGE-UNICODE-TRANSLATION-TABLES NIL MAPPING TABLE INVERSETABLE)
                  (SETQ *MCCSTOUNICODE* TABLE)
                  (SETQ *UNICODETOMCCS* INVERSETABLE)
                  (LIST *MCCSTOUNICODE* *UNICODETOMCCS*))
      else (CL:UNLESS (BOUNDP '*NEXT-PRIVATE-MCCSCODE*)
               (SETQ *NEXT-PRIVATE-MCCSCODE* FIRST-PRIVATE-MCCSCODE)
               (SETQ *NEXT-PRIVATE-UNICODE* FIRST-PRIVATE-UNICODE))
           (MERGE-UNICODE-TRANSLATION-TABLES NIL MAPPING])

(MERGE-UNICODE-TRANSLATION-TABLES
  [LAMBDA (INVERSE MAPPING TABLE INVERSETABLE)               (* ; "Edited 24-Apr-2025 15:28 by rmk")
                                                             (* ; "Edited  1-Feb-2025 21:42 by rmk")
                                                             (* ; "Edited 26-Jan-2025 12:58 by rmk")
                                                             (* ; "Edited 22-Jan-2025 08:20 by rmk")
                                                             (* ; "Edited 19-Jan-2025 15:58 by rmk")
                                                             (* ; "Edited 18-Jan-2025 11:49 by rmk")
                                                             (* ; "Edited 27-Mar-2024 12:10 by rmk")
                                                             (* ; "Edited  3-Feb-2024 12:46 by rmk")
                                                             (* ; "Edited 31-Jan-2024 10:06 by rmk")

    (* ;; "MAPPINGS is a list of pairs that map domain codes to range codes.  TABLE and INVERSETABLE default to *XCCSTOUNICODE* *UNICODETOXCCS* respectively.   ")

    (CL:UNLESS TABLE
        [SETQ TABLE (OR *MCCSTOUNICODE* (SETQ *MCCSTOUNICODE* (HASHARRAY (LENGTH MAPPING])
    (CL:UNLESS INVERSETABLE
        [SETQ INVERSETABLE (OR *UNICODETOMCCS* (SETQ *UNICODETOMCCS* (HASHARRAY (LENGTH MAPPING])
    (for M D R OLDR in MAPPING first (CL:IF INVERSE (swap TABLE INVERSETABLE))
       eachtime (SETQ D (CAR M))
             (SETQ R (CADR M)) 

             (* ;; "We don't do combiners, but we are allowing non-SMALLP's")
 unless (OR (LISTP D)
            (LISTP R)) do 
                          (* ;; "The (CONS R OLDR) deals with alternatives:  (U X1) (U X2) => (U (X1 X2)), lowest code first.  Those are only possible in the U-to-X direction when the tables contain (X1 U) and (X2 U). There are no duplicates/alternative table entries in the X-to-U direction.")

                          (SETQ OLDR (GETHASH D TABLE))
                          (CL:UNLESS (MEMB R OLDR)
                              (PUTHASH D (SORT (CONS R OLDR))
                                     TABLE))
                          (swap D R)
                          (SETQ OLDR (GETHASH D INVERSETABLE))
                          (CL:UNLESS (MEMB R OLDR)
                              (PUTHASH D (SORT (CONS R OLDR))
                                     INVERSETABLE)))
    (LIST TABLE INVERSETABLE])

(UNICODE.UNMAPPED
  [LAMBDA (CODE TABLE DONTFAKE)                              (* ; "Edited 24-Apr-2025 15:48 by rmk")
                                                             (* ; "Edited 22-Jan-2025 08:19 by rmk")
                                                             (* ; "Edited 19-Jan-2025 22:02 by rmk")
                                                             (* ; "Edited 18-Jan-2025 12:02 by rmk")
                                                             (* ; "Edited  2-Feb-2024 23:52 by rmk")
                                                             (* ; "Edited 31-Jan-2024 10:07 by rmk")
                                                            (* ; "Edited 11-Aug-2020 20:23 by rmk:")

    (* ;; "This is the slow fall-out when UNICODE.TRANSLATE determines that CODE has no fast mapping in TRANSLATION-TABLE.")

    (* ;; "")

    (* ;; "If we have not already installed the mapping segment for that code, we try to retrieve it from the numberic file.  If that segment mapping doesn't exist or doesn't have an entry for CODE, we fake up a mapping with a negative range in both directions. One way or the other, there will be an entry for that segment in both mapping vectors.")

    (* ;; "")

    (PROG ((INVERSE (EQ TABLE *UNICODETOMCCS*))
           RANGE HASH)

     (* ;; "If we already looked up CODE's character set in a file, then we have already filled in its information in the translation table.  If it didn't have a code for a particular character, then we fake it here.  Faked codes are negative, so we can detect them easily, and interpret them with IABS.")

          (CL:WHEN (AND (UNICODE-EXTEND-TRANSLATION? CODE TABLE)
                        (SETQ RANGE (GETHASH CODE TABLE)))

              (* ;; "We might have gotten the segment that didn't have an entry for CODE.")

              (RETURN RANGE))

     (* ;; "")

          (CL:UNLESS DONTFAKE

              (* ;; "Our attempt at extending the known tables did not provide a mapping for CODE.  So we fake it up with the next unused private code in the code space.  ")

              (* ;; "The number of possible faked mappings is determined by the number of private-use Unicodes, since the MCCS character space is pretty sparse.  The codes don't have to come from the same part of the code space, and the NEXTCODEs are saved in global variables.  The last available codes are constants.")

              (CL:WHEN (IEQP *NEXT-PRIVATE-MCCSCODE* LAST-PRIVATE-MCCSCODE)
                                                             (* ; 
                                                           "Same number of available codes both ways")
                  (ERROR "EXHAUSTED RANGE FOR UNMAPPED CODES"))
              (if INVERSE
                  then (SETQ RANGE *NEXT-PRIVATE-MCCSCODE*)
                       (add *NEXT-PRIVATE-MCCSCODE* 1)
                else (SETQ RANGE *NEXT-PRIVATE-UNICODE*)
                     (add *NEXT-PRIVATE-UNICODE* 1))
              (MERGE-UNICODE-TRANSLATION-TABLES INVERSE (CONS (LIST CODE RANGE)))

              (* ;; "CONS because of LIST convention so we can eventually distinguish combiners.")

              (RETURN (CONS RANGE)))])

(UNICODE-EXTEND-TRANSLATION?
  [LAMBDA (CODE TABLE)                                       (* ; "Edited  4-Sep-2025 00:34 by rmk")
                                                             (* ; "Edited 29-Jun-2025 16:44 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:49 by rmk")
                                                             (* ; "Edited 26-Jan-2025 11:26 by rmk")
                                                             (* ; "Edited 21-Jan-2025 22:31 by rmk")
                                                             (* ; "Edited 18-Jan-2025 12:40 by rmk")
                                                             (* ; "Edited 13-Jan-2025 23:50 by rmk")
                                                             (* ; "Edited 26-Aug-2024 16:49 by rmk")
                                                             (* ; "Edited 27-Mar-2024 23:02 by rmk")
                                                             (* ; "Edited  5-Feb-2024 13:48 by rmk")
                                                             (* ; "Edited  3-Feb-2024 12:40 by rmk")

    (* ;; "There is currently no mapping for CODE in TABLE, hopefully just because the relevant character-set mapping has not been installed. We infer from TABLE whether CODE is an MCCS or UNICODE code and look for the proper mapping table (forward or inverted) for its character set. ")

    (* ;; "We record which character sets we have already expanded so we don't do them again.")

    (LET ((CHARSET (\CHARSET CODE))
          (INVERSE (EQ TABLE *UNICODETOMCCS*))
          MAPPING FILE)

         (* ;; "If we already looked for CHARSET in the file and found anything, it has already been merged. Otherwise, it would just fail again")

         (CL:UNLESS (MEMB CHARSET (CL:IF INVERSE
                                      *UNICODE-LOADED-CHARSETS*
                                      *MCCS-LOADED-CHARSETS*))

             (* ;; "Don't try this charset again.")

             (CL:IF INVERSE
                 (push *UNICODE-LOADED-CHARSETS* CHARSET)
                 (push *MCCS-LOADED-CHARSETS* CHARSET))
             (SETQ FILE (FINDFILE (CL:IF INVERSE
                                      'UNICODE-TO-MCCS-MAPPINGS
                                      MCCS-TO-UNICODE-MAPPINGS)
                               T UNICODEDIRECTORIES))

             (* ;; "The mappings files are indexed by CHARSET.")

             (CL:WHEN [AND FILE (SETQ MAPPING (CL:WITH-OPEN-FILE (STREAM FILE :INPUT)
                                                     (CL:WHEN (FILEPOS (CONCAT "[" CHARSET " ")
                                                                     STREAM NIL NIL NIL T)
                                                            (READ STREAM]

                 (* ;; 
                 "Merge MAPPING into both tables, respecting the direction indicated by TABLE. ")

                 (MERGE-UNICODE-TRANSLATION-TABLES INVERSE MAPPING)
                 T))])
)
(DEFINEQ

(ALL-UNICODE-MAPPINGS
  [LAMBDA (INVERTED FILE)                                    (* ; "Edited 24-Apr-2025 15:51 by rmk")
                                                             (* ; "Edited 31-Jan-2025 17:46 by rmk")
                                                             (* ; "Edited 26-Jan-2025 13:40 by rmk")
                                                             (* ; "Edited 22-Jan-2025 14:07 by rmk")
                                                             (* ; "Edited 19-Jan-2025 12:20 by rmk")
                                                             (* ; "Edited 17-Jan-2025 22:32 by rmk")
                                                             (* ; "Edited 15-Jan-2025 09:49 by rmk")
                                                             (* ; "Edited 27-Mar-2024 14:48 by rmk")
                                                             (* ; "Edited  5-Feb-2024 13:14 by rmk")
                                                             (* ; "Edited  3-Feb-2024 09:16 by rmk")

    (* ;; "Reads all the XCCS-to-UNICODE mapping files that we know about, and produces a 2-level index that maps between MCCS codes and UNICODE codes, depending on INVERTED.")

    (* ;; "The first index level segments all the domain codes according to their character sets.  The segments are sorted by character set, the pairs within each segment are sorted by their domain codes.  ")

    (* ;; 
    "E.g. if INVERTED=NIL and given a XCCS code, the lookup for the corresponding Unicode(s) is")

    (* ;; "    (CADR (ASSOC MCCSCODE (\CHARSET MCCSCODE) INDEX)))).")

    (* ;; "If FILE is not NIL, the result is written to a file. If FILE is T, the file is either MCCS-TO-UNICODE-MAPPINGS.TXT or UNICODE-TO-MCCS-MAPPINGS.TXT, depending on INVERTED.")

    (LET (INDEX)
         (for PAIR DOMAIN RANGE CHARSET in (READ-UNICODE-MAPPING 'ALL) eachtime (SETQ DOMAIN
                                                                                 (CAR PAIR))
                                                                             (SETQ RANGE (CADR PAIR))
                                                                             
                                                                             (* ;; 
                                                      "(LISTP RANGE) is a combiner, ignored for now.")
 unless (LISTP RANGE) do (CL:WHEN INVERTED (SWAP DOMAIN RANGE)) 

                         (* ;; 
       "One segment for each high-byte character set.   This aligns with UNICODE-EXTEND.TRANSLATION?")

                         [SETQ CHARSET (OR (ASSOC (\CHARSET DOMAIN)
                                                  INDEX)
                                           (CAR (push INDEX (CONS (\CHARSET DOMAIN] 

                         (* ;; "For alternative mappings (in the U-to-M direction) we end up with  (D R1 R2 ...).  (CADR is the first (and almost always) the only one.")

                         (pushnew [CDR (OR (ASSOC DOMAIN (CDR CHARSET))
                                           (CAR (push (CDR CHARSET)
                                                      (CONS DOMAIN]
                                RANGE))

         (* ;; "Push the charset mappings down an extra CONS, so that a subsequent READ will get them all after a FILEPOS search for super-paren [")

         [for CS in INDEX do (for M in (CDR CS) when (CDDR M) do 
                                                                 (* ;; 
                                                                "Sort the range alternatives, if any")

                                                                 (change (CDR M)
                                                                        (SORT DATUM))) 

                             (* ;; "Sort by domain codes and push down a level")

                             (change (CDR CS)
                                    (CONS (SORT DATUM T]
         (SETQ INDEX (SORT INDEX T))                         (* ; "Sort character sets")
         (if FILE
             then (SETQ FILE (PACKFILENAME 'BODY (if (NEQ FILE T)
                                                     then FILE
                                                   elseif INVERTED
                                                     then 'UNICODE-TO-MCCS-MAPPINGS
                                                   else 'MCCS-TO-UNICODE-MAPPINGS)
                                    'DIRECTORY
                                    (CAR (MKLIST UNICODEDIRECTORIES))
                                    'EXTENSION
                                    'TXT))
                  (CL:WITH-OPEN-FILE (STREAM FILE :DIRECTION :OUTPUT :IF-EXISTS :NEW-VERSION)

                         (* ;; 
         "We can FILEPOS for %"[nnn %" then READ for each segment. Or just READFILE to get them all.")

                         (for I in INDEX do (PRINTOUT STREAM "[" (CAR I)
                                                   " "
                                                   (CADR I)
                                                   "]" T T))
                         (PRINTOUT STREAM "STOP" T)
                         (FULLNAME STREAM))
           else INDEX])

(XCCSJAPANESECHARSETS
  [LAMBDA (OCTAL FILE)                                       (* ; "Edited 11-Jun-2025 23:00 by rmk")

    (* ;; "Returns the list of numbers for the Japanese character sets.")

    (for F POS CS in (READ-UNICODE-MAPPING-FILENAMES "JIS")
       when (SETQ POS (STRPOS "XCCS-" F 1 NIL NIL T))
       collect [SETQ CS (SUBSTRING F POS (SUB1 (STRPOS '=JIS F POS]
             (CL:IF OCTAL
                 CS
                 (MKATOM (CONCAT CS "Q")))
       finally (SORT $$VAL)
             (CL:WHEN FILE
                 (RETURN (CL:WITH-OPEN-FILE (STREAM (PACKFILENAME 'BODY (CL:IF (EQ FILE T)
                                                                            "JAPANESECHARSETS"
                                                                            FILE)
                                                           'DIRECTORY
                                                           (CAR (MKLIST UNICODEDIRECTORIES))
                                                           'EXTENSION
                                                           'TXT)
                                                   :DIRECTION :OUTPUT :IF-EXISTS :NEW-VERSION)
                                (PRINT $$VAL STREAM)
                                (FULLNAME STREAM))))])
)

(RPAQ? *MCCSTOUNICODE* )

(RPAQ? *UNICODETOMCCS* )

(RPAQ? *MCCS-LOADED-CHARSETS* )

(RPAQ? *UNICODE-LOADED-CHARSETS* )
(DECLARE%: DOEVAL@COMPILE DONTCOPY

(GLOBALVARS *MCCSTOUNICODE* *UNICODETOMCCS* *NEXT-PRIVATE-UNICODE* *NEXT-PRIVATE-MCCSCODE* 
       *MCCS-LOADED-CHARSETS* *UNICODE-LOADED-CHARSETS*)
)
(DECLARE%: EVAL@COMPILE DONTCOPY 
(DECLARE%: EVAL@COMPILE 

(RPAQ FIRST-PRIVATE-UNICODE (HEXNUM? "E000"))

(RPAQ LAST-PRIVATE-UNICODE (HEXNUM? "F8FF"))

(RPAQ FIRST-PRIVATE-MCCSCODE (CHARCODE "200,0"))

(RPAQ LAST-PRIVATE-MCCSCODE (CHARCODE "230,377"))


(CONSTANTS (FIRST-PRIVATE-UNICODE (HEXNUM? "E000"))
       (LAST-PRIVATE-UNICODE (HEXNUM? "F8FF"))
       (FIRST-PRIVATE-MCCSCODE (CHARCODE "200,0"))
       (LAST-PRIVATE-MCCSCODE (CHARCODE "230,377")))
)

(DECLARE%: EVAL@COMPILE 

(PUTPROPS TRUECODEP MACRO (OPENLAMBDA (RANGE TABLE)

                            (* ;; "Return NIL if RANGE is a fake range in TABLE, otherwise RANGE.")

                            (CL:UNLESS (CL:IF (EQ TABLE *MCCSTOUNICODE*)
                                           (AND (IGEQ RANGE FIRST-PRIVATE-UNICODE)
                                                (ILEQ RANGE LAST-PRIVATE-UNICODE))
                                           (AND (IGEQ RANGE FIRST-PRIVATE-MCCSCODE)
                                                (ILEQ RANGE LAST-PRIVATE-MCCSCODE)))
                                   RANGE)))
)
)
(DECLARE%: DONTEVAL@LOAD DOCOPY 

(MAKE-UNICODE-TRANSLATION-TABLES 'ALL)
)



(* ;; "")




(* ; "Write Unicode mapping files")

(DEFINEQ

(WRITE-UNICODE-MAPPING
  [LAMBDA (MAPPING INCLUDECHARSETS FILE EMPTYOK)             (* ; "Edited  4-Jan-2024 22:44 by rmk")
                                                            (* ; "Edited 16-Aug-2020 16:56 by rmk:")

    (* ;; "Writes a symbol unicode mapping file.  Mapping is a list of (XCCS-code Unicode) pairs, which may contain codes in multiple character sets.")

    (* ;; "If FILE is NIL, it defaults to a name XCCS- followed by the octal character sets in the mapping, in the unicode/XEROX directory.")

    (* ;; "The output lines are of the form x0XXX<tab>x0UUUU<tab>#  Unicode-char")

    (* ;; 
    "If INCLUDECHARSETS=T then the mappings are split up into separate per-character set files.")

    (* ;; "Otherwise, all and only mappings included in thos charsets are included in a single output file--an implicit subset.")

    (IF (AND (EQ INCLUDECHARSETS T)
             (NULL FILE))
        THEN (IF MAPPING
                 THEN (FOR CSI F IN XCCS-SET-NAMES WHEN (SETQ F (WRITE-UNICODE-MAPPING MAPPING
                                                                       (CAR CSI)
                                                                       NIL T)) COLLECT F)
               ELSE (PRINTOUT T "THERE ARE NO MAPPINGS" T)
                    NIL)
      ELSE
      (LET
       (IMAPPING CSETINFO RANGES)
       (CL:MULTIPLE-VALUE-SETQ (IMAPPING CSETINFO RANGES)
              (WRITE-UNICODE-INCLUDED MAPPING INCLUDECHARSETS))
       (IF IMAPPING
           THEN (CL:WITH-OPEN-FILE
                 (STREAM (WRITE-UNICODE-MAPPING-FILENAME FILE CSETINFO RANGES)
                        :DIRECTION :OUTPUT :IF-EXISTS :NEW-VERSION :EXTERNAL-FORMAT :UTF-8-RAW)
                 (WRITE-UNICODE-MAPPING-HEADER STREAM CSETINFO RANGES)
                 (SORT IMAPPING T)
                 (FOR M CSET LEFTC FIRSTRIGHTC CSI IN IMAPPING
                    DO (SETQ LEFTC (CAR M))
                       (SETQ FIRSTRIGHTC (CADR M))
                       (CL:UNLESS (EQ CSET (LRSH LEFTC 8))
                           (SETQ CSET (LRSH LEFTC 8))
                           (SETQ CSI (ASSOC CSET CSETINFO))
                           (PRINTOUT STREAM T "#  " .P2 (CADR CSI)
                                  " "
                                  (CADDR CSI)
                                  T))
                       (PRINTOUT STREAM "0x" (HEXSTRING LEFTC 4)
                              %#
                              (FOR RIGHTC IN (CDR M) DO (PRINTOUT NIL "	" "0x" (HEXSTRING RIGHTC 4)))
                              "	#  "
                              (SELECTC FIRSTRIGHTC
                                  (UNDEFINEDCODE 
                                                 (* ;; "FFFF")

                                                 "UNDEFINED")
                                  (MISSINGCODE 
                                               (* ;; "FFFE")

                                               "MISSING")
                                  (IF (ILESSP FIRSTRIGHTC 32)
                                      THEN                   (* ; "Control chars")
                                           [CONCAT "^" (CHARACTER (IPLUS FIRSTRIGHTC (CHARCODE @]
                                    ELSE (CHARACTER FIRSTRIGHTC)))
                              T))
                 (FULLNAME STREAM))
         ELSEIF (NOT EMPTYOK)
           THEN (PRINTOUT T "THERE ARE NO MAPPINGS")
                (CL:WHEN INCLUDECHARSETS
                    (PRINTOUT T " FOR " .PPVTL (MKLIST INCLUDECHARSETS)
                           T))
                NIL])

(WRITE-UNICODE-INCLUDED
  [LAMBDA (MAPPING INCLUDECHARSETS)                     (* ; "Edited  4-Aug-2020 17:47 by rmk:")

    (* ;; "CSETINFO is a list of (num string name) for each included character set.")

    (LET (CHARSETS CSETINFO RANGES ICSETS IMAPPING)

         (* ;; "Normalize the INCLUDECHARSETS, then reduce MAPPING to the included mappings")

         [SETQ ICSETS (FOR C POS KNOWN INSIDE (OR INCLUDECHARSETS (FOR CSI IN 
                                                                                       XCCS-SET-NAMES
                                                                             COLLECT (CAR CSI)))
                         JOIN [SETQ KNOWN (OR (SASSOC C XCCS-SET-NAMES)
                                                  (FIND N IN XCCS-SET-NAMES
                                                     SUCHTHAT (EQ C (CADR N)))
                                                  (HELP "UNKNOWN CHARACTER SET" (OCTALSTRING C]
                               (IF (SETQ POS (STRPOS "-" (CAR KNOWN)))
                                   THEN (FOR I FROM (CL:PARSE-INTEGER (SUBSTRING
                                                                                   (CAR KNOWN)
                                                                                   1
                                                                                   (SUB1 POS))
                                                                       :RADIX 8)
                                               TO (CL:PARSE-INTEGER (SUBSTRING (CAR KNOWN)
                                                                               (ADD1 POS))
                                                             :RADIX 8)
                                               COLLECT (LIST I (OCTALSTRING I)
                                                                 (CADR KNOWN)))
                                 ELSE (CONS (CONS (CL:PARSE-INTEGER (CAR KNOWN)
                                                             :RADIX 8)
                                                      KNOWN]
         (SETQ IMAPPING (FOR M CSI IN MAPPING WHEN (SETQ CSI (ASSOC (LRSH (CAR M)
                                                                                      8)
                                                                                ICSETS))
                           COLLECT 

                                 (* ;; "The attested subset of INCLUDED")

                                 (CL:UNLESS (MEMB CSI CSETINFO)
                                        (PUSH CSETINFO CSI))
                                 M))

         (* ;; "Sort as numbers, not octal strings, then group into consecutive ranges")

         (SETQ CSETINFO (SORT CSETINFO T))
         [SETQ RANGES (FOR CTAIL C START END ON (FOR CSI IN CSETINFO
                                                           COLLECT (CAR CSI)) WHILE CTAIL
                         COLLECT (SETQ START (CAR CTAIL))
                               (SETQ END START)
                               (CONS START (WHILE [AND (CDR CTAIL)
                                                           (EQ END (SUB1 (CADR CTAIL]
                                              COLLECT (SETQ CTAIL (CDR CTAIL))
                                                    (SETQ END (CAR CTAIL]

         (* ;; "Split out groups of less than 3.  But if a range exhaustively covers a known subset (like JIS), replace by the name")

         [SETQ RANGES (FOR R STR KNOWN LAST IN RANGES
                         JOIN (SETQ LAST (CAR (LAST R)))
                               (IF (EQ (CAR R)
                                           LAST)
                                   THEN (CONS (OCTALSTRING (CAR R)))
                                 ELSEIF (SETQ KNOWN (SASSOC (SETQ STR (CONCAT (OCTALSTRING
                                                                                   (CAR R))
                                                                                 "-"
                                                                                 (OCTALSTRING LAST)))
                                                               XCCS-SET-NAMES))
                                   THEN (CONS (CADR KNOWN))
                                 ELSEIF (CDDR R)
                                   THEN (CONS STR)
                                 ELSE (LIST (OCTALSTRING (CAR R))
                                                (OCTALSTRING LAST]
         (CL:VALUES IMAPPING CSETINFO RANGES])

(WRITE-UNICODE-MAPPING-HEADER
  [LAMBDA (STREAM CSETINFO RANGES)                           (* ; "Edited  5-Jan-2024 13:24 by rmk")
                                                            (* ; "Edited  4-Aug-2020 17:38 by rmk:")

    (* ;; "Writes the standard per-file header information")

    (FOR LINE IN UNICODE-MAPPING-HEADER
       DO (PRINTOUT STREAM "#" 2)
          (SELECTQ LINE
              (XCCSCHARACTERSETS 
                   (PRINTOUT STREAM "        XCCS charset")
                   (IF (CDR CSETINFO)
                       THEN (PRINTOUT STREAM "s:" -4)
                            (FOR R IN RANGES DO (PRINTOUT STREAM R " "))
                     ELSE                                    (* ; "Singleton")
                          (PRINTOUT STREAM ": " -4 (CADAR CSETINFO)
                                 " "
                                 (CADDAR CSETINFO)))
                   (TERPRI STREAM))
              (DATE (PRINTOUT STREAM "        Date:" -13 (DATE (DATEFORMAT NO.TIME NO.LEADING.SPACES)
                                                               )
                           T))
              (PRINTOUT STREAM LINE T)))
    (TERPRI STREAM])

(WRITE-UNICODE-MAPPING-FILENAME
  [LAMBDA (FILE CSETINFO RANGES)                        (* ; "Edited  4-Aug-2020 19:34 by rmk:")
    (PACKFILENAME 'BODY [OR FILE (CONCATLIST
                                  (CONS 'XCCS- (IF (CDR CSETINFO)
                                                   THEN (FOR RTAIL R ON RANGES
                                                               JOIN (SETQ R (CAR RTAIL))
                                                                     (SETQ R
                                                                      (CL:IF (LISTP R)
                                                                          (LIST (CAR R)
                                                                                "-"
                                                                                (CDR R))
                                                                          (CONS R)))
                                                                     (CL:IF (CDR RTAIL)
                                                                            (NCONC1 R ","))
                                                                     R)
                                                 ELSE (LIST (CADAR CSETINFO)
                                                                "="
                                                                (CADDAR CSETINFO]
           'DIRECTORY
           (CAR UNICODEDIRECTORIES)
           'EXTENSION
           'TXT])
)
(DEFINEQ

(XCCS-UTF8-AFTER-OPEN
  [LAMBDA (STREAM ACCESS PARAMETERS)                         (* ; "Edited  3-Jan-2024 10:27 by rmk")
                                                            (* ; "Edited 13-Aug-2020 11:54 by rmk:")

    (* ;; 
    "If added to STREAM-AFTER-OPEN-FNS, causes mapping files to be opened as UTF-8. For development")

    (CL:WHEN (AND (STRPOS "XCCS-" (U-CASE (FULLNAME STREAM)))
                  [EQ 'TXT (U-CASE (FILENAMEFIELD (FULLNAME STREAM)
                                          'EXTENSION]
                  (NOT (ASSOC 'EXTERNALFORMAT PARAMETERS)))
        (STREAMPROP STREAM 'EXTERNALFORMAT :UTF-8))])
)



(* ;; "Automate dumping of a documentation prefix")

(DECLARE%: EVAL@COMPILE DONTCOPY 
(DECLARE%: EVAL@COMPILE 

(RPAQ MISSINGCODE (CL:PARSE-INTEGER "FFFE" :RADIX 16))

(RPAQ UNDEFINEDCODE (CL:PARSE-INTEGER "FFFF" :RADIX 16))


(CONSTANTS (MISSINGCODE (CL:PARSE-INTEGER "FFFE" :RADIX 16))
       (UNDEFINEDCODE (CL:PARSE-INTEGER "FFFF" :RADIX 16)))
)
)

(RPAQQ UNICODE-MAPPING-HEADER
       ("" "        Name:             XCCS (Version 2.0) to Unicode" "        Unicode version:  3.0"
           XCCSCHARACTERSETS "        Table version:    0.1" "        Table format:     Format A" 
           DATE "        Author:           Ron Kaplan <Ron.Kaplan@post.harvard.edu>" "" 
           "This file contains mappings from the Xerox Character Code Standard (version" 
           "2.0, 1990) into Unicode 3.0. standard codes.  That is an extension of the" 
           "version of XCCS corresponding to the fonts in the Medley system." "" 
           "The format of this file conforms to the format of the other Unicode-supplied" 
           "mapping files:" "   Three white-space (tab or spaces) separated columns:" 
           "     Column 1 is the XCCS code (as hex 0xXXXX)" 
           "     Column 2 is the corresponding Unicode (as hex 0xXXXX)" 
           "     Column 3 (after #) is a comment column. For convenience, it contains the" 
           "        Unicode character itself and the Unicode character names when available." 
           "Unicode FFFF is used for undefined XCCS codes (Column 3 = UNDEFINED" 
           "Unicode FFFE is used for XCCS codes that have not yet been filled in." 
           "(Column 3 = MISSING)" "" "This file is encoded in UTF-8, so that the Unicode characters"
           "are properly displayed in Column 3 and can be edited by standard" 
           "Unicode-enabled editors (e.g. Mac Textedit)." "" 
           "This file can also be read by the function" 
           "READ-UNICODE-MAPPING in the UNICODE Medley library package." "" 
           "The entries are in XCCS order and grouped by character sets.  In front of" 
           "the mappings, for convenience, there is a line with the octal XCCS" 
           "character set, after #." "" 
           "Note that a given XCCS code might map to codes in several different Unicode" 
           "positions, since there are repetitions in the Unicode standard." "" 
           "For more details, see the associated README.TXT file." "" 
           "Any comments or problems, contact <ron.kaplan@post.harvard.edu>"))
(DEFINEQ

(UTF8HEXSTRING
  [LAMBDA (CHARCODE)                                    (* ; "Edited 10-Aug-2020 08:33 by rmk:")

    (* ;; "Utility to produces the UTF8 hexstring representing CODE")

    (HEXSTRING (IF (ILESSP CHARCODE 128)
                       THEN CHARCODE
                     ELSEIF (ILESSP CHARCODE 2048)
                       THEN                              (* ; "x800")
                             (LOGOR (LLSH (LOGOR (LLSH 3 6)
                                                 (LRSH CHARCODE 6))
                                          8)
                                    (LOGOR (LLSH 2 6)
                                           (LOADBYTE CHARCODE 0 6)))
                     ELSEIF (ILESSP CHARCODE 65536)
                       THEN                              (* ; "x10000")
                             (LOGOR (LLSH (LOGOR (LLSH 7 5)
                                                 (LRSH CHARCODE 12))
                                          16)
                                    (LLSH (LOGOR (LLSH 2 6)
                                                 (LOADBYTE CHARCODE 6 6))
                                          8)
                                    (LOGOR (LLSH 2 6)
                                           (LOADBYTE CHARCODE 0 6)))
                     ELSEIF (ILESSP CHARCODE 2097152)
                       THEN                              (* ; "x200000")
                             (LOGOR (LLSH (LOGOR (LLSH 15 4)
                                                 (LRSH CHARCODE 18))
                                          24)
                                    (LLSH (LOGOR (LLSH 2 6)
                                                 (LOADBYTE CHARCODE 12 6))
                                          16)
                                    (LLSH (LOGOR (LLSH 2 6)
                                                 (LOADBYTE CHARCODE 6 6))
                                          8)
                                    (LOGOR (LLSH 2 6)
                                           (LOADBYTE CHARCODE 0 6)))
                     ELSE (ERROR "CHARCODE too big for UTF8" CHARCODE])
)



(* ; "debugging")

(DEFINEQ

(SHOWCHARS
  [LAMBDA (FONT FROMCHAR TOCHAR ONELINE)                     (* ; "Edited  7-Sep-2025 20:29 by rmk")
                                                             (* ; "Edited  2-Sep-2025 10:26 by rmk")
                                                             (* ; "Edited 24-Jul-2025 11:30 by rmk")
                                                             (* ; "Edited  8-Jun-2025 20:05 by rmk")
                                                             (* ; "Edited 26-Jan-2024 14:18 by mth")
                                                            (* ; "Edited  1-Aug-2020 09:27 by rmk:")
    [SETQ FONT (FONTCREATE (OR FONT '(CLASSIC 12]
    (RESETLST
        [LET ((OLDFONT (DSPFONT NIL T))
              CHARS)
             (CL:UNLESS (CHARCODEP FROMCHAR)
                 (SETQ FROMCHAR (OR (CHARCODE.DECODE FROMCHAR T)
                                    FROMCHAR)))
             (SETQ CHARS (if (CHARCODEP FROMCHAR)
                             then (CL:UNLESS (CHARCODEP TOCHAR)
                                      (SETQ TOCHAR (OR (CHARCODE.DECODE FROMCHAR)
                                                       FROMCHAR)))
                                  (for C from FROMCHAR to TOCHAR collect C)
                           else (CHCON FROMCHAR)))
             [RESETSAVE OLDFONT '(PROGN (DSPFONT OLDVALUE]
             (TERPRI)
             (for C in CHARS do (PRINTOUT T .FONT OLDFONT (CONCAT (OCTALSTRING (\CHARSET C))
                                                                 ","
                                                                 (OCTALSTRING (\CHAR8CODE C)))
                                       10 .FONT FONT (CHARACTER C))
                                (CL:UNLESS ONELINE (PRINTOUT T T])
    (TERPRI])
)
(DECLARE%: DOEVAL@LOAD DONTCOPY 
(DECLARE%: EVAL@COMPILE 

(PUTPROPS HEXCHAR MACRO ((CODE)
                         (HEXSTRING CODE)))

(PUTPROPS OCTALCHAR MACRO [(CODE)
                           (CONCAT (OCTALSTRING (\CHARSET CODE))
                                  ","
                                  (OCTALSTRING (LOGAND CODE 255])
)
)
(DECLARE%: EVAL@COMPILE DONTCOPY 

(FILESLOAD (FROM LOADUPS)
       EXPORTS.ALL)
)

(PUTPROPS UNICODE FILETYPE :TCOMPL)
(DECLARE%: DONTCOPY
  (FILEMAP (NIL (4433 19681 (UTF8.OUTCHARFN 4443 . 7459) (UTF8.SLUG.OUTCHARFN 7461 . 8125) (
UTF8.INCCODEFN 8127 . 13848) (UTF8.PEEKCCODEFN 13850 . 18699) (\UTF8.BACKCCODEFN 18701 . 19679)) (
19682 24372 (UTF16BE.OUTCHARFN 19692 . 20711) (UTF16BE.INCCODEFN 20713 . 21838) (UTF16BE.PEEKCCODEFN 
21840 . 23180) (\UTF16BE.BACKCCODEFN 23182 . 24370)) (24373 29096 (UTF16LE.OUTCHARFN 24383 . 25499) (
UTF16LE.INCCODEFN 25501 . 26626) (UTF16LE.PEEKCCODEFN 26628 . 27904) (\UTF16LE.BACKCCODEFN 27906 . 
29094)) (29097 32144 (READBOM 29107 . 31176) (WRITEBOM 31178 . 32142)) (32174 35739 (
MAKE-UNICODE-FORMATS 32184 . 35737)) (35836 40330 (UTF8.BINCODE 35846 . 38534) (\UTF8.FETCHCODE 38536
 . 40328)) (40331 45958 (UTF8.VALIDATE 40341 . 42938) (NUTF8-BYTE1-BYTES 42940 . 43677) (
NUTF8-CODE-BYTES 43679 . 44736) (NUTF8-STRING-BYTES 44738 . 45634) (N-MCHARS 45636 . 45956)) (47686 
56555 (MTOUCODE 47696 . 48083) (UTOMCODE 48085 . 48475) (MTOUCODE? 48477 . 49510) (UTOMCODE? 49512 . 
50476) (MTOUSTRING 50478 . 51063) (UTOMSTRING 51065 . 51650) (MTOUTF8STRING 51652 . 55658) (
UTF8TOMSTRING 55660 . 56553)) (56556 62258 (XTOUCODE 56566 . 57084) (UTOXCODE 57086 . 57594) (
XTOUCODE? 57596 . 58657) (UTOXCODE? 58659 . 59742) (XTOUSTRING 59744 . 60437) (UTOXSTRING 60439 . 
61180) (XTOUTF8STRING 61182 . 62256)) (63495 71791 (READ-UNICODE-MAPPING-FILENAMES 63505 . 67302) (
READ-UNICODE-MAPPING 67304 . 71789)) (71858 84200 (MAKE-UNICODE-TRANSLATION-TABLES 71868 . 75178) (
MERGE-UNICODE-TRANSLATION-TABLES 75180 . 77724) (UNICODE.UNMAPPED 77726 . 81050) (
UNICODE-EXTEND-TRANSLATION? 81052 . 84198)) (84201 91037 (ALL-UNICODE-MAPPINGS 84211 . 89700) (
XCCSJAPANESECHARSETS 89702 . 91035)) (92628 103896 (WRITE-UNICODE-MAPPING 92638 . 96388) (
WRITE-UNICODE-INCLUDED 96390 . 101112) (WRITE-UNICODE-MAPPING-HEADER 101114 . 102362) (
WRITE-UNICODE-MAPPING-FILENAME 102364 . 103894)) (103897 104573 (XCCS-UTF8-AFTER-OPEN 103907 . 104571)
) (107098 109315 (UTF8HEXSTRING 107108 . 109313)) (109342 111222 (SHOWCHARS 109352 . 111220)))))
STOP
