(DEFINE-FILE-INFO PACKAGE "INTERLISP" READTABLE "INTERLISP" BASE 10)

(FILECREATED " 5-Feb-2026 11:07:12" {WMEDLEY}<library>UNICODE.;213 82607  

      :EDIT-BY rmk

      :CHANGES-TO (FNS MAKE-UNICODE-FORMATS)

      :PREVIOUS-DATE "31-Jan-2026 19:24:45" {WMEDLEY}<library>UNICODE.;212)


(PRETTYCOMPRINT UNICODECOMS)

(RPAQQ UNICODECOMS
       (
        (* ;; "Unicode external formats and MCCS-to-Unicode mapping functions.  Must be loaded after UNICODE-TABLES.")

        (COMS                                                (* ; "External formats")
              (FNS UTF8.OUTCHARFN UTF8.SLUG.OUTCHARFN UTF8.INCCODEFN UTF8.PEEKCCODEFN 
                   \UTF8.BACKCCODEFN)
              (FNS UTF16BE.OUTCHARFN UTF16BE.INCCODEFN UTF16BE.PEEKCCODEFN \UTF16BE.BACKCCODEFN)
              (FNS UTF16LE.OUTCHARFN UTF16LE.INCCODEFN UTF16LE.PEEKCCODEFN \UTF16LE.BACKCCODEFN)
              (FNS READBOM WRITEBOM)
              (INITVARS (EXTERNALEOL 'LF))
              (FNS MAKE-UNICODE-FORMATS)
              (P (MAKE-UNICODE-FORMATS EXTERNALEOL))
              (ADDVARS (*DEFAULT-EXTERNALFORMATS* (UNIX :UTF-8)))
              (FNS UTF8.BINCODE \UTF8.FETCHCODE)
              (FNS UTF8.VALIDATE NUTF8-BYTE1-BYTES NUTF8-CODE-BYTES NUTF8-STRING-BYTES N-MCHARS)
              (DECLARE%: EVAL@COMPILE DONTCOPY (MACROS UNICODE.TRANSLATE \UTF8.GETBASEBYTE 
                                                      UNICODE.SMALLP)))
        
        (* ;; "")

        
        (* ;; "These come after the translation tables have been loaded, since the Unix file names needed to read the mapping files depend on the UTF8 string coercions.  Those functions are defined as EVQ in UFS, cannot be used until the tables exist.  This assumes that previous files have only 7-bit MCCS characters in their names.")

        (FNS MTOUCODE UTOMCODE MTOUCODE? UTOMCODE? MTOUSTRING UTOMSTRING MTOUTF8STRING UTF8TOMSTRING)
        (FNS XTOUCODE UTOXCODE XTOUCODE? UTOXCODE? XTOUSTRING UTOXSTRING XTOUTF8STRING)
        
        (* ;; "")

        (COMS                                                (* ; "Write Unicode mapping files")
              (FNS WRITE-UNICODE-MAPPING WRITE-UNICODE-INCLUDED WRITE-UNICODE-MAPPING-HEADER 
                   WRITE-UNICODE-MAPPING-FILENAME)
              (FNS XCCS-UTF8-AFTER-OPEN)
              
              (* ;; "Automate dumping of a documentation prefix")

              [DECLARE%: EVAL@COMPILE DONTCOPY (CONSTANTS (MISSINGCODE (CL:PARSE-INTEGER "FFFE" 
                                                                              :RADIX 16))
                                                      (UNDEFINEDCODE (CL:PARSE-INTEGER "FFFF" :RADIX
                                                                            16]
              (VARS UNICODE-MAPPING-HEADER))
        (FNS UTF8HEXSTRING)
        (COMS                                                (* ; "debugging")
              (FNS SHOWCHARS)
              (DECLARE%: DOEVAL@LOAD DONTCOPY (MACROS HEXCHAR OCTALCHAR)))
        (DECLARE%: EVAL@COMPILE DONTCOPY (FILES (LOADCOMP)
                                                UNICODE-EXPORTS))
        (PROP (FILETYPE)
              UNICODE)))



(* ;; 
"Unicode external formats and MCCS-to-Unicode mapping functions.  Must be loaded after UNICODE-TABLES."
)




(* ; "External formats")

(DEFINEQ

(UTF8.OUTCHARFN
  [LAMBDA (STREAM CHARCODE RAW)                              (* ; "Edited 24-Apr-2025 15:43 by rmk")
                                                             (* ; "Edited 20-Jan-2025 20:45 by rmk")
                                                             (* ; "Edited 31-Jan-2024 00:32 by rmk")
                                                            (* ; "Edited  8-Aug-2021 13:02 by rmk:")
                                                            (* ; "Edited 17-Aug-2020 08:45 by rmk:")
                                                            (* ; "Edited 30-Jan-2020 23:08 by rmk:")

    (* ;; "Perhaps the translation table should already do the mapping for EOL to LF, but that seems to be a separate property of the stream. Also, CRLF=2 bytes.")

    (* ;; "Print UTF8 sequence for CHARCODE.  Do not do MCCS to Unicode translation if RAW.")

    (IF (EQ CHARCODE (CHARCODE EOL))
        THEN (FREPLACE (STREAM CHARPOSITION) OF STREAM WITH 0)
             (\BOUTEOL STREAM)
      ELSE (CHANGE (FFETCH (STREAM CHARPOSITION) OF STREAM)
                  (IPLUS16 1 DATUM))
           (FOR C INSIDE (CL:IF RAW
                             CHARCODE
                             (UNICODE.TRANSLATE CHARCODE *MCCSTOUNICODE*))
              DO (IF (ILESSP C 128)
                     THEN (\BOUT STREAM C)
                   ELSEIF (ILESSP C 2048)
                     THEN                                    (* ; "x800")
                          (\BOUT STREAM (LOGOR (LLSH 3 6)
                                               (LRSH C 6)))
                          (\BOUT STREAM (LOGOR (LLSH 2 6)
                                               (LOADBYTE C 0 6)))
                   ELSEIF (ILESSP C 65536)
                     THEN                                    (* ; "x10000")
                          (\BOUT STREAM (LOGOR (LLSH 7 5)
                                               (LRSH C 12)))
                          (\BOUT STREAM (LOGOR (LLSH 2 6)
                                               (LOADBYTE C 6 6)))
                          (\BOUT STREAM (LOGOR (LLSH 2 6)
                                               (LOADBYTE C 0 6)))
                   ELSEIF (ILESSP C 2097152)
                     THEN                                    (* ; "x200000")
                          (\BOUT STREAM (LOGOR (LLSH 15 4)
                                               (LRSH C 18)))
                          (\BOUT STREAM (LOGOR (LLSH 2 6)
                                               (LOADBYTE C 12 6)))
                          (\BOUT STREAM (LOGOR (LLSH 2 6)
                                               (LOADBYTE C 6 6)))
                          (\BOUT STREAM (LOGOR (LLSH 2 6)
                                               (LOADBYTE C 0 6)))
                   ELSE (ERROR "CHARCODE too big for UTF8" C])

(UTF8.SLUG.OUTCHARFN
  [LAMBDA (STREAM CODE RAW)                                  (* ; "Edited 24-Apr-2025 15:43 by rmk")
                                                             (* ; "Edited 21-Jan-2025 18:37 by rmk")
                                                             (* ; "Edited 14-Jan-2025 12:39 by rmk")

    (* ;; "Produces Unicode Representative FFFD as a slug for MCCS unmapped characters")

    (UTF8.OUTCHARFN STREAM (OR (CL:IF RAW
                                   CODE
                                   (XTOUCODE? CODE))
                               (CONSTANT (HEXNUM? "FFFD")))
           T])

(UTF8.INCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 23-Oct-2025 08:31 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:44 by rmk")
                                                             (* ; "Edited  2-Feb-2024 11:44 by rmk")
                                                             (* ; "Edited 30-Jan-2024 22:56 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:02 by rmk:")
                                                            (* ; "Edited  6-Aug-2020 17:13 by rmk:")

    (* ;; "Do not do UNICODE to MCSS translation if RAW.")

    (* ;; "Test for smallp because the stream's End-of-file operation may suppress the error")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (LET (BYTE1 BYTE2 BYTE3 BYTE4 CODE (COUNT 1))
         (SETQ BYTE1 (\BIN STREAM))

         (* ;; "Distinguish on header bytes (modulo peculiar EOF behavior--the caller will get whatever ended up in BYTE1")

         (CL:WHEN (SMALLP BYTE1)
             [SETQ CODE (if (ILEQ BYTE1 127)
                            then 
                                 (* ;; 
                           "Test first:  Ascii is the common case.  EOL requires its own translation")

                                 (SELCHARQ BYTE1
                                      (CR (SELECTC (fetch (STREAM EOLCONVENTION) of STREAM)
                                              (CR.EOLC       (* ; "Also eq BYTE1")
                                                       (CHARCODE EOL))
                                              (CRLF.EOLC (if (EQ (CHARCODE LF)
                                                                 (\PEEKBIN STREAM T))
                                                             then (\BIN STREAM)
                                                                  (CL:WHEN COUNTP (SETQ COUNT 2))
                                                                  (CHARCODE EOL)
                                                           else BYTE1))
                                              BYTE1))
                                      (LF (CL:IF (EQ LF.EOLC (fetch (STREAM EOLCONVENTION)
                                                                of STREAM))
                                              (CHARCODE EOL)
                                              BYTE1))
                                      BYTE1)
                          elseif (ILEQ BYTE1 223)
                            then                             (* ; "2 bytes")
                                 (SETQ COUNT 2)
                                 (SETQ BYTE2 (\BIN STREAM))
                                 (CL:WHEN (OR (NOT (SMALLP BYTE2))
                                              (ILESSP BYTE2 128))
                                     (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                                 (LOGOR (LLSH (LOADBYTE BYTE1 0 5)
                                              6)
                                        (LOADBYTE BYTE2 0 6))
                          elseif (ILEQ BYTE1 239)
                            then                             (* ; "3 bytes")
                                 (SETQ BYTE2 (\BIN STREAM))
                                 (CL:WHEN (OR (NOT (SMALLP BYTE2))
                                              (ILESSP BYTE2 128))
                                     (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                                 (SETQ BYTE3 (\BIN STREAM))
                                 (CL:WHEN (OR (NOT (SMALLP BYTE3))
                                              (ILESSP BYTE3 128))
                                     (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3)))
                                 (SETQ COUNT 3)
                                 (LOGOR (LLSH (LOADBYTE BYTE1 0 4)
                                              12)
                                        (LLSH (LOADBYTE BYTE2 0 6)
                                              6)
                                        (LOADBYTE BYTE3 0 6))
                          else                               (* ; "4 bytes")
                               (SETQ BYTE2 (\BIN STREAM))
                               (CL:WHEN (OR (NOT (SMALLP BYTE2))
                                            (ILESSP BYTE2 128))
                                   (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                               (SETQ BYTE3 (\BIN STREAM))
                               (CL:WHEN (OR (NOT (SMALLP BYTE3))
                                            (ILESSP BYTE3 128))
                                   (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3)))
                               (SETQ BYTE4 (\BIN STREAM))
                               (CL:WHEN (OR (NOT (SMALLP BYTE4))
                                            (ILESSP BYTE4 128))
                                   (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3 BYTE4)))
                               (SETQ COUNT 4)
                               (LOGOR (LLSH (LOADBYTE BYTE1 0 3)
                                            18)
                                      (LLSH (LOADBYTE BYTE2 0 6)
                                            12)
                                      (LLSH (LOADBYTE BYTE3 0 6)
                                            6)
                                      (LOADBYTE BYTE4 0 6])
         (CL:UNLESS RAW
             (SETQ CODE (UNICODE.TRANSLATE (UNICODE.SMALLP CODE)
                               *UNICODETOMCCS*)))
         (CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT))
         CODE])

(UTF8.PEEKCCODEFN
  [LAMBDA (STREAM NOERROR RAW)                               (* ; "Edited 23-Oct-2025 08:26 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:44 by rmk")
                                                             (* ; "Edited  2-Feb-2024 11:48 by rmk")
                                                            (* ; "Edited 14-Jun-2021 22:53 by rmk:")

    (* ;; "Modeled this after \EUCPEEK on LLREAD.  In the multi-byte (non-ASCII) case, backs the file pointer to the beginning by the proper number of \BACKFILEPTRs, and returns a count of 0.  Returns NIL if NOERROR and either invalid UTF8 or end of file.")

    (* ;; "Could be that the caller takes care of backing up the file position if the number of binned-bytes is returned.")

    (* ;; "Do not do UNICODE to MCCS translation if RAW")

    (PROG (BYTE1 BYTE2 BYTE3 BYTE4 CODE)
          (SETQ BYTE1 (\PEEKBIN STREAM NOERROR))

     (* ;; "Distinguish on header bytex")

          (CL:UNLESS BYTE1 (RETURN NIL))
          [if (ILEQ BYTE1 127)
              then 
                   (* ;; 
                   "Test first:  Ascii is the common case.  No need to back up, since we peeked.")

                   (SETQ CODE BYTE1)
            elseif [ILEQ BYTE1 223                           (* ; "2 bytes")
                         (BIN STREAM)
                         (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                         (\BACKFILEPTR STREAM)
                         (if (AND BYTE2 (IGEQ BYTE2 128))
                             then (SETQ CODE (LOGOR (LLSH (LOADBYTE BYTE1 0 5)
                                                          6)
                                                    (LOADBYTE BYTE2 0 6)))
                           elseif NOERROR
                           else (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2]
            elseif (ILEQ BYTE1 239)
              then                                           (* ; "3 bytes")
                   (BIN STREAM)
                   (CL:UNLESS (AND (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                                   (IGEQ BYTE2 128))
                       (\BACKFILEPTR STREAM)
                       (OR NOERROR (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                       (RETURN CODE))
                   (BIN STREAM)
                   (SETQ BYTE3 (\PEEKBIN STREAM NOERROR))    (* ; 
                                                             "PEEK the last, no need to back it up")
                   (\BACKFILEPTR STREAM)
                   (\BACKFILEPTR STREAM)
                   (if (AND BYTE3 (IGEQ BYTE3 128))
                       then (SETQ CODE (LOGOR (LLSH (LOADBYTE BYTE1 0 4)
                                                    12)
                                              (LLSH (LOADBYTE BYTE2 0 6)
                                                    6)
                                              (LOADBYTE BYTE3 0 6)))
                     elseif NOERROR
                     else (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3)))
            else                                             (* ; "4 bytes")
                 (BIN STREAM)
                 (CL:UNLESS (AND (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                                 (IGEQ BYTE2 128))
                     (\BACKFILEPTR STREAM)
                     (OR NOERROR (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2)))
                     (RETURN CODE))
                 (BIN STREAM)
                 (CL:UNLESS (AND (SETQ BYTE3 (\PEEKBIN STREAM NOERROR))
                                 (IGEQ BYTE3 128))
                     (\BACKFILEPTR STREAM)
                     (\BACKFILEPTR STREAM)
                     (OR NOERROR (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3)))
                     (RETURN CODE))
                 (BIN STREAM)
                 (SETQ BYTE4 (\PEEKBIN STREAM NOERROR))
                 (\BACKFILEPTR STREAM)
                 (\BACKFILEPTR STREAM)
                 (\BACKFILEPTR STREAM)
                 (if (AND BYTE4 (IGEQ BYTE4 128))
                     then (SETQ CODE (LOGOR (LLSH (LOADBYTE BYTE1 0 3)
                                                  18)
                                            (LLSH (LOADBYTE BYTE2 0 6)
                                                  12)
                                            (LLSH (LOADBYTE BYTE3 0 6)
                                                  6)
                                            (LOADBYTE BYTE4 0 6)))
                   elseif NOERROR
                   else (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3 BYTE4]
          (CL:WHEN (AND CODE (NOT RAW))
              (SETQ CODE (UNICODE.TRANSLATE (UNICODE.SMALLP CODE)
                                *UNICODETOMCCS*)))
          (RETURN CODE])

(\UTF8.BACKCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 19-Jul-2022 15:30 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:04 by rmk:")

    (* ;; "\BACKFILEPTR is NIL at beginning of FILE.  Presumably a little bit more efficient if we decoded the UTF8 bytes backwards and didn't do the peek, but probably not worth the complexity. ")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (BIND (C _ 0) WHILE (IF (\BACKFILEPTR STREAM)
                            THEN (ADD C -1)
                                 (EQ 2 (LRSH (\PEEKBIN STREAM)
                                             6))
                          ELSE (CL:WHEN COUNTP (SETQ *BYTECOUNTER* C))
                               (RETURN NIL)) REPEATUNTIL (EQ C -4)
       FINALLY (CL:WHEN COUNTP (SETQ *BYTECOUNTER* C))
             (RETURN (UTF8.PEEKCCODEFN STREAM NIL RAW])
)
(DEFINEQ

(UTF16BE.OUTCHARFN
  [LAMBDA (STREAM CHARCODE RAW)                              (* ; "Edited 24-Apr-2025 15:44 by rmk")
                                                             (* ; "Edited 31-Jan-2024 00:32 by rmk")
                                                            (* ; "Edited  8-Aug-2021 13:09 by rmk:")
                                                            (* ; "Edited 30-Jan-2020 23:08 by rmk:")

    (* ;; "PRINT UTF16 sequence for CHARCODE.  Do not do MCCS to UNICODE translation if RAW.")

    (* ;; "Not sure about EOL conversion if truly %"raw%"")

    (IF (EQ CHARCODE (CHARCODE EOL))
        THEN (FREPLACE (STREAM CHARPOSITION) OF STREAM WITH 0)
      ELSE (CHANGE (FFETCH (STREAM CHARPOSITION) OF STREAM)
                  (IPLUS16 1 DATUM)))
    (FOR C INSIDE (CL:IF RAW
                      CHARCODE
                      (UNICODE.TRANSLATE CHARCODE *MCCSTOUNICODE*)) DO (\WOUT STREAM C])

(UTF16BE.INCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 24-Apr-2025 15:45 by rmk")
                                                             (* ; "Edited 10-Mar-2024 12:00 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:05 by rmk:")

    (* ;; 
    "Do not do UNICODE to MCCS translation if RAW.  Test for SMALLPin case of funky EOF behavior")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (LET (CODE BYTE1 BYTE2 COUNT)
         (IF [AND (SMALLP (SETQ BYTE1 (\BIN STREAM)))
                  (SMALLP (SETQ BYTE2 (\BIN STREAM]
             THEN (SETQ COUNT 2)
                  (SETQ CODE (create WORD
                                    HIBYTE _ (\BIN STREAM)
                                    LOBYTE _ (\BIN STREAM)))
                  (CL:UNLESS RAW
                      (SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOMCCS*)))
                  (CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT))
                  CODE
           ELSE (ERROR "ODD NUMBER OF BYTES IN UTF16 FILE" STREAM])

(UTF16BE.PEEKCCODEFN
  [LAMBDA (STREAM NOERROR RAW)                               (* ; "Edited 24-Apr-2025 15:45 by rmk")
                                                             (* ; "Edited 10-Mar-2024 12:01 by rmk")
                                                            (* ; "Edited 14-Jun-2021 22:58 by rmk:")

    (* ;; "Could be that the caller takes care of backing up the file position if the number of binned-bytes is returned.")

    (* ;; "Do not do UNICODE to MCCS translation if RAW")

    (LET (BYTE1 BYTE2 CODE)
         (SETQ BYTE1 (\PEEKBIN STREAM NOERROR))
         (IF BYTE1
             THEN (\BIN STREAM)
                  (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                  (\BACKFILEPTR STREAM)
                  (IF BYTE2
                      THEN (SETQ CODE (create WORD
                                             HIBYTE _ BYTE1
                                             LOBYTE _ BYTE2))
                           (CL:IF RAW
                               CODE
                               (UNICODE.TRANSLATE CODE *UNICODETOMCCS*))
                    ELSEIF NOERROR
                      THEN NIL)
           ELSEIF NOERROR
             THEN NIL
           ELSE (ERROR "INVALID UTF16 CHARACTER" (LIST BYTE1 BYTE2])

(\UTF16BE.BACKCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 24-Apr-2025 15:28 by rmk")
                                                             (* ; "Edited 10-Mar-2024 12:02 by rmk")
                                                             (* ; "Edited 19-Jul-2022 15:14 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:07 by rmk:")

    (* ;; "\BACKFILEPTR is NIL at beginning of FILE, do nothing.")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (CL:WHEN (\BACKFILEPTR STREAM)
        (LET (CODE (BYTE2 (\PEEKBIN STREAM)))
             (IF (\BACKFILEPTR STREAM)
                 THEN (CL:WHEN COUNTP (SETQ *BYTECOUNTER* -2))
                      (SETQ CODE (create WORD
                                        HIBYTE _ (\PEEKBIN STREAM)
                                        LOBYTE _ BYTE2))
                      (CL:IF RAW
                          CODE
                          (UNICODE.TRANSLATE CODE *UNICODETOMCCS*))
               ELSEIF COUNTP
                 THEN (SETQ *BYTECOUNTER* -1)
                      NIL)))])
)
(DEFINEQ

(UTF16LE.OUTCHARFN
  [LAMBDA (STREAM CHARCODE RAW)                              (* ; "Edited 24-Apr-2025 15:45 by rmk")
                                                             (* ; "Edited 10-Mar-2024 11:58 by rmk")
                                                            (* ; "Edited  8-Aug-2021 13:09 by rmk:")
                                                            (* ; "Edited 30-Jan-2020 23:08 by rmk:")

    (* ;; "PRINT UTF16 sequence for CHARCODE.  Do not do MCCS to UNICODE translation if RAW.")

    (* ;; "Not sure about EOL conversion if truly %"raw%"")

    (IF (EQ CHARCODE (CHARCODE EOL))
        THEN (FREPLACE (STREAM CHARPOSITION) OF STREAM WITH 0)
      ELSE (CHANGE (FFETCH (STREAM CHARPOSITION) OF STREAM)
                  (IPLUS16 1 DATUM)))
    (FOR C INSIDE (CL:IF RAW
                      CHARCODE
                      (UNICODE.TRANSLATE CHARCODE *MCCSTOUNICODE*))
       DO (BOUT STREAM (fetch LOBYTE of CHARCODE))
          (BOUT STREAM (fetch HIBYTE of CHARCODE])

(UTF16LE.INCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 24-Apr-2025 15:45 by rmk")
                                                             (* ; "Edited 10-Mar-2024 12:03 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:05 by rmk:")

    (* ;; 
    "Do not do UNICODE to MCCS translation if RAW.  Test for SMALLPin case of funky EOF behavior")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (LET (CODE BYTE1 BYTE2 COUNT)
         (IF [AND (SMALLP (SETQ BYTE1 (\BIN STREAM)))
                  (SMALLP (SETQ BYTE2 (\BIN STREAM]
             THEN (SETQ COUNT 2)
                  (SETQ CODE (create WORD
                                    LOBYTE _ (\BIN STREAM)
                                    HIBYTE _ (\BIN STREAM)))
                  (CL:UNLESS RAW
                      (SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOMCCS*)))
                  (CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT))
                  CODE
           ELSE (ERROR "ODD NUMBER OF BYTES IN UTF16 FILE" STREAM])

(UTF16LE.PEEKCCODEFN
  [LAMBDA (STREAM NOERROR RAW)                               (* ; "Edited 24-Apr-2025 15:46 by rmk")
                                                             (* ; "Edited 10-Mar-2024 11:43 by rmk")
                                                            (* ; "Edited 14-Jun-2021 22:58 by rmk:")

    (* ;; "Could be that the caller takes care of backing up the file position if the number of binned-bytes is returned.")

    (* ;; "Do not do UNICODE to MCCS translation if RAW")

    (LET (BYTE1 BYTE2 CODE)
         (SETQ BYTE1 (\PEEKBIN STREAM NOERROR))
         (IF BYTE1
             THEN (\BIN STREAM)
                  (SETQ BYTE2 (\PEEKBIN STREAM NOERROR))
                  (\BACKFILEPTR STREAM)
                  (IF BYTE2
                      THEN (SETQ CODE (LOGOR (LLSH BYTE2 8)
                                             BYTE1))
                           (CL:IF RAW
                               CODE
                               (UNICODE.TRANSLATE CODE *UNICODETOMCCS*))
                    ELSEIF NOERROR
                      THEN NIL)
           ELSEIF NOERROR
             THEN NIL
           ELSE (ERROR "INVALID UTF16 CHARACTER" (LIST BYTE1 BYTE2])

(\UTF16LE.BACKCCODEFN
  [LAMBDA (STREAM COUNTP RAW)                                (* ; "Edited 24-Apr-2025 15:28 by rmk")
                                                             (* ; "Edited 10-Mar-2024 12:04 by rmk")
                                                             (* ; "Edited 19-Jul-2022 15:14 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:07 by rmk:")

    (* ;; "\BACKFILEPTR is NIL at beginning of FILE, do nothing.")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (CL:WHEN (\BACKFILEPTR STREAM)
        (LET (CODE (BYTE2 (\PEEKBIN STREAM)))
             (IF (\BACKFILEPTR STREAM)
                 THEN (CL:WHEN COUNTP (SETQ *BYTECOUNTER* -2))
                      (SETQ CODE (create WORD
                                        HIBYTE _ BYTE2
                                        LOBYTE _ (\PEEKBIN STREAM)))
                      (CL:IF RAW
                          CODE
                          (UNICODE.TRANSLATE CODE *UNICODETOMCCS*))
               ELSEIF COUNTP
                 THEN (SETQ *BYTECOUNTER* -1)
                      NIL)))])
)
(DEFINEQ

(READBOM
  [LAMBDA (STREAM COUNTP)                                    (* ; "Edited 17-Jan-2025 11:29 by rmk")
                                                             (* ; "Edited 11-Mar-2024 23:53 by rmk")
                                                             (* ; "Edited 10-Mar-2024 13:01 by rmk")

    (* ;; "If COUNTP, this must be under a generic \INCCODE that binds *BYTECOUNTER*")

    (* ;; "Reads and decodes the BOM bytes.  If BOM ispresent, the stream is left at the first following byte, otherwise the stream is reset to its position on entry (presumably 0).")

    (* ;; "I used the UNHEXTRING constants so that the hex bytes are visible in the code, maybe there's another function that does that?")

    (DECLARE (USEDFREE *BYTECOUNTER*))
    (SELECTC (\PEEKBIN STREAM T)
        ((HEXNUM? "EF") 
             (BIN STREAM)
             (if (EQ (CONSTANT (HEXNUM? "BB"))
                     (\PEEKBIN STREAM T))
                 then (BIN STREAM)
                      (if (EQ (CONSTANT (HEXNUM? "BF"))
                              (\PEEKBIN STREAM T))
                          then (BIN STREAM)
                               (CL:WHEN COUNTP (add *BYTECOUNTER* 3))
                               :UTF-8
                        else (\BACKFILEPTR STREAM))
               else (\BACKFILEPTR STREAM)))
        ((HEXNUM? "FE") 
             (BIN STREAM)
             (if (EQ (CONSTANT (HEXNUM? "FF"))
                     (\PEEKBIN STREAM T))
                 then (BIN STREAM)
                      (CL:WHEN COUNTP (add *BYTECOUNTER* 2))
                      :UTF-16BE
               else (\BACKFILEPTR STREAM)))
        ((HEXNUM? "FF") 
             (BIN STREAM)
             (if (EQ (CONSTANT (HEXNUM? "FE"))
                     (\PEEKBIN STREAM T))
                 then (BIN STREAM)
                      (CL:WHEN COUNTP (add *BYTECOUNTER* 2))
                      :UTF-16LE
               else (\BACKFILEPTR STREAM)))
        NIL])

(WRITEBOM
  [LAMBDA (STREAM FORMAT)                                    (* ; "Edited 17-Jan-2025 11:29 by rmk")
                                                             (* ; "Edited 16-Mar-2024 20:53 by rmk")
                                                             (* ; "Edited 11-Mar-2024 23:53 by rmk")
                                                             (* ; "Edited 10-Mar-2024 13:01 by rmk")

    (* ;; "Writes a BOM that represents FORMAT (:UTF-8, :UTF16-BE, :UTF16-LE")

    (SELECTQ FORMAT
        (:UTF-8 (BOUT STREAM (CONSTANT (HEXNUM? "EF")))
                (BOUT STREAM (CONSTANT (HEXNUM? "BB")))
                (BOUT STREAM (CONSTANT (HEXNUM? "BF"))))
        (:UTF-16BE (BOUT STREAM (CONSTANT (HEXNUM? "FE")))
                   (BOUT STREAM (CONSTANT (HEXNUM? "FF"))))
        (:UTF-16LE (BOUT STREAM (CONSTANT (HEXNUM? "FF")))
                   (BOUT STREAM (HEXNUM? "FE")))
        NIL])
)

(RPAQ? EXTERNALEOL 'LF)
(DEFINEQ

(MAKE-UNICODE-FORMATS
  [LAMBDA (EXTERNALEOL)                                      (* ; "Edited  5-Feb-2026 11:06 by rmk")
                                                             (* ; "Edited 17-Jan-2025 18:38 by rmk")
                                                             (* ; "Edited 10-Mar-2024 11:55 by rmk")
                                                             (* ; "Edited  8-Dec-2023 15:19 by rmk")
                                                             (* ; "Edited 19-Jul-2022 15:36 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:08 by rmk:")

    (* ;; "RAW formats do not do XCCS/Unicode translation, just deal with the byte encoding.")

    (* ;; "The EXTERNALEOL specifies the EOLCONVENTION of the stream, particularly to produce output files with the desired convention.  On input the macro \CHECKEOLC (LLREAD) coerces only that coding to the internal EOL, which is a mistake.")

    (MAKE-EXTERNALFORMAT :UTF-8 (FUNCTION UTF8.INCCODEFN)
           (FUNCTION UTF8.PEEKCCODEFN)
           (FUNCTION \UTF8.BACKCCODEFN)
           (FUNCTION UTF8.OUTCHARFN)
           NIL EXTERNALEOL NIL (FUNCTION MTOUTF8STRING)
           NIL
           (FUNCTION NILL)
           (FUNCTION UTF8TOMSTRING))
    (MAKE-EXTERNALFORMAT :UTF-8-RAW [FUNCTION (LAMBDA (STREAM COUNTP)
                                                (UTF8.INCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM NOERROR)
                       (UTF8.PEEKCCODEFN STREAM NOERROR T]
           [FUNCTION (LAMBDA (STREAM COUNTP)
                       (\UTF8.BACKCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM CHARCODE)
                       (UTF8.OUTCHARFN STREAM CHARCODE T]
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (MAKE-EXTERNALFORMAT :UTF-16BE (FUNCTION UTF16BE.INCCODEFN)
           (FUNCTION UTF16BE.PEEKCCODEFN)
           (FUNCTION \UTF16BE.BACKCCODEFN)
           (FUNCTION UTF16BE.OUTCHARFN)
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (MAKE-EXTERNALFORMAT :UTF-16BE-RAW [FUNCTION (LAMBDA (STREAM COUNTP)
                                                   (UTF16BE.INCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM NOERROR)
                       (UTF16BE.PEEKCCODEFN STREAM NOERROR T]
           [FUNCTION (LAMBDA (STREAM COUNTP)
                       (\UTF16BE.BACKCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM CHARCODE)
                       (UTF16BE.OUTCHARFN STREAM CHARCODE T]
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (MAKE-EXTERNALFORMAT :UTF-16LE (FUNCTION UTF16LE.INCCODEFN)
           (FUNCTION UTF16LE.PEEKCCODEFN)
           (FUNCTION \UTF16LE.BACKCCODEFN)
           (FUNCTION UTF16LE.OUTCHARFN)
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (MAKE-EXTERNALFORMAT :UTF-16LE-RAW [FUNCTION (LAMBDA (STREAM COUNTP)
                                                   (UTF16LE.INCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM NOERROR)
                       (UTF16LE.PEEKCCODEFN STREAM NOERROR T]
           [FUNCTION (LAMBDA (STREAM COUNTP)
                       (\UTF16LE.BACKCCODEFN STREAM COUNTP T]
           [FUNCTION (LAMBDA (STREAM CHARCODE)
                       (UTF16LE.OUTCHARFN STREAM CHARCODE T]
           NIL EXTERNALEOL NIL NIL NIL (FUNCTION NILL))
    (\INSTALL.EXTERNALFORMAT (create EXTERNALFORMAT using (FIND-FORMAT :UTF-8)
                                                          NAME _ :UTF-8-SLUG OUTCHARFN _
                                                          (FUNCTION UTF8.SLUG.OUTCHARFN])
)

(MAKE-UNICODE-FORMATS EXTERNALEOL)

(ADDTOVAR *DEFAULT-EXTERNALFORMATS* (UNIX :UTF-8))
(DEFINEQ

(UTF8.BINCODE
  [LAMBDA (STREAM RAW)                                       (* ; "Edited 24-Apr-2025 15:28 by rmk")
                                                             (* ; "Edited  4-Feb-2024 01:06 by rmk")
                                                             (* ; "Edited  1-Feb-2024 11:21 by rmk")
                                                             (* ; "Edited 28-Dec-2023 13:32 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:02 by rmk:")
                                                            (* ; "Edited  6-Aug-2020 17:13 by rmk:")

    (* ;; "Decodes a UTF8 character code by binning  from STREAM ")

    (* ;; "The validity of STREAM is guaranteed by the caller (presumably TEDIT), we aren't testing here for the validity of the trailing bytes.")

    (* ;; "This doesn't do EOL conversion or translation, unlike UTF8.INCCODEFN.")

    (LET ((BYTE1 (BIN STREAM))
          CODE)
         [SETQ CODE (if (ILEQ BYTE1 127)
                        then BYTE1
                      elseif (ILEQ BYTE1 223)
                        then                                 (* ; "2 bytes")
                             (LOGOR (LLSH (LOADBYTE BYTE1 0 5)
                                          6)
                                    (LOADBYTE (BIN STREAM)
                                           0 6))
                      elseif (ILEQ BYTE1 239)
                        then                                 (* ; "3 bytes")
                             (LOGOR (LLSH (LOADBYTE BYTE1 0 4)
                                          12)
                                    (LLSH (LOADBYTE (BIN STREAM)
                                                 0 6)
                                          6)
                                    (LOADBYTE (BIN STREAM)
                                           0 6))
                      else                                   (* ; "4 bytes")
                           (LOGOR (LLSH (LOADBYTE BYTE1 0 3)
                                        18)
                                  (LLSH (LOADBYTE (BIN STREAM)
                                               0 6)
                                        12)
                                  (LLSH (LOADBYTE (BIN STREAM)
                                               0 6)
                                        6)
                                  (LOADBYTE (BIN STREAM)
                                         0 6]
         (CL:IF RAW
             CODE
             (UNICODE.TRANSLATE CODE *UNICODETOMCCS*))])

(\UTF8.FETCHCODE
  [LAMBDA (CODESIZE BUFFER BYTEOFFSET)                       (* ; "Edited 28-Dec-2023 13:32 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:02 by rmk:")
                                                            (* ; "Edited  6-Aug-2020 17:13 by rmk:")

    (* ;; "Decodes a UTF8 byte sequence of size CODESIZE in BUFFER starting at BYTEOFFSET.")

    (* ;; "The validity of the thesize, buffer, and offset are guaranteed by the caller.")

    (LET ((BYTE1 (\GETBASEBYTE BUFFER BYTEOFFSET))
          BYTE2 BYTE3 BYTE4)
         (SELECTQ CODESIZE
             (2 (SETQ BYTE2 (\UTF8.GETBASEBYTE BUFFER (IPLUS 1 BYTEOFFSET)))
                (LOGOR (LLSH (LOADBYTE BYTE1 0 5)
                             6)
                       (LOADBYTE BYTE2 0 6)))
             (3 (SETQ BYTE2 (\UTF8.GETBASEBYTE BUFFER (IPLUS 1 BYTEOFFSET)))
                (SETQ BYTE3 (\UTF8.GETBASEBYTE BUFFER (IPLUS 2 BYTEOFFSET)))
                (LOGOR (LLSH (LOADBYTE BYTE1 0 4)
                             12)
                       (LLSH (LOADBYTE BYTE2 0 6)
                             6)
                       (LOADBYTE BYTE3 0 6)))
             (4 (SETQ BYTE2 (\UTF8.GETBASEBYTE BUFFER (IPLUS 1 BYTEOFFSET)))
                (SETQ BYTE3 (\UTF8.GETBASEBYTE BUFFER (IPLUS 2 BYTEOFFSET)))
                (SETQ BYTE4 (\UTF8.GETBASEBYTE BUFFER (IPLUS 3 BYTEOFFSET)))
                (LOGOR (LLSH (LOADBYTE BYTE1 0 3)
                             18)
                       (LLSH (LOADBYTE BYTE2 0 6)
                             12)
                       (LLSH (LOADBYTE BYTE3 0 6)
                             6)
                       (LOADBYTE BYTE4 0 6)))
             (1 BYTE1)
             (SHOULDNT])
)
(DEFINEQ

(UTF8.VALIDATE
  [LAMBDA (STREAM BYTE)                                      (* ; "Edited  2-Feb-2024 12:03 by rmk")
                                                             (* ; "Edited 28-Dec-2023 11:57 by rmk")
                                                            (* ; "Edited  6-Aug-2021 16:02 by rmk:")
                                                            (* ; "Edited  6-Aug-2020 17:13 by rmk:")

    (* ;; "Returns the codesize if the bytes starting at STREAM's current position form a valid UTF-8 sequence.")

    (* ;; "If BYTE is provided, it is interpreted as the just-read header byte with the stream is positioned just after it.")

    (* ;; "Test for smallp because the stream's End-of-file operation may suppress the error--otherwise an error will happen if the streams runs out of necessary bytes.")

    (* ;; "For valid sequences, returns the same value as UTF8-SIZE-FROM-BYTE1, but this reads/validates the rest of the bytes. On a non-NILreturn the  stream is positioned before the header byte of the next putative code. The stream position is uncertain on a NIL return.")

    (* ;; "")

    (* ;; "Distinguish on the header byte BYTE.  Not SMALLP presumably if ENDOFSTREAMOP did something unusual.")

    (CL:UNLESS BYTE
        (SETQ BYTE (BIN STREAM)))
    (CL:WHEN (SMALLP BYTE)
        (if (ILEQ BYTE 127)
            then 1
          elseif (ILEQ BYTE 223)
            then                                             (* ; " 2 bytes")
                 (CL:UNLESS (OR [NOT (SMALLP (SETQ BYTE (BIN STREAM]
                                (ILESSP BYTE 128))
                        2)
          elseif (ILEQ BYTE 239)
            then                                             (* ; "3 bytes")
                 (CL:UNLESS (OR (OR [NOT (SMALLP (SETQ BYTE (BIN STREAM]
                                    (ILESSP BYTE 128))
                                (OR [NOT (SMALLP (SETQ BYTE (BIN STREAM]
                                    (ILESSP BYTE 128)))
                        3)
          else                                               (* ; "4 bytes")
               (CL:UNLESS (OR (OR [NOT (SMALLP (SETQ BYTE (BIN STREAM]
                                  (ILESSP BYTE 128))
                              (OR [NOT (SMALLP (SETQ BYTE (BIN STREAM]
                                  (ILESSP BYTE 128))
                              (OR [NOT (SMALLP (SETQ BYTE (BIN STREAM]
                                  (ILESSP BYTE 128)))
                      4)))])

(NUTF8-BYTE1-BYTES
  [LAMBDA (BYTE1)                                            (* ; "Edited  3-Feb-2024 15:00 by rmk")
                                                             (* ; "Edited  8-Jan-2024 10:57 by rmk")
                                                             (* ; "Edited 28-Jun-2022 00:02 by rmk")
                                                            (* ; "Edited 10-Aug-2020 12:35 by rmk:")

    (* ;; "Returns the number of bytes in a UTF8 code representation whose first byte is BYTEE1. ")

    (IF (ILEQ BYTE1 127)
        THEN 1
      ELSEIF (ILEQ BYTE1 223)
        THEN 2
      ELSEIF (ILEQ BYTE1 239)
        THEN 3
      ELSE 4])

(NUTF8-CODE-BYTES
  [LAMBDA (CODE)                                             (* ; "Edited  3-Feb-2024 14:42 by rmk")
                                                             (* ; "Edited  8-Jan-2024 10:57 by rmk")
                                                             (* ; "Edited 28-Jun-2022 00:02 by rmk")
                                                            (* ; "Edited 10-Aug-2020 12:35 by rmk:")

    (* ;; "Returns the number of bytes needed to encode in UTF8 a number headed by BYTE. ")

    (IF (ILESSP CODE 128)
        THEN 1
      ELSEIF (ILESSP CODE 2048)
        THEN                                                 (* ; "x800")
             2
      ELSEIF (ILESSP CODE 65536)
        THEN                                                 (* ; "x10000")
             3
      ELSEIF (ILESSP CODE 2097152)
        THEN                                                 (* ; "x200000")
             4
      ELSE (ERROR "INVALID UTF-8 CODE"])

(NUTF8-STRING-BYTES
  [LAMBDA (STRING RAW)                                       (* ; "Edited  2-Sep-2025 10:40 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:37 by rmk")
                                                             (* ; "Edited  3-Feb-2024 21:32 by rmk")
                                                            (* ; "Edited 10-Aug-2020 09:06 by rmk:")

    (* ;; "Returns the number of bytes it would take to represent STRING in UTF8, assuming it is an MCCS string unless RAWFLG. ")

    (for I C from 1 while (SETQ C (NTHCHARCODE STRING I)) sum (NUTF8-CODE-BYTES (CL:IF RAW
                                                                                    C
                                                                                    (MTOUCODE C))])

(N-MCHARS
  [LAMBDA (UTF8STRING)                                       (* ; "Edited  9-Sep-2025 08:35 by rmk")

    (* ;; "Returns the number of MCCS characters coded in UTF8STRING")

    (for I B from 1 while (SETQ B (NTHCHARCODE UTF8STRING I)) by (NUTF8-BYTE1-BYTES B) count T])
)
(DECLARE%: EVAL@COMPILE DONTCOPY 
(DECLARE%: EVAL@COMPILE 

(PUTPROPS UNICODE.TRANSLATE MACRO [OPENLAMBDA (CODE TRANSLATION-TABLE DONTFAKE RETURNALL)

                                   (* ;; "If RETURNALL and there are alternatives in the RANG, the list is returned. Othewise just the first one if the fake flag allows ")

                                    (LET [(RANGE (OR (GETHASH CODE TRANSLATION-TABLE)
                                                     (UNICODE.UNMAPPED CODE TRANSLATION-TABLE 
                                                            DONTFAKE]
                                         (CL:WHEN RANGE
                                             (if (AND RETURNALL (CDR RANGE))
                                                 then RANGE
                                               else (SETQ RANGE (CAR RANGE))
                                                    (CL:IF DONTFAKE
                                                        (TRUECODEP RANGE TRANSLATION-TABLE)
                                                        RANGE)))])

(PUTPROPS \UTF8.GETBASEBYTE MACRO ((BASE OFFSET ERROR?)      (* ; 
                        "Fetches the OFFSET'th byte from BASE, checking for UTF-8 validity if ERROR?")
                                   (IF ERROR?
                                       THEN (LET ((BYTE (\GETBASEBYTE BASE OFFSET)))
                                                 (CL:WHEN (ILESSP BYTE 128)
                                                        (ERROR "INVALID UTF8 BYTE" BYTE))
                                                 BYTE)
                                     ELSE (\GETBASEBYTE BASE OFFSET))))

(PUTPROPS UNICODE.SMALLP MACRO [OPENLAMBDA (UNICODE)         (* ; 
                                                "Cananonicalizes a large UNICODE for EQ hash-testing")
                                 (OR (SMALLP UNICODE)
                                     (CAR (OR (MEMBER UNICODE *LARGEUNICODES*)
                                              (PUSH *LARGEUNICODES* UNICODE])
)
)



(* ;; "")




(* ;; 
"These come after the translation tables have been loaded, since the Unix file names needed to read the mapping files depend on the UTF8 string coercions.  Those functions are defined as EVQ in UFS, cannot be used until the tables exist.  This assumes that previous files have only 7-bit MCCS characters in their names."
)

(DEFINEQ

(MTOUCODE
  [LAMBDA (MCODE)                                            (* ; "Edited  4-Sep-2025 15:10 by rmk")
                                                             (* ; "Edited 24-Apr-2025 10:19 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")
    (UNICODE.TRANSLATE MCODE *MCCSTOUNICODE*])

(UTOMCODE
  [LAMBDA (UNICODE)                                          (* ; "Edited 23-Oct-2025 08:23 by rmk")
                                                             (* ; "Edited 24-Apr-2025 10:17 by rmk")
                                                             (* ; "Edited 16-Jan-2025 23:46 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")
    (UNICODE.TRANSLATE (UNICODE.SMALLP UNICODE)
           *UNICODETOMCCS*])

(MTOUCODE?
  [LAMBDA (MCODE)                                            (* ; "Edited  4-Sep-2025 15:09 by rmk")
                                                             (* ; "Edited 24-Apr-2025 10:18 by rmk")
                                                             (* ; "Edited 20-Jan-2025 20:38 by rmk")
                                                             (* ; "Edited 18-Jan-2025 11:44 by rmk")
                                                             (* ; "Edited 15-Jan-2025 19:51 by rmk")
                                                             (* ; "Edited 14-Jan-2025 13:14 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")

    (* ;; "Returns the Unix range-code(s) corresponding to MCODE if there are true mapppings, otherwise NIL.  Alternative codes are returned in a list, the code itself is returned for a singleton.")

    (UNICODE.TRANSLATE MCODE *MCCSTOUNICODE* T T])

(UTOMCODE?
  [LAMBDA (UNICODE)                                          (* ; "Edited 23-Oct-2025 08:24 by rmk")
                                                             (* ; "Edited 24-Apr-2025 10:18 by rmk")
                                                             (* ; "Edited 19-Jan-2025 21:14 by rmk")
                                                             (* ; "Edited 18-Jan-2025 11:46 by rmk")
                                                             (* ; "Edited 15-Jan-2025 19:51 by rmk")
                                                             (* ; "Edited 14-Jan-2025 13:14 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")

    (* ;; "Returns the MCCS range-code(s) corresponding to UNICODE if there are true mapppings, otherwise NIL.  ")

    (* ;; 
    " NOTE:  Alternative codes are returned in a list, the code itself is returned for a singleton.")

    (* ;; "Canonicalize unicodes outside of the 16-bit plane")

    (UNICODE.TRANSLATE (UNICODE.SMALLP UNICODE)
           *UNICODETOMCCS* T T])

(MTOUSTRING
  [LAMBDA (MSTRING DESTRUCTIVE)                              (* ; "Edited  2-Sep-2025 12:19 by rmk")
                                                             (* ; "Edited 29-Apr-2025 12:01 by rmk")

    (* ;; "Converts MCCS codes in MSTRING to Unicodes.")

    (for I MCODE (USTRING _ (CL:IF DESTRUCTIVE
                                MSTRING
                                (CONCAT MSTRING))) from 1 while (SETQ MCODE (NTHCHARCODE MSTRING I))
       do (RPLCHARCODE USTRING I (MTOUCODE MCODE)) finally (RETURN USTRING])

(UTOMSTRING
  [LAMBDA (USTRING DESTRUCTIVE)                              (* ; "Edited  2-Sep-2025 12:18 by rmk")
                                                             (* ; "Edited 29-Apr-2025 12:00 by rmk")

    (* ;; "Converts Unicodes to MCCS codes in USTRING.")

    (for I UCODE (MSTRING _ (CL:IF DESTRUCTIVE
                                USTRING
                                (CONCAT USTRING))) from 1 while (SETQ UCODE (NTHCHARCODE USTRING I))
       do (RPLCHARCODE MSTRING I (UTOMCODE UCODE)) finally (RETURN MSTRING])

(MTOUTF8STRING
  [LAMBDA (MSTRING)                                          (* ; "Edited 31-Jan-2026 19:15 by rmk")
                                                             (* ; "Edited  9-Sep-2025 07:51 by rmk")
                                                             (* ; "Edited  4-Sep-2025 15:13 by rmk")
                                                             (* ; "Edited  2-Sep-2025 11:12 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:37 by rmk")
                                                             (* ; "Edited  3-Feb-2024 14:55 by rmk")
                                                            (* ; "Edited 10-Aug-2020 21:42 by rmk:")

    (* ;; 
    "Produces a string that contains the UTF8 bytes that represent the characters in MSTRING.  ")

    (* ;; "The resulting string will not be directly interpretable inside Medley.")

    (if (if (STRINGP MSTRING)
            then [OR (ffetch (STRINGP FATSTRINGP) of MSTRING)
                     (thereis C instring MSTRING suchthat (OR (IGEQ C 128)
                                                              (NEQ C (MTOUCODE C]
          elseif (LITATOM MSTRING)
            then [OR (ffetch (LITATOM FATPNAMEP) of MSTRING)
                     (thereis C inatom MSTRING suchthat (OR (IGEQ C 128)
                                                            (NEQ C (MTOUCODE C]
          else T)
        then (LET [(USTR (ALLOCSTRING (NUTF8-STRING-BYTES MSTRING]
                  (for I UCODE MCODE (SINDEX _ 0) from 1 while (SETQ MCODE (NTHCHARCODE MSTRING I))
                     do (SETQ UCODE (MTOUCODE MCODE))
                        (if (ILESSP UCODE 128)
                            then (RPLCHARCODE USTR (ADD SINDEX 1)
                                        UCODE)
                          elseif (ILESSP UCODE 2048)
                            then                             (* ; "x800")
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 3 6)
                                               (LRSH UCODE 6)))
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 2 6)
                                               (LOADBYTE UCODE 0 6)))
                          elseif (ILESSP UCODE 65536)
                            then                             (* ; "x10000")
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 7 5)
                                               (LRSH UCODE 12)))
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 2 6)
                                               (LOADBYTE UCODE 6 6)))
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 2 6)
                                               (LOADBYTE UCODE 0 6)))
                          elseif (ILESSP UCODE 2097152)
                            then                             (* ; "x200000")
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 15 4)
                                               (LRSH UCODE 18)))
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 2 6)
                                               (LOADBYTE UCODE 12 6)))
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 2 6)
                                               (LOADBYTE UCODE 6 6)))
                                 (RPLCHARCODE USTR (ADD SINDEX 1)
                                        (LOGOR (LLSH 2 6)
                                               (LOADBYTE UCODE 0 6)))
                          else (SHOULDNT)))
                  USTR)
      else MSTRING])

(UTF8TOMSTRING
  [LAMBDA (UTF8STRING)                                       (* ; "Edited 22-Oct-2025 22:00 by rmk")
                                                             (* ; "Edited 16-Oct-2025 14:39 by rmk")
                                                             (* ; "Edited  9-Sep-2025 08:59 by rmk")
    (CL:UNLESS (OR (STRINGP UTF8STRING)
                   (LITATOM UTF8STRING))
        (SETQ UTF8STRING (MKSTRING UTF8STRING)))
    (CL:WHEN (ffetch (STRINGP FATSTRINGP) of UTF8STRING)
           (\ILLEGAL.ARG UTF8STRING))
    (LET* ((NMCHARS (N-MCHARS UTF8STRING))
           (MSTRING (ALLOCSTRING NMCHARS)))
          [for M NBYTES BYTE1 (BASE _ (ffetch (STRINGP BASE) of UTF8STRING)) from 1 to NMCHARS
             as OFFSET from (fetch (STRINGP OFFST) of MSTRING) by NBYTES
             do (SETQ BYTE1 (\GETBASEBYTE BASE OFFSET))
                (SETQ NBYTES (NUTF8-BYTE1-BYTES BYTE1))
                (RPLCHARCODE MSTRING M (UTOMCODE (\UTF8.FETCHCODE NBYTES BASE OFFSET]
          MSTRING])
)
(DEFINEQ

(XTOUCODE
  [LAMBDA (XCODE)                                            (* ; "Edited  4-Sep-2025 15:09 by rmk")
                                                             (* ; "Edited 24-May-2025 23:16 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:27 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")
    (UNICODE.TRANSLATE (XTOMCODE XCODE)
           *MCCSTOUNICODE*])

(UTOXCODE
  [LAMBDA (UNICODE)                                          (* ; "Edited 24-May-2025 23:17 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:28 by rmk")
                                                             (* ; "Edited 16-Jan-2025 23:46 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")
    (MTOXCODE (UNICODE.TRANSLATE UNICODE *UNICODETOMCCS*])

(XTOUCODE?
  [LAMBDA (XCCSCODE)                                         (* ; "Edited 24-May-2025 23:18 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:27 by rmk")
                                                             (* ; "Edited 20-Jan-2025 20:38 by rmk")
                                                             (* ; "Edited 18-Jan-2025 11:44 by rmk")
                                                             (* ; "Edited 15-Jan-2025 19:51 by rmk")
                                                             (* ; "Edited 14-Jan-2025 13:14 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")

    (* ;; "Returns the Unix range-code(s) corresponding to XCCSCODE if there are true mapppings, otherwise NIL.  Alternative codes are returned in a list, the code itself is returned for a singleton.")

    (UNICODE.TRANSLATE (XTOMCODE XCCSCODE)
           *MCCSTOUNICODE* T T])

(UTOXCODE?
  [LAMBDA (UNICODE)                                          (* ; "Edited 24-May-2025 23:19 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:28 by rmk")
                                                             (* ; "Edited 19-Jan-2025 21:14 by rmk")
                                                             (* ; "Edited 18-Jan-2025 11:46 by rmk")
                                                             (* ; "Edited 15-Jan-2025 19:51 by rmk")
                                                             (* ; "Edited 14-Jan-2025 13:14 by rmk")
                                                            (* ; "Edited  9-Aug-2020 09:04 by rmk:")

    (* ;; "Returns the XCCS range-code(s) corresponding to UNICODE if there are true mapppings, otherwise NIL.  ")

    (* ;; 
    " NOTE:  Alternative codes are returned in a list, the code itself is returned for a singleton.")

    (MTOXCODE (UNICODE.TRANSLATE UNICODE *UNICODETOMCCS* T T])

(XTOUSTRING
  [LAMBDA (XSTRING DESTRUCTIVE)                              (* ; "Edited  2-Sep-2025 12:00 by rmk")
                                                             (* ; "Edited 29-Apr-2025 12:01 by rmk")

    (* ;; "Converts XCCS codes in XSTRING to Unicodes.")

    (for I UCODE XCODE (USTRING _ (CL:IF DESTRUCTIVE
                                      XSTRING
                                      (CONCAT XSTRING))) from 1 while (SETQ XCODE (NTHCHARCODE 
                                                                                         XSTRING I))
       do (RPLCHARCODE USTRING I (XTOUCODE XCODE)) finally (RETURN USTRING])

(UTOXSTRING
  [LAMBDA (USTRING DESTRUCTIVE)                              (* ; "Edited  2-Sep-2025 11:54 by rmk")
                                                             (* ; "Edited 29-Apr-2025 12:00 by rmk")

    (* ;; "Converts Unicodes in USTRING to XCCS codes.")

    (for I XCODE UCODE (XSTRING _ (CL:IF DESTRUCTIVE
                                      USTRING
                                      (CONCAT USTRING))) from 1 while (SETQ UCODE (NTHCHARCODE 
                                                                                         USTRING I))
       unless (EQ UCODE (SETQ XCODE (UTOXCODE UCODE))) do (RPLCHARCODE XSTRING I XCODE)
       finally (RETURN XSTRING])

(XTOUTF8STRING
  [LAMBDA (XSTRING)                                          (* ; "Edited  4-Sep-2025 18:37 by rmk")
                                                             (* ; "Edited  2-Sep-2025 11:37 by rmk")
                                                             (* ; "Edited 29-Apr-2025 12:53 by rmk")
                                                             (* ; "Edited 24-Apr-2025 15:42 by rmk")
                                                             (* ; "Edited  3-Feb-2024 14:55 by rmk")
                                                            (* ; "Edited 10-Aug-2020 21:42 by rmk:")

    (* ;; "Produces a string that contains the UTF8 bytes that represent the characters in XSTRING.  Applies the ")

    (* ;; "The resulting string will not be interpretable inside Medley.")

    (for I C (MSTRING _ (CONCAT XSTRING)) from 1 while (SETQ C (NTHCHARCODE XSTRING I))
       do (RPLCHARCODE MSTRING I (XTOMCODE C)) finally (RETURN (MTOUTF8STRING MSTRING])
)



(* ;; "")




(* ; "Write Unicode mapping files")

(DEFINEQ

(WRITE-UNICODE-MAPPING
  [LAMBDA (MAPPING INCLUDECHARSETS FILE EMPTYOK)             (* ; "Edited  4-Jan-2024 22:44 by rmk")
                                                            (* ; "Edited 16-Aug-2020 16:56 by rmk:")

    (* ;; "Writes a symbol unicode mapping file.  Mapping is a list of (XCCS-code Unicode) pairs, which may contain codes in multiple character sets.")

    (* ;; "If FILE is NIL, it defaults to a name XCCS- followed by the octal character sets in the mapping, in the unicode/XEROX directory.")

    (* ;; "The output lines are of the form x0XXX<tab>x0UUUU<tab>#  Unicode-char")

    (* ;; 
    "If INCLUDECHARSETS=T then the mappings are split up into separate per-character set files.")

    (* ;; "Otherwise, all and only mappings included in thos charsets are included in a single output file--an implicit subset.")

    (IF (AND (EQ INCLUDECHARSETS T)
             (NULL FILE))
        THEN (IF MAPPING
                 THEN (FOR CSI F IN XCCS-SET-NAMES WHEN (SETQ F (WRITE-UNICODE-MAPPING MAPPING
                                                                       (CAR CSI)
                                                                       NIL T)) COLLECT F)
               ELSE (PRINTOUT T "THERE ARE NO MAPPINGS" T)
                    NIL)
      ELSE
      (LET
       (IMAPPING CSETINFO RANGES)
       (CL:MULTIPLE-VALUE-SETQ (IMAPPING CSETINFO RANGES)
              (WRITE-UNICODE-INCLUDED MAPPING INCLUDECHARSETS))
       (IF IMAPPING
           THEN (CL:WITH-OPEN-FILE
                 (STREAM (WRITE-UNICODE-MAPPING-FILENAME FILE CSETINFO RANGES)
                        :DIRECTION :OUTPUT :IF-EXISTS :NEW-VERSION :EXTERNAL-FORMAT :UTF-8-RAW)
                 (WRITE-UNICODE-MAPPING-HEADER STREAM CSETINFO RANGES)
                 (SORT IMAPPING T)
                 (FOR M CSET LEFTC FIRSTRIGHTC CSI IN IMAPPING
                    DO (SETQ LEFTC (CAR M))
                       (SETQ FIRSTRIGHTC (CADR M))
                       (CL:UNLESS (EQ CSET (LRSH LEFTC 8))
                           (SETQ CSET (LRSH LEFTC 8))
                           (SETQ CSI (ASSOC CSET CSETINFO))
                           (PRINTOUT STREAM T "#  " .P2 (CADR CSI)
                                  " "
                                  (CADDR CSI)
                                  T))
                       (PRINTOUT STREAM "0x" (HEXSTRING LEFTC 4)
                              %#
                              (FOR RIGHTC IN (CDR M) DO (PRINTOUT NIL "	" "0x" (HEXSTRING RIGHTC 4)))
                              "	#  "
                              (SELECTC FIRSTRIGHTC
                                  (UNDEFINEDCODE 
                                                 (* ;; "FFFF")

                                                 "UNDEFINED")
                                  (MISSINGCODE 
                                               (* ;; "FFFE")

                                               "MISSING")
                                  (IF (ILESSP FIRSTRIGHTC 32)
                                      THEN                   (* ; "Control chars")
                                           [CONCAT "^" (CHARACTER (IPLUS FIRSTRIGHTC (CHARCODE @]
                                    ELSE (CHARACTER FIRSTRIGHTC)))
                              T))
                 (FULLNAME STREAM))
         ELSEIF (NOT EMPTYOK)
           THEN (PRINTOUT T "THERE ARE NO MAPPINGS")
                (CL:WHEN INCLUDECHARSETS
                    (PRINTOUT T " FOR " .PPVTL (MKLIST INCLUDECHARSETS)
                           T))
                NIL])

(WRITE-UNICODE-INCLUDED
  [LAMBDA (MAPPING INCLUDECHARSETS)                     (* ; "Edited  4-Aug-2020 17:47 by rmk:")

    (* ;; "CSETINFO is a list of (num string name) for each included character set.")

    (LET (CHARSETS CSETINFO RANGES ICSETS IMAPPING)

         (* ;; "Normalize the INCLUDECHARSETS, then reduce MAPPING to the included mappings")

         [SETQ ICSETS (FOR C POS KNOWN INSIDE (OR INCLUDECHARSETS (FOR CSI IN 
                                                                                       XCCS-SET-NAMES
                                                                             COLLECT (CAR CSI)))
                         JOIN [SETQ KNOWN (OR (SASSOC C XCCS-SET-NAMES)
                                                  (FIND N IN XCCS-SET-NAMES
                                                     SUCHTHAT (EQ C (CADR N)))
                                                  (HELP "UNKNOWN CHARACTER SET" (OCTALSTRING C]
                               (IF (SETQ POS (STRPOS "-" (CAR KNOWN)))
                                   THEN (FOR I FROM (CL:PARSE-INTEGER (SUBSTRING
                                                                                   (CAR KNOWN)
                                                                                   1
                                                                                   (SUB1 POS))
                                                                       :RADIX 8)
                                               TO (CL:PARSE-INTEGER (SUBSTRING (CAR KNOWN)
                                                                               (ADD1 POS))
                                                             :RADIX 8)
                                               COLLECT (LIST I (OCTALSTRING I)
                                                                 (CADR KNOWN)))
                                 ELSE (CONS (CONS (CL:PARSE-INTEGER (CAR KNOWN)
                                                             :RADIX 8)
                                                      KNOWN]
         (SETQ IMAPPING (FOR M CSI IN MAPPING WHEN (SETQ CSI (ASSOC (LRSH (CAR M)
                                                                                      8)
                                                                                ICSETS))
                           COLLECT 

                                 (* ;; "The attested subset of INCLUDED")

                                 (CL:UNLESS (MEMB CSI CSETINFO)
                                        (PUSH CSETINFO CSI))
                                 M))

         (* ;; "Sort as numbers, not octal strings, then group into consecutive ranges")

         (SETQ CSETINFO (SORT CSETINFO T))
         [SETQ RANGES (FOR CTAIL C START END ON (FOR CSI IN CSETINFO
                                                           COLLECT (CAR CSI)) WHILE CTAIL
                         COLLECT (SETQ START (CAR CTAIL))
                               (SETQ END START)
                               (CONS START (WHILE [AND (CDR CTAIL)
                                                           (EQ END (SUB1 (CADR CTAIL]
                                              COLLECT (SETQ CTAIL (CDR CTAIL))
                                                    (SETQ END (CAR CTAIL]

         (* ;; "Split out groups of less than 3.  But if a range exhaustively covers a known subset (like JIS), replace by the name")

         [SETQ RANGES (FOR R STR KNOWN LAST IN RANGES
                         JOIN (SETQ LAST (CAR (LAST R)))
                               (IF (EQ (CAR R)
                                           LAST)
                                   THEN (CONS (OCTALSTRING (CAR R)))
                                 ELSEIF (SETQ KNOWN (SASSOC (SETQ STR (CONCAT (OCTALSTRING
                                                                                   (CAR R))
                                                                                 "-"
                                                                                 (OCTALSTRING LAST)))
                                                               XCCS-SET-NAMES))
                                   THEN (CONS (CADR KNOWN))
                                 ELSEIF (CDDR R)
                                   THEN (CONS STR)
                                 ELSE (LIST (OCTALSTRING (CAR R))
                                                (OCTALSTRING LAST]
         (CL:VALUES IMAPPING CSETINFO RANGES])

(WRITE-UNICODE-MAPPING-HEADER
  [LAMBDA (STREAM CSETINFO RANGES)                           (* ; "Edited  5-Jan-2024 13:24 by rmk")
                                                            (* ; "Edited  4-Aug-2020 17:38 by rmk:")

    (* ;; "Writes the standard per-file header information")

    (FOR LINE IN UNICODE-MAPPING-HEADER
       DO (PRINTOUT STREAM "#" 2)
          (SELECTQ LINE
              (XCCSCHARACTERSETS 
                   (PRINTOUT STREAM "        XCCS charset")
                   (IF (CDR CSETINFO)
                       THEN (PRINTOUT STREAM "s:" -4)
                            (FOR R IN RANGES DO (PRINTOUT STREAM R " "))
                     ELSE                                    (* ; "Singleton")
                          (PRINTOUT STREAM ": " -4 (CADAR CSETINFO)
                                 " "
                                 (CADDAR CSETINFO)))
                   (TERPRI STREAM))
              (DATE (PRINTOUT STREAM "        Date:" -13 (DATE (DATEFORMAT NO.TIME NO.LEADING.SPACES)
                                                               )
                           T))
              (PRINTOUT STREAM LINE T)))
    (TERPRI STREAM])

(WRITE-UNICODE-MAPPING-FILENAME
  [LAMBDA (FILE CSETINFO RANGES)                        (* ; "Edited  4-Aug-2020 19:34 by rmk:")
    (PACKFILENAME 'BODY [OR FILE (CONCATLIST
                                  (CONS 'XCCS- (IF (CDR CSETINFO)
                                                   THEN (FOR RTAIL R ON RANGES
                                                               JOIN (SETQ R (CAR RTAIL))
                                                                     (SETQ R
                                                                      (CL:IF (LISTP R)
                                                                          (LIST (CAR R)
                                                                                "-"
                                                                                (CDR R))
                                                                          (CONS R)))
                                                                     (CL:IF (CDR RTAIL)
                                                                            (NCONC1 R ","))
                                                                     R)
                                                 ELSE (LIST (CADAR CSETINFO)
                                                                "="
                                                                (CADDAR CSETINFO]
           'DIRECTORY
           (CAR UNICODEDIRECTORIES)
           'EXTENSION
           'TXT])
)
(DEFINEQ

(XCCS-UTF8-AFTER-OPEN
  [LAMBDA (STREAM ACCESS PARAMETERS)                         (* ; "Edited  3-Jan-2024 10:27 by rmk")
                                                            (* ; "Edited 13-Aug-2020 11:54 by rmk:")

    (* ;; 
    "If added to STREAM-AFTER-OPEN-FNS, causes mapping files to be opened as UTF-8. For development")

    (CL:WHEN (AND (STRPOS "XCCS-" (U-CASE (FULLNAME STREAM)))
                  [EQ 'TXT (U-CASE (FILENAMEFIELD (FULLNAME STREAM)
                                          'EXTENSION]
                  (NOT (ASSOC 'EXTERNALFORMAT PARAMETERS)))
        (STREAMPROP STREAM 'EXTERNALFORMAT :UTF-8))])
)



(* ;; "Automate dumping of a documentation prefix")

(DECLARE%: EVAL@COMPILE DONTCOPY 
(DECLARE%: EVAL@COMPILE 

(RPAQ MISSINGCODE (CL:PARSE-INTEGER "FFFE" :RADIX 16))

(RPAQ UNDEFINEDCODE (CL:PARSE-INTEGER "FFFF" :RADIX 16))


(CONSTANTS (MISSINGCODE (CL:PARSE-INTEGER "FFFE" :RADIX 16))
       (UNDEFINEDCODE (CL:PARSE-INTEGER "FFFF" :RADIX 16)))
)
)

(RPAQQ UNICODE-MAPPING-HEADER
       ("" "        Name:             XCCS (Version 2.0) to Unicode" "        Unicode version:  3.0"
           XCCSCHARACTERSETS "        Table version:    0.1" "        Table format:     Format A" 
           DATE "        Author:           Ron Kaplan <Ron.Kaplan@post.harvard.edu>" "" 
           "This file contains mappings from the Xerox Character Code Standard (version" 
           "2.0, 1990) into Unicode 3.0. standard codes.  That is an extension of the" 
           "version of XCCS corresponding to the fonts in the Medley system." "" 
           "The format of this file conforms to the format of the other Unicode-supplied" 
           "mapping files:" "   Three white-space (tab or spaces) separated columns:" 
           "     Column 1 is the XCCS code (as hex 0xXXXX)" 
           "     Column 2 is the corresponding Unicode (as hex 0xXXXX)" 
           "     Column 3 (after #) is a comment column. For convenience, it contains the" 
           "        Unicode character itself and the Unicode character names when available." 
           "Unicode FFFF is used for undefined XCCS codes (Column 3 = UNDEFINED" 
           "Unicode FFFE is used for XCCS codes that have not yet been filled in." 
           "(Column 3 = MISSING)" "" "This file is encoded in UTF-8, so that the Unicode characters"
           "are properly displayed in Column 3 and can be edited by standard" 
           "Unicode-enabled editors (e.g. Mac Textedit)." "" 
           "This file can also be read by the function" 
           "READ-UNICODE-MAPPING in the UNICODE Medley library package." "" 
           "The entries are in XCCS order and grouped by character sets.  In front of" 
           "the mappings, for convenience, there is a line with the octal XCCS" 
           "character set, after #." "" 
           "Note that a given XCCS code might map to codes in several different Unicode" 
           "positions, since there are repetitions in the Unicode standard." "" 
           "For more details, see the associated README.TXT file." "" 
           "Any comments or problems, contact <ron.kaplan@post.harvard.edu>"))
(DEFINEQ

(UTF8HEXSTRING
  [LAMBDA (CHARCODE)                                    (* ; "Edited 10-Aug-2020 08:33 by rmk:")

    (* ;; "Utility to produces the UTF8 hexstring representing CODE")

    (HEXSTRING (IF (ILESSP CHARCODE 128)
                       THEN CHARCODE
                     ELSEIF (ILESSP CHARCODE 2048)
                       THEN                              (* ; "x800")
                             (LOGOR (LLSH (LOGOR (LLSH 3 6)
                                                 (LRSH CHARCODE 6))
                                          8)
                                    (LOGOR (LLSH 2 6)
                                           (LOADBYTE CHARCODE 0 6)))
                     ELSEIF (ILESSP CHARCODE 65536)
                       THEN                              (* ; "x10000")
                             (LOGOR (LLSH (LOGOR (LLSH 7 5)
                                                 (LRSH CHARCODE 12))
                                          16)
                                    (LLSH (LOGOR (LLSH 2 6)
                                                 (LOADBYTE CHARCODE 6 6))
                                          8)
                                    (LOGOR (LLSH 2 6)
                                           (LOADBYTE CHARCODE 0 6)))
                     ELSEIF (ILESSP CHARCODE 2097152)
                       THEN                              (* ; "x200000")
                             (LOGOR (LLSH (LOGOR (LLSH 15 4)
                                                 (LRSH CHARCODE 18))
                                          24)
                                    (LLSH (LOGOR (LLSH 2 6)
                                                 (LOADBYTE CHARCODE 12 6))
                                          16)
                                    (LLSH (LOGOR (LLSH 2 6)
                                                 (LOADBYTE CHARCODE 6 6))
                                          8)
                                    (LOGOR (LLSH 2 6)
                                           (LOADBYTE CHARCODE 0 6)))
                     ELSE (ERROR "CHARCODE too big for UTF8" CHARCODE])
)



(* ; "debugging")

(DEFINEQ

(SHOWCHARS
  [LAMBDA (FONT FROMCHAR TOCHAR ONELINE)                     (* ; "Edited  5-Oct-2025 17:41 by rmk")
                                                             (* ; "Edited  7-Sep-2025 20:29 by rmk")
                                                             (* ; "Edited  2-Sep-2025 10:26 by rmk")
                                                             (* ; "Edited 24-Jul-2025 11:30 by rmk")
                                                             (* ; "Edited  8-Jun-2025 20:05 by rmk")
                                                             (* ; "Edited 26-Jan-2024 14:18 by mth")
                                                            (* ; "Edited  1-Aug-2020 09:27 by rmk:")
    [SETQ FONT (FONTCREATE (OR FONT '(CLASSIC 12]
    (RESETLST
        [LET ((OLDFONT (DSPFONT NIL T))
              CHARS)
             (CL:UNLESS (CHARCODEP FROMCHAR)
                 (SETQ FROMCHAR (OR (CHARCODE.DECODE FROMCHAR T)
                                    FROMCHAR)))
             (SETQ CHARS (if (LISTP FROMCHAR)
                           elseif (CHARCODEP FROMCHAR)
                             then (CL:UNLESS (CHARCODEP TOCHAR)
                                      (SETQ TOCHAR (OR (CHARCODE.DECODE TOCHAR)
                                                       FROMCHAR)))
                                  (for C from FROMCHAR to TOCHAR collect C)
                           else (CHCON FROMCHAR)))
             [RESETSAVE OLDFONT '(PROGN (DSPFONT OLDVALUE]
             (TERPRI)
             (for C in CHARS do (PRINTOUT T .FONT OLDFONT (CONCAT (OCTALSTRING (\CHARSET C))
                                                                 ","
                                                                 (OCTALSTRING (\CHAR8CODE C)))
                                       10 .FONT FONT (CHARACTER C))
                                (CL:UNLESS ONELINE (PRINTOUT T T])
    (TERPRI])
)
(DECLARE%: DOEVAL@LOAD DONTCOPY 
(DECLARE%: EVAL@COMPILE 

(PUTPROPS HEXCHAR MACRO ((CODE)
                         (HEXSTRING CODE)))

(PUTPROPS OCTALCHAR MACRO [(CODE)
                           (CONCAT (OCTALSTRING (\CHARSET CODE))
                                  ","
                                  (OCTALSTRING (LOGAND CODE 255])
)
)
(DECLARE%: EVAL@COMPILE DONTCOPY 

(FILESLOAD (LOADCOMP)
       UNICODE-EXPORTS)
)

(PUTPROPS UNICODE FILETYPE :TCOMPL)
(DECLARE%: DONTCOPY
  (FILEMAP (NIL (3379 18917 (UTF8.OUTCHARFN 3389 . 6405) (UTF8.SLUG.OUTCHARFN 6407 . 7071) (
UTF8.INCCODEFN 7073 . 12926) (UTF8.PEEKCCODEFN 12928 . 17935) (\UTF8.BACKCCODEFN 17937 . 18915)) (
18918 23608 (UTF16BE.OUTCHARFN 18928 . 19947) (UTF16BE.INCCODEFN 19949 . 21074) (UTF16BE.PEEKCCODEFN 
21076 . 22416) (\UTF16BE.BACKCCODEFN 22418 . 23606)) (23609 28332 (UTF16LE.OUTCHARFN 23619 . 24735) (
UTF16LE.INCCODEFN 24737 . 25862) (UTF16LE.PEEKCCODEFN 25864 . 27140) (\UTF16LE.BACKCCODEFN 27142 . 
28330)) (28333 31380 (READBOM 28343 . 30412) (WRITEBOM 30414 . 31378)) (31410 35163 (
MAKE-UNICODE-FORMATS 31420 . 35161)) (35260 39754 (UTF8.BINCODE 35270 . 37958) (\UTF8.FETCHCODE 37960
 . 39752)) (39755 45382 (UTF8.VALIDATE 39765 . 42362) (NUTF8-BYTE1-BYTES 42364 . 43101) (
NUTF8-CODE-BYTES 43103 . 44160) (NUTF8-STRING-BYTES 44162 . 45058) (N-MCHARS 45060 . 45380)) (47864 
57575 (MTOUCODE 47874 . 48261) (UTOMCODE 48263 . 48789) (MTOUCODE? 48791 . 49824) (UTOMCODE? 49826 . 
50995) (MTOUSTRING 50997 . 51582) (UTOMSTRING 51584 . 52169) (MTOUTF8STRING 52171 . 56460) (
UTF8TOMSTRING 56462 . 57573)) (57576 63278 (XTOUCODE 57586 . 58104) (UTOXCODE 58106 . 58614) (
XTOUCODE? 58616 . 59677) (UTOXCODE? 59679 . 60762) (XTOUSTRING 60764 . 61457) (UTOXSTRING 61459 . 
62200) (XTOUTF8STRING 62202 . 63276)) (63341 74609 (WRITE-UNICODE-MAPPING 63351 . 67101) (
WRITE-UNICODE-INCLUDED 67103 . 71825) (WRITE-UNICODE-MAPPING-HEADER 71827 . 73075) (
WRITE-UNICODE-MAPPING-FILENAME 73077 . 74607)) (74610 75286 (XCCS-UTF8-AFTER-OPEN 74620 . 75284)) (
77811 80028 (UTF8HEXSTRING 77821 . 80026)) (80055 82097 (SHOWCHARS 80065 . 82095)))))
STOP
