* TESTUPF: Move from internal/ to internal/test/unpackfilename to be parallel with filepos * IOCHAR: FILEPOS respects external format (#3) * FILEPKG: EDITCALLERS speed up with new FILEPOS * internal/test/filepos: Testing jig and cases for new FILEPOS * EXTERNALFORMAT, IOCHAR: fix external-format glitches
This commit is contained in:
817
sources/IOCHAR
817
sources/IOCHAR
@@ -1,12 +1,12 @@
|
||||
(DEFINE-FILE-INFO PACKAGE "INTERLISP" READTABLE "INTERLISP" BASE 10)
|
||||
|
||||
(FILECREATED "26-Oct-2021 10:07:31"
|
||||
{DSK}<Users>kaplan>Local>medley3.5>git-medley>sources>IOCHAR.;5 90395
|
||||
(FILECREATED "10-Jul-2022 16:52:14"
|
||||
{DSK}<users>kaplan>local>medley3.5>working-medley>sources>IOCHAR.;39 99224
|
||||
|
||||
changes to%: (VARS IOCHARCOMS)
|
||||
:CHANGES-TO (FNS FILEPOS \SLOWFILEPOS FFILEPOS)
|
||||
|
||||
previous date%: "24-Oct-2021 23:57:27"
|
||||
{DSK}<Users>kaplan>Local>medley3.5>git-medley>sources>IOCHAR.;4)
|
||||
:PREVIOUS-DATE " 1-Jul-2022 11:55:50"
|
||||
{DSK}<users>kaplan>local>medley3.5>working-medley>sources>IOCHAR.;36)
|
||||
|
||||
|
||||
(* ; "
|
||||
@@ -32,7 +32,7 @@ Copyright (c) 1981-1988, 1990-1991, 2018, 2020 by Venue & Xerox Corporation.
|
||||
(DECLARE%: EVAL@COMPILE (PROP GLOBALVAR UPPERCASEARRAY)
|
||||
DONTCOPY
|
||||
(GLOBALVARS \TRANSPARENT)))
|
||||
(COMS (FNS FILEPOS FFILEPOS \SETUP.FFILEPOS)
|
||||
(COMS (FNS FILEPOS FFILEPOS \SETUP.FFILEPOS \SLOWFILEPOS)
|
||||
(DECLARE%: EVAL@COMPILE DONTCOPY (RESOURCES \FFDELTA1 \FFDELTA2 \FFPATCHAR)
|
||||
(CONSTANTS (\MAX.PATTERN.SIZE 128)
|
||||
(\MIN.PATTERN.SIZE 3)
|
||||
@@ -50,7 +50,7 @@ Copyright (c) 1981-1988, 1990-1991, 2018, 2020 by Venue & Xerox Corporation.
|
||||
|
||||
|
||||
(* ;;
|
||||
" Note: this might not be relevant to users with local time servers that do the right thing.")
|
||||
" Note: this might not be relevant to users with local time servers that do the right thing.")
|
||||
|
||||
(INITVARS (\TimeZoneComp 8)
|
||||
(\BeginDST 74)
|
||||
@@ -187,13 +187,12 @@ Copyright (c) 1981-1988, 1990-1991, 2018, 2020 by Venue & Xerox Corporation.
|
||||
(DECLARE%: EVAL@COMPILE
|
||||
|
||||
(PUTPROPS \CATRANSLATE MACRO (OPENLAMBDA (CABASE CASIZE CAFAT CHAR)
|
||||
(COND
|
||||
((ILEQ CHAR CASIZE)(* ;
|
||||
"If it's in the table, use the table value")
|
||||
(\GETBASEBYTE CABASE CHAR))
|
||||
(T (* ;
|
||||
"Off the end -- assume it's itself")
|
||||
CHAR))))
|
||||
(COND
|
||||
((ILEQ CHAR CASIZE) (* ;
|
||||
"If it's in the table, use the table value")
|
||||
(\GETBASECHAR CAFAT CABASE CHAR))
|
||||
(T (* ; "Off the end -- assume it's itself")
|
||||
CHAR))))
|
||||
)
|
||||
)
|
||||
(DEFINEQ
|
||||
@@ -256,339 +255,511 @@ DONTCOPY
|
||||
(DEFINEQ
|
||||
|
||||
(FILEPOS
|
||||
[LAMBDA (STR FILE START END SKIP TAIL CASEARRAY) (* ; "Edited 10-Aug-2020 21:44 by rmk:")
|
||||
(* Pavel "12-Oct-86 15:13")
|
||||
[LAMBDA (PATTERN FILE START END SKIP TAIL CASEARRAY)
|
||||
|
||||
(* ;; "RMK: Added coercion from internal XCCS string to UTF8 if searching a UTF8 file")
|
||||
(* ;; "Edited 10-Jul-2022 16:51 by rmk")
|
||||
|
||||
(* ;; "NB: this function now works on non-PAGEMAPPED files. It must use only IO functions that respect that.")
|
||||
(* ;; "Edited 1-Jul-2022 11:55 by rmk")
|
||||
|
||||
(PROG ((SKIPCHAR (AND SKIP (CHCON1 SKIP)))
|
||||
[CA (fetch (ARRAYP BASE) of (COND
|
||||
[CASEARRAY
|
||||
(COND
|
||||
((AND (ARRAYP CASEARRAY)
|
||||
(EQ (fetch (ARRAYP TYP) of
|
||||
CASEARRAY
|
||||
)
|
||||
\ST.BYTE))
|
||||
CASEARRAY)
|
||||
(T (CASEARRAY CASEARRAY]
|
||||
(T \TRANSPARENT]
|
||||
(STREAM (\GETSTREAM FILE 'INPUT))
|
||||
CHAR FIRSTCHAR STRBASE STRINDEX PATLEN PATINDEX ORGFILEPTR LASTINDEX STARTBYTE ENDBYTE
|
||||
BIGENDBYTE STARTSEG ENDSEG)
|
||||
(CL:WHEN (EQ :UTF8 (\EXTERNALFORMAT STREAM))
|
||||
(SETQ STR (XTOUSTRING STR)))
|
||||
[COND
|
||||
((LITATOM STR)
|
||||
(SETQ STRBASE (fetch (LITATOM PNAMEBASE) of STR))
|
||||
(SETQ STRINDEX 1)
|
||||
(SETQ PATLEN (fetch (LITATOM PNAMELENGTH) of STR)))
|
||||
(T (OR (STRINGP STR)
|
||||
(SETQ STR (MKSTRING STR)))
|
||||
(SETQ STRBASE (fetch (STRINGP BASE) of STR))
|
||||
(SETQ STRINDEX (fetch (STRINGP OFFST) of STR))
|
||||
(SETQ PATLEN (fetch (STRINGP LENGTH) of STR]
|
||||
(* ;
|
||||
"calculate start addr and set file ptr.")
|
||||
[SETQ STARTBYTE (COND
|
||||
(START (COND
|
||||
((NOT (AND (FIXP START)
|
||||
(IGEQ START 0)))
|
||||
(LISPERROR "ILLEGAL ARG" START)))
|
||||
(SETQ ORGFILEPTR (\GETFILEPTR STREAM))
|
||||
(\SETFILEPTR STREAM START)
|
||||
START)
|
||||
(T (SETQ ORGFILEPTR (\GETFILEPTR STREAM]
|
||||
(* ;
|
||||
"calculate the character address of the character after the last possible match.")
|
||||
[SETQ ENDBYTE (ADD1 (COND
|
||||
((NULL END) (* ; "Default is end of file")
|
||||
(IDIFFERENCE (\GETEOFPTR STREAM)
|
||||
PATLEN))
|
||||
((IGEQ END 0) (* ; "Absolute byte pointer given")
|
||||
(IMIN END (IDIFFERENCE (\GETEOFPTR STREAM)
|
||||
PATLEN)))
|
||||
((IGREATERP PATLEN (IMINUS END))
|
||||
(* ;
|
||||
"END is too far, use eof less length")
|
||||
(IDIFFERENCE (\GETEOFPTR STREAM)
|
||||
PATLEN))
|
||||
(T (IDIFFERENCE (IPLUS (\GETEOFPTR STREAM)
|
||||
END 1)
|
||||
PATLEN]
|
||||
(* ;; "Edited 25-Jun-2022 22:51 by rmk: The original version was a byte-level searcher, this upgrades to character searching as determined by the external format of the stream. (It is also a bit faster than the original).")
|
||||
|
||||
(* ;; "use STARTBYTE and ENDBYTE instead of START and END because vm functions shouldn't change their arguments.")
|
||||
(* ;; "This provides accurate results if the stream's external format is stable, wherein each character code has a unique byte representation. If the stream's format is unstable (i.e. XCCS runcoding), then the result is accurate if the stream's initial charset (or other contextual information) is correct for the START byte position.")
|
||||
|
||||
(COND
|
||||
((IGEQ STARTBYTE ENDBYTE) (* ; "nothing to search")
|
||||
(GO FAILED)))
|
||||
(SETQ LASTINDEX PATLEN)
|
||||
SKIPLP
|
||||
(* ;
|
||||
"set the first character to FIRSTCHAR, handling leading skips.")
|
||||
(COND
|
||||
((EQ LASTINDEX 0) (* ; "null case")
|
||||
(GO FOUNDIT))
|
||||
((EQ (SETQ FIRSTCHAR (\GETBASEBYTE CA (\GETBASEBYTE STRBASE STRINDEX)))
|
||||
SKIPCHAR) (* ;
|
||||
"first character in pattern is skip.")
|
||||
(SETQ LASTINDEX (SUB1 LASTINDEX))
|
||||
(\BIN STREAM) (* ; "Move forward a character.")
|
||||
(add STRINDEX 1)
|
||||
(add STARTBYTE 1)
|
||||
(GO SKIPLP)))
|
||||
(SETQ LASTINDEX (IPLUS LASTINDEX STRINDEX)) (* ;
|
||||
"Used for end of pattern check, comparing against current INDEX")
|
||||
[COND
|
||||
((SMALLP ENDBYTE)
|
||||
(SETQ STARTSEG (SETQ ENDSEG 0)))
|
||||
(T
|
||||
(* ;; "The search will be in the large integers at least part of the time, so split the start and end fileptrs into hi and lo parts. The `segment' size we choose is smaller than 2^16 so that we are still smallp near the boundary (can get around that here by decrementing everyone, but can't in FFILEPOS). Note that STARTBYTE and ENDBYTE are never actually used as file ptrs, just for counting.")
|
||||
(* ;; "Otherwise, there may be some bad matches and some missing matches. The slow case will be accurate in those cases (and a NIL return for the format's \FORMATBYTESTRING function will kick it into the slow case (about 10 times slower). This always defers to the slow case if SKIP or CASEARRAY are non-NIL.")
|
||||
|
||||
(SETQ ENDSEG (FOLDLO ENDBYTE FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ BIGENDBYTE (IMOD ENDBYTE FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ STARTSEG (FOLDLO STARTBYTE FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ STARTBYTE (IMOD STARTBYTE FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ ENDBYTE (COND
|
||||
((EQ STARTSEG ENDSEG)
|
||||
BIGENDBYTE)
|
||||
(T
|
||||
(* ;; "(Original algorithm advanced the pattern (and the stream starting position) over leading skips, presumably to speed up the search. A foolish (and complex) optimization, since it would just mean that you would cover the intervening characters in a different way.)")
|
||||
|
||||
(* ;; "In different segments, so we'll have to search all the way to the end of this seg; hence, `end' is currently as big as it gets")
|
||||
(* ;;
|
||||
"New interface features: TAIL=BOTH means return a dotted pair of the (start . end) of the match")
|
||||
|
||||
FILEPOS.SEGMENT.SIZE]
|
||||
FIRSTCHARLP
|
||||
(* ;; "CASEARRAY=T forces the slow case, as if CASEARRAY=(CASEARRAU) - transparent.")
|
||||
|
||||
(PROG ((STREAM (\GETSTREAM FILE 'INPUT))
|
||||
STREAMLEN ORGFILEPTR PATSTR PATLEN PATBASE PATLEN FIRSTINDEX LASTINDEX PATFIRSTBYTE
|
||||
STARTBYTEPOS ENDBYTEPOS)
|
||||
|
||||
(* ;; "")
|
||||
|
||||
(* ;; "Decode the start and end parameters, set the starting filepointer.")
|
||||
|
||||
(* ;; "Set STARTBYTEPOS and ENDBYTEPOS instead of resetting START and END because vm functions shouldn't change their arguments.")
|
||||
|
||||
(SETQ ORGFILEPTR (\GETFILEPTR STREAM))
|
||||
(SETQ STARTBYTEPOS (COND
|
||||
(START (CL:UNLESS (AND (FIXP START)
|
||||
(IGEQ START 0))
|
||||
(LISPERROR "ILLEGAL ARG" START))
|
||||
(\SETFILEPTR STREAM START)
|
||||
START)
|
||||
(T ORGFILEPTR)))
|
||||
(SETQ STREAMLEN (\GETEOFPTR STREAM))
|
||||
[SETQ ENDBYTEPOS (ADD1 (COND
|
||||
((NULL END) (* ; "Default is end of file ")
|
||||
STREAMLEN)
|
||||
((IGEQ END 0) (* ; "Absolute byte pointer given")
|
||||
(IMIN END STREAMLEN))
|
||||
(T (IPLUS STREAMLEN END 1]
|
||||
|
||||
(* ;; "STARTBYTEPOS is the position of the first matchable byte = (SETFILEPTR SBP)(BIN)")
|
||||
|
||||
(* ;; "ENDBYTEPOS here is the position one after the last possible start (not the position of the final byte of the last possible match). That is, the match itself can go further than ENDBYTEPOS")
|
||||
|
||||
(CL:WHEN (IGREATERP STARTBYTEPOS ENDBYTEPOS) (* ; "nothing to search ")
|
||||
(GO FAILED))
|
||||
(CL:WHEN (EQ (NCHARS PATTERN)
|
||||
0)
|
||||
|
||||
(* ;; "Empty string: succed. Already positioned at STARTBYTEPOS")
|
||||
|
||||
(RETURN STARTBYTEPOS))
|
||||
(CL:WHEN [OR CASEARRAY (AND SKIP (STRPOS SKIP PATTERN))
|
||||
(NOT (SETQ PATSTR (\FORMATBYTESTRING STREAM PATTERN]
|
||||
(RETURN (OR (\SLOWFILEPOS PATTERN STREAM STARTBYTEPOS ENDBYTEPOS SKIP TAIL CASEARRAY)
|
||||
(GO FAILED))))
|
||||
|
||||
(* ;; "")
|
||||
|
||||
(* ;; "Now we're in the fast case: No SKIP and no CASEARRAY, and we were able to map the search string to a stable sequence of file bytes. ")
|
||||
|
||||
(SETQ PATLEN (ffetch (STRINGP LENGTH) of PATSTR))
|
||||
(CL:WHEN (IGREATERP (SUB1 PATLEN)
|
||||
(IDIFFERENCE STREAMLEN ENDBYTEPOS))
|
||||
|
||||
(* ;; "EOF guard; needed to wait for actual pattern length")
|
||||
|
||||
(SETQ ENDBYTEPOS (IDIFFERENCE STREAMLEN (SUB1 PATLEN)))
|
||||
(CL:WHEN (IGEQ STARTBYTEPOS ENDBYTEPOS)
|
||||
(GO FAILED)))
|
||||
(SETQ PATBASE (ffetch (STRINGP BASE) of PATSTR))
|
||||
(SETQ FIRSTINDEX (ffetch (STRINGP OFFST) of PATSTR))
|
||||
(SETQ LASTINDEX (IPLUS FIRSTINDEX (SUB1 PATLEN)))
|
||||
(SETQ PATFIRSTBYTE (\GETBASEBYTE PATBASE FIRSTINDEX))
|
||||
(ADD FIRSTINDEX 1) (* ;
|
||||
"Start at the second byte when the first one matched. ")
|
||||
|
||||
(* ;; "")
|
||||
|
||||
(* ;; "The stream keeps track of its byte position, but we must count down ourselves so that we don't go beyond ENDBYTEPOS (would be nice if we could construct a substream). Use hi/lo arithmetic to avoid large integers on big byte regions.")
|
||||
|
||||
(* ;; "A loop of nomatch-match sequences")
|
||||
|
||||
(BIND (NBYTES _ (IDIFFERENCE ENDBYTEPOS STARTBYTEPOS))
|
||||
NBYTESHI NBYTESLO FIRST (SETQ NBYTESHI (FOLDLO NBYTES FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ NBYTESLO (IMOD NBYTES FILEPOS.SEGMENT.SIZE))
|
||||
DO (DO
|
||||
(* ;; "Find next FIRSTBYTE")
|
||||
|
||||
(CL:WHEN (ILEQ NBYTESLO 0) (* ; "Finished this segment ")
|
||||
(CL:WHEN (EQ NBYTESHI 0)
|
||||
(GO FAILED)) (* ; "Roll over to a new segment")
|
||||
(add NBYTESLO FILEPOS.SEGMENT.SIZE)
|
||||
(add NBYTESHI -1))
|
||||
(ADD NBYTESLO -1) (* ; "Decrement the byte count")
|
||||
REPEATUNTIL (EQ PATFIRSTBYTE (\BIN STREAM)))
|
||||
|
||||
(* ;; "")
|
||||
|
||||
(* ;; "Found PATFIRSTBYTE, enter match loop.")
|
||||
|
||||
(FOR I FROM FIRSTINDEX TO LASTINDEX
|
||||
DO (CL:UNLESS (EQ (\GETBASEBYTE PATBASE I)
|
||||
(\BIN STREAM))
|
||||
|
||||
(* ;; "Match failed: Go back to second position and try again")
|
||||
|
||||
(\INCFILEPTR STREAM (SUB1 (IDIFFERENCE FIRSTINDEX I)))
|
||||
(RETURN)) FINALLY
|
||||
|
||||
(* ;;
|
||||
"Ran off the end: complete match, get out of the outer loop")
|
||||
|
||||
(GO FOUNDIT)))
|
||||
FOUNDIT
|
||||
|
||||
|
||||
(* ;; "STARTBYTE is the possible beginning of a match. the file ptr of the file is always at STARTBYTE position when the FIRSTCHAR loop is passed.")
|
||||
(* ;; "The stream's charset should be set to the charset corresponding to the return byte-position. We haven't been tracking it, but if we are returning the tail pointer, then the stream's character set must be the same as the character set of the last character o fPATTERN.")
|
||||
|
||||
(COND
|
||||
((EQ STARTBYTE ENDBYTE) (* ; "end of this part of search")
|
||||
(COND
|
||||
((EQ STARTSEG ENDSEG) (* ; "failed")
|
||||
(GO FAILED))) (* ;
|
||||
"Finished this segment, roll over into new one")
|
||||
(SETQ STARTBYTE 0) (* ;
|
||||
"= STARTBYTE-FILEPOS.SEGMENT.SIZE")
|
||||
[COND
|
||||
((EQ (add STARTSEG 1)
|
||||
ENDSEG) (* ;
|
||||
"Entering final segment, so set ENDBYTE to actual end instead of segment end")
|
||||
(COND
|
||||
((EQ (SETQ ENDBYTE BIGENDBYTE)
|
||||
0)
|
||||
(GO FAILED]
|
||||
(GO FIRSTCHARLP))
|
||||
((NEQ FIRSTCHAR (\GETBASEBYTE CA (\BIN STREAM)))
|
||||
(add STARTBYTE 1)
|
||||
(GO FIRSTCHARLP)))
|
||||
(SETQ PATINDEX STRINDEX)
|
||||
MATCHLP
|
||||
(* ;
|
||||
"At this point, STR is matched thru offset PATINDEX")
|
||||
(COND
|
||||
((EQ (SETQ PATINDEX (ADD1 PATINDEX))
|
||||
LASTINDEX) (* ; "matched for entire length")
|
||||
(GO FOUNDIT))
|
||||
((OR (EQ (SETQ CHAR (\GETBASEBYTE CA (\GETBASEBYTE STRBASE PATINDEX)))
|
||||
(\GETBASEBYTE CA (\BIN STREAM)))
|
||||
(EQ CHAR SKIPCHAR)) (* ;
|
||||
"Char from file matches char from STR")
|
||||
(GO MATCHLP))
|
||||
(T (* ;
|
||||
"Match failed, so we have to start again with first char")
|
||||
(\SETFILEPTR STREAM (IDIFFERENCE (\GETFILEPTR STREAM)
|
||||
(IDIFFERENCE PATINDEX STRINDEX)))
|
||||
(* ;; "Getting the character set for the start of the match is a little trickier. We know the character set at the byte that starts the beginning of the match (= character set of PATTERN's first character. If we set the stream to that charset, then back up one character, that should get it right. ")
|
||||
|
||||
(* ;; "Back up over the chars we have just read in trying to match, less one. I.e. go back to one past the previous starting point")
|
||||
(* ;; "This should only be necessary for an unstable format, maybe don't bother if it isn't XCCS. There is another special case here for XCCS: if the charset is 255 at the start (=2 byte encoding), then we assume that it didn't change, and nothing to worry about.")
|
||||
|
||||
(add STARTBYTE 1)
|
||||
(GO FIRSTCHARLP)))
|
||||
FOUNDIT
|
||||
(* ;
|
||||
"set fileptr, adjust for beginning skips and return proper value.")
|
||||
[COND
|
||||
((NOT TAIL) (* ;
|
||||
"Fileptr wants to be at start of string")
|
||||
(\SETFILEPTR STREAM (IDIFFERENCE (\GETFILEPTR STREAM)
|
||||
PATLEN]
|
||||
(RETURN (\GETFILEPTR STREAM))
|
||||
(RETURN (IF TAIL
|
||||
THEN (CL:UNLESS (EQ NSCHARSETSHIFT (ffetch (STREAM CHARSET) of STREAM))
|
||||
(freplace (STREAM CHARSET) of STREAM with (\CHARSET (NTHCHARCODE
|
||||
PATTERN -1))))
|
||||
(CL:IF (EQ TAIL 'BOTH)
|
||||
(CONS (IDIFFERENCE (\GETFILEPTR STREAM)
|
||||
PATLEN)
|
||||
(\GETFILEPTR STREAM))
|
||||
(\GETFILEPTR STREAM))
|
||||
ELSE
|
||||
(* ;;
|
||||
"Fileptr wants to be where the match started, PATLEN back from where the match ended")
|
||||
|
||||
(\INCFILEPTR STREAM (IMINUS PATLEN))
|
||||
(SETQ STARTBYTEPOS (\GETFILEPTR STREAM))
|
||||
(CL:UNLESS (EQ NSCHARSETSHIFT (ffetch (STREAM CHARSET) of STREAM))
|
||||
(freplace (STREAM CHARSET) of STREAM with (\CHARSET (CHCON1 PATTERN)))
|
||||
(\BACKCCODE STREAM) (* ; "Should fix the charset")
|
||||
(\SETFILEPTR STREAM STARTBYTEPOS))
|
||||
STARTBYTEPOS))
|
||||
FAILED
|
||||
(* ;
|
||||
"return the fileptr to its initial position.")
|
||||
(\SETFILEPTR STREAM ORGFILEPTR)
|
||||
(\SETFILEPTR STREAM ORGFILEPTR) (* ;
|
||||
"return the fileptr to its initial position. We didn't jigger the original charset")
|
||||
(RETURN NIL])
|
||||
|
||||
(FFILEPOS
|
||||
[LAMBDA (PATTERN FILE START END SKIP TAIL CASEARRAY) (* ; "Edited 10-Aug-2020 21:44 by rmk:")
|
||||
[LAMBDA (PATTERN FILE START END SKIP TAIL CASEARRAY)
|
||||
|
||||
(* ;; "RMK: Added coercion from internal XCCS string to UTF8 if searching a UTF8 file")
|
||||
(* Pavel "12-Oct-86 15:20")
|
||||
(PROG ([STREAM (\GETSTREAM (OR FILE (INPUT]
|
||||
PATBASE PATOFFSET PATLEN ORGFILEPTR STARTOFFSET ENDOFFSET BIGENDOFFSET STARTSEG ENDSEG EOF
|
||||
)
|
||||
(* ;; "Edited 10-Jul-2022 10:17 by rmk")
|
||||
|
||||
(* ;; "Edited 1-Jul-2022 11:55 by rmk")
|
||||
|
||||
(* ;; "Edited 23-Jun-2022 08:50 by rmk: CASEARRAY is now also a slow (FFILEPOS) case. Fast case now works for arbitrary external formats")
|
||||
|
||||
(* ;; "Edited 10-Aug-2020 21:44 by rmk:")
|
||||
|
||||
(* ;; "RMK: Added coercion from internal XCCS string to UTF8 if searching a UTF8 file")
|
||||
(* Pavel "12-Oct-86 15:20")
|
||||
(PROG ((STREAM (\GETSTREAM FILE 'INPUT))
|
||||
BYTEPATTERN BPATBASE BPATOFFSET BPATLEN ORGFILEPTR STARTBYTEPOS ENDBYTEPOS BIGENDOFFSET
|
||||
STARTSEG ENDSEG EOF)
|
||||
(CL:WHEN [OR SKIP CASEARRAY (NOT (fetch PAGEMAPPED of (fetch (STREAM DEVICE) of STREAM)))
|
||||
(NULL (SETQ BYTEPATTERN (\FORMATBYTESTRING STREAM PATTERN]
|
||||
(* ; "Slow case--use FILEPOS")
|
||||
(GO TRYFILEPOS)) (* ;
|
||||
"calculate start addr and set file ptr.")
|
||||
(SETQ BPATBASE (fetch (STRINGP BASE) of BYTEPATTERN))
|
||||
(SETQ BPATOFFSET (fetch (STRINGP OFFST) of BYTEPATTERN))
|
||||
(SETQ BPATLEN (fetch (STRINGP LENGTH) of BYTEPATTERN))
|
||||
(COND
|
||||
(SKIP (* ; "Slow case--use FILEPOS")
|
||||
(GO TRYFILEPOS))
|
||||
((NOT (fetch PAGEMAPPED of (fetch (STREAM DEVICE) of STREAM)))
|
||||
(* ;
|
||||
"This is a non-page-oriented file. Use FILEPOS instead.")
|
||||
(GO TRYFILEPOS))) (* ;
|
||||
"calculate start addr and set file ptr.")
|
||||
(CL:WHEN (EQ :UTF8 (\EXTERNALFORMAT STREAM))
|
||||
(SETQ PATTERN (XTOUSTRING PATTERN)))
|
||||
[COND
|
||||
((LITATOM PATTERN)
|
||||
(SETQ PATBASE (fetch (LITATOM PNAMEBASE) of PATTERN))
|
||||
(SETQ PATOFFSET 1)
|
||||
(SETQ PATLEN (fetch (LITATOM PNAMELENGTH) of PATTERN)))
|
||||
(T (OR (STRINGP PATTERN)
|
||||
(SETQ PATTERN (MKSTRING PATTERN)))
|
||||
(SETQ PATBASE (fetch (STRINGP BASE) of PATTERN))
|
||||
(SETQ PATOFFSET (fetch (STRINGP OFFST) of PATTERN))
|
||||
(SETQ PATLEN (fetch (STRINGP LENGTH) of PATTERN]
|
||||
(COND
|
||||
((OR (IGREATERP PATLEN \MAX.PATTERN.SIZE)
|
||||
(ILESSP PATLEN \MIN.PATTERN.SIZE))
|
||||
((OR (IGREATERP BPATLEN \MAX.PATTERN.SIZE)
|
||||
(ILESSP BPATLEN \MIN.PATTERN.SIZE))
|
||||
(GO TRYFILEPOS)))
|
||||
(SETQ ORGFILEPTR (\GETFILEPTR STREAM))
|
||||
(SETQ STARTOFFSET (IPLUS (COND
|
||||
(START (COND
|
||||
((NOT (AND (FIXP START)
|
||||
(IGEQ START 0)))
|
||||
(LISPERROR "ILLEGAL ARG" START)))
|
||||
START)
|
||||
(T ORGFILEPTR))
|
||||
(SUB1 PATLEN))) (* ;
|
||||
"STARTOFFSET is the address of the character corresponding to the last character of PATTERN.")
|
||||
(SETQ EOF (\GETEOFPTR STREAM)) (* ;
|
||||
"calculate the character address of the character after the last possible match.")
|
||||
[SETQ ENDOFFSET (COND
|
||||
((NULL END) (* ; "Default is end of file")
|
||||
EOF)
|
||||
(T (IMIN (IPLUS (COND
|
||||
((ILESSP END 0)
|
||||
(IPLUS EOF END 1))
|
||||
(T END))
|
||||
PATLEN)
|
||||
EOF]
|
||||
(SETQ STARTBYTEPOS (IPLUS (COND
|
||||
(START (COND
|
||||
((NOT (AND (FIXP START)
|
||||
(IGEQ START 0)))
|
||||
(LISPERROR "ILLEGAL ARG" START)))
|
||||
START)
|
||||
(T ORGFILEPTR))
|
||||
(SUB1 BPATLEN))) (* ;
|
||||
"STARTBYTEPOS is the address of the character corresponding to the last character of PATTERN.")
|
||||
(SETQ EOF (\GETEOFPTR STREAM)) (* ;
|
||||
"calculate the character address of the character after the last possible match.")
|
||||
[SETQ ENDBYTEPOS (COND
|
||||
((NULL END) (* ; "Default is end of file")
|
||||
EOF)
|
||||
(T (IMIN (IPLUS (COND
|
||||
((ILESSP END 0)
|
||||
(IPLUS EOF END 1))
|
||||
(T END))
|
||||
BPATLEN)
|
||||
EOF]
|
||||
|
||||
(* ;; "use STARTOFFSET and ENDOFFSET instead of START and END because vm functions shouldn't change their arguments.")
|
||||
(* ;; "use STARTBYTEPOS and ENDBYTEPOS instead of START and END because vm functions shouldn't change their arguments.")
|
||||
|
||||
(COND
|
||||
((IGEQ STARTOFFSET ENDOFFSET) (* ; "nothing to search")
|
||||
((IGEQ STARTBYTEPOS ENDBYTEPOS) (* ; "nothing to search")
|
||||
(RETURN))
|
||||
((ILESSP (IDIFFERENCE ENDOFFSET STARTOFFSET)
|
||||
\MIN.SEARCH.LENGTH) (* ;
|
||||
"too small to make FFILEPOS worthwhile")
|
||||
((ILESSP (IDIFFERENCE ENDBYTEPOS STARTBYTEPOS)
|
||||
\MIN.SEARCH.LENGTH) (* ;
|
||||
"too small to make FFILEPOS worthwhile")
|
||||
(GO TRYFILEPOS)))
|
||||
(\SETFILEPTR STREAM STARTOFFSET)
|
||||
[RETURN (GLOBALRESOURCE
|
||||
(\FFDELTA1 \FFDELTA2 \FFPATCHAR)
|
||||
(PROG ((CASE (fetch (ARRAYP BASE)
|
||||
of (COND
|
||||
[CASEARRAY (COND
|
||||
((AND (ARRAYP CASEARRAY)
|
||||
(EQ (fetch (ARRAYP TYP)
|
||||
of CASEARRAY)
|
||||
\ST.BYTE))
|
||||
CASEARRAY)
|
||||
(T (CASEARRAY CASEARRAY]
|
||||
(T \TRANSPARENT))))
|
||||
(DELTA1 (fetch (ARRAYP BASE) of \FFDELTA1))
|
||||
(DELTA2 (fetch (ARRAYP BASE) of \FFDELTA2))
|
||||
(PATCHAR (fetch (ARRAYP BASE) of \FFPATCHAR))
|
||||
(MAXPATINDEX (SUB1 PATLEN))
|
||||
CHAR CURPATINDEX LASTCHAR INC)
|
||||
(\SETFILEPTR STREAM STARTBYTEPOS)
|
||||
[RETURN (GLOBALRESOURCE (\FFDELTA1 \FFDELTA2 \FFPATCHAR)
|
||||
(PROG ((DELTA1 (fetch (ARRAYP BASE) of \FFDELTA1))
|
||||
(DELTA2 (fetch (ARRAYP BASE) of \FFDELTA2))
|
||||
(PATCHAR (fetch (ARRAYP BASE) of \FFPATCHAR))
|
||||
(MAXPATINDEX (SUB1 BPATLEN))
|
||||
CHAR CURPATINDEX LASTCHAR INC)
|
||||
|
||||
(* ;; "Use Boyer-Moore string search algorithm. Use two auxiliary tables, DELTA1 and DELTA2, to tell how far ahead to move in the file when a partial match fails. DELTA1 contains, for each character code, the distance of that character from the right end of the pattern, or PATLEN if the character does not occur in the pattern. DELTA2 contains, for each character position in the pattern, how far ahead to move such that the partial substring discovered to the right of the position now matches some other substring (to the left) in the pattern. PATCHAR is just PATTERN translated thru CASEARRAY")
|
||||
(* ;; "Use Boyer-Moore string search algorithm. Use two auxiliary tables, DELTA1 and DELTA2, to tell how far ahead to move in the file when a partial match fails. DELTA1 contains, for each character code, the distance of that character from the right end of the pattern, or PATLEN if the character does not occur in the pattern. DELTA2 contains, for each character position in the pattern, how far ahead to move such that the partial substring discovered to the right of the position now matches some other substring (to the left) in the pattern. PATCHAR is just PATTERN translated thru CASEARRAY")
|
||||
|
||||
(\SETUP.FFILEPOS PATBASE PATOFFSET PATLEN PATCHAR DELTA1 DELTA2 CASE)
|
||||
[COND
|
||||
((SMALLP ENDOFFSET)
|
||||
(SETQ STARTSEG (SETQ ENDSEG 0)))
|
||||
(T
|
||||
(* ;; "The search will be in the large integers at least part of the time, so split the start and end fileptrs into hi and lo parts. The `segment' size we choose is smaller than 2^16 so that we are still smallp near the boundary. Note that STARTOFFSET and ENDOFFSET are never actually used as file ptrs, just for counting.")
|
||||
(\SETUP.FFILEPOS BPATBASE BPATOFFSET BPATLEN PATCHAR DELTA1 DELTA2)
|
||||
[COND
|
||||
((SMALLP ENDBYTEPOS)
|
||||
(SETQ STARTSEG (SETQ ENDSEG 0)))
|
||||
(T
|
||||
|
||||
(SETQ ENDSEG (FOLDLO ENDOFFSET FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ BIGENDOFFSET (MOD ENDOFFSET FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ STARTSEG (FOLDLO STARTOFFSET FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ STARTOFFSET (MOD STARTOFFSET FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ ENDOFFSET (COND
|
||||
((EQ STARTSEG ENDSEG)
|
||||
BIGENDOFFSET)
|
||||
(T
|
||||
(* ;; "The search will be in the large integers at least part of the time, so split the start and end fileptrs into hi and lo parts. The `segment' size we choose is smaller than 2^16 so that we are still smallp near the boundary. Note that STARTBYTEPOS and ENDBYTEPOS are never actually used as file ptrs, just for counting.")
|
||||
|
||||
(* ;; "In different segments, so we'll have to search all the way to the end of this seg; hence, `end' is currently as big as it gets")
|
||||
(SETQ ENDSEG (FOLDLO ENDBYTEPOS FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ BIGENDOFFSET (MOD ENDBYTEPOS FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ STARTSEG (FOLDLO STARTBYTEPOS FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ STARTBYTEPOS (MOD STARTBYTEPOS FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ ENDBYTEPOS (COND
|
||||
((EQ STARTSEG ENDSEG)
|
||||
BIGENDOFFSET)
|
||||
(T
|
||||
|
||||
FILEPOS.SEGMENT.SIZE]
|
||||
(SETQ LASTCHAR (GETBASEBYTE PATCHAR MAXPATINDEX))
|
||||
FIRSTCHARLP
|
||||
(COND
|
||||
[(IGEQ STARTOFFSET ENDOFFSET) (* ; "End of this chunk")
|
||||
(COND
|
||||
((EQ STARTSEG ENDSEG) (* ; "failed")
|
||||
(GO FAILED))
|
||||
(T (* ;
|
||||
"Finished this segment, roll over into new one")
|
||||
(add STARTSEG 1)
|
||||
(SETQ STARTOFFSET (IDIFFERENCE STARTOFFSET FILEPOS.SEGMENT.SIZE))
|
||||
(* ;; "In different segments, so we'll have to search all the way to the end of this seg; hence, `end' is currently as big as it gets")
|
||||
|
||||
FILEPOS.SEGMENT.SIZE]
|
||||
(SETQ LASTCHAR (GETBASEBYTE PATCHAR MAXPATINDEX))
|
||||
FIRSTCHARLP
|
||||
(COND
|
||||
[(IGEQ STARTBYTEPOS ENDBYTEPOS)
|
||||
(* ; "End of this chunk")
|
||||
(COND
|
||||
((EQ STARTSEG ENDSEG)
|
||||
(SETQ ENDOFFSET BIGENDOFFSET)))
|
||||
(GO FIRSTCHARLP]
|
||||
((NEQ (SETQ CHAR (GETBASEBYTE CASE (\BIN STREAM)))
|
||||
LASTCHAR)
|
||||
(add STARTOFFSET (SETQ INC (GETBASEBYTE DELTA1 CHAR)))
|
||||
(OR (EQ INC 1)
|
||||
(\INCFILEPTR STREAM (SUB1 INC)))
|
||||
(* ;
|
||||
"advance file pointer accordingly (\BIN already advanced it one)")
|
||||
(GO FIRSTCHARLP)))
|
||||
(SETQ CURPATINDEX (SUB1 MAXPATINDEX))
|
||||
MATCHLP
|
||||
(COND
|
||||
((ILESSP CURPATINDEX 0)
|
||||
(GO FOUNDIT)))
|
||||
(\DECFILEPTR STREAM 2) (* ; "back up to read previous char")
|
||||
(COND
|
||||
((NEQ (SETQ CHAR (GETBASEBYTE CASE (\BIN STREAM)))
|
||||
(GETBASEBYTE PATCHAR CURPATINDEX))
|
||||
(* ;
|
||||
"Mismatch, advance by greater of delta1 and delta2")
|
||||
(add STARTOFFSET (IDIFFERENCE (SETQ INC (IMAX (GETBASEBYTE DELTA1
|
||||
CHAR)
|
||||
(GETBASEBYTE DELTA2
|
||||
CURPATINDEX)))
|
||||
(IDIFFERENCE MAXPATINDEX CURPATINDEX)))
|
||||
(OR (EQ INC 1)
|
||||
(\INCFILEPTR STREAM (SUB1 INC)))
|
||||
(GO FIRSTCHARLP)))
|
||||
(SETQ CURPATINDEX (SUB1 CURPATINDEX))
|
||||
(GO MATCHLP)
|
||||
FOUNDIT
|
||||
(* ;
|
||||
"set fileptr, adjust for beginning skips and return proper value.")
|
||||
(\INCFILEPTR STREAM (COND
|
||||
(TAIL (* ; "Put fileptr at end of string")
|
||||
(SUB1 PATLEN))
|
||||
(T (* ;
|
||||
"back up over the last char we looked at, i.e. the first char of string")
|
||||
-1)))
|
||||
(RETURN (\GETFILEPTR STREAM))
|
||||
FAILED
|
||||
(* ;
|
||||
"return the fileptr to its initial position.")
|
||||
(\SETFILEPTR STREAM ORGFILEPTR)
|
||||
(RETURN NIL]
|
||||
((EQ STARTSEG ENDSEG) (* ; "failed")
|
||||
(GO FAILED))
|
||||
(T (* ;
|
||||
"Finished this segment, roll over into new one")
|
||||
(add STARTSEG 1)
|
||||
(SETQ STARTBYTEPOS (IDIFFERENCE STARTBYTEPOS
|
||||
FILEPOS.SEGMENT.SIZE))
|
||||
(COND
|
||||
((EQ STARTSEG ENDSEG)
|
||||
(SETQ ENDBYTEPOS BIGENDOFFSET)))
|
||||
(GO FIRSTCHARLP]
|
||||
((NEQ (SETQ CHAR (\BIN STREAM))
|
||||
LASTCHAR)
|
||||
(add STARTBYTEPOS (SETQ INC (GETBASEBYTE DELTA1 CHAR)))
|
||||
(OR (EQ INC 1)
|
||||
(\INCFILEPTR STREAM (SUB1 INC)))
|
||||
(* ;
|
||||
"advance file pointer accordingly (\BIN already advanced it one)")
|
||||
(GO FIRSTCHARLP)))
|
||||
(SETQ CURPATINDEX (SUB1 MAXPATINDEX))
|
||||
MATCHLP
|
||||
(COND
|
||||
((ILESSP CURPATINDEX 0)
|
||||
(GO FOUNDIT)))
|
||||
(\DECFILEPTR STREAM 2) (* ; "back up to read previous char")
|
||||
(COND
|
||||
((NEQ (SETQ CHAR (\BIN STREAM))
|
||||
(GETBASEBYTE PATCHAR CURPATINDEX))
|
||||
(* ;
|
||||
"Mismatch, advance by greater of delta1 and delta2")
|
||||
(add STARTBYTEPOS (IDIFFERENCE (SETQ INC (IMAX (GETBASEBYTE DELTA1
|
||||
CHAR)
|
||||
(GETBASEBYTE DELTA2
|
||||
CURPATINDEX)
|
||||
))
|
||||
(IDIFFERENCE MAXPATINDEX CURPATINDEX)))
|
||||
(OR (EQ INC 1)
|
||||
(\INCFILEPTR STREAM (SUB1 INC)))
|
||||
(GO FIRSTCHARLP)))
|
||||
(SETQ CURPATINDEX (SUB1 CURPATINDEX))
|
||||
(GO MATCHLP)
|
||||
FOUNDIT
|
||||
|
||||
|
||||
(* ;; "Unlike FILEPOS, it appears that the file is now positioned just after the first byte of the match. See note there about charsets.")
|
||||
|
||||
(RETURN (IF TAIL
|
||||
THEN (CL:UNLESS (EQ NSCHARSETSHIFT (ffetch (STREAM CHARSET
|
||||
)
|
||||
of STREAM))
|
||||
(freplace (STREAM CHARSET) of STREAM
|
||||
with (\CHARSET (NTHCHARCODE PATTERN -1))))
|
||||
(\INCFILEPTR STREAM (SUB1 BPATLEN))
|
||||
(SETQ ENDBYTEPOS (\GETFILEPTR STREAM))
|
||||
(CL:IF (EQ TAIL 'BOTH)
|
||||
(CONS (IDIFFERENCE ENDBYTEPOS BPATLEN)
|
||||
ENDBYTEPOS)
|
||||
ENDBYTEPOS)
|
||||
ELSE
|
||||
(* ;;
|
||||
"Fileptr wants to be where the match started, 1 back from where the match ended")
|
||||
|
||||
(\INCFILEPTR STREAM -1)
|
||||
(SETQ STARTBYTEPOS (\GETFILEPTR STREAM))
|
||||
(CL:UNLESS (EQ NSCHARSETSHIFT (ffetch (STREAM CHARSET)
|
||||
of STREAM))
|
||||
(freplace (STREAM CHARSET) of STREAM
|
||||
with (\CHARSET (CHCON1 PATTERN)))
|
||||
(\BACKCCODE STREAM)
|
||||
(* ; "Should fix the charset")
|
||||
(\SETFILEPTR STREAM STARTBYTEPOS))
|
||||
STARTBYTEPOS))
|
||||
FAILED
|
||||
(\SETFILEPTR STREAM ORGFILEPTR)
|
||||
(* ;
|
||||
"return the fileptr to its initial position.")
|
||||
(RETURN NIL]
|
||||
TRYFILEPOS
|
||||
(RETURN (FILEPOS PATTERN STREAM START END SKIP TAIL CASEARRAY])
|
||||
|
||||
(\SETUP.FFILEPOS
|
||||
(LAMBDA (PATBASE PATOFFSET PATLEN PATCHAR DELTA1 DELTA2 CASE) (* jop%: "25-Sep-86 11:44") (* ;;; "Set up PATCHAR, DELTA1 and DELTA2 arrays from string. This is a separate function currently so I can gather stats on it") (PROG ((PATLEN,PATLEN (IPLUS (LLSH PATLEN BITSPERBYTE) PATLEN)) (MAXPATINDEX (SUB1 PATLEN)) CHAR) (for I from 0 to (FOLDLO \MAXCHAR BYTESPERWORD) do (PUTBASE DELTA1 I PATLEN,PATLEN)) (* ;; "DELTA1 initially all PATLEN, the default for chars not in the pattern. I assume array is word-aligned") (for I from 0 to MAXPATINDEX do (PUTBASEBYTE PATCHAR I (SETQ CHAR (GETBASEBYTE CASE (GETBASEBYTE PATBASE (IPLUS PATOFFSET I))))) (* ; "Translate STR now so we don't have to do it repeatedly") (PUTBASEBYTE DELTA1 CHAR (IDIFFERENCE MAXPATINDEX I)) (* ; "DELTA1 = how far ahead to move when we mismatch with this char")) (* ;; "Now set up DELTA2. Scan pattern backwards. For each character, we want to find the rightmost reoccurrence of the substring consisting of the chars to the right of the current char. This is slightly different than Boyer-Moore, in that we do not insist that it be the rightmost reoccurrence that is not preceded by the current char. Small difference, noticeable only in patterns that contain multiple occurrences of tails of the pattern. The following loop calculates DELTA2 in almost the obvious way, using the observation that DELTA2 is strictly increasing (by our definition) as the pattern index decreases. This algorithm is potentially quadratic, as it amounts to searching a string (PATTERN, backwards) for a given substring in the 'dumb' way; fortunately, it is rarely so in practice for 'normal' patterns") (for P from (SUB1 MAXPATINDEX) to 0 by -1 bind (LASTD2 _ 1) (LASTMATCHPOS _ MAXPATINDEX) do (PUTBASEBYTE DELTA2 P (SETQ LASTD2 (COND ((OR (IGEQ LASTD2 PATLEN) (EQ (GETBASEBYTE PATCHAR (IDIFFERENCE MAXPATINDEX LASTD2)) (GETBASEBYTE PATCHAR (ADD1 P)))) (* ;; "The last time around we matched a terminal substring somehow, and now the next char matches the char before that substring, so DELTA2 is just one more, i.e. the match continues. Once we've overflowed the pattern, the 'match' continues trivially") (ADD1 LASTD2)) (T (do (SETQ LASTMATCHPOS (SUB1 LASTMATCHPOS)) repeatuntil (for I from MAXPATINDEX to (ADD1 P) by -1 as J from LASTMATCHPOS to 0 by -1 always (EQ (GETBASEBYTE PATCHAR I) (GETBASEBYTE PATCHAR J)))) (* ; "Substring from P+1 onward matches substring that ends at LASTMATCHPOS") (IPLUS (IDIFFERENCE MAXPATINDEX LASTMATCHPOS) (IDIFFERENCE MAXPATINDEX P)))))))))
|
||||
)
|
||||
[LAMBDA (PATBASE PATOFFSET PATLEN PATCHAR DELTA1 DELTA2)
|
||||
|
||||
(* ;; "Edited 24-Jun-2022 16:32 by rmk: Removing CASE argument. That forces the \SLOWFILEPOS, because the the alternative stream matches can't be anticipated.")
|
||||
(* jop%: "25-Sep-86 11:44")
|
||||
|
||||
(* ;;; "Set up PATCHAR, DELTA1 and DELTA2 arrays from string. This is a separate function currently so I can gather stats on it")
|
||||
|
||||
(PROG ((PATLEN,PATLEN (IPLUS (LLSH PATLEN BITSPERBYTE)
|
||||
PATLEN))
|
||||
(MAXPATINDEX (SUB1 PATLEN))
|
||||
CHAR)
|
||||
(for I from 0 to (FOLDLO \MAXCHAR BYTESPERWORD) do (PUTBASE DELTA1 I PATLEN,PATLEN))
|
||||
|
||||
(* ;; "DELTA1 initially all PATLEN, the default for chars not in the pattern. I assume array is word-aligned")
|
||||
|
||||
(for I from 0 to MAXPATINDEX do (SETQ CHAR (GETBASEBYTE PATBASE (IPLUS PATOFFSET I)))
|
||||
(PUTBASEBYTE PATCHAR I CHAR)
|
||||
(PUTBASEBYTE DELTA1 CHAR (IDIFFERENCE MAXPATINDEX I))
|
||||
(* ;
|
||||
"DELTA1 = how far ahead to move when we mismatch with this char")
|
||||
)
|
||||
|
||||
(* ;; "Now set up DELTA2. Scan pattern backwards. For each character, we want to find the rightmost reoccurrence of the substring consisting of the chars to the right of the current char. This is slightly different than Boyer-Moore, in that we do not insist that it be the rightmost reoccurrence that is not preceded by the current char. Small difference, noticeable only in patterns that contain multiple occurrences of tails of the pattern. The following loop calculates DELTA2 in almost the obvious way, using the observation that DELTA2 is strictly increasing (by our definition) as the pattern index decreases. This algorithm is potentially quadratic, as it amounts to searching a string (PATTERN, backwards) for a given substring in the 'dumb' way; fortunately, it is rarely so in practice for 'normal' patterns")
|
||||
|
||||
(for P from (SUB1 MAXPATINDEX) to 0 by -1 bind (LASTD2 _ 1)
|
||||
(LASTMATCHPOS _ MAXPATINDEX)
|
||||
do (PUTBASEBYTE DELTA2 P
|
||||
(SETQ LASTD2
|
||||
(COND
|
||||
([OR (IGEQ LASTD2 PATLEN)
|
||||
(EQ (GETBASEBYTE PATCHAR (IDIFFERENCE MAXPATINDEX LASTD2))
|
||||
(GETBASEBYTE PATCHAR (ADD1 P]
|
||||
|
||||
(* ;; "The last time around we matched a terminal substring somehow, and now the next char matches the char before that substring, so DELTA2 is just one more, i.e. the match continues. Once we've overflowed the pattern, the 'match' continues trivially")
|
||||
|
||||
(ADD1 LASTD2))
|
||||
(T [do (SETQ LASTMATCHPOS (SUB1 LASTMATCHPOS))
|
||||
repeatuntil (for I from MAXPATINDEX to (ADD1 P) by -1 as J
|
||||
from LASTMATCHPOS to 0 by -1
|
||||
always (EQ (GETBASEBYTE PATCHAR I)
|
||||
(GETBASEBYTE PATCHAR J]
|
||||
(* ;
|
||||
"Substring from P+1 onward matches substring that ends at LASTMATCHPOS")
|
||||
(IPLUS (IDIFFERENCE MAXPATINDEX LASTMATCHPOS)
|
||||
(IDIFFERENCE MAXPATINDEX P])
|
||||
|
||||
(\SLOWFILEPOS
|
||||
[LAMBDA (PATTERN STREAM STARTBYTEPOS ENDBYTEPOS SKIP TAIL CASEARRAY)
|
||||
|
||||
(* ;; "Edited 10-Jul-2022 16:50 by rmk")
|
||||
|
||||
(* ;; "Edited 1-Jul-2022 10:51 by rmk")
|
||||
|
||||
(* ;; "Edited 29-Jun-2022 13:43 by rmk: The slow case when either SKIP or TAIL is specified. Those operate only on character codes, not on individual bytes of the external format, so the file has to be decoded with generic character functions.")
|
||||
|
||||
(* ;; "CASEARRAY is assumed only to map ASCII, but that is independent of the logic here.")
|
||||
|
||||
(PROG ((SKIPCODE (CL:WHEN SKIP (CHCON1 SKIP)))
|
||||
PATBASE PATLEN PATFATP FIRSTINDEX LASTINDEX SKIPCODE PATFIRSTCODE NFIRSTCODEBYTES NPBYTES
|
||||
CABASE CASIZE CAFAT STARTCHARSET (ORGCHARSET (ffetch (STREAM CHARSET) of STREAM)))
|
||||
(DECLARE (SPECVARS NFIRSTCODEBYTES NPBYTES))
|
||||
(CL:WHEN (AND CASEARRAY (NEQ T CASEARRAY))
|
||||
(CL:UNLESS [AND (ARRAYP CASEARRAY)
|
||||
(OR (EQ \ST.BYTE (ffetch (ARRAYP TYP) of CASEARRAY))
|
||||
(SETQ CAFAT (EQ \ST.POS16 (ffetch (ARRAYP TYP) of CASEARRAY]
|
||||
(LISPERROR "ILLEGAL ARG" CASEARRAY))
|
||||
(SETQ CABASE (FETCH (ARRAYP BASE) OF CASEARRAY))
|
||||
(SETQ CASIZE (FETCH (ARRAYP LENGTH) OF CASEARRAY))
|
||||
(SETQ PATTERN (CONCAT PATTERN)) (* ;
|
||||
"Map all STR characters thru the case array")
|
||||
(FOR C INSTRING PATTERN AS I FROM 1 DO (RPLCHARCODE PATTERN I
|
||||
(\CATRANSLATE CABASE CASIZE CAFAT C))))
|
||||
|
||||
(* ;; "")
|
||||
|
||||
(* ;; "PATSTR now has case-mapped characters")
|
||||
|
||||
[COND
|
||||
((LITATOM PATTERN)
|
||||
(SETQ PATBASE (ffetch (LITATOM PNAMEBASE) of PATTERN))
|
||||
(SETQ PATLEN (ffetch (LITATOM PNAMELENGTH) of PATTERN))
|
||||
(SETQ FIRSTINDEX 1)
|
||||
(SETQ PATFATP (ffetch (LITATOM FATPNAMEP) of PATTERN)))
|
||||
(T (CL:UNLESS (STRINGP PATTERN)
|
||||
(SETQ PATTERN (MKSTRING PATTERN)))
|
||||
(SETQ PATBASE (ffetch (STRINGP BASE) of PATTERN))
|
||||
(SETQ PATLEN (ffetch (STRINGP LENGTH) of PATTERN))
|
||||
(SETQ FIRSTINDEX (ffetch (STRINGP OFFST) of PATTERN))
|
||||
(SETQ PATFATP (ffetch (STRINGP FATSTRINGP) of PATTERN]
|
||||
(SETQ LASTINDEX (IPLUS FIRSTINDEX (SUB1 PATLEN)))
|
||||
(SETQ PATFIRSTCODE (\GETBASECHAR PATFATP PATBASE FIRSTINDEX))
|
||||
(ADD FIRSTINDEX 1) (* ;
|
||||
"Start at the second character after the first one matched. ")
|
||||
|
||||
(* ;; "")
|
||||
|
||||
(* ;; "A loop of nomatch-match sequences")
|
||||
|
||||
(* ;;
|
||||
"EOFGUARD saves a little testing, assumes no character in any encoding takes more than 10 bytes.")
|
||||
|
||||
(BIND STREAMCODE NBYTESHI NBYTESLO SECONDCHARSET (NBYTES _ (IDIFFERENCE ENDBYTEPOS
|
||||
STARTBYTEPOS))
|
||||
FIRST (SETQ NBYTESHI (FOLDLO NBYTES FILEPOS.SEGMENT.SIZE))
|
||||
(SETQ NBYTESLO (IMOD NBYTES FILEPOS.SEGMENT.SIZE))
|
||||
DO (DO
|
||||
(* ;; "Find next FIRSTCHAR")
|
||||
|
||||
(CL:WHEN (ILEQ NBYTESLO 0) (* ; "Finished this segment ")
|
||||
(CL:WHEN (EQ NBYTESHI 0)
|
||||
(GO FAILED)) (* ; "Roll over to a new segment")
|
||||
(add NBYTESLO FILEPOS.SEGMENT.SIZE)
|
||||
(add NBYTESHI -1))
|
||||
|
||||
(* ;; "Guard \INCCODE against EOF, only when we are getting close")
|
||||
|
||||
(CL:WHEN (AND (EQ NBYTESHI 0)
|
||||
(ILEQ NBYTESLO 10)
|
||||
(NULL (\PEEKCCODE STREAM T)))
|
||||
(GO FAILED))
|
||||
(SETQ STARTCHARSET (ffetch (STREAM CHARSET) of STREAM))
|
||||
(SETQ STREAMCODE (\INCCODE.EOLC STREAM NIL 'NFIRSTCODEBYTES 0))
|
||||
(ADD NBYTESLO NFIRSTCODEBYTES) (* ;
|
||||
"Decrement the character's byte count")
|
||||
(CL:WHEN (EQ PATFIRSTCODE SKIPCODE) (* ; "Pattern starts with skip")
|
||||
(RETURN))
|
||||
(CL:WHEN CABASE
|
||||
(SETQ STREAMCODE (\CATRANSLATE CABASE CASIZE CAFAT STREAMCODE)))
|
||||
REPEATUNTIL (EQ STREAMCODE PATFIRSTCODE))
|
||||
(SETQ SECONDCHARSET (ffetch (STREAM CHARSET) of STREAM))
|
||||
|
||||
(* ;; "")
|
||||
|
||||
(* ;; "Found PATFIRSTCODE, match the rest")
|
||||
|
||||
(* ;;
|
||||
"The matching loop must fail at EOF, otherwise either match or return to firstchar loop.")
|
||||
|
||||
(* ;;
|
||||
"The EOF guard is \PEEKCCODE (no error), we only want to bother when we might be getting close.")
|
||||
|
||||
(SETQ NPBYTES 0)
|
||||
(FOR I PATCODE (EOFGUARD _ (AND (EQ NBYTESHI 0)
|
||||
(ILEQ NBYTESLO 10))) FROM FIRSTINDEX TO LASTINDEX
|
||||
DO (CL:WHEN (AND EOFGUARD (NULL (\PEEKCCODE STREAM T)))
|
||||
(GO FAILED))
|
||||
(SETQ PATCODE (\GETBASECHAR PATFATP PATBASE I))
|
||||
(SETQ STREAMCODE (\INCCODE.EOLC STREAM NIL 'NPBYTES NPBYTES))
|
||||
(CL:UNLESS (EQ PATCODE SKIPCODE)
|
||||
(CL:WHEN CABASE
|
||||
(SETQ STREAMCODE (\CATRANSLATE CABASE CASIZE CAFAT STREAMCODE)))
|
||||
(CL:UNLESS (EQ STREAMCODE PATCODE)
|
||||
|
||||
(* ;; "Match failed: Go back to second position and try again")
|
||||
|
||||
(\INCFILEPTR STREAM NPBYTES)
|
||||
(freplace (STREAM CHARSET) of STREAM with SECONDCHARSET)
|
||||
(RETURN))) FINALLY (GO FOUNDIT)))
|
||||
FOUNDIT
|
||||
|
||||
|
||||
(* ;; "The CHARSET should be accurate in the tail case. We have to adjust for the start case.")
|
||||
|
||||
(RETURN (SELECTQ TAIL
|
||||
(NIL (* ;
|
||||
"Fileptr wants to be where the match started")
|
||||
(freplace (STREAM CHARSET) of STREAM with STARTCHARSET)
|
||||
(\INCFILEPTR STREAM (IPLUS NPBYTES NFIRSTCODEBYTES))
|
||||
(\GETFILEPTR STREAM))
|
||||
(BOTH (CONS (IPLUS (\GETFILEPTR STREAM)
|
||||
NPBYTES NFIRSTCODEBYTES)
|
||||
(\GETFILEPTR STREAM)))
|
||||
(\GETFILEPTR STREAM)))
|
||||
FAILED
|
||||
(freplace (STREAM CHARSET) of STREAM with ORGCHARSET)
|
||||
(RETURN NIL])
|
||||
)
|
||||
(DECLARE%: EVAL@COMPILE DONTCOPY
|
||||
(DECLARE%: EVAL@COMPILE
|
||||
@@ -1321,7 +1492,7 @@ DONTCOPY
|
||||
)
|
||||
|
||||
(DEFOPTIMIZER DATEFORMAT (&REST X)
|
||||
(KWOTE (CONS 'DATEFORMAT X)))
|
||||
(KWOTE (CONS 'DATEFORMAT X)))
|
||||
|
||||
|
||||
|
||||
@@ -1383,15 +1554,15 @@ DONTCOPY
|
||||
(PUTPROPS IOCHAR COPYRIGHT ("Venue & Xerox Corporation" 1981 1982 1983 1984 1985 1986 1987 1988 1990
|
||||
1991 2018 2020))
|
||||
(DECLARE%: DONTCOPY
|
||||
(FILEMAP (NIL (3484 7278 (CHCON 3494 . 4344) (UNPACK 4346 . 5240) (DCHCON 5242 . 6509) (DUNPACK 6511
|
||||
. 7276)) (7279 18794 (UALPHORDER 7289 . 7385) (ALPHORDER 7387 . 9190) (CONCAT 9192 . 9837) (
|
||||
CONCATCODES 9839 . 10025) (PACKC 10027 . 12630) (PACK 12632 . 13211) (PACK* 13213 . 14935) (\PACK.ITEM
|
||||
14937 . 15392) (STRPOS 15394 . 18792)) (18796 19085 (XCL:PACK 18796 . 19085)) (19087 19337 (XCL:PACK*
|
||||
19087 . 19337)) (20055 22446 (STRPOSL 20065 . 21691) (MAKEBITTABLE 21693 . 22444)) (22608 23085 (
|
||||
CASEARRAY 22618 . 22808) (UPPERCASEARRAY 22810 . 23083)) (23407 47009 (FILEPOS 23417 . 33329) (
|
||||
FFILEPOS 33331 . 44444) (\SETUP.FFILEPOS 44446 . 47007)) (47797 89044 (DATE 47807 . 47893) (DATEFORMAT
|
||||
47895 . 47987) (GDATE 47989 . 48100) (IDATE 48102 . 59773) (\IDATESCANTOKEN 59775 . 61054) (
|
||||
\IDATE-PARSE-MONTH 61056 . 64752) (\OUTDATE 64754 . 77502) (\OUTDATE-STRING 77504 . 78119) (\RPLRIGHT
|
||||
78121 . 78359) (\UNPACKDATE 78361 . 84152) (\PACKDATE 84154 . 87474) (\DTSCAN 87476 . 87618) (\ISDST?
|
||||
87620 . 88127) (\CHECKDSTCHANGE 88129 . 89042)))))
|
||||
(FILEMAP (NIL (3524 7318 (CHCON 3534 . 4384) (UNPACK 4386 . 5280) (DCHCON 5282 . 6549) (DUNPACK 6551
|
||||
. 7316)) (7319 18834 (UALPHORDER 7329 . 7425) (ALPHORDER 7427 . 9230) (CONCAT 9232 . 9877) (
|
||||
CONCATCODES 9879 . 10065) (PACKC 10067 . 12670) (PACK 12672 . 13251) (PACK* 13253 . 14975) (\PACK.ITEM
|
||||
14977 . 15432) (STRPOS 15434 . 18832)) (18836 19125 (XCL:PACK 18836 . 19125)) (19127 19377 (XCL:PACK*
|
||||
19127 . 19377)) (20015 22406 (STRPOSL 20025 . 21651) (MAKEBITTABLE 21653 . 22404)) (22568 23045 (
|
||||
CASEARRAY 22578 . 22768) (UPPERCASEARRAY 22770 . 23043)) (23367 55842 (FILEPOS 23377 . 32618) (
|
||||
FFILEPOS 32620 . 44841) (\SETUP.FFILEPOS 44843 . 48626) (\SLOWFILEPOS 48628 . 55840)) (56630 97877 (
|
||||
DATE 56640 . 56726) (DATEFORMAT 56728 . 56820) (GDATE 56822 . 56933) (IDATE 56935 . 68606) (
|
||||
\IDATESCANTOKEN 68608 . 69887) (\IDATE-PARSE-MONTH 69889 . 73585) (\OUTDATE 73587 . 86335) (
|
||||
\OUTDATE-STRING 86337 . 86952) (\RPLRIGHT 86954 . 87192) (\UNPACKDATE 87194 . 92985) (\PACKDATE 92987
|
||||
. 96307) (\DTSCAN 96309 . 96451) (\ISDST? 96453 . 96960) (\CHECKDSTCHANGE 96962 . 97875)))))
|
||||
STOP
|
||||
|
||||
Reference in New Issue
Block a user