Files
seta75D 7c4988eac0 Init
2021-10-11 19:38:01 -03:00

1506 lines
37 KiB
C
Executable File

/*
* Copyright (c) 1994, by Sun Microsystems, Inc.
*
* This code is MKS code ported to Solaris with minimum modifications so that
* upgrades from MKS will readily integrate. Avoid modification if possible!
*/
#ident "@(#)regcomp.c 1.15 95/10/05 SMI"
/*
* regcomp, regwcomp, and regfree
*
* Copyright (c) 1986 by University of Toronto.
* Written by Henry Spencer. Not derived from licensed software.
*
* Copyright 1985, 1992 by Mortice Kern Systems Inc. All rights reserved.
*
* This Software is unpublished, valuable, confidential property of
* Mortice Kern Systems Inc. Use is authorized only in accordance
* with the terms and conditions of the source licence agreement
* protecting this Software. Any unauthorized use or disclosure of
* this Software is strictly prohibited and will result in the
* termination of the licence agreement.
*
* If you have any questions, please consult your supervisor.
*
* Modifications by Eric Gisin and Alex White.
*
* Permission is granted to anyone to use this software for any
* purpose on any computer system, and to redistribute it freely,
* subject to the following restrictions:
*
* 1. The author is not responsible for the consequences of use of
* this software, no matter how awful, even if they arise
* from defects in it.
*
* 2. The origin of this software must not be misrepresented, either
* by explicit claim or by omission.
*
* 3. Altered versions must be plainly marked as such, and must not
* be misrepresented as being the original software.
*
* Beware that some of this code is subtly aware of the way operator
* precedence is structured in regular expressions. Serious changes in
* regular-expression syntax might require a total rethink.
*
**** This has been modified to be a subset of the P1003.2 regex library ****
*
* Summary of changes:
* - adopt the POSIX 1003.2 interface, with some extensions.
* - add function, regerror(regex_t*), to get error message string.
* - distinguish between basic and extended REs with REG_EXTENDED.
* - case insensitive match if REG_ICASE.
* "grep -i" is half as fast with REG_ICASE.
* - concatenate STRING string in regbranch() instead of regatom().
* - rewrite the handling of \0, |, and ) terminators.
* - recognize ^, $, *, etc. only where appropriate (subject to change).
* - use struct regnode instead of linear byte stream to represent NFA.
* there are three reasons for this:
* code is much more readable,
* it is a bit faster - (a|b) is 33% faster.
* the NFA is in a format suitable for conversion to DFA.
* - add implicit ".*" node to beginning of unanchored REs.
* - add BREAK node to optimize ".*".
* - add REG_ANCHOR (for grep -x).
* - add [:alpha:], etc. bracket expressions.
* - replace recursion in regmatch with a back-tracking stack.
* now (a|b) is 2-3 times faster.
* - remove regrepeat; the new regmatch is faster.
* - use bit vectors for character classes. 2-4 times faster.
* - regexec now trys all branches, for longest match
* - regex.c split regcomp/regfree into regcomp.c, regexec into regexec.c
* - multi-character collating element support
* - multibyte support
* - Make {} constructs work on non-atoms
*/
#ifdef M_RCSID
#ifndef lint
static char rcsID[] = "$Header: /u/rd/src/libc/regex/rcs/regcomp.c 1.39 1995/03/29 21:35:49 rodney Exp $";
#endif
#endif
#include "mks.h"
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <setjmp.h>
#include <regex.h>
#include "reg.h"
#include <stdarg.h>
#include <m_collate.h>
#include <errno.h>
/*
* Tunable parameters.
*/
/* The maximum size of the string in a STRING node */
#define REG_MAXSTR 255
/*
* Flags to be passed up and down.
*/
#define WORST 0 /* Worst case. */
#define HASWIDTH 0x1 /* Known never to match null string. */
#define CLOSEPAR 0x2 /* Subexpression is closed */
#define NOWIDTH 0x4 /* Known never to match anything */
#define NOTOKEN (-1) /* indicates no terminator in token */
/*l
* We allocate nodes in blocks of 5 for efficiency.
* It also allows us to iterate over the nodes in an NFA.
*/
#define REGNODES 5 /* no. nodes for simple RE */
typedef struct _regblock block;
struct _regblock {
block *next;
node nodes[REGNODES];
};
#ifdef REG_CESC
static struct {
char code;
char character;
} cesc_map[] = {
{'a', '\a'},
{'b', '\b'},
{'f', '\f'},
{'n', '\n'},
{'r', '\r'},
{'t', '\t'},
{'v', '\v'},
{ 0 , '\0'},
};
#endif /* REG_CESC */
static block *regblockp; /* current allocation block */
static int regnodei; /* index of current regnode */
/*
* Global work variables for regcomp().
*/
static jmp_buf err; /* jump back to error hander */
static int regflags; /* recomp flags */
static wuchar_t *regbegin; /* Input RE */
static wuchar_t *regparse; /* Input-scan pointer. */
static int token; /* branch terminator: \0, |, or ). */
static uchar parflags[10]; /* flags of parenthesized sub-exprs */
static node *optlist; /* list of ANY nodes in ".*"s */
static int pardepth; /* count of open sub-exprs */
static node *reg(regex_t *r, int *flagp);
static node *regbranch(regex_t *r, int *flagp);
static node *regpiece(regex_t *r, int *flagp);
static node *regatom(register regex_t *r, int *flagp);
static node *regclass(regex_t *r);
static node *regnode(int op);
static m_collel_t regelement(void);
static node *regjoin(node *head, node *tail);
static block *regblock(void);
static node *regrepeat(register node *atom);
static void optimize(void);
/*
* Porting mods
*/
extern void * _loaded_coll_;
wchar_t * reg_m_mbstowcsdup(const char *s); /* ZZZ */
/*
* REG_WCHAROPT - this flag is passed in the re_flags field by regcomp to
* regwcomp to indicate it was called by regcomp. Note that this flag
* must not conflict with the regcomp flags defined in regex.h.
*/
#define REG_WCHAROPT 0xF0000000
void _wchar_opt(wchar_t *temp_pattern, const wchar_t *pattern);
/*l
* Character classes.
* For each [] bracketted expression, we create a bit map for all the
* characters in the character set that it can match.
* Note that the maximum process code is *not* say 256 in single byte
* code sets, but 256 + # of multi-character collating elements.
* For wide chars, the assumption is that we only support 2 byte
* wchar_t's, since 4 byte ones will take unusable amounts of storage.
* For wide chars, the multi-character collating elements are assumed
* to be allocated an unused portion of the wide array space. e.g.
* no encoding that I know of uses 0xFF as the first half of its process
* code, so 0xFFyy could encode all the multi-character collating elements.
* In practice, you will however allocate some unused part to keep the
* maximum down; such that a full 8K need not be used.
* However, even if it is, 8K is not an unreasonable size to use.
*/
static char *cl; /* Current class being built */
/*f
* Allocate and clear to zero the bit array.
*/
static void
classbegin(void)
{
m_collel_t max = m_maxcoll();
if ((cl = (char *)calloc(1, (unsigned short) ((unsigned short)max+
CHAR_BIT-1)/ (unsigned short) CHAR_BIT)) == NULL)
longjmp(err, REG_ESPACE);
}
/*f
* Set the cth bit in the character class. If icase set, then we set
* both upper and lower case of the value.
*/
static void
classadd(m_collel_t c, int icase)
{
wchar_t wc;
if (c == -1 || c > 0xFFFF)
longjmp(err, REG_ECOLLATE);
/* Don't add illegal characters to list */
if ((wc = _cetowc(_loaded_coll_, c)) < 0)
return;
if (icase && !m_ismccollel(c)) {
if (iswlower(wc))
classadd(_wctoce (_loaded_coll_, towupper(wc)), 0);
else if (iswupper(wc))
classadd(_wctoce (_loaded_coll_, towlower(wc)), 0);
}
cl[(unsigned)c/CHAR_BIT] |= 1<<(unsigned)c%CHAR_BIT;
}
/*f
* Save the class bits array.
*/
static char *
classclose(void)
{
return cl;
}
/*f
* Free up memory from a class structure.
*/
static void
classfree(register char *clp)
{
free(clp);
}
int
classtest(char * CLP, m_collel_t C)
{
int val;
if (C == -1)
return (0);
val =
((CLP)[(unsigned)(C)/CHAR_BIT] & (1<<(unsigned)(C)%CHAR_BIT));
return (val);
}
#ifdef DEBUG
/*f
* regprint - printable representation of opcode
*/
void
regprint(np)
register node *np;
{
register char *p = NULL;
printf("%p: ", (void *)np);
switch (np->type) {
case RETURN:
printf("RETURN %d", np->u.i);
break;
case BOL: p = "BOL"; break;
case BOL_NL: p = "BOL_NL"; break;
case EOL: p = "EOL"; break;
case EOL_NL: p = "EOL_NL"; break;
case ANY: p = "ANY"; break;
case ANY_NL: p = "ANY_NL"; break;
case ANYOF: p = "ANYOF"; break;
case ANYBUT: p = "ANYBUT"; break;
case ANYBUT_NL: p = "ANYBUT_NL"; break;
case FAIL: p = "FAIL"; break;
case EMPTY: p = "EMPTY"; break;
case SCAN_NL: p = "SCAN_NL"; break;
case SCAN: p = "SCAN"; break;
case REPEATSTART: p = "REPEATSTART"; break;
case ANYOF2:
printf(m_smalls("ANYOF2 '%C' or '%C'\n"),
np->u.c[0], np->u.c[1]);
break;
case BRANCH:
printf("BRANCH %p", (void *)np->u.np);
break;
case CHAR:
printf(m_smalls("CHAR '%C'"), np->u.c[0]);
break;
case STRING:
printf(m_smalls("STRING \"%S\""), np->u.str);
break;
#ifdef REG_WORDS
case WORDB:
p = "WORDB";
break;
#endif
case SETSP:
printf("SETSP %d", np->u.i);
break;
case SETEP:
printf("SETEP %d", np->u.i);
break;
case EXPAND:
printf("EXPAND %d", np->u.i);
break;
case BREAK:
printf(m_smalls("BREAK '%C'"), np->u.c[0]);
break;
case BREAK_NL:
printf(m_smalls("BREAK_NL '%C'"), np->u.c[0]);
break;
case SCANTO:
printf(m_smalls("SCANTO '%C'"), np->u.c[0]);
break;
case SEPR:
printf("SEPR %d", np->u.i);
break;
case PUSH:
printf("PUSH %d", np->u.i);
break;
case POP:
printf("POP %d", np->u.i);
break;
case REPEATINFO:
printf("REPEATINFO {%d,%d:%d}:%d", np->U.min, np->U.max,
np->U.unbound, np->U.n);
break;
case REPEAT:
printf("REPEAT %p", (void *)np->u.np);
break;
case REPEATCHECK:
printf(m_smalls("REPEATCHECK \"%s\""),
np->u.str == NULL ? "(NULL)": (char *)np->u.str);
break;
case REPEATPOP:
printf("REPEATPOP");
break;
case REPEATPOP2:
printf("REPEATPOP_2");
break;
case MINMAXSIZE:
printf("MINMAXSIZE");
break;
default:
printf("*GOK* %d", np->type);
break;
}
if (p != NULL)
printf("%s", p);
printf(" -> %p\n", (void *) np->next);
}
/*f
* regdump - dump a regex onto stdout in vaguely comprehensible form
*/
static void
regdump(regex_t *r)
{
register block *bp;
register node *np;
register int i;
for (bp = r->re_block; bp != NULL; bp = bp->next)
for (i = REGNODES, np = bp->nodes; i--; np++) {
if (np == r->re_node)
printf("*");
regprint(np);
}
printf("flags %x, ", r->re_flags);
if (r->re_regmust != NULL)
printf("longest \"%S\" ", r->re_regmust);
if (r->re_flags&REG_MUST)
printf("(must have)");
printf("\n");
}
#endif /* DEBUG */
#ifdef M_I18N_MB
extern void * _loaded_coll_;
/*
* Can't use the mks library function, because regex is in the system libs.
*/
wchar_t *
reg_m_mbstowcsdup(const char *s)
{
int n;
wchar_t *w;
n = strlen(s) + 1;
if ((w = (wchar_t *)malloc(n * sizeof(wchar_t))) == NULL)
return(NULL);
if (_loaded_coll_ != NULL) {
if (mbstowcs(w, s, n) == -1)
return(NULL);
} else {
int i;
wchar_t *wp = w;
unsigned char *sp = (unsigned char *) s;
for (i=0; i < n; i++)
*wp++ = *sp++;
}
return w;
}
#endif
int
regcomp(register regex_t *r, const char *pattern, int uflags)
{
wchar_t *wpattern;
int i;
_reginit(); /* Solaris porting mod - load coll table */
if ((wpattern = reg_m_mbstowcsdup(pattern)) == NULL)
return errno == EILSEQ ? REG_ECHAR : REG_ESPACE;
i = regwcomp(r, wpattern, uflags | REG_WCHAROPT);
free(wpattern);
return i;
}
/*f
* regcomp - compile a regular expression into internal code
*
* Beware that the optimization-preparation code in here knows about some
* of the structure of the compiled regex.
*/
int
regwcomp(register regex_t *r, const wchar_t *pattern, int uflags)
{
register node *ret, *np;
register size_t i;
int flags, errnum;
wchar_t temp_pattern[1024];
_reginit(); /* Solaris porting mod - load coll table */
M_INVARIANTINIT();
/* catch syntax and internal errors */
if ((errnum = setjmp(err)) != 0) {
/* fill up last block */
while (regnodei != REGNODES)
(void) regnode(RETURN);
regfree(r);
return errnum;
}
if (pattern == NULL)
longjmp(err, REG_EFATAL);
r->re_block = regblockp = regblock();
r->re_nsub = 0;
regflags = uflags;
/*
* If the REG_WCHAROPT flag is set, then we've been called by
* regcomp, and any wide characters have already been optimized.
* Also, if the collation table is loaded, we don't optimize the
* the wide characters. Otherwise, we call _wchar_opt to
* optimize the input characters.
*/
if (uflags & REG_WCHAROPT || _loaded_coll_ != NULL) {
regbegin = regparse = (wuchar_t *)pattern;
} else {
_wchar_opt(temp_pattern, pattern);
regbegin = regparse = (wuchar_t *)temp_pattern;
}
optlist = NULL;
for (i = 0; i < 10; i++)
parflags[i] = 0;
pardepth = 0;
if ((ret = reg(r, &flags)) == NULL)
longjmp(err, REG_BADPAT);
if (token != '\0')
longjmp(err, (token==')') ? REG_EPAREN : REG_BADPAT);
/*
* Find longest literal string that must appear.
* Resolve ties in favor of later strings, since
* the BREAK check works with the beginning of the r.e.
* and avoiding duplication strengthens checking. Not a
* strong reason, but sufficient in the absence of others.
*/
i = 0; /* length of longest string */
r->re_regmust = NULL;
for (np = ret; np != NULL; np = np->next) {
if (np->type == CHAR && 1 >= i) {
i = 1;
r->re_regmust = (wchar_t*)np->u.c;
}
if (np->type == STRING && wcslen((wchar_t*)np->u.str) >= i) {
i = wcslen((wchar_t*)np->u.str);
r->re_regmust = (wchar_t*)np->u.str;
}
}
/* prepend SETSP node to set sub[0].sp */
np = regnode(SETSP);
np->u.i = 0;
np->next = ret;
ret = np;
/* append SETEP node to set sub[0].ep */
np = regnode(SETEP);
np->u.i = 0;
regjoin(ret, np);
if (regflags&REG_ANCHOR)
regjoin(ret, regnode((regflags&REG_NEWLINE) ? EOL_NL : EOL));
else if (ret->next->type != BOL && ret->next->type != BOL_NL) {
np = regnode(SCAN);
np->next = ret;
np->u.np = optlist; /* do SCANTO optimization */
optlist = np;
ret = np;
} else if (ret->next->type == BOL_NL) {
np = regnode(SCAN_NL);
np->next = ret;
ret = np;
}
/* append RETURN 1 node to signal success */
np = regnode(RETURN);
np->u.i = 1;
np->next = NULL;
regjoin(ret, np);
/* optimize ".*"s to use BREAKs */
optimize();
/* unless RE starts with a string, do regmust optimization */
if (r->re_regmust != NULL) {
#if 0 /* regmust always wins */
if (ret->type != SCANTO)
#endif
regflags |= REG_MUST;
}
/* fill up last block */
while (regnodei != REGNODES)
(void) regnode(RETURN);
r->re_node = ret;
r->re_flags = regflags;
#ifdef DEBUG
if (regflags&REG_DEBUG)
regdump(r);
#endif
return 0;
}
/*f
* reg - regular expression, top level
*
* implements alteration.
*/
static node *
reg(regex_t *r, int *flagp)
{
register node *ret = NULL;
register node *br, *np;
register node *end;
int flags;
*flagp = HASWIDTH; /* Tentatively. */
/* Pick up the branches, linking them together. */
token = NOTOKEN;
br = regbranch(r, &flags);
*flagp = flags;
#ifndef REG_EXTENDED
return br;
#else
if (token != '|')
return br;
*flagp &= ~(~flags&HASWIDTH);
end = regnode(EMPTY);
while (1) {
/* create BRANCH node for this branch */
regjoin(br, end);
np = regnode(BRANCH);
np->u.np = br;
if (ret == NULL)
ret = np;
else
regjoin(ret, np);
if (token != '|')
break;
token = NOTOKEN;
br = regbranch(r, &flags);
*flagp &= ~(~flags&HASWIDTH);
}
/* fail after all branches attempted */
br = regnode(FAIL);
br->next = end;
regjoin(ret, br);
return ret;
#endif /* REG_EXTENDED */
}
/*f
* regbranch - one alternative of an | operator
*
* Implements the concatenation operator.
*/
static node *
regbranch(regex_t *r, int *flagp)
{
register node *ret = NULL;
register node *last, *piece;
wuchar_t *strend;
int flags;
*flagp = WORST; /* Tentatively. */
last = NULL;
while ((piece = regpiece(r, &flags)) != NULL) {
*flagp |= flags&HASWIDTH;
if (last == NULL) { /* First piece. */
ret = last = piece;
continue;
}
/* Join adjacent CHAR nodes into STRING */
/* We only let a STRING grow REG_MAXSTR chars long */
if (piece->type == CHAR && piece->next == NULL &&
last->next == NULL) {
if (last->type == CHAR) {
int c = last->u.c[0];
last->type = STRING;
last->u.str = strend = (wuchar_t *)
malloc(sizeof(wchar_t)*(REG_MAXSTR+1));
if (strend == NULL)
longjmp(err, REG_ESPACE);
*strend++ = c;
}
if (last->type == STRING &&
strend < last->u.str+REG_MAXSTR) {
*strend++ = piece->u.c[0];
*strend = '\0';
regnodei--; /* free the current CHAR node */
continue;
}
}
regjoin(last, piece);
last = piece;
}
if (ret == NULL)
ret = regnode(EMPTY);
for (last = ret; last != NULL; last = last->next)
if (last->type == STRING) {
/* free the unused portion of STRING nodes */
last->u.str = strend = (wuchar_t *)
realloc((void *)last->u.str,
sizeof(wchar_t)*(wcslen((wchar_t*)last->u.str)+1));
if (strend == NULL)
longjmp(err, REG_ESPACE);
}
return ret;
}
/*f
* regpiece - something followed by possible [*+?{]
* we allow \+ and \? in basic REs as an extension.
*/
static node *
regpiece(regex_t *r, int *flagp)
{
register node *np, *ret, *atom, *repy, *tp;
register wuchar_t op, opp;
register int i;
unsigned int min = 0, max = 0; /* {min,max} values */
int unbound = 0; /* {min,} unbound repeat */
int flags;
int backset;
/* Pick off one atom to match */
atom = regatom(r, &flags);
if (atom == NULL)
return (NULL);
/* Is the atom followed by some form of repetition? */
op = *regparse++;
op = M_INVARIANT(op);
backset = 2;
if (!(regflags&REG_EXTENDED))
/* BASIC: * is ok, but {, +, and ? must be preceeded by \ */
if (op == '*')
;
else if (op == '\\' &&
(M_INVARIANT(*regparse) == '{'
|| *regparse == '+' || *regparse == '?')) {
op = *regparse++, op = M_INVARIANT(op);
backset = 3;
}
else
op = 0;
/*
* Wasn't something that looked like a repetition?
* Reverse the character we just picked up, and return the
* whole piece of this RE. Also, if it looked like one of the
* '*', '?', or '+' repeaters and it's following a BOL then
* treat it as an ordinary character; if it's '{' following a
* BOL don't treat it as an ordinary character (safer from the
* user standpoint. POSIX 2.8.4.1.2 says this behavior falls into
* the undefined zone, but this matches historical practice well.
* !Nack!, XPG4 counters this and says '*', '+' or '?' at BOL or
* after a '^' at BOL or after a '|' alternation is an error
* (REG_BADRPT or REG_BADPAT) in ERE's; same with BRE's except for
* '*' which is done historically and '\?' is to treated as a literal;
* mostly handled in regatom().
* Before the 'atom' is returned alone or wrapped in a repeat
* construct, put a PUSH, POP and SEPR before it. The PUSH/POP
* preserves the offsets of the subexpression if we need to do
* some backtracking and the SEPR clears the pointers as we go forward
* because nested subexpressions need to be reset for each iteration
* of the outer subexpression in case it does not participate in
* the match while the outer one does.
*/
/* Don't allow repetition of empty operand. */
if ((flags & NOWIDTH) && (op == '+' || op == '{' ||
(op == '?' && (regflags&REG_EXTENDED)) ||
(op == '*' && (regflags&REG_EXTENDED)) )) {
/* +, ?, or { } has no atom/RE to repeat */
longjmp(err, REG_BADRPT);
}
else if (!(op == '*' || op == '+' || op == '?' || op == '{')
|| (M_INVARIANT(regparse[-backset]) == '^' &&
(regparse-backset) == regbegin && op != '{') ) {
regparse--;
*flagp = flags;
if (atom->type == PUSH) {
ret = regnode(PUSH);
ret->u.i = atom->u.i;
tp = ret->next = regnode(POP);
tp->u.i = atom->u.i;
tp->next = regnode(SEPR);
tp = tp->next;
tp->u.i = atom->u.i;
tp->next = atom;
atom = ret;
}
return (atom);
}
ret = NULL; /* to be sure */
/*
* Now look at repetition type, and do something with it.
* No default, because we know op is *+?{
* *, + and ? are now all converted to a {min,max} variation
* because of null matching when going forward after a backtrack.
*/
switch (op) {
case '*':
min = 0;
max = 255;
unbound = 1;
break;
case '+':
min = 1;
max = 255;
unbound = 1;
break;
case '?':
min = 0;
max = 1;
unbound = 0;
break;
case '{': /* } */
/* {min[,[max]]}: Parse off minimum */
if (!iswdigit(*regparse) && *regparse != '\0')
longjmp(err, REG_BADBR);
while ((op = *regparse++) != '\0' && iswdigit(op))
min = min*10 + op-'0';
/* Parse off maximum, or max same as min */
if (op == ',') {
max = 0;
while ((op = *regparse++) != '\0' && iswdigit(op))
max = max*10 + op-'0';
if (max == 0 && regparse[-2] != '0') {
max = min;
unbound = 1;
}
} else
max = min;
/* Parse off trailing delimiter */
if (op == '\0')
longjmp(err, REG_EBRACE);
if (M_INVARIANT(op) == '\\') /* \{ \} and \{ } are both valid */
op = *regparse++;
if (M_INVARIANT(op) != '}')
longjmp(err, REG_BADBR);
/* Check too large: RE_DUP_MAX is UCHAR_MAX */
if (min > UCHAR_MAX || max > UCHAR_MAX || max < min)
longjmp(err, REG_BADBR);
break;
}
if (ret == NULL) {
/*
* This is a code match from ipsu 1.55.1.6: If we haven't
* generated the repetition loop yet, then we need to
* generate a REPEATSTART, [REPEATCHECK,] REPEATINFO,
* REPEAT loop. For the weird construct we're building
* here, see the execution phase code.
*/
if (atom->type == PUSH) {
ret = regnode(PUSH);
ret->u.i = atom->u.i;
tp = ret->next = regnode(POP);
tp->u.i = atom->u.i;
tp->next = regnode(SEPR);
tp = tp->next;
tp->u.i = atom->u.i;
repy = np = tp->next = regnode(REPEATSTART);
np = np->next = regnode(REPEATCHECK);
}
else {
repy = ret = regnode(REPEATSTART);
/* add a check for a possible empty match */
np = ret->next = regnode(REPEATCHECK);
}
np->u.str = NULL;
/*if (min > 1)
min = 1;*/
np = np->next = regnode(REPEATINFO);
np->U.min = min;
np->U.max = unbound ? UCHAR_MAX : max;
np->U.n = 0;
np->U.unbound = unbound;
np->next = regnode(REPEAT);
np->next->u.np = atom;
regjoin(atom, repy->next);
}
*flagp = (min == 0) ? (WORST) : (WORST|HASWIDTH);
#if 0
/* P1003.2 does not specify whether these should be literal or an error */
if (ISMULT(*regparse))
return NULL; /*"nested *?+"*/
#endif
return ret;
}
/*f
* Generate a repetition node.
*/
static node *
regrepeat(register node *atom)
{
register node *np;
np = regnode(BRANCH);
np->u.np = atom; /* branch to atom then continue */
regjoin(atom, np); /* link atom to BRANCH */
/* if ".*" add to post-optimization list */
if ((atom->type == ANY || atom->type == ANY_NL) && atom->next == np) {
atom->u.np = optlist;
optlist = atom;
}
return np;
}
/*f
* optimize - optimize ".*" when we know following char
* In the ANY (ANY_NL) node, the arg is not normally used; we use u.np to
* chain the ANY nodes together so we can find them all here.
*
* Replace the ANY (ANY_NL) with a faster BREAK (BREAK_NL),
* or the SCAN with a SCANTO.
* We depend on ANY+1 == BREAK, SCAN+1 == SCANTO, ANY_NL+1 == BREAK_NL
*/
static void
optimize(void)
{
while (optlist != NULL) {
register node *np;
node *any = optlist;
optlist = optlist->u.np;
/* any->next is always BRANCH -- find next relevant node */
np = any->next->next; /* the EMPTY node */
while (np->type==EMPTY || np->type==WORDB ||
np->type==SETSP || np->type==PUSH || np->type==POP)
np = np->next;
/*
* Change ANY->BREAK, ANY_NL->BREAK_NL, SCAN->SCANTO
* ANY/ANY_NL/SCAN don't normally care about the current
* character -- if the following node does, we do this
* optimization so we can use strchr at runtime.
*/
any->type++; /* Assume we know next char */
if (np->type == CHAR)
any->u.c[0] = np->u.c[0];
else if (np->type == STRING)
any->u.c[0] = np->u.str[0];
else if (np->type == EOL)
any->u.c[0] = '\0';
else
any->type--; /* mis-assumed */
}
}
/*f
* regatom - the lowest level - pick off one character, backreference,
* subexpression, bracketed RE, start-of-line, ...
*/
static node *
regatom(register regex_t *r, int *flagp)
{
register node *ret;
register int op;
int quoted = 0;
int i;
*flagp = WORST; /* Tentatively. */
/* Look for backreference, subexpression, and newline */
op = *regparse++;
op = M_INVARIANT(op);
if (op == '\\') {
op = *regparse++;
if (op == '\0') {
longjmp(err, REG_EESCAPE);
} else if (op == '(' || op == ')') {
if (regflags&REG_EXTENDED)
quoted++;
} else if (op == '{') {
if (regflags&REG_EXTENDED)
goto Default;
else
longjmp(err, REG_BADRPT);
} else if (op == '*') {
goto Default;
} else if (op == '+') {
if (regflags&REG_EXTENDED)
goto Default;
else
longjmp(err, REG_BADRPT);
} else if (op == '?') {
if (regflags&REG_EXTENDED)
goto Default;
else if (regbegin == &regparse[-2])
goto Default;
else
longjmp(err, REG_BADRPT);
} else if ('1' <= op && op <= '9') {
int n = op - '0';
if (!(parflags[n]&CLOSEPAR))
longjmp(err, REG_ESUBREG);
ret = regnode(EXPAND);
ret->u.i = n;
*flagp = parflags[n];
return ret;
#ifdef REG_CESC
} else if (op == 'x') {
/* hex escape sequence */
long n;
errno = 0;
n = strtol(regparse, &regparse, 16);
if ((n == LONG_MAX || n == LONG_MIN)
&& errno == ERANGE) {
longjmp(err, REG_BADPAT);
}
ret = regnode(CHAR);
ret->u.c[0] = n & 0xFF;
ret->u.c[1] = '\0';
return ret;
} else if (op == '0') {
/* octal escape sequence */
int n = 0;
for (i = 0; '0' <= *regparse && *regparse <= '7'
&& i < 3; i += 1) {
n = (n << 3) + (*regparse++ - '0');
}
ret = regnode(CHAR);
ret->u.c[0] = n;
ret->u.c[1] = '\0';
return ret;
#endif /* REG_CESC */
} else
quoted++;
} else {
if (op == '(' || op == '|' || op == '{') {
if (!(regflags&REG_EXTENDED))
quoted++;
} else if (op == ')') {
/* Unmatched )'s are ordinary characters in ERE's */
if (!(regflags&REG_EXTENDED) || pardepth == 0 ) {
quoted++;
}
}
}
if (quoted) {
#ifdef REG_WORDS
switch (op) {
case '<':
ret = regnode(WORDB);
ret->u.i = 0;
*flagp = NOWIDTH;
return ret;
case '>':
ret = regnode(WORDB);
ret->u.i = 1;
*flagp = NOWIDTH;
return ret;
}
#endif
#ifdef REG_CESC
for (i = 0; cesc_map[i].code != '\0'; i += 1) {
if (cesc_map[i].code == op) {
ret = regnode(CHAR);
ret->u.c[0] = cesc_map[i].character;
ret->u.c[1] = '\0';
return ret;
}
}
#endif
goto Default;
}
switch (op) {
case '^':
if (regparse-1 == regbegin || regflags&REG_EXTENDED) {
ret = regnode((regflags&REG_NEWLINE) ? BOL_NL : BOL);
*flagp = NOWIDTH;
}
else
goto Default;
break;
case '$':
if (*regparse == '\0' || regflags&REG_EXTENDED) {
ret = regnode((regflags&REG_NEWLINE) ? EOL_NL : EOL);
*flagp = NOWIDTH;
}
else
goto Default;
break;
case '.':
ret = regnode((regflags&REG_NEWLINE) ? ANY_NL : ANY);
*flagp = HASWIDTH;
break;
case '[':
ret = regclass(r);
op = *regparse++;
if (M_INVARIANT(op) != ']')
longjmp(err, REG_EBRACK);
*flagp = HASWIDTH;
break;
case '(':
op = ++(r->re_nsub);
pardepth++;
ret = reg(r, flagp);
pardepth--;
if (op < 10)
parflags[op] = *flagp|CLOSEPAR;
if (token != ')')
longjmp(err, REG_EPAREN);
{
node *np, *re;
/* set subexpression start and end positions */
re = regnode(SETSP);
re->next = ret;
re->u.i = op;
ret = re;
re = regnode(SETEP);
re->u.i = op;
regjoin(ret, re);
/* generate PUSH and POP to save and restore */
re = regnode(PUSH);
re->next = np = regnode(POP);
re->u.i = np->u.i = op;
np->next = ret;
ret = re;
}
break;
case '\0':
case '|':
case ')':
token = op; /* handled higher up */
ret = NULL;
break;
case '?':
case '+':
case '{':
if (!(regflags & REG_EXTENDED))
goto Default;
longjmp(err, REG_BADRPT); /* wasn't quoted out as a literal */
break;
case '*':
if (!(regflags & REG_EXTENDED)) {
if (regparse-1 == regbegin ||
(regparse-2 == regbegin
&& M_INVARIANT(regparse[-2]) == '^') ||
(regparse-3 >= regbegin
&& M_INVARIANT(regparse[-2]) == '('
&& M_INVARIANT(regparse[-3]) == '\\'))
/* fall through */ ;
else
longjmp(err, REG_BADRPT);
/* not a literal because of RE context */
}
else
longjmp(err, REG_BADRPT);
/* wasn't quoted out as a literal */
default:
Default:
regparse--;
op = *regparse++;
*flagp = HASWIDTH;
if (regflags&REG_ICASE && iswalpha(op)) {
ret = regnode(ANYOF2);
ret->u.c[0] = op;
ret->u.c[1] = iswlower(op) ? towupper(op) :
iswupper(op) ? towlower(op) : op;
} else {
ret = regnode(CHAR);
ret->u.c[0] = op;
ret->u.c[1] = '\0';
}
break;
}
return ret;
}
/*f
* regclass - parse character class (bracket expression)
*/
static node *
regclass(regex_t *r)
{
register node *np;
register m_collel_t cbeg, cend;
if (M_INVARIANT(*regparse) == '^') { /* Complement of range. */
np = regnode((regflags&REG_NEWLINE) ? ANYBUT_NL : ANYBUT);
regparse++;
} else
np = regnode(ANYOF);
np->u.clp = NULL;
classbegin();
/*
* Process any leading - or ] as a special case.
*/
if (M_INVARIANT(*regparse) == ']' || *regparse == '-') {
if ((cbeg = regelement()) == -1)
longjmp(err, REG_ECOLLATE);
cend = 0;
while (*regparse == '-' &&
M_INVARIANT(regparse[1]) != ']' && regparse[1] != '\0') {
m_collel_t n;
m_collel_t *rp;
regparse++;
if ((cend = regelement()) == -1)
longjmp(err, REG_ECOLLATE);
/* Range beginning/ending in [==] or [::] not valid */
if (cbeg == 0 || cend == 0
|| (n = m_collrange(cbeg, cend, &rp)) <= 0 ||
n == (m_collel_t) -1)
longjmp(err, REG_ERANGE);
/* Add each character in the range */
while (n--)
classadd(*rp++, regflags&REG_ICASE);
cbeg = cend;
}
if (cend == 0 && cbeg > 0) /* not range & not added */
classadd(cbeg, regflags&REG_ICASE);
}
/*
* Process until end of bracketed expression, looking for ranges.
*/
while (*regparse != '\0' && M_INVARIANT(*regparse) != ']') {
if ((cbeg = regelement()) == -1)
longjmp(err, REG_ECOLLATE);
cend = 0;
while (*regparse == '-'
&& M_INVARIANT(regparse[1]) != ']'
&& M_INVARIANT(regparse[1]) != '\0') {
int n;
m_collel_t *rp;
regparse++;
if ((cend = regelement()) == -1)
longjmp(err, REG_ECOLLATE);
/* Range beginning/ending in [==] or [::] not valid */
if (cbeg == 0 || cend == 0
|| (n = m_collrange(cbeg, cend, &rp)) <= 0)
longjmp(err, REG_ERANGE);
/* Add each character in the range */
while (n--)
classadd(*rp++, regflags&REG_ICASE);
cbeg = cend;
}
if (cend == 0 && cbeg > 0) /* not range & not added */
classadd(cbeg, regflags&REG_ICASE);
}
np->u.clp = classclose();
return np;
}
/*f
* regelement - parse element of char class, either alone or in range.
* returns 0 if all the characters are already added to the class by
* calling classadd; otherwise a character or collating element to add.
* 0 implies that the element is not valid as an end of a range because
* it has multiple values
* e.g. [[=a=]-z], [[:alpha:]-z] are not valid, returns 0;
* but [[.ch.]-z] is valid.
*/
static m_collel_t
regelement(void)
{
register m_collel_t c;
register wuchar_t *cp;
char *brack;
wuchar_t *p;
int m, n;
/*
* Simple case: If it isn't [., [=, or [:, just return the single
* character.
*/
if (M_INVARIANT(*regparse) != '[' ||
((c=regparse[1]) != '.' && c != '=' && c != ':'))
return (_wctoce(_loaded_coll_, *regparse++));
regparse += 2;
/*
* Look forward for bracket of matching type.
* We can't nest the same type brackets: [.[.ch.]h.] or similar
* is never valid: the only reason we do this at all is for [=[.ch.]=]
* Any nesting is meaningless inside [..] or [::]; and only [..]
* means anything inside [==].
*/
for (cp = regparse; *cp != '\0'; cp++)
if (*cp == c && M_INVARIANT(cp[1]) == ']')
break;
if (*cp == '\0')
longjmp(err, REG_EBRACK);
/*
* Copy the contents of the [ ... ] out.
* It would be so much easier to just stuff a \0 in place,
* but regparse is the argument to regcomp, which the standard defines
* to be a const *, so it might really be un-stuffable.
* Need to explicitly check whether there is an expression in the
* bracket. That is, [..], and [==] are invalid r.e.'s.
*/
if ((n = cp - regparse) <= 0)
longjmp(err, REG_ECOLLATE);
#ifdef M_I18N_MB
/*
* Since the mks i18n routines work on narrow chars,
* convert here.
*/
if ((brack = malloc(m = n * (MB_LEN_MAX + 1))) == NULL)
longjmp(err, REG_ESPACE);
if ((p = (wuchar_t *)malloc(sizeof(wchar_t)*(n+1))) == NULL) {
free(brack);
longjmp(err, REG_ESPACE);
}
for (n = 0; regparse[n] != '\0'; n++) {
if (regparse[n] == c && M_INVARIANT(regparse[n+1]) == ']')
break;
p[n] = regparse[n];
}
if (!n) {
free(brack);
free(p);
longjmp(err, REG_ECHAR);
}
p[n] = 0;
if ((n = wcstombs(brack, p, m)) == -1) {
free(brack);
free(p);
longjmp(err, REG_ECHAR);
}
free(p);
#else
if ((brack = malloc(n+1)) == NULL)
longjmp(err, REG_ESPACE);
memcpy(brack, regparse, n);
#endif
brack[n] = '\0';
switch (c) {
case ':': { /* [: character-class-name :] */
m_collel_t n;
m_collel_t *ccl;
n = m_cclass(brack, &ccl);
free(brack);
if (n == (m_collel_t) -1)
longjmp(err, REG_ECTYPE);
while (n-- > 0)
if ((c = *ccl++) != '\0')
classadd(c, regflags&REG_ICASE);
c = 0;
break;
}
case '.': /* [. collating-symbol .] */
c = m_strtocoll(brack);
free(brack);
if (c == (m_collel_t) -1)
longjmp(err, REG_ECOLLATE);
break;
/*l
* Equivalence class is normally a single character; which may
* expand to multiple characters and some multi-character
* collating elements. There doesn't appear to be any reason that
* it can't be a collating-element: [=[.ch.]=] should produce all
* chars and collating elements equivalent to the collating element ch.
* Note that as of D11.2 of posix.2, the regular expression grammar
* in 2.8.5 doesn't make any sense.
*/
case '=': { /* [= equivalence-class =] */
int n;
m_collel_t *rp;
/*
* We accept either [=ch=], or [=[.ch.]=].
* It isn't obvious which the standard requires/allows.
* If it starts with [, then recurse to do this.
*/
if (M_INVARIANT(*brack) == '[') {
free(brack); /* In case longjmp out of regelement */
/* Try for [=[.ch.]=] style */
c = regelement();
/*
* regelement will return 0 if it was [==] or [::],
* otherwise return the appropriate multi-character
* collating element.
* regparse when done must point at our ending point
* (the =]) or we have something like [=[.ch.]x=].
* Why isn't there a special error message like
* REG_EEQUIV?
*/
if (c == 0 || regparse != cp)
longjmp(err, REG_ECOLLATE);
} else {
/* Try for [=ch=] */
c = m_strtocoll(brack);
free(brack);
if (c == (m_collel_t) -1)
longjmp(err, REG_ECOLLATE);
}
n = m_collequiv(c, &rp);
if (n!= (m_collel_t) -1)
while (n-- > 0)
classadd(*rp++, regflags & REG_ICASE);
c = 0;
break;
}
}
regparse = cp+2;
return (c);
}
/*f
* regnode - allocate and initialize a node
*/
static node *
regnode(int op)
{
register node *np;
if (regnodei == REGNODES)
regblockp = regblockp->next = regblock();
np = &regblockp->nodes[regnodei++];
np->type = op;
np->next = NULL;
return np;
}
/*f
* regblock - allocate a block of nodes.
*/
static block *
regblock(void)
{
register block *bp;
bp = (block*) malloc(sizeof(block));
if (bp == NULL)
longjmp(err, REG_ESPACE);
bp->next = NULL;
regnodei = 0;
return bp;
}
/*f
* regjoin - concatenate two node lists.
*/
static node *
regjoin(node *head, node *tail)
{
register node *np, *nplast = NULL;
if (head == NULL)
return tail;
for (np = head; np != NULL; np = np->next)
nplast = np;
nplast->next = tail;
return head;
}
/*f
* Free a regular expression. Cycle thru all the nodes freeing all the
* memory associated with them.
*/
void
regfree(regex_t *r)
{
register block *bp;
register node *np;
register int i;
block *nextbp;
if (r->re_block == NULL)
return;
for (bp = r->re_block; bp != NULL; bp = nextbp) {
for (i = REGNODES, np = bp->nodes; i--; np++)
switch (np->type) {
case STRING:
if (np->u.str != NULL)
free((void *)np->u.str);
break;
case ANYOF: case ANYBUT: case ANYBUT_NL:
if (np->u.clp != NULL)
classfree(np->u.clp);
break;
}
nextbp = bp->next;
free((void *) bp);
}
r->re_block = NULL;
}
/*
* wchar_opt - Scan the string for 8 bit chars converted to wide chars
*
* This routine is to be invoked only in "C" locale. It scans the string
* to see if there are wide characters over 255 - that means that it
* was a character in the range of 128-255 that got converted via
* wctomb into a wide char. In that case convert back to the 8 bit value
* and assign it directly to the wide character string.
*/
void
_wchar_opt(wchar_t *temp_pattern, const wchar_t *pattern)
{
unsigned char temp[16];
while (*pattern) {
if (*pattern > 255) {
if (wctomb((char *)temp, *pattern) != 1)
longjmp(err, REG_ECOLLATE);
else
*temp_pattern = (wchar_t) temp[0];
} else
*temp_pattern = *pattern;
temp_pattern++;
pattern++;
}
*temp_pattern = (wchar_t) '\0';
}