Files
Arquivotheca.AIX-4.1.3/bos/usr/ccs/lib/libc/__regcomp_std.c
seta75D d6fe8fe829 Init
2021-10-11 22:19:34 -03:00

1718 lines
44 KiB
C

static char sccsid[] = "@(#)46 1.12.1.13 src/bos/usr/ccs/lib/libc/__regcomp_std.c, libcpat, bos41J, 9511A_all 2/23/95 17:04:34";
/*
* COMPONENT_NAME: libcpat
*
* FUNCTIONS: CLEARBIT
* CLEARBITC
* OPPOSITE
* OVERFLOW
* SETBIT
* SETBITC
* __regcomp_std
* bracket
* bracketw
* enlarge
* intl_expr
* wgtstring
*
* ORIGINS: 27
*
* IBM CONFIDENTIAL -- (IBM Confidential Restricted when
* combined with the aggregated modules for this product)
* SOURCE MATERIALS
*
* (C) COPYRIGHT International Business Machines Corp. 1991,1995
* All Rights Reserved
* US Government Users Restricted Rights - Use, duplication or
* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
*/
#define _ILS_MACROS
#include <sys/types.h>
#include <sys/localedef.h>
#include <ctype.h>
#include <limits.h>
#include <regex.h>
#include <stdlib.h>
#include <string.h>
#include "reglocal.h"
#include "patlocal.h"
#include "libc_msg.h"
/* get the weight string for the character with index "index" */
#define wgtstring(index) __OBJ_DATA(hdl)->co_subs[index].tgt_wgt_str[0]
/************************************************************************/
/* External data defined in regexec() */
/************************************************************************/
extern int __reg_bits[]; /* bitmask for [] bitmap */
/************************************************************************/
/* RE compilation definitions */
/************************************************************************/
/* expand pattern if buffer too small */
#define OVERFLOW(x) { \
if (pe - pp < x) \
{ \
enlarge(x, &pp_start, &pe, &pp, &plastce); \
if (pp_start == NULL) \
{ \
preg->re_erroff = \
(uchar_t *)ppat - (uchar_t *)pattern - 1; \
return (REG_ESPACE); \
} \
} \
}
#define PATTERN_EXP 128 /* compiled pattern expansion (bytes) */
/* remove interval, uncouple last STRING char */
#define REPEAT_CHECK { \
if (((*plastce & CR_MASK) == CR_INTERVAL) || \
((*plastce & CR_MASK) == CR_INTERVAL_ALL)) \
{ \
pp -= 2; \
pto = plastce + 1; \
pfrom = pto + 2; \
do \
*pto++ = *pfrom++; \
while (pto > pp); \
} \
else if (*plastce == CC_STRING) \
{ \
plastce[1]--; \
plastce = pp - 1; \
*pp = pp[-1]; \
pp[-1] = CC_CHAR; \
pp++; \
} \
else if (*plastce == CC_I_STRING) \
{ \
plastce[1]--; \
plastce = pp - 2; \
*pp = pp[-1]; \
pp[-1] = pp[-2]; \
pp[-2] = CC_I_CHAR; \
pp++; \
} \
}
/* set character bit in C [] bitmap */
#define SETBITC(pp,c) { \
*(pp + (c >> 3)) |= __reg_bits[c & 7]; \
}
/* clear character bit in C [] bitmap */
#define CLEARBITC(pp,c) { \
*(pp + (c >> 3)) &= ~(__reg_bits[c & 7]); \
}
/* set u.c.w. bit in ILS [] bitmap */
#define SETBIT(pp,ucoll) \
{ \
delta = ucoll - MIN_UCOLL; \
*(pp + (delta >> 3)) \
|= __reg_bits[delta & 7]; \
}
/* clear u.c.w. bit in ILS [] bitmap */
#define CLEARBIT(pp,ucoll) \
{ \
delta = ucoll - MIN_UCOLL; \
*(pp + (delta >> 3)) \
&= ~(__reg_bits[delta & 7]); \
}
/* set opposite case u.c.w. bit in ILS bitmap */
#define OPPOSITE(icase,pp,ucoll,wc,wc2) \
{ \
if (icase != 0) \
if (((wc2 = towupper(wc)) != wc) || \
((wc2 = towlower(wc)) != wc)) \
{ \
ucoll = __wcuniqcollwgt(wc2); \
if (ucoll >= MIN_UCOLL && ucoll <= MAX_UCOLL)\
SETBIT(pp, ucoll); \
} \
}
/************************************************************************/
/* Internal function prototypes */
/************************************************************************/
static int bracket( /* convert [bracket] to bitmap */
uchar_t *,
uchar_t **,
uchar_t *,
int);
static void enlarge( /* enlarge compiled pattern buffer */
int,
uchar_t **,
uchar_t **,
uchar_t **,
uchar_t **);
static int bracketw( /* convert [bracket] to bitmap */
_LC_collate_objhdl_t,
uchar_t *,
uchar_t **,
uchar_t *,
regex_t *,
uchar_t *);
static int intl_expr( /* decode internationalization [] */
_LC_collate_objhdl_t,
regex_t *,
uchar_t,
uchar_t *,
uchar_t *,
wchar_t *,
wchar_t *);
/************************************************************************/
/* __regcomp_std()- Compile RE pattern */
/* - valid for all locales and any codeset */
/* */
/* - hdl ptr to __lc_collate table */
/* - preg ptr to structure for compiled pattern */
/* - pattern ptr to RE pattern */
/* - cflags regcomp() flags */
/************************************************************************/
int
__regcomp_std(_LC_collate_objhdl_t hdl, regex_t *preg, const char *pattern,
int cflags)
{
int altloc[_REG_SUBEXP_MAX+1]; /* offset to last alternate */
int be_size; /* [bracket] bitmap size */
int c; /* pattern character */
int c2; /* opposite case pattern character */
int delta; /* SETBIT unique collating value offset */
int do_all; /* set if {m,} is used. */
int eol[_REG_SUBEXP_MAX+1]; /* EOL anchor offset in pattern */
int ere; /* extended RE flag */
int first; /* logical beginning of pattern */
int first_BOL; /* set when the first ^ is found */
int i; /* loop index */
int icase; /* ignore case flag */
int idx; /* current subexpression index */
int isfirst; /* first expression flag */
int maxri; /* mamimum repetition interval */
int mb_cur_max; /* in memory copy of MB_CUR_MAX */
int minri; /* minimum repetition interval */
int nsub; /* highest subexpression index */
uchar_t *palt; /* expand pattern ptr */
uchar_t *pe; /* ptr to end of compiled pattern space */
uchar_t *pfrom; /* expand pattern ptr */
uchar_t *plastce; /* ptr to last compiled expression */
uchar_t *pmap; /* ptr to character map table */
uchar_t *pp; /* ptr to next compiled RE pattern slot */
uchar_t *pp_start; /* ptr to start of compiled RE pattern */
uchar_t *ppat; /* ptr to next RE pattern byte */
uchar_t *pri; /* ptr to repetition interval */
uchar_t *psubidx; /* ptr to current subidx entry */
uchar_t *pto; /* expand pattern ptr */
int sblocale; /* is this a single byte locale? */
int wclen; /* length of character */
uchar_t sol[_REG_SUBEXP_MAX+1]; /* don't clear "first" */
int stat; /* bracket() return status */
uchar_t subidx[_REG_SUBEXP_MAX+1]; /* active subexpression index*/
wchar_t wc; /* a wide character */
wchar_t wc2; /* opposite case pattern wide character */
/*
* Allocate initial RE compiled pattern buffer
* OVERFLOW(X) will expand buffer as required
*/
pmap = (uchar_t *)malloc(256*(sizeof(uchar_t)));
pp = (uchar_t *)malloc(PATTERN_EXP);
if (pp == NULL || pmap == NULL)
{
preg->re_erroff = 0;
return (REG_ESPACE);
}
pp_start = pp;
pe = pp + PATTERN_EXP - 1;
/*
* Other initialization
*/
bzero(preg, sizeof(regex_t));
preg->re_cflags = cflags;
preg->re_ucoll[0] = MIN_UCOLL;
preg->re_ucoll[1] = MAX_UCOLL;
icase = cflags & REG_ICASE;
ere = cflags & REG_EXTENDED;
nsub = 0;
plastce = NULL;
preg->re_lsub[0] = 0;
psubidx = subidx;
*psubidx = 0;
altloc[0] = 0;
idx = 0;
first = 0;
first_BOL = 0;
isfirst = 0;
eol[0] = 0;
sol[0] = 0;
preg->re_map = pmap;
mb_cur_max = MB_CUR_MAX;
if (mb_cur_max == 1)
{
sblocale = 1;
wclen = 1;
}
else
sblocale = 0;
/*
* BIG LOOP to process all characters in RE pattern
* stop on NUL
* return on any error
* set character map for all characters which satisfy the pattern
* expand pattern space now if large element won't fit
*/
ppat = (char *)pattern;
while ((c = *ppat++) != '\0')
{
OVERFLOW(10)
switch(c)
{
/*
* match a single character
* error if preceeded by ERE $
* if multibyte locale, set wclen and get wide character
* otherwise wclen is always set to 1
* if case sensitive pattern
* if single byte character
* if no previous pattern, add CC_CHAR code to pattern
* if previous pattern is CC_CHAR, convert to CC_STRING
* if previous pattern is CC_STRING, add to end of string
* otherwise add CC_CHAR code to pattern
* if multibyte character
* add CC_WCHAR code to pattern
*
* if ignore case pattern
* if single byte character
* determine opposite case of pattern character
* if no opposite case, treat as case sensitive
* if no previous pattern, add CC_I_CHAR code to pattern
* if previous pattern is CC_I_CHAR, convert to CC_I_STRING
* if previous pattern is CC_I_STRING, add to end of string
* otherwise add CC_I_CHAR code to pattern
* if multibyte character
* determine opposite case of pattern character
* if no opposite case, process as case sensitive
* otherwise add CC_I_WCHAR code to pattern
*/
default:
cc_char:
if (sblocale == 0)
{
wclen = mbtowc(&wc, ppat-1, mb_cur_max);
if (wclen < 0)
{
preg->re_erroff = ppat - pattern - 1;
return (REG_ECHAR);
}
}
if (icase == 0)
{
if (wclen == 1)
{
if (plastce == NULL)
{
plastce = pp;
*pp++ = CC_CHAR;
*pp++ = c;
if (isfirst++ == 0)
pmap[c] = 1;
}
else if (*plastce == CC_CHAR)
{
*plastce = CC_STRING;
*pp++ = plastce[1];
plastce[1] = 2;
*pp++ = c;
}
else if (*plastce == CC_STRING && plastce[1] < 255)
{
plastce[1]++;
*pp++ = c;
}
else
{
plastce = pp;
*pp++ = CC_CHAR;
*pp++ = c;
if (isfirst++ == 0)
pmap[c] = 1;
}
} /* if wclen == 1 */
else
{ /* multibyte character */
cc_wchar :
plastce = pp;
*pp++ = CC_WCHAR;
*pp++ = wclen;
*pp++ = c;
while (--wclen > 0)
*pp++ = *ppat++;
if (isfirst++ == 0)
pmap[c] = 1;
}
} /* if icase == 0 */
else
{
if (wclen == 1)
{
c2 = toupper(c);
if (c2 == c)
c2 = tolower(c);
if (plastce == NULL)
{
plastce = pp;
*pp++ = CC_I_CHAR;
*pp++ = c;
*pp++ = c2;
if (isfirst++ == 0)
{
pmap[c] = 1;
pmap[c2] = 1;
}
}
else if (*plastce == CC_I_CHAR)
{
*plastce = CC_I_STRING;
*pp++ = plastce[2];
plastce[2] = plastce[1];
plastce[1] = 2;
*pp++ = c;
*pp++ = c2;
}
else if (*plastce == CC_I_STRING && plastce[1] < 255)
{
plastce[1]++;
*pp++ = c;
*pp++ = c2;
}
else
{
plastce = pp;
*pp++ = CC_I_CHAR;
*pp++ = c;
*pp++ = c2;
if (isfirst++ == 0)
{
pmap[c] = 1;
pmap[c2] = 1;
}
}
} /* if single byte char */
else
{ /* multibyte case */
if (((wc2 = towupper(wc)) == wc) &&
((wc2 = tolower(wc)) == wc))
goto cc_wchar;
plastce = pp;
*pp++ = CC_I_WCHAR;
*pp++ = wclen;
*pp++ = c;
while (--wclen > 0)
*pp++ = *ppat++;
wclen = wctomb(pp, wc2);
if (wclen < 0)
{
wclen = 1;
*pp = wc2;
}
if (isfirst++ == 0)
{
pmap[c] = 1;
pmap[*pp] = 1;
}
pp += wclen;
}
} /* if case sensitive */
first++;
continue;
/*
* If we can use the smaller CC_BITMAP, use it:
* bracket expression
* always use 256-bit bitmap - indexed by file code
* decode pattern into list of characters which satisfy the [] expression
* error if invalid [] expression
* add CC_BITMAP to pattern
* set character map for each bit set in bitmap
* otherwise
* ILS bracket expression
* bitmap size is based upon min/max unique collating value
* zero fill bitmap - yes its big for kanji
* error if invalid bracket expression
* add CC_WBITMAP
* set character map for each bit set in bitmap, however must
* convert bits from unique collation weight to file code
* Note: only use first byte of multibyte languages
*/
case '[':
if ((strcmp(setlocale(LC_COLLATE,NULL),"C") == 0) || ((__OBJ_DATA(hdl)->co_coltbl == NULL) && (MAX_PC - MIN_PC < BITMAP_LEN * NBBY)))
{
OVERFLOW(BITMAP_LEN+1)
plastce = pp;
*pp++ = CC_BITMAP;
bzero(pp, BITMAP_LEN);
stat = bracket(ppat, &pto, pp, cflags);
if (stat != 0)
{
preg->re_erroff = pto - pattern - 1;
return (stat);
}
ppat = pto;
pto = pp;
pp += BITMAP_LEN;
if (isfirst++ == 0)
{
pfrom = pmap;
do
{
if (*pto != 0)
for (i=0; i<8; i++)
if ((*pto & __reg_bits[i]) != 0)
pfrom[i] = 1;
pfrom += 8;
}
while (++pto < pp);
}
}
else /* now do CC_WBITMAP */
{
be_size = ((MAX_UCOLL - MIN_UCOLL) / NBBY) + 1;
OVERFLOW(be_size+1)
bzero(pp+1, be_size);
stat = bracketw(hdl, ppat, &pto, pp+1, preg,
isfirst == 0 ? pmap : NULL);
if (stat != 0)
{
preg->re_erroff = (char *)pto - (char *)pattern;
return (stat);
}
ppat = pto;
plastce = pp;
*pp++ = CC_WBITMAP;
if (isfirst++ == 0)
{
wchar_t ucoll; /* unique collating value */
wchar_t min_ucoll; /* minimum u.c.w */
wchar_t max_ucoll; /* maximum u.c.w */
uchar_t filecode[MB_LEN_MAX]; /* pc -> fc */
min_ucoll = MIN_UCOLL;
max_ucoll = MAX_UCOLL;
for (i = MIN_PC; i <= MAX_PC; i++)
{
ucoll = __wcuniqcollwgt(i);
if (ucoll >= min_ucoll && ucoll <= max_ucoll)
{
delta = ucoll - min_ucoll;
if ((*(pp + (delta >> 3)) & __reg_bits[delta & 7]) != 0)
if (sblocale != 0)
pmap[i] = 1;
else if (wctomb(filecode, i) < 1)
pmap[i & 0xff] = 1;
else
pmap[*filecode] = 1;
}
}
}
pp += be_size;
}
first++;
continue;
/*
* zero or more matches of previous expression
* error if no valid previous expression for ERE
* ordinary character if no valid previous expression for BRE
* specify CR_STAR for previous expression repeat factor
*/
case '*':
if (plastce == NULL)
{
if (ere == 0)
goto cc_char;
else
{
preg->re_erroff = ppat - pattern - 1;
return (REG_BADRPT);
}
}
REPEAT_CHECK
isfirst = 0;
*plastce = (*plastce & ~CR_MASK) | CR_STAR;
continue;
/*
* match any character except NUL
* error if preceeded by ERE $
* add CC_DOT code to pattern if REG_NEWLINE is not set & single byte locale
* add CC_DOTREG code to pattern if REG_NEWLINE is set & single byte locale
* add CC_WDOT code to pattern if multibyte locale
* set all map bits
*/
case '.':
plastce = pp;
if (sblocale != 0)
{
if ((cflags & REG_NEWLINE) != 0)
*pp++ = CC_DOTREG;
else
*pp++ = CC_DOT;
}
else
*pp++ = CC_WDOT;
if (isfirst++ == 0)
memset(pmap, (int)1, (int)256);
first++;
continue;
/*
* match beginning of line
* error if preceeded by ERE $
* ordinary character if not
* first thing in BRE
* first thing is a subexpression BRE
* add CC_BOL to pattern
* set all map bits
*/
case '^':
if (first != 0 && ere == 0 )
goto cc_char;
if (first_BOL && ere == 0 && *(pp-1) == CC_BOL) {
first++;
goto cc_char;
}
if (isfirst++ == 0) {
plastce = NULL;
memset(pmap, (int)1, (int)256);
}
first_BOL++;
*pp++ = CC_BOL;
continue;
/*
* match end of line
* error if preceeded by ERE $
* normal character if not last thing in BRE
* save $ offset in pattern for later testing and error reporting
* add CC_EOL to pattern
*/
case '$':
if((ere==0) && (*ppat!='\\') && (*ppat!='\0'))
goto cc_char;
eol[idx] = ppat - pattern;
plastce = NULL;
*pp++ = CC_EOL;
if (isfirst++ == 0)
{
if ((cflags & REG_NEWLINE) != 0)
pmap['\n'] = 1;
pmap[0] = 1;
}
continue;
/*
* backslash
* error if followed by NUL
* protects next ERE character
* introduces special BRE characters
* processing is based upon next character
* ( start subexpression
* ) end subexpression
* { repetition interval
* 1-9 backreference
* other ordinary character
*/
case '\\':
c = *ppat++;
if (c == 0)
{
preg->re_erroff = ppat - pattern - 1;
return (REG_EESCAPE);
}
if (ere != 0)
goto cc_char;
switch (c)
{
/*
* start subexpression
* error if too many subexpressions
* save start information concerning this subexpression
* add CC_SUBEXP to pattern
* subexpression data follows up to ending CC_SUBEXP_E
*/
case '(':
lparen:
if (nsub++ >= _REG_SUBEXP_MAX)
{
preg->re_erroff = ppat - pattern - 1;
return (REG_EPAREN);
}
if (nsub > preg->__maxsub)
preg->__maxsub = nsub;
*++psubidx = nsub;
eol[nsub] = 0;
altloc[nsub] = 0;
if (first == 0)
sol[nsub] = 0;
else
sol[nsub] = 1;
idx = nsub;
plastce = NULL;
*pp++ = CC_SUBEXP;
*pp++ = nsub;
preg->re_lsub[nsub] = (void *)(pp - pp_start);
preg->re_esub[nsub] = NULL;
continue;
/*
* end subexpression
* error if no matching start subexpression BRE
* regular character if no matching start subexpression ERE
* save end information concerning this subexpression
* add CC_SUBEXP_E to pattern
*/
case ')':
rparen:
if (--psubidx < subidx)
{
if (ere)
{
psubidx = subidx;
goto cc_char;
}
preg->re_erroff = ppat - pattern - 1;
return (REG_EPAREN);
}
preg->re_esub[idx] = (void *)(pp - pp_start);
plastce = pp;
*pp++ = CC_SUBEXP_E;
*pp++ = idx;
idx = *psubidx;
first++;
continue;
/*
* repetition interval match of previous expression
* treat characters as themselves if no previous expression
* \{m\} matches exactly m occurances
* \{m,\} matches at least m occurances
* \{m,n\} matches m through n occurances
* error if invalid sequence or previous expression already has * or {}
* insert two bytes for min/max after pattern code
* specify CR_INTERVAL for previous expression repeat factor
*/
case '{':
do_all = 0;
if (plastce == NULL)
{
c = '\\';
ppat--;
goto cc_char;
}
pri = ppat;
minri = 0;
while ((c2 = *pri++) >= '0' && c2 <= '9')
minri = minri * 10 + c2 - '0';
/**** first, lets check if we didn't convert anything ****/
if ((pri == ppat+1) || (c2 == '\0'))
{
preg->re_erroff = ppat - pattern;
return ((c2=='\0')? REG_EBRACE : REG_BADBR);
}
if (c2 == '\\' && *pri == '}')
{
pri++;
maxri = minri;
}
else if (c2 != ',')
{
preg->re_erroff = pri - pattern - 1;
return (REG_BADBR);
}
else if (*pri == '\\' && pri[1] == '}')
{
pri += 2;
do_all = 1;
maxri = minri;
}
else
{
maxri = 0;
while ((c2 = *pri++) >= '0' && c2 <= '9')
maxri = maxri * 10 + c2 - '0';
if (c2 != '\\' || *pri != '}')
{
preg->re_erroff = pri - pattern - 1;
return ((c2=='\0')? REG_EBRACE : REG_BADBR);
}
pri++;
}
if (minri > maxri || maxri > RE_DUP_MAX || *pri == '*' || (*plastce & CR_MASK) != 0)
{
preg->re_erroff = ppat - pattern;
return (REG_BADBR);
}
maxri -= minri;
ppat = pri;
REPEAT_CHECK
pp += 2;
pto = pp - 1;
pfrom = pto - 2;
do
*pto-- = *pfrom--;
while (pfrom > plastce);
if (do_all)
*plastce = (*plastce & ~CR_MASK) | CR_INTERVAL_ALL;
else
*plastce = (*plastce & ~CR_MASK) | CR_INTERVAL;
plastce[1] = minri;
plastce[2] = maxri;
if (minri == 0)
isfirst = 0;
continue;
/*
* subexpression backreference
* error if subexpression not completed yet
* add CC_BACKREF to pattern if case sensitive
* add CC_I_BACKREF or CC_I_WBACKREF to pattern if ignore case
*/
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
c -= '0';
if (c > nsub || preg->re_esub[c] == NULL)
{
preg->re_erroff = ppat - pattern - 1;
return (REG_ESUBREG);
}
plastce = pp;
if (icase == 0)
*pp++ = CC_BACKREF;
else
if (sblocale != 0)
*pp++ = CC_I_BACKREF;
else
*pp++ = CC_I_WBACKREF;
*pp++ = c;
first++;
continue;
/*
* not a special character
* treat as ordinary character
*/
default:
goto cc_char;
}
/*
* start subexpression for ERE
* do same as \( for BRE
* treat as ordinary character for BRE
*/
case '(':
if (ere != 0)
goto lparen;
goto cc_char;
/*
* end subexpression for ERE
* do same as \) for BRE
* treat as ordinary character for BRE
*/
case ')':
if (ere != 0)
goto rparen;
goto cc_char;
/*
* zero or one match of previous expression
* ordinary character for BRE
* error if no valid previous expression
* ignore if previous expression already has *
* specify CR_QUESTION for previous expression repeat factor
*/
case '?':
if (ere == 0)
goto cc_char;
if (plastce == NULL)
{
preg->re_erroff = ppat - pattern - 1;
return (REG_BADRPT);
}
if ((*plastce & CR_MASK) > CR_QUESTION)
continue;
REPEAT_CHECK
*plastce = (*plastce & ~CR_MASK) | CR_QUESTION;
isfirst = 0;
continue;
/*
* one or more matches of previous expression
* ordinary character for BRE
* error if no valid previous expression
* ignore if previous expression already has * or ?
* specify CR_PLUS for previous expression repeat factor
*/
case '+':
if (ere == 0)
goto cc_char;
if (plastce == NULL)
{
preg->re_erroff = ppat - pattern - 1;
return (REG_BADRPT);
}
if ((*plastce & CR_MASK) > CR_PLUS)
continue;
REPEAT_CHECK
*plastce = (*plastce & ~CR_MASK) | CR_PLUS;
continue;
/*
* repetition interval match of previous expression
* ordinary character for BRE
* {m} matches exactly m occurances
* {m,} matches at least m occurances
* {m,n} matches m through n occurances
* treat characters as themselves if invalid sequence
* ignore if previous expression already has * or ? or +
* error if valid {} does not have previous expression
* insert two bytes for min/max after pattern code
* specify CR_INTERVAL for previous expression repeat factor
*/
case '{':
do_all = 0;
if (ere == 0)
goto cc_char;
pri = ppat;
minri = 0;
while ((c2 = *pri++) >= '0' && c2 <= '9')
minri = minri * 10 + c2 - '0';
if (pri == ppat+1)
goto cc_char;
/****
XPG4 says that '{' is undefined for ERE's if it is not part of a valid
repetition interval, so we're going back to treating it as a normal char
but this may change in a later release of XPG. If XPG changes its mind
and decides it should return an error, this is what should be done
(instead of "goto cc_char;") :
{
preg->re_erroff = ppat - pattern;
return (REG_BADPAT);
}
****/
if (c2 == '}')
maxri = minri;
else if (c2 != ',')
goto cc_char;
/****
{
preg->re_erroff = pri - pattern - 1;
return (REG_BADBR);
}
****/
else if (*pri == '}')
{
do_all = 1;
maxri = minri;
pri++;
}
else
{
maxri = 0;
while ((c2 = *pri++) >= '0' && c2 <= '9')
maxri = maxri * 10 + c2 - '0';
if (c2 != '}')
goto cc_char;
/****
{
preg->re_erroff = pri - pattern - 1;
return (REG_BADBR);
}
****/
}
if (minri > maxri || maxri > RE_DUP_MAX)
goto cc_char;
/****
{
preg->re_erroff = ppat - pattern;
return (REG_BADBR);
}
****/
maxri -= minri;
if (plastce == NULL)
{
preg->re_erroff = ppat - pattern - 1;
return (REG_BADBR);
}
ppat = pri;
if ((*plastce & CR_MASK) > CR_INTERVAL_ALL)
continue;
REPEAT_CHECK
pp += 2;
pto = pp - 1;
pfrom = pto - 2;
do
*pto-- = *pfrom--;
while (pfrom > plastce);
if (do_all)
*plastce = (*plastce & ~CR_MASK) | CR_INTERVAL_ALL;
else
*plastce = (*plastce & ~CR_MASK) | CR_INTERVAL;
plastce[1] = minri;
plastce[2] = maxri;
if (minri == 0)
isfirst = 0;
continue;
/*
* begin alternate expression
* treat <vertical-line> as normal character if
* 1) BRE
* 2) not followed by another expression
* 3) beginning of pattern
* 4) no previous expression
* insert leading CC_ALTERNATE if this is first alternative at this level
* compensate affected begin/end subexpression offsets
* compute delta offset from last CC_ALTERNATE to this one
* add CC_ALTERNATE_E to pattern, terminating previous alternative
* add CC_ALTERNATE to pattern, starting next alternative
* indicate now at end-of-line position
* indicate now at beginning-of-line if not blocked by previous expression
*/
case '|':
if (ere == 0 || *ppat == ')' || *ppat == '\0' || ppat == pattern+1 ||
(plastce == NULL && ppat[-2] != '^' && ppat[-2] != '$'))
goto cc_char;
palt = pp_start + (size_t)preg->re_lsub[idx];
if (altloc[idx] == 0)
{
pp += 3;
pto = pp - 1;
pfrom = pto - 3;
do
*pto-- = *pfrom--;
while (pfrom >= palt);
*palt = CC_ALTERNATE;
palt[1] = 0;
palt[2] = 0;
if (psubidx == subidx)
{
for (i=1; i<=nsub; i++)
{
preg->re_lsub[i] = (void *)((size_t)(preg->re_lsub[i]) + 3);
preg->re_esub[i] = (void *)((size_t)(preg->re_esub[i]) + 3);
}
}
else
{
for (i=*psubidx; i<=nsub; i++)
if (preg->re_esub[i] != NULL)
{
preg->re_lsub[i] = (void *)((size_t)(preg->re_lsub[i]) + 3);
preg->re_esub[i] = (void *)((size_t)(preg->re_esub[i]) + 3);
}
}
}
else
palt = altloc[idx] + pp_start;
i = pp - palt - 1;
palt[1] = i >> 8;
palt[2] = i & 0xff;
*pp++ = CC_ALTERNATE_E;
*pp++ = idx;
altloc[idx] = pp - pp_start;
*pp++ = CC_ALTERNATE;
*pp++ = 0;
*pp++ = 0;
plastce = NULL;
eol[idx] = 0;
if (sol[idx] == 0)
{
first = 0;
isfirst = 0;
}
continue;
} /* end of switch */
} /* end of while */
/*
* Return error if missing ending subexpression
*/
if (psubidx != subidx)
{
preg->re_erroff = ppat - pattern - 1;
return (REG_EPAREN);
}
/*
* Set all map bits to prevent regexec() failure if
* "first" expression not defined yet
* 1) empty pattern
* 2) last expression has *, ?, or {0,}
*/
if (isfirst == 0)
memset(pmap, (int)1, (int)256);
/*
* No problems so add trailing end-of-pattern compile code
* There is always suppose to be room for this
*/
*pp++ = CC_EOP;
/*
* Convert beginning/ending subexpression offsets to addresses
* Change first subexpression expression to start of subexpression
*/
preg->re_lsub[0] = pp_start;
preg->re_esub[0] = pp - 1;
for (i=1; i<=nsub; i++)
{
preg->re_lsub[i] = pp_start + (size_t)preg->re_lsub[i] - 2;
preg->re_esub[i] = pp_start + (size_t)preg->re_esub[i];
}
/*
* Define remaining RE structure and return status
*/
preg->re_comp = (void *)pp_start;
preg->re_len = pp - pp_start;
if ((cflags & REG_NOSUB) == 0)
preg->re_nsub = nsub;
return (0);
}
/************************************************************************/
/* bracket - convert [] expression into compiled RE pattern */
/* */
/* - ppat ptr to pattern */
/* - pnext ptr to pattern address following [] */
/* - pp ptr to compiled RE pattern */
/* - cflags __regcomp() flags */
/************************************************************************/
static int
bracket(uchar_t *ppat, uchar_t **pnext, uchar_t *pp, int cflags)
{
int c; /* file code of pattern character */
int c2; /* file code of character opposite case */
char class[CLASS_SIZE+1]; /* [: :] text with terminating NUL */
int dash; /* in the middle of a range expression */
int i; /* loop index */
int icase; /* ignore case flag */
int neg; /* nonmatching bitmap */
uchar_t *pb; /* ptr to [] expression */
char *pclass; /* ptr to class */
uchar_t *pend; /* ptr to end point in range expression */
uchar_t *pi; /* ptr to [international] expression */
int prev; /* previous character for range expr */
uchar_t *pxor; /* nonmatching xor bitmap ptr */
wctype_t wh; /* character class handle for is_wctype */
/*
* Check for nonmatching expression which has a leading <circumflex>
*/
icase = cflags & REG_ICASE;
pb = ppat;
neg = 0;
if (*pb == '^')
{
pb++;
neg++;
}
/*
* Check for leading <hyphen> or <right-bracket> which is not the [] terminator
*/
dash = 0;
prev = 0;
if (*pb == '-')
{
prev = *pb++;
SETBITC(pp, prev)
}
else if (*pb == ']')
{
prev = *pb++;
SETBITC(pp, prev)
}
/*
* BIG LOOP to process all characters in [] expression
* stop on ]
* return on any error
* next character can begin any of the following:
* a) any single character (default)
* b) equivalence character [= =] (only mathces specified character)
* c) collating symbol [. .] (assumes only one single byte character)
* d) character class [: :]
*/
while ((c = *pb++) != '\0')
{
switch(c)
{
/*
* single character
* set bitmap bit associated with character's file code
* if ignore case, also set bit of opposite case character
*/
default:
one_char:
SETBITC(pp, c)
if (icase != 0)
{
if ((c2 = toupper(c)) == c)
c2 = tolower(c);
SETBITC(pp, c2)
}
break;
/*
* [] terminator
* set bit for <minus> if expression ends with -]
* negate bitmap if nonmatching [] expression and clear
* newline bit if REG_NEWLINE is set
* clear NUL bit to disallow match of NUL in string
* return ptr to next character after ]
*/
case ']':
if (dash != 0)
SETBITC(pp, dash)
if (neg != 0)
{
for (pxor = pp + BITMAP_LEN - 1; pxor >= pp; pxor--)
*pxor = ~*pxor;
*pp &= 0xfe;
if ((cflags & REG_NEWLINE) != 0)
pp[1] &= 0xfb;
}
*pnext = pb;
return (0);
/*
* [: :] character class
* move class name into NUL terminated buffer
* error if too short or too long
* determine class handle, error in undefined
* set bitmap bit of all characters with this class characteristic
* if ignore case, also set bits of opposite case characters
*/
case '[':
if ((c = *pb++) == ':')
{
pclass = class;
while (1)
{
if (*pb == '\0')
{
*pnext = pb - 1;
return (REG_EBRACK);
}
if (*pb == ':' && pb[1] == ']')
break;
if (pclass >= &class[CLASS_SIZE-1])
{
*pnext = pb;
return (REG_ECTYPE);
}
*pclass++ = *pb++;
}
if (pclass == class)
{
*pnext = pb;
return (REG_ECTYPE);
}
*pclass = '\0';
if ((wh = get_wctype(class)) == -1) {
*pnext = pb;
return (REG_ECTYPE);
}
pb += 2;
for (i=1; i<=255; i++)
{
if (is_wctype(i, wh) != 0)
{
SETBITC(pp, i)
if (icase != 0)
{
if ((c2 = toupper(i)) == i)
c2 = tolower(i);
SETBITC(pp, c2)
}
}
}
c = 0;
break;
}
/*
* [= =] equivalence class or [. .] collating element
* error if not a single character followed by terminating character pair
* set bitmap bit of character
* if ignore case, also set bit of opposite case character
* set bit
*/
else if (c == '=' || c == '.')
{
if (*pb == '\0' || pb[1] != c || pb[2] != ']')
{
*pnext = pb;
return (REG_ECOLLATE);
}
c = *pb;
pb += 3;
SETBITC(pp, c)
if (icase != 0)
{
if ((c2 = toupper(c)) == c)
c2 = tolower(c);
SETBITC(pp, c2)
}
break;
}
else
{
pb--;
c = '[';
goto one_char;
}
/*
* <hyphen> deliniates a range expression unless it is an end point
*/
case '-':
if (dash == 0)
{
dash = c;
pend = pb;
continue;
}
else
goto one_char;
} /* end of switch */
/*
* Process range expression
* prev is file code of previous character (start point)
* c is file code of character following <hyphen> (end point)
* error if start point is greater than end point
* set all bits between prev and c
* if ignore case, also set bits of opposite case characters
*/
if (dash != 0)
{
dash = 0;
if (prev > c || prev == 0)
{
*pnext = pend;
return (REG_ERANGE);
}
for (i=prev+1; i<c; i++)
{
SETBITC(pp, i)
if (icase != 0)
{
if ((c2 = toupper(i)) == i)
c2 = tolower(i);
SETBITC(pp, c2)
}
}
prev = 0;
}
else
prev = c;
} /* end of while */
/*
* fatal error if <right-bracket> not found
*/
*pnext = pb - 1;
return (REG_EBRACK);
}
/************************************************************************/
/* enlarge - enlarge compiled pattern buffer */
/* */
/* - x # of new bytes needed in pattern buf */
/* - pp_start ptr to starting address of pattern buf */
/* - pe ptr to ending address of pattern buf */
/* - plastce ptr to last compiled pattern code */
/************************************************************************/
static void
enlarge(int x, uchar_t **pp_start, uchar_t **pe, uchar_t **pp, uchar_t **plastce)
{
size_t old_len; /* previous length (bytes) */
size_t new_len; /* new length (bytes) */
uchar_t *old_start; /* previous pp_start */
uchar_t *new_start; /* new pp_start */
old_start = *pp_start;
old_len = *pe - old_start + 1;
new_len = old_len + PATTERN_EXP;
while (new_len < old_len + x)
new_len += PATTERN_EXP;
new_start = (uchar_t *)malloc(new_len);
*pp_start = new_start;
if (new_start != NULL)
{
memcpy(new_start, old_start, old_len);
*pe = new_start + new_len - 1;
*pp = (*pp - old_start) + new_start;
if (*plastce != NULL)
*plastce = (*plastce - old_start) + new_start;
free(old_start);
}
return;
}
/************************************************************************/
/* bracketw - convert [bracket expression] into compiled RE bitmap */
/************************************************************************/
static int
bracketw(_LC_collate_objhdl_t hdl, uchar_t *ppat, uchar_t **pnext, uchar_t *pp,
regex_t *preg, uchar_t *pmap)
{
int dashflag; /* in the middle of a range expression */
int delta; /* SETBIT unique collating value offset */
int i; /* loop index for range of bits */
int icase; /* ignore case flag */
wchar_t max_ucoll; /* maximum unique collating value */
wchar_t min_ucoll; /* minimum unique collating value */
int mb_cur_max; /* local copy of MB_CUR_MAX */
int neg; /* nonmatching bitmap flag */
uchar_t *pb; /* ptr to [bracket expression] */
uchar_t *pclass; /* class[] ptr */
uchar_t *pdash; /* ptr to <hyphen> in range expression */
wchar_t prev_min_ucoll; /* previous character min_ucoll */
wchar_t sv_wc; /* save wc of previous character */
uchar_t *pxor; /* pattern ptr to xor nonmatching [] */
int stat; /* intl_expr return status */
wchar_t ucoll; /* unique collating value of lowercase */
wchar_t wc; /* character process code */
wchar_t wc2; /* OPPOSITE character process code */
int wclen; /* # bytes in next character */
uchar_t class[CLASS_SIZE]; /* [ ] text with terminating <NUL> */
pb = ppat;
dashflag = 0;
mb_cur_max = MB_CUR_MAX;
icase = preg->re_cflags & REG_ICASE;
/*
* <circumflex> defines a nonmatching bracket expression if it is
* the first [bracket expression] character
*/
if (*pb == '^')
{
pb++;
neg++;
}
else
neg = 0;
/*
* determine process code of next character
* leading <circumflex> means nonmatching []
*
* use next byte if invalid multibyte character detected
* determine min/max unique collating value of next character
*
* next character can be one of the following
* a) single collating element (any single character)
* b) equivalence character ([= =])
* c) character class ([: :])
* d) collating symbol ([. .])
*/
while ((wclen = mbtowc(&wc, pb, mb_cur_max)) > 0)
{
pb += wclen;
min_ucoll = __wcuniqcollwgt(wc);
max_ucoll = min_ucoll;
switch (wc)
{
/*
* single character collating element
* invalid if has an out-of-range unique collating value (meaning
* it is not considered for collation)
* set bitmap associated with character's unique collating value
*/
default:
coll_ele:
if (min_ucoll < MIN_UCOLL || min_ucoll > MAX_UCOLL)
{
*pnext = pb - wclen;
return (REG_ECOLLATE);
}
SETBIT(pp, min_ucoll);
OPPOSITE(icase, pp, ucoll, wc, wc2);
break;
/*
* <hyphen> defines a range expression a-z if it is surrounded by a
* valid range expression
* it is treated as itself if the first or last character within
* the [bracket expression]
*/
case '-':
if ((dashflag != 0) ||
((neg == 0 && pb == ppat + 1) || (neg != 0 && pb == ppat + 2)) ||
(*pb == ']'))
goto coll_ele;
dashflag++;
pdash = pb - 1;
continue;
/*
* <open-bracket> initiates one of the following internationalization
* character expressions:
* a) [= =] equivalence character class
* b) [. .] collation symbol
* c) [: :] character class
*
* it is treated as itself if not followed by one of the three
* special characters <equal-sign>, <period>, or <colon>
*
* move contents of [ ] to a <NUL> terminated string
* set bitmap bits by calling intl_expr
* min/max will return with valid values if not character class
*
* set pmap bit for first byte of collation symbol because loop
* in bracketw() does not know about collation symbols in v3.2
*/
case '[':
if (*pb != '=' && *pb != '.' && *pb != ':')
goto coll_ele;
*pnext = pb++;
pclass = class;
while (1)
{
if (*pb == '\0')
{
*pnext = pb;
return (REG_EBRACK);
}
if (*pb == **pnext && pb[1] == ']')
break;
if (pclass >= &class[CLASS_SIZE])
return (REG_ECTYPE);
*pclass++ = *pb++;
}
if (pclass == class)
return (REG_ECTYPE);
*pclass = '\0';
pb += 2;
stat = intl_expr(hdl, preg, (char)**pnext, class, pp,
&min_ucoll, &max_ucoll);
if (stat != 0)
return (stat);
if (pmap != NULL && **pnext == '.')
pmap[*class] = 1;
break;
/*
* <close-bracket> is treated as itself if it is the first character
* within the [bracket expression]
* otherwise it correctly ends this [bracket expression]
* set
* complement the final bitmap if nonmatching [bracket expression]
* making sure <NUL> is not allowed to match, and <newline> does
* not match if REG_NEWLINE is set
*/
case ']':
if ((neg == 0 && pb == ppat + 1) || (neg != 0 && pb == ppat + 2))
goto coll_ele;
if (neg != 0)
{
pxor = pp + ((MAX_UCOLL - MIN_UCOLL) / NBBY);
for (; pxor >= pp; pxor--)
*pxor = ~*pxor;
ucoll = __wcuniqcollwgt('\0');
if (ucoll >= MIN_UCOLL && ucoll <= MAX_UCOLL)
CLEARBIT(pp, ucoll)
if ((preg->re_cflags & REG_NEWLINE) != 0)
{
ucoll = __wcuniqcollwgt('\n');
if (ucoll >= MIN_UCOLL && ucoll <= MAX_UCOLL)
CLEARBIT(pp, ucoll)
}
}
*pnext = pb;
return (0);
} /* end of switch */
/*
* a range expression a-z sets all of the bitmap bits between the starting
* and ending point of the range. The range is invalid if
* a) either the starting or ending point has a non-collating collating value
* b) the starting point collating value is greater than the ending point
* c) either end point is a character class
* d) the starting point is a previous range expression
* if ignoring case, set bit associated with opposite case of each character
* and multicharacter collating symbol
*/
if (dashflag != 0)
{
dashflag = 0;
if (prev_min_ucoll < MIN_UCOLL ||
max_ucoll > MAX_UCOLL ||
prev_min_ucoll > max_ucoll)
{
*pnext = pdash;
return (REG_ERANGE);
}
/* Loop thru all process codes looking for wgts between the two endpoints */
for (i=MIN_PC; i<=MAX_PC; i++) {
ucoll = __wcuniqcollwgt(i);
if (ucoll>=prev_min_ucoll && ucoll<=max_ucoll){
SETBIT(pp, ucoll);
OPPOSITE(icase, pp, ucoll, i, wc2);
}
}
if (icase != 0)
{
}
min_ucoll = 0;
}
prev_min_ucoll = min_ucoll;
sv_wc = wc;
} /* end of while */
/*
* return with error when <NUL> or invalid character detected
*/
*pnext = pb;
if (wclen < 0)
return (REG_ECHAR);
return (REG_EBRACK);
}
/************************************************************************/
/* intl_expr - decode [ ] internationalization character expression */
/************************************************************************/
static int
intl_expr(_LC_collate_objhdl_t hdl, regex_t *preg, uchar_t type,
uchar_t *pexpr, uchar_t *pp, wchar_t *pmin, wchar_t *pmax)
{
int delta; /* SETBIT variable */
wint_t i; /* loop index */
int icase; /* ignore case flag */
wchar_t ocoll; /* opposite case collating weight */
wchar_t pcoll; /* primary collating weight */
uchar_t *pend; /* ptr to end of collating element + 1 */
wchar_t *pwgt; /* ptr to collation weight table */
wchar_t *tmpwgt; /* temp collating weight table ptr */
wchar_t ucoll; /* unique collating weight */
wchar_t wc; /* process code of pexpr character */
wchar_t wc2; /* OPPOSITE character process code */
int wclen; /* # bytes in pexpr character */
wctype_t wctype; /* character class handle for is_wctype */
uchar_t lcexpr[NL_NMAX*MB_LEN_MAX]; /* lowercase [. .] */
icase = preg->re_cflags & REG_ICASE;
switch (type)
{
/*
* equivalence class [= =]
* treat invalid collating element as collating symbol
* set bitmap bits for all characters in the equivalence class
* if ignoring case, set bit associated with opposite case version of [= =]
* define min/max unique collating values for the equivalence class
*/
case '=':
wclen = mbtowc(&wc, pexpr, MB_CUR_MAX);
if (wclen < 0)
return (REG_ECHAR);
if (pexpr[wclen] != '\0')
goto co_symbol;
pwgt = __wccollwgt(wc);
if (pwgt == NULL)
return (REG_ECOLLATE);
*pmax = pwgt[_UCW_ORDER];
pcoll = *pwgt;
if ((*pmax < MIN_UCOLL || *pmax > MAX_UCOLL) && (pcoll != 0))
return (REG_ECOLLATE);
*pmin = *pmax;
{
/* These indexes are for 1 to many mappings. */
int idx_ec; /* index to wgt string for equivalence class sought */
int idx_chk; /* same, but for character being checked */
char *wstr_ec; /* weight string for equivalence class sought */
char *wstr_chk; /* same, but for string for character being checked */
int substring;
substring = (pcoll == _SUB_STRING);
if (substring) {
idx_ec = pwgt[0];
wstr_ec = wgtstring(idx_ec); /* We'll need this later. */
}
for (i = MIN_PC; i <= MAX_PC; i++) {
tmpwgt = __wccollwgt(i); /* weight table */
ucoll = tmpwgt[_UCW_ORDER]; /* unique weight */
if (substring && tmpwgt[0] == _SUB_STRING) {
idx_chk = tmpwgt[0]; /* weight string index */
wstr_chk = wgtstring(idx_chk); /* weight string */
}
/* do primary wgts match (even if they're _SUB_STRING) */
if (tmpwgt[0] == pcoll && (ucoll >= MIN_UCOLL && ucoll <= MAX_UCOLL)) {
if (substring && strcmp(wstr_ec, wstr_chk) != 0)
continue; /* substrings don't match */
/* same equivalence class */
SETBIT(pp, ucoll);
OPPOSITE(icase, pp, ocoll, i, wc2);
if (ucoll < *pmin)
*pmin = ucoll;
if (ucoll > *pmax)
*pmax = ucoll;
}
}
}
break;
/*
* collating symbol [. .]
* set single bitmap bit for the collation symbol
* define min/max as collation symbol unique collating value
* if ignoring case, set bit associated with opposite case version of [. .]
*
* return error if invalid collation symbol
*/
case '.':
co_symbol:
ucoll = _mbucoll(hdl, pexpr, &pend);
if (ucoll < MIN_UCOLL || ucoll > MAX_UCOLL)
return (REG_ECOLLATE);
if (*pend != '\0')
return (REG_ECOLLATE);
SETBIT(pp, ucoll);
if (icase != 0)
{
}
*pmin = ucoll;
*pmax = ucoll;
break;
/*
* character class [: :]
* return error if undefined in current locale
* set bitmap bit for each process code with the class characteristic
* if ignoring case, set bit associated with opposite case version of [: :]
* define min unique collating value as zero so character class
* cannot be used with a range expression
*/
case ':':
wctype = get_wctype(pexpr);
for (i = 1; i <= MAX_PC; i++)
if (is_wctype(i, wctype) != 0)
{
ucoll = __wcuniqcollwgt(i);
if (ucoll >= MIN_UCOLL && ucoll <= MAX_UCOLL)
{
SETBIT(pp, ucoll);
OPPOSITE(icase, pp, ocoll, i, wc2);
}
}
*pmin = 0;
break;
}
return (0);
}