diff options
Diffstat (limited to 'Src/pattern.c')
-rw-r--r-- | Src/pattern.c | 1116 |
1 files changed, 656 insertions, 460 deletions
diff --git a/Src/pattern.c b/Src/pattern.c index 048e3d3ec..914479847 100644 --- a/Src/pattern.c +++ b/Src/pattern.c @@ -1,5 +1,5 @@ /* - * glob.c - filename generation + * pattern.c - pattern matching * * This file is part of zsh, the Z shell. * @@ -50,13 +50,28 @@ */ #include "zsh.mdh" -#include "pattern.pro" /* - * Globbing flags: lower 8 bits gives approx count + * The following union is used mostly for alignment purposes. + * Normal nodes are longs, while certain nodes take a char * as an argument; + * here we make sure that they both work out to the same length. + * The compiled regexp we construct consists of upats stuck together; + * anything else to be added (strings, numbers) is stuck after and + * then aligned to a whole number of upat units. + * + * Note also that offsets are in terms of the sizes of these things. */ -#define C_LCMATCHUC 0x0100 -#define C_IGNCASE 0x0200 +union upat { + long l; + unsigned char *p; +}; + +typedef union upat *Upat; + +#include "pattern.pro" + +/* Number of active parenthesised expressions allowed in backreferencing */ +#define NSUBEXP 9 /* definition number opnd? meaning */ #define P_END 0x00 /* no End of program. */ @@ -154,36 +169,17 @@ #define PP_UNKWN 13 #define PP_RANGE 14 -/* Align everything to the pointer type. */ -typedef char *zalign_t; - -#define P_OP(p) (*(long *)(p) & 0xff) -#define P_NEXT(p) (*(long *)(p) >> 8) -#define P_OPERAND(p) ((p) + sizeof(zalign_t)) -#define P_ISBRANCH(p) (*(long *)(p) & 0x20) -#define P_ISEXCLUDE(p) ((*(long *)(p) & 0x30) == 0x30) -#define P_NOTDOT(p) (*(long *)(p) & 0x40) +#define P_OP(p) ((p)->l & 0xff) +#define P_NEXT(p) ((p)->l >> 8) +#define P_OPERAND(p) ((p) + 1) +#define P_ISBRANCH(p) ((p)->l & 0x20) +#define P_ISEXCLUDE(p) (((p)->l & 0x30) == 0x30) +#define P_NOTDOT(p) ((p)->l & 0x40) #define P_SIMPLE 0x01 /* Simple enough to be #/## operand. */ #define P_HSTART 0x02 /* Starts with # or ##'d pattern. */ #define P_PURESTR 0x04 /* Can be matched with a strcmp */ -/* - * pointer is end string to be parsed? - * a bit dire because of extendedglob possibilities: - * we need to make sure a ~ at the end of a string isn't mistaken - * for an excluder or lots of emacs users get very cross. - */ -#define ISENDCHAR(X) (!*(X) || ((patflags & PAT_FILE) && *(X) == '/') || \ - *(X) == Bar || *(X) == Outpar || \ - (isset(EXTENDEDGLOB) && \ - (*(X) == Hat || \ - (*(X) == Tilde && \ - !(!(X)[1] || ((patflags & PAT_FILE) && \ - (X)[1] == '/') || \ - (X)[1] == Bar || (X)[1] == Outpar || \ - (X)[1] == Tilde))))) - /* Next character after one which may be a Meta (x is any char *) */ #define METANEXT(x) (*(x) == Meta ? (x)+2 : (x)+1) /* @@ -202,6 +198,36 @@ typedef zlong zrange_t; typedef unsigned long zrange_t; #endif +/* + * Characters which terminate a pattern segment. We actually use + * a pointer patendseg which skips the first character if we are not + * parsing a file pattern. + * Note that the size of this and the next array are hard-wired + * via the definitions. + */ + +static char endseg[] = { + '/', /* file only */ + '\0', Bar, Outpar, /* all patterns */ + Tilde /* extended glob only */ +}; + +#define PATENDSEGLEN_NORM 4 +#define PATENDSEGLEN_EXT 5 + +/* Characters which terminate a simple string */ + +static char endstr[] = { + '/', /* file only */ + '\0', Bar, Outpar, Quest, Star, Inbrack, Inpar, Inang, + /* all patterns */ + Tilde, Hat, Pound /* extended glob only */ +}; + +#define PATENDSTRLEN_NORM 9 +#define PATENDSTRLEN_EXT 12 + + /* Default size for pattern buffer */ #define P_DEF_ALLOC 256 @@ -212,6 +238,10 @@ static char *patcode; /* point of code emission */ static long patsize; /* size of code */ static char *patout; /* start of code emission string */ static long patalloc; /* size allocated for same */ +static char *patendseg; /* characters ending segment */ +static int patendseglen; /* length of same */ +static char *patendstr; /* characters ending plain string */ +static int patendstrlen; /* length of sameo */ /* Flags used in both compilation and execution */ static int patflags; /* flags passed down to patcompile */ @@ -226,8 +256,8 @@ patadd(char *add, int ch, long n, int noalgn) /* Make sure everything gets aligned unless we get noalgn. */ long newpatsize = patsize + n; if (!noalgn) - newpatsize = (newpatsize + sizeof(zalign_t) - 1) & - ~(sizeof(zalign_t) - 1); + newpatsize = (newpatsize + sizeof(union upat) - 1) & + ~(sizeof(union upat) - 1); if (patalloc < newpatsize) { long newpatalloc = 2*(newpatsize > patalloc ? newpatsize : patalloc); @@ -245,7 +275,7 @@ patadd(char *add, int ch, long n, int noalgn) } static long rn_offs; -/* operates on poiners, returns a pointer */ +/* operates on pointers to union upat, returns a pointer */ #define PATNEXT(p) ((rn_offs = P_NEXT(p)) ? \ (P_OP(p) == P_BACK) ? \ ((p)-rn_offs) : ((p)+rn_offs) : NULL) @@ -262,24 +292,18 @@ patcompstart(void) /* Top level pattern compilation subroutine */ /**/ -Patprog +mod_export Patprog patcompile(char *exp, int inflags, char **endexp) { - int flags, len; + int flags = 0, len = 0; long startoff; - char *pscan, *lng; + Upat pscan; + char *lng, *strp = NULL; Patprog p; - DPUTS(sizeof(long) > sizeof(zalign_t), "BUG: patprog alignment too small"); - -#ifdef BACKREFERENCES - startoff = (inflags & PAT_BACKR) ? sizeof(struct patprog) : - sizeof(struct patprog_short); -#else startoff = sizeof(struct patprog); -#endif /* Ensure alignment of start of program string */ - startoff = (startoff + sizeof(zalign_t) - 1) & ~(sizeof(zalign_t) - 1); + startoff = (startoff + sizeof(union upat) - 1) & ~(sizeof(union upat) - 1); /* Allocate reasonable sized chunk if none, reduce size if too big */ if (patalloc != P_DEF_ALLOC) @@ -287,10 +311,23 @@ patcompile(char *exp, int inflags, char **endexp) patcode = patout + startoff; patsize = patcode - patout; patstart = patparse = exp; + /* + * Note global patnpar numbers parentheses 1..9, while patnpar + * in struct is actual count of parentheses. + */ patnpar = 1; - patflags = inflags; + patflags = inflags & ~PAT_PURES; + + patendseg = endseg; + patendseglen = isset(EXTENDEDGLOB) ? PATENDSEGLEN_EXT : PATENDSEGLEN_NORM; + patendstr = endstr; + patendstrlen = isset(EXTENDEDGLOB) ? PATENDSTRLEN_EXT : PATENDSTRLEN_NORM; if (!(patflags & PAT_FILE)) { + patendseg++; + patendstr++; + patendseglen--; + patendstrlen--; remnulargs(exp); patglobflags = 0; } @@ -299,65 +336,88 @@ patcompile(char *exp, int inflags, char **endexp) */ ((Patprog)patout)->globflags = patglobflags; - if (patflags & PAT_ANY) - flags = 0; - else if (patcompswitch(0, &flags) == 0) - return NULL; + if (!(patflags & PAT_ANY)) { + /* Look for a really pure string, with no tokens at all. */ + if (!patglobflags) + for (strp = exp; *strp && + (!(patflags & PAT_FILE) || *strp != '/') && !itok(*strp); + strp++) + ; + if (!strp || (*strp && *strp != '/')) { + /* No, do normal compilation. */ + strp = NULL; + if (patcompswitch(0, &flags) == 0) + return NULL; + } else { + /* Yes, copy the string and skip compilation altogether */ + patparse = strp; + len = strp - exp; + patadd(exp, 0, len + 1, 0); + patout[startoff + len] = '\0'; + patflags |= PAT_PURES; + } + } /* end of compilation: safe to use pointers */ p = (Patprog)patout; p->startoff = startoff; p->patstartch = '\0'; p->globend = patglobflags; - p->flags = (patflags & ~PAT_PURES); + p->flags = patflags; p->mustoff = 0; p->size = patsize; - p->patmlen = 0; - pscan = patout + startoff; + p->patmlen = len; + p->patnpar = patnpar-1; - if (!(patflags & PAT_ANY) && P_OP(PATNEXT(pscan)) == P_END) { - /* only one top level choice */ - pscan = P_OPERAND(pscan); + if (!strp) { + pscan = (Upat)(patout + startoff); - if (flags & P_PURESTR) { - /* - * The pattern can be matched with a simple strncmp/strcmp. - * Careful in case we've overwritten the node for the next ptr. - */ - char *dst = patout + startoff, *next; - p->flags |= PAT_PURES; - for (; pscan; pscan = next) { - next = PATNEXT(pscan); - if (P_OP(pscan) == P_EXACTLY) { - char *opnd = P_OPERAND(pscan); - while ((*dst = *opnd++)) - dst++; + if (!(patflags & PAT_ANY) && P_OP(PATNEXT(pscan)) == P_END) { + /* only one top level choice */ + pscan = P_OPERAND(pscan); + + if (flags & P_PURESTR) { + /* + * The pattern can be matched with a simple strncmp/strcmp. + * Careful in case we've overwritten the node for the next ptr. + */ + char *dst = patout + startoff; + Upat next; + p->flags |= PAT_PURES; + for (; pscan; pscan = next) { + next = PATNEXT(pscan); + if (P_OP(pscan) == P_EXACTLY) { + char *opnd = (char *)P_OPERAND(pscan); + while ((*dst = *opnd++)) + dst++; + } } - } - *dst++ = '\0'; - p->size = dst - patout; - /* patmlen is reall strlen, don't include null byte */ - p->patmlen = p->size - startoff - 1; - } else { - /* starting point info */ - if (P_OP(pscan) == P_EXACTLY && !p->globflags) - p->patstartch = *P_OPERAND(pscan); - /* Find the longest literal string in something expensive. - * This is itself not all that cheap if we have case-insensitive - * matching or approximation, so don't. - */ - if ((flags & P_HSTART) && !p->globflags) { - lng = NULL; - len = 0; - for (; pscan; pscan = PATNEXT(pscan)) - if (P_OP(pscan) == P_EXACTLY && - strlen((char *)P_OPERAND(pscan)) >= len) { - lng = P_OPERAND(pscan); - len = strlen((char *)P_OPERAND(pscan)); + *dst++ = '\0'; + p->size = dst - patout; + /* patmlen is really strlen, don't include null byte */ + p->patmlen = p->size - startoff - 1; + } else { + /* starting point info */ + if (P_OP(pscan) == P_EXACTLY && !p->globflags) + p->patstartch = *(char *)P_OPERAND(pscan); + /* + * Find the longest literal string in something expensive. + * This is itself not all that cheap if we have + * case-insensitive matching or approximation, so don't. + */ + if ((flags & P_HSTART) && !p->globflags) { + lng = NULL; + len = 0; + for (; pscan; pscan = PATNEXT(pscan)) + if (P_OP(pscan) == P_EXACTLY && + strlen((char *)P_OPERAND(pscan)) >= len) { + lng = (char *)P_OPERAND(pscan); + len = strlen(lng); + } + if (lng) { + p->mustoff = lng - patout; + p->patmlen = len; } - if (lng) { - p->mustoff = lng - patout; - p->patmlen = len; } } } @@ -367,8 +427,13 @@ patcompile(char *exp, int inflags, char **endexp) * The pattern was compiled in a fixed buffer: unless told otherwise, * we stick the compiled pattern on the heap. This is necessary * for files where we will often be compiling multiple segments at once. + * But if we get the ZDUP flag w always put it in zalloc()ed memory. */ - if (!(patflags & PAT_STATIC)) { + if (patflags & PAT_ZDUP) { + Patprog newp = (Patprog)zalloc(patsize); + memcpy((char *)newp, (char *)p, patsize); + p = newp; + } else if (!(patflags & PAT_STATIC)) { Patprog newp = (Patprog)zhalloc(patsize); memcpy((char *)newp, (char *)p, patsize); p = newp; @@ -389,26 +454,22 @@ static long patcompswitch(int paren, int *flagp) { long starter, br, ender, excsync = 0; -#ifdef BACKREFERENCES int parno = 0; -#endif - int flags, gfchanged = 0, savflags = patflags, savglobflags = patglobflags; - char *ptr; + int flags, gfchanged = 0, savglobflags = patglobflags; + Upat ptr; *flagp = 0; -#ifdef BACKREFERENCES - if (paren && (patflags & PAT_BACKR)) { + if (paren && (patglobflags & GF_BACKREF) && patnpar <= NSUBEXP) { /* * parenthesized: make an open node. * We can only refer to the first nine parentheses. * For any others, we just use P_OPEN on its own; there's * no gain in arbitrarily limiting the number of parentheses. */ - parno = patnpar >= NSUBEXP ? 0 : patnpar++; + parno = patnpar++; starter = patnode(P_OPEN + parno); } else -#endif starter = 0; br = patnode(P_BRANCH); @@ -424,17 +485,16 @@ patcompswitch(int paren, int *flagp) *flagp |= flags & (P_HSTART|P_PURESTR); while (*patparse == Bar || - (isset(EXTENDEDGLOB) && - *patparse == Tilde && patparse[1] && patparse[1] != Bar && - patparse[1] != Outpar && patparse[1] != Tilde && - !((patflags & PAT_FILE) && patparse[1] == '/'))) { + (isset(EXTENDEDGLOB) && *patparse == Tilde && + (patparse[1] == '/' || + !memchr(patendseg, patparse[1], patendseglen)))) { int tilde = *patparse++ == Tilde; long gfnode = 0, newbr; *flagp &= ~P_PURESTR; if (tilde) { - unsigned char *unull = NULL; + union upat up; /* excsync remembers the P_EXCSYNC node before a chain of * exclusions: all point back to this. only the * original (non-excluded) branch gets a trailing P_EXCSYNC. @@ -455,10 +515,16 @@ patcompswitch(int paren, int *flagp) patglobflags &= ~0xff; br = patnode(!(patflags & PAT_FILET) || paren ? P_EXCLUDE : P_EXCLUDP); - patadd((char *)&unull, 0, sizeof(unull), 0); + up.p = NULL; + patadd((char *)&up, 0, sizeof(up), 0); /* / is not treated as special if we are at top level */ - if (!paren) - patflags &= ~PAT_FILE; + if (!paren && *patendseg == '/') { + tilde++; + patendseg++; + patendseglen--; + patendstr++; + patendstrlen--; + } } else { excsync = 0; br = patnode(P_BRANCH); @@ -485,16 +551,23 @@ patcompswitch(int paren, int *flagp) * No gfchanged, as nothing to follow branch at top * level. */ + union upat up; gfnode = patnode(P_GFLAGS); - patadd((char *)&patglobflags, 0, sizeof(long), - 0); + up.l = patglobflags; + patadd((char *)&up, 0, sizeof(union upat), 0); } } else { patglobflags = savglobflags; } } newbr = patcompbranch(&flags); - patflags = savflags; + if (tilde == 2) { + /* restore special treatment of / */ + patendseg--; + patendseglen++; + patendstr--; + patendstrlen++; + } if (!newbr) return 0; if (gfnode) @@ -513,21 +586,16 @@ patcompswitch(int paren, int *flagp) * branch at that point would indicate the current choices continue, * which they don't. */ -#ifdef BACKREFERENCES - ender = patnode(paren ? (patflags & PAT_BACKR) ? P_CLOSE+parno - : P_NOTHING : P_END); -#else - ender = patnode(paren ? P_NOTHING : P_END); -#endif + ender = patnode(paren ? parno ? P_CLOSE+parno : P_NOTHING : P_END); pattail(starter, ender); /* * Hook the tails of the branches to the closing node, * except for exclusions which terminate where they are. */ - for (ptr = patout + starter; ptr; ptr = PATNEXT(ptr)) + for (ptr = (Upat)patout + starter; ptr; ptr = PATNEXT(ptr)) if (!P_ISEXCLUDE(ptr)) - patoptail(ptr-patout, ender); + patoptail(ptr-(Upat)patout, ender); /* check for proper termination */ if ((paren && *patparse++ != Outpar) || @@ -566,12 +634,9 @@ patcompbranch(int *flagp) *flagp = P_PURESTR; starter = chain = 0; - while (*patparse && !((patflags & PAT_FILE) && *patparse == '/') && - *patparse != Bar && *patparse != Outpar && - (!isset(EXTENDEDGLOB) || *patparse != Tilde || - !patparse[1] || patparse[1] == Bar || patparse[1] == Outpar - || patparse[1] == Tilde || - ((patflags & PAT_FILE) && patparse[1] == '/'))) { + while (!memchr(patendseg, *patparse, patendseglen) || + (*patparse == Tilde && patparse[1] != '/' && + memchr(patendseg, patparse[1], patendseglen))) { if (isset(EXTENDEDGLOB) && ((!isset(SHGLOB) && (*patparse == Inpar && patparse[1] == Pound)) || @@ -601,8 +666,10 @@ patcompbranch(int *flagp) */ if (oldglobflags != patglobflags) { /* Flags changed */ + union upat up; latest = patnode(P_GFLAGS); - patadd((char *)&patglobflags, 0, sizeof(int), 0); + up.l = patglobflags; + patadd((char *)&up, 0, sizeof(union upat), 0); } else { /* No effect. */ continue; @@ -663,17 +730,37 @@ patgetglobflags(char **strp) case 'l': /* Lowercase in pattern matches lower or upper in target */ - patglobflags = (patglobflags & ~C_IGNCASE) | C_LCMATCHUC; + patglobflags = (patglobflags & ~GF_IGNCASE) | GF_LCMATCHUC; break; case 'i': /* Fully case insensitive */ - patglobflags = (patglobflags & ~C_LCMATCHUC) | C_IGNCASE; + patglobflags = (patglobflags & ~GF_LCMATCHUC) | GF_IGNCASE; break; case 'I': /* Restore case sensitivity */ - patglobflags &= ~(C_LCMATCHUC|C_IGNCASE); + patglobflags &= ~(GF_LCMATCHUC|GF_IGNCASE); + break; + + case 'b': + /* Make backreferences */ + patglobflags |= GF_BACKREF; + break; + + case 'B': + /* Don't make backreferences */ + patglobflags &= ~GF_BACKREF; + break; + + case 'm': + /* Make references to complete match */ + patglobflags |= GF_MATCHREF; + break; + + case 'M': + /* Don't */ + patglobflags &= ~GF_MATCHREF; break; default: @@ -696,156 +783,105 @@ static long patcomppiece(int *flagp) { long starter, next, pound, op; - int flags, kshchar; - unsigned char *ptr; - - starter = patcompatom(&flags, &kshchar); - if (!starter) - return 0; - - if (!(pound = (*patparse == Pound && isset(EXTENDEDGLOB))) && - (kshchar <= 0 || kshchar == '@' || kshchar == '!')) { - *flagp = flags; - return starter; - } - - /* too much at once doesn't currently work */ - if (kshchar && pound) - return 0; - - if (kshchar == '*') { - op = P_ONEHASH; - *flagp = P_HSTART; - } else if (kshchar == '+') { - op = P_TWOHASH; - *flagp = P_HSTART; - } else if (kshchar == '?') { - op = 0; - *flagp = 0; - } else if (*++patparse == Pound) { - op = P_TWOHASH; - patparse++; - *flagp = P_HSTART; - } else { - op = P_ONEHASH; - *flagp = P_HSTART; - } - - /* - * Note optimizations with pointers into P_NOTHING branches: some - * should logically point to next node after current piece. - * - * Backtracking is also encoded in a slightly obscure way: the - * code emitted ensures we test the non-empty branch of complex - * patterns before the empty branch on each repetition. Hence - * each time we fail on a non-empty branch, we try the empty branch, - * which is equivalent to backtracking. - */ - if ((flags & P_SIMPLE) && op == P_ONEHASH && - P_OP(patout+starter) == P_ANY) { - /* Optimize ?# to *. Silly thing to do, since who would use - * use ?# ? But it makes the later code shorter. - */ - long *lptr = (long *)(patout + starter); - *lptr = (*lptr & ~0xff) | P_STAR; - } else if ((flags & P_SIMPLE) && op && !(patglobflags & 0xff)) { - /* Don't simplify if we need to look for approximations. */ - patinsert(op, starter, NULL, 0); - } else if (op == P_ONEHASH) { - /* Emit x# as (x&|), where & means "self". */ - ptr = NULL; - patinsert(P_WBRANCH, starter, (char *)&ptr, sizeof(ptr)); - /* Either x */ - patoptail(starter, patnode(P_BACK)); /* and loop */ - patoptail(starter, starter); /* back */ - pattail(starter, patnode(P_BRANCH)); /* or */ - pattail(starter, patnode(P_NOTHING)); /* null. */ - } else if (op == P_TWOHASH) { - /* Emit x## as x(&|) where & means "self". */ - next = patnode(P_WBRANCH); /* Either */ - ptr = NULL; - patadd((char *)&ptr , 0, sizeof(ptr), 0); - pattail(starter, next); - pattail(patnode(P_BACK), starter); /* loop back */ - pattail(next, patnode(P_BRANCH)); /* or */ - pattail(starter, patnode(P_NOTHING)); /* null. */ - } else if (kshchar == '?') { - /* Emit ?(x) as (x|) */ - patinsert(P_BRANCH, starter, NULL, 0); /* Either x */ - pattail(starter, patnode(P_BRANCH)); /* or */ - next = patnode(P_NOTHING); /* null */ - pattail(starter, next); - patoptail(starter, next); - } - if (*patparse == Pound) - return 0; - - return starter; -} - -/* - * Parse lowest level pattern. If doing ordinary characters, we - * gobble a complete string as far as we can get. - * kshcharp returns a character found before an Inpar, for handling - * as a closure. - */ - -/**/ -static long -patcompatom(int *flagp, int *kshcharp) -{ - long starter; - int patch, flags, len, ch; + int flags, flags2, kshchar, len, ch, patch; + union upat up; char *nptr, *str0, cbuf[2]; zrange_t from, to; - *flagp = 0; + flags = 0; str0 = patparse; for (;;) { - /* check kshglob here */ - *kshcharp = '\0'; + /* + * Check if we have a string. First, we need to make sure + * the string doesn't introduce a ksh-like parenthesised expression. + */ + kshchar = '\0'; if (isset(KSHGLOB) && *patparse && patparse[1] == Inpar) { - if (strchr("?*+!@", (char)*patparse)) - *kshcharp = STOUC(*patparse); + if (strchr("?*+!@", *patparse)) + kshchar = STOUC(*patparse); else if (*patparse == Star || *patparse == Quest) - *kshcharp = STOUC(ztokens[*patparse - Pound]); + kshchar = STOUC(ztokens[*patparse - Pound]); } - if (patparse > str0) { - /* - * This is up here instead of at the end to simplify the - * kshglob bracket testing. Note patparse doesn't - * get incremented till afterwards. - */ - if (ISENDCHAR(patparse) || *kshcharp || *patparse == Quest || - *patparse == Star || *patparse == Inbrack || - (*patparse == Inpar && !isset(SHGLOB)) || - *patparse == Inang || - (isset(EXTENDEDGLOB) && *patparse == Pound)) - break; - else { - METAINC(patparse); - continue; - } - } + /* + * End of string (or no string at all) if ksh-type parentheses, + * or special character, unless that character is a tilde and + * the character following is an end-of-segment character. Thus + * tildes are not special if there is nothing following to + * be excluded. + */ + if (kshchar || (memchr(patendstr, *patparse, patendstrlen) && + (*patparse != Tilde || + patparse[1] == '/' || + !memchr(patendseg, patparse[1], patendseglen)))) + break; + + METAINC(patparse); + } - if (*kshcharp) + if (patparse > str0) { + /* Ordinary string: cancel kshchar lookahead */ + kshchar = '\0'; + /* + * Assume it matches a simple string until we find otherwise. + */ + flags |= P_PURESTR; + DPUTS(patparse == str0, "BUG: matched nothing in patcomppiece."); + /* more than one character matched? */ + len = str0 + (*str0 == Meta ? 2 : 1) < patparse; + /* + * If we have more than one character, a following hash only + * applies to the last, so decrement. + */ + if (isset(EXTENDEDGLOB) && *patparse == Pound && len) + patparse -= (patparse > str0 + 1 && patparse[-2] == Meta) ? 2 : 1; + /* + * If len is 1, we can't have an active # following, so doesn't + * matter that we don't make X in `XX#' simple. + */ + if (!len) + flags |= P_SIMPLE; + starter = patnode(P_EXACTLY); + /* add enough space including null byte */ + len = patparse - str0; + patadd(str0, 0, len + 1, 0); + nptr = (char *)P_OPERAND((Upat)patout + starter); + nptr[len] = '\0'; + /* + * It's much simpler to turn off pure string mode for + * any case-insensitive or approximate matching; usually, + * that is correct, or they wouldn't have been turned on. + * However, we need to make sure we match a "." or ".." + * in a file name as a pure string. There's a minor bug + * that this will also apply to something like + * ..(#a1).. (i.e. the (#a1) has no effect), but if you're + * going to write funny patterns, you get no sympathy from me. + */ + if (patglobflags && + (!(patflags & PAT_FILE) || (strcmp(nptr, ".") && + strcmp(nptr, "..")))) + flags &= ~P_PURESTR; + for (; *nptr; METAINC(nptr)) + if (itok(*nptr)) + *nptr = ztokens[*nptr - Pound]; + } else { + if (kshchar) patparse++; patch = *patparse; METAINC(patparse); switch(patch) { case Quest: - *flagp |= P_SIMPLE; - return patnode(P_ANY); + flags |= P_SIMPLE; + starter = patnode(P_ANY); break; case Star: /* kshchar is used as a sign that we can't have #'s. */ - *kshcharp = -1; - return patnode(P_STAR); + kshchar = -1; + starter = patnode(P_STAR); break; case Inbrack: - *flagp |= P_SIMPLE; + flags |= P_SIMPLE; if (*patparse == Hat || *patparse == '^' || *patparse == '!') { patparse++; starter = patnode(P_ANYBUT); @@ -919,15 +955,14 @@ patcompatom(int *flagp, int *kshcharp) patparse++; /* terminate null string and fix alignment */ patadd(NULL, 0, 1, 0); - return starter; break; case Inpar: /* is this how to treat parentheses in SHGLOB? */ - if (isset(SHGLOB) && !*kshcharp) + if (isset(SHGLOB) && !kshchar) return 0; - if (*kshcharp == '!') { + if (kshchar == '!') { /* This is nasty, we should really either handle all - * kshglobbing upstairs or down here. But most of the + * kshglobbing below or here. But most of the * others look like non-ksh patterns, while this one * doesn't, so we handle it here and leave the rest. * We treat it like an extendedglob ^, except that @@ -939,12 +974,11 @@ patcompatom(int *flagp, int *kshcharp) * the expense of allowing the user to do things * they shouldn't. */ - if (!(starter = patcompnot(1, &flags))) + if (!(starter = patcompnot(1, &flags2))) return 0; - } else if (!(starter = patcompswitch(1, &flags))) + } else if (!(starter = patcompswitch(1, &flags2))) return 0; - *flagp |= flags & P_HSTART; - return starter; + flags |= flags2 & P_HSTART; break; case Inang: /* Numeric glob */ @@ -990,68 +1024,101 @@ patcompatom(int *flagp, int *kshcharp) * Mention in manual that matching digits with [...] * is more efficient. */ - return starter; break; case Pound: - if (!isset(EXTENDEDGLOB)) - break; + DPUTS(!isset(EXTENDEDGLOB), "BUG: # not treated as string"); + /* + * A hash here is an error; it should follow something + * repeatable. + */ return 0; break; #ifdef DEBUG - case Bar: - case Outpar: - case '\0': - dputs("BUG: wrong character in patcompatom."); + default: + dputs("BUG: character not handled in patcomppiece"); return 0; break; #endif } } - /* Simple string: cancel kshchar lookahead */ - *kshcharp = '\0'; - /* - * Assume it matches a simple string until we find otherwise. - */ - *flagp |= P_PURESTR; - DPUTS(patparse == str0, "BUG: matched nothing in patcompatom."); - /* more than one character matched? */ - len = str0 + (*str0 == Meta ? 2 : 1) < patparse; + if (!(pound = (*patparse == Pound && isset(EXTENDEDGLOB))) && + (kshchar <= 0 || kshchar == '@' || kshchar == '!')) { + *flagp = flags; + return starter; + } + + /* too much at once doesn't currently work */ + if (kshchar && pound) + return 0; + + if (kshchar == '*') { + op = P_ONEHASH; + *flagp = P_HSTART; + } else if (kshchar == '+') { + op = P_TWOHASH; + *flagp = P_HSTART; + } else if (kshchar == '?') { + op = 0; + *flagp = 0; + } else if (*++patparse == Pound) { + op = P_TWOHASH; + patparse++; + *flagp = P_HSTART; + } else { + op = P_ONEHASH; + *flagp = P_HSTART; + } + /* - * Ordinary string of characters. - * If we have more than one character, a following hash only - * applies to the last, so decrement. - */ - if (isset(EXTENDEDGLOB) && *patparse == Pound && len) - patparse -= (patparse > str0 + 1 && patparse[-2] == Meta) ? 2 : 1; - /* if len is 1, we can't have an active # following, so doesn't - * matter that we don't make X in `XX#' simple. + * Note optimizations with pointers into P_NOTHING branches: some + * should logically point to next node after current piece. + * + * Backtracking is also encoded in a slightly obscure way: the + * code emitted ensures we test the non-empty branch of complex + * patterns before the empty branch on each repetition. Hence + * each time we fail on a non-empty branch, we try the empty branch, + * which is equivalent to backtracking. */ - if (!len) - *flagp |= P_SIMPLE; - starter = patnode(P_EXACTLY); - while (str0 < patparse) { - if (*str0 == Meta) { - cbuf[0] = *str0++; - cbuf[1] = *str0++; - } else { - cbuf[0] = itok(*str0) ? ztokens[*str0 - Pound] : *str0; - str0++; - } - ch = UNMETA(cbuf); - /* - * HACK: this cause a string consisting of any number of - * dots in files to be matched exactly, even with approximation. - * We just want to limit it to the first two. + if ((flags & P_SIMPLE) && op == P_ONEHASH && + P_OP((Upat)patout+starter) == P_ANY) { + /* Optimize ?# to *. Silly thing to do, since who would use + * use ?# ? But it makes the later code shorter. */ - if (((patglobflags & C_IGNCASE) && (islower(ch) || isupper(ch))) || - ((patglobflags & C_LCMATCHUC) && islower(ch)) || - ((patglobflags & 0xff) && !((patflags & PAT_FILE) && ch == '.'))) - *flagp &= ~P_PURESTR; - patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, 1); + Upat uptr = (Upat)patout + starter; + uptr->l = (uptr->l & ~0xff) | P_STAR; + } else if ((flags & P_SIMPLE) && op && !(patglobflags & 0xff)) { + /* Don't simplify if we need to look for approximations. */ + patinsert(op, starter, NULL, 0); + } else if (op == P_ONEHASH) { + /* Emit x# as (x&|), where & means "self". */ + up.p = NULL; + patinsert(P_WBRANCH, starter, (char *)&up, sizeof(up)); + /* Either x */ + patoptail(starter, patnode(P_BACK)); /* and loop */ + patoptail(starter, starter); /* back */ + pattail(starter, patnode(P_BRANCH)); /* or */ + pattail(starter, patnode(P_NOTHING)); /* null. */ + } else if (op == P_TWOHASH) { + /* Emit x## as x(&|) where & means "self". */ + next = patnode(P_WBRANCH); /* Either */ + up.p = NULL; + patadd((char *)&up, 0, sizeof(up), 0); + pattail(starter, next); + pattail(patnode(P_BACK), starter); /* loop back */ + pattail(next, patnode(P_BRANCH)); /* or */ + pattail(starter, patnode(P_NOTHING)); /* null. */ + } else if (kshchar == '?') { + /* Emit ?(x) as (x|) */ + patinsert(P_BRANCH, starter, NULL, 0); /* Either x */ + pattail(starter, patnode(P_BRANCH)); /* or */ + next = patnode(P_NOTHING); /* null */ + pattail(starter, next); + patoptail(starter, next); } - /* null terminate and fix alignment */ - patadd(NULL, 0, 1, 0); + if (*patparse == Pound) + return 0; + return starter; } @@ -1064,7 +1131,7 @@ patcompatom(int *flagp, int *kshcharp) static long patcompnot(int paren, int *flagsp) { - unsigned char *unull = NULL; + union upat up; long excsync, br, excl, n, starter; int dummy; @@ -1076,7 +1143,8 @@ patcompnot(int paren, int *flagsp) excsync = patnode(P_EXCSYNC); pattail(br, excsync); pattail(starter, excl = patnode(P_EXCLUDE)); - patadd((char *)&unull, 0, sizeof(unull), 0); + up.p = NULL; + patadd((char *)&up, 0, sizeof(up), 0); if (!(br = (paren ? patcompswitch(1, &dummy) : patcompbranch(&dummy)))) return 0; pattail(br, patnode(P_EXCEND)); @@ -1093,9 +1161,11 @@ patcompnot(int paren, int *flagsp) static long patnode(long op) { - long starter = patcode - patout; + long starter = (Upat)patcode - (Upat)patout; + union upat up; - patadd((char *)&op, 0, sizeof(long), 0); + up.l = op; + patadd((char *)&up, 0, sizeof(union upat), 0); return starter; } @@ -1109,22 +1179,22 @@ static void patinsert(long op, int opnd, char *xtra, int sz) { char *src, *dst, *opdst; - long buf, *lptr; + union upat buf, *lptr; - buf = 0; - patadd((char *)&buf, 0, sizeof(long), 0); + buf.l = 0; + patadd((char *)&buf, 0, sizeof(buf), 0); if (sz) patadd(xtra, 0, sz, 0); - src = patcode - sizeof(long) - sz; + src = patcode - sizeof(union upat) - sz; dst = patcode; - opdst = patout + opnd; + opdst = patout + opnd * sizeof(union upat); while (src > opdst) *--dst = *--src; /* A cast can't be an lvalue */ - lptr = (long *)opdst; - *lptr = op; - opdst += sizeof(long); + lptr = (Upat)opdst; + lptr->l = op; + opdst += sizeof(union upat); while (sz--) *opdst++ = *xtra++; } @@ -1135,10 +1205,10 @@ patinsert(long op, int opnd, char *xtra, int sz) static void pattail(long p, long val) { - char *scan, *temp; - long offset, *lptr; + Upat scan, temp; + long offset; - scan = patout + p; + scan = (Upat)patout + p; for (;;) { if (!(temp = PATNEXT(scan))) break; @@ -1146,10 +1216,9 @@ pattail(long p, long val) } offset = (P_OP(scan) == P_BACK) - ? (scan - patout) - val : val - (scan - patout); + ? (scan - (Upat)patout) - val : val - (scan - (Upat)patout); - lptr = (long *)scan; - *lptr |= offset << 8; + scan->l |= offset << 8; } /* do pattail, but on operand of first argument; nop if operandless */ @@ -1157,14 +1226,14 @@ pattail(long p, long val) /**/ static void patoptail(long p, long val) { - char *ptr = patout + p; + Upat ptr = (Upat)patout + p; int op = P_OP(ptr); if (!p || !P_ISBRANCH(ptr)) return; if (op == P_BRANCH) pattail(P_OPERAND(p), val); else - pattail(P_OPERAND(p) + sizeof(char *), val); + pattail(P_OPERAND(p) + 1, val); } @@ -1178,11 +1247,20 @@ char *patinput; /* String input pointer */ /* Length of input string, plus null byte, if needed */ static int patinlen; -#ifdef BACKREFERENCES -static char **patstartp; /* Pointer to backref starts */ -static char **patendp; /* Pointer to backref ends */ -static int parsfound; /* parentheses found */ -#endif + +/* + * Offset of string at which we are trying to match. + * This is added in to the positions recorded in patbeginp and patendp + * when we are looking for substrings. Currently this only happens + * in the parameter substitution code. + */ +/**/ +int patoffset; + +static char *patbeginp[NSUBEXP]; /* Pointer to backref beginnings */ +static char *patendp[NSUBEXP]; /* Pointer to backref ends */ +static int parsfound; /* parentheses (with backrefs) found */ + static int globdots; /* Glob initial dots? */ /* @@ -1204,15 +1282,28 @@ pattrystart(void) } /**/ -int +mod_export int pattry(Patprog prog, char *string) { -#ifdef BACKREFERENCES - int i; + return pattryrefs(prog, string, NULL, NULL, NULL); +} + +/* The last three arguments are used to report the positions for the + * backreferences. On entry, *nump should contain the maximum number + * positions to report. */ + +/**/ +mod_export int +pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) +{ + int i, maxnpos = 0; char **sp, **ep; -#endif char *progstr = (char *)prog + prog->startoff; + if (nump) { + maxnpos = *nump; + *nump = 0; + } /* inherited from domatch, but why, exactly? */ if (*string == Nularg) string++; @@ -1248,40 +1339,111 @@ pattry(Patprog prog, char *string) errsfound = 0; } globdots = !(patflags & PAT_NOGLD); -#ifdef BACKREFERENCES parsfound = 0; - if (patflags & PAT_BACKR) { - patstartp = prog->ppStartp; - patendp = prog->ppEndp; - } else { - patstartp = patendp = NULL; - } -#endif - if (patmatch(progstr)) { -#ifdef BACKREFERENCES - if (patflags & PAT_BACKR) { - prog->ppStartp[0] = string; - prog->ppEndp[0] = patinput; - - sp = patstartp+1; - ep = patendp + 1; - for (i = 1; i < NSUBEXP; i++) { - if (!(parsfound & (1 << (i - 1)))) - *sp = 0; - if (!(parsfound & (1 << (i + 15)))) - *ep = 0; - sp++; - ep++; - } - - } -#endif + if (patmatch((Upat)progstr)) { /* * we were lazy and didn't save the globflags if an exclusion * failed, so set it now */ patglobflags = prog->globend; + /* + * Should we clear backreferences and matches on a failed + * match? + */ + if ((patglobflags & GF_MATCHREF) && !(patflags & PAT_FILE)) { + /* + * m flag: for global match. This carries no overhead + * in the pattern matching part. + */ + char *str; + int mlen = ztrsub(patinput, patinstart); + + str = ztrduppfx(patinstart, patinput - patinstart); + setsparam("MATCH", str); + setiparam("MBEGIN", (zlong)(patoffset + !isset(KSHARRAYS))); + setiparam("MEND", + (zlong)(mlen + patoffset + !isset(KSHARRAYS) - 1)); + } + if (prog->patnpar && nump) { + /* + * b flag: for backreferences using parentheses. Reported + * directly. + */ + *nump = prog->patnpar; + + sp = patbeginp; + ep = patendp; + + for (i = 0; i < prog->patnpar && i < maxnpos; i++) { + if (parsfound & (1 << i)) { + if (begp) + *begp++ = ztrsub(*sp, patinstart) + patoffset; + if (endp) + *endp++ = ztrsub(*ep, patinstart) + patoffset - 1; + } else { + if (begp) + *begp++ = -1; + if (endp) + *endp++ = -1; + } + + sp++; + ep++; + } + } else if (prog->patnpar && !(patflags & PAT_FILE)) { + /* + * b flag: for backreferences using parentheses. + */ + int palen = prog->patnpar+1; + char **matcharr, **mbeginarr, **mendarr; + char numbuf[DIGBUFSIZE]; + + matcharr = zcalloc(palen*sizeof(char *)); + mbeginarr = zcalloc(palen*sizeof(char *)); + mendarr = zcalloc(palen*sizeof(char *)); + + sp = patbeginp; + ep = patendp; + + for (i = 0; i < prog->patnpar; i++) { + if (parsfound & (1 << i)) { + matcharr[i] = ztrduppfx(*sp, *ep - *sp); + /* + * mbegin and mend give indexes into the string + * in the standard notation, i.e. respecting + * KSHARRAYS, and with the end index giving + * the last character, not one beyond. + * For example, foo=foo; [[ $foo = (f)oo ]] gives + * (without KSHARRAYS) indexes 1 and 1, which + * corresponds to indexing as ${foo[1,1]}. + */ + sprintf(numbuf, "%ld", + (long)(ztrsub(*sp, patinstart) + + patoffset + + !isset(KSHARRAYS))); + mbeginarr[i] = ztrdup(numbuf); + sprintf(numbuf, "%ld", + (long)(ztrsub(*ep, patinstart) + + patoffset + + !isset(KSHARRAYS) - 1)); + mendarr[i] = ztrdup(numbuf); + } else { + /* Pattern wasn't set: either it was in an + * unmatched branch, or a hashed parenthesis + * that didn't match at all. + */ + matcharr[i] = ztrdup(""); + mbeginarr[i] = ztrdup("-1"); + mendarr[i] = ztrdup("-1"); + } + sp++; + ep++; + } + setaparam("match", matcharr); + setaparam("mbegin", mbeginarr); + setaparam("mend", mendarr); + } return 1; } else return 0; @@ -1293,10 +1455,10 @@ pattry(Patprog prog, char *string) * comes from the input string, the second the current pattern. */ #define CHARMATCH(chin, chpa) (chin == chpa || \ - ((patglobflags & C_IGNCASE) ? \ + ((patglobflags & GF_IGNCASE) ? \ ((isupper(chin) ? tolower(chin) : chin) == \ (isupper(chpa) ? tolower(chpa) : chpa)) : \ - (patglobflags & C_LCMATCHUC) ? \ + (patglobflags & GF_LCMATCHUC) ? \ (islower(chpa) && toupper(chpa) == chin) : 0)) /* @@ -1314,11 +1476,12 @@ static char *exactpos; */ /**/ -int -patmatch(char *prog) +static int +patmatch(Upat prog) { /* Current and next nodes */ - char *scan = prog, *next, *opnd, *save, *start; + Upat scan = prog, next, opnd; + char *start, *save, *chrop; int savglobflags, op, no, min, nextch, fail = 0, saverrsfound; zrange_t from, to, comp; @@ -1338,36 +1501,38 @@ patmatch(char *prog) break; case P_EXACTLY: /* - * acts as nothing if *opnd is null: this is used by + * acts as nothing if *chrop is null: this is used by * approx code. */ - opnd = exactpos ? exactpos : P_OPERAND(scan); + chrop = exactpos ? exactpos : (char *)P_OPERAND(scan); exactpos = NULL; - while (*opnd && *patinput) { + while (*chrop && *patinput) { int chin = STOUC(UNMETA(patinput)); - int chpa = STOUC(UNMETA(opnd)); + int chpa = STOUC(UNMETA(chrop)); if (!CHARMATCH(chin, chpa)) { fail = 1; break; } - METAINC(opnd); + METAINC(chrop); METAINC(patinput); } - if (*opnd) { - exactpos = opnd; + if (*chrop) { + exactpos = chrop; fail = 1; } break; case P_ANYOF: if (!*patinput || - !patmatchrange(P_OPERAND(scan), STOUC(UNMETA(patinput)))) + !patmatchrange((char *)P_OPERAND(scan), + STOUC(UNMETA(patinput)))) fail = 1; else METAINC(patinput); break; case P_ANYBUT: if (!*patinput || - patmatchrange(P_OPERAND(scan), STOUC(UNMETA(patinput)))) + patmatchrange((char *)P_OPERAND(scan), + STOUC(UNMETA(patinput)))) fail = 1; else METAINC(patinput); @@ -1384,12 +1549,12 @@ patmatch(char *prog) * the first attempt. */ op = P_OP(scan); - start = P_OPERAND(scan); + start = (char *)P_OPERAND(scan); from = to = 0; if (op != P_NUMTO) { #ifdef ZSH_64_BIT_TYPE /* We can't rely on pointer alignment being good enough. */ - memcpy((char *)&from, (char *)start, sizeof(zrange_t)); + memcpy((char *)&from, start, sizeof(zrange_t)); #else from = *((zrange_t *) start); #endif @@ -1414,7 +1579,8 @@ patmatch(char *prog) return 1; } if (!no && P_OP(next) == P_EXACTLY && - !idigit(STOUC(*P_OPERAND(next))) && !(patglobflags & 0xff)) + !idigit(STOUC(*(char *)P_OPERAND(next))) && + !(patglobflags & 0xff)) return 0; patinput = --save; no++; @@ -1434,7 +1600,8 @@ patmatch(char *prog) if (patmatch(next)) return 1; if (!no && P_OP(next) == P_EXACTLY && - !idigit(STOUC(*P_OPERAND(next))) && !(patglobflags & 0xff)) + !idigit(STOUC(*(char *)P_OPERAND(next))) && + !(patglobflags & 0xff)) return 0; patinput = --save; no++; @@ -1447,9 +1614,8 @@ patmatch(char *prog) case P_BACK: break; case P_GFLAGS: - patglobflags = *(int *)P_OPERAND(scan); + patglobflags = P_OPERAND(scan)->l; break; -#ifdef BACKREFERENCES case P_OPEN: case P_OPEN+1: case P_OPEN+2: @@ -1464,13 +1630,12 @@ patmatch(char *prog) save = patinput; if (patmatch(next)) { - DPUTS(!patstartp, "patstartp not set for backreferencing"); /* - * Don't set ppStartp if some later invocation of + * Don't set patbeginp if some later invocation of * the same parentheses already has. */ if (no && !(parsfound & (1 << (no - 1)))) { - patstartp[no] = save; + patbeginp[no-1] = save; parsfound |= 1 << (no - 1); } return 1; @@ -1493,25 +1658,24 @@ patmatch(char *prog) if (patmatch(next)) { DPUTS(!patendp, "patendp not set for backreferencing"); if (no && !(parsfound & (1 << (no + 15)))) { - patendp[no] = save; + patendp[no-1] = save; parsfound |= 1 << (no + 15); } return 1; } else return 0; break; -#endif case P_EXCSYNC: - /* See the P_EXCLUDE code below for where syncstrp comes from */ + /* See the P_EXCLUDE code below for where syncptr comes from */ { - unsigned char **syncstrp, *syncptr; - char *after; + unsigned char *syncptr; + Upat after; after = P_OPERAND(scan); DPUTS(!P_ISEXCLUDE(after), "BUG: EXCSYNC not followed by EXCLUDE."); - syncstrp = (unsigned char **)P_OPERAND(after); - DPUTS(!*syncstrp, "BUG: EXCSYNC not handled by EXCLUDE"); - syncptr = *syncstrp + (patinput - patinstart); + DPUTS(!P_OPERAND(after)->p, + "BUG: EXCSYNC not handled by EXCLUDE"); + syncptr = P_OPERAND(after)->p + (patinput - patinstart); /* * If we already matched from here, this time we fail. * See WBRANCH code for story about error count. @@ -1570,15 +1734,14 @@ patmatch(char *prog) * The pointer also tells us where the asserted * pattern matched for use by the exclusion. */ - unsigned char **syncstrp, *oldsyncstr; + Upat syncstrp; + unsigned char *oldsyncstr; char *matchpt = NULL; int ret, savglobdots, matchederrs = 0; -#ifdef BACKREFERENCES int savparsfound = parsfound; -#endif DPUTS(P_OP(scan) == P_WBRANCH, "BUG: excluded WBRANCH"); - syncstrp = (unsigned char **)P_OPERAND(next); + syncstrp = P_OPERAND(next); /* * Unlike WBRANCH, each test at the same exclude * sync point (due to an external loop) is separate, @@ -1586,14 +1749,15 @@ patmatch(char *prog) * (foo~bar)(foo~bar)... from the exclusion point * of view, so we use a different sync string. */ - oldsyncstr = *syncstrp; + oldsyncstr = syncstrp->p; if (!patinlen) patinlen = strlen(patinstart)+1; - *syncstrp = (unsigned char *)zcalloc(patinlen); + syncstrp->p = (unsigned char *)zcalloc(patinlen); while ((ret = patmatch(P_OPERAND(scan)))) { unsigned char *syncpt; char savchar, *testptr; - int savforce = forceerrs; + char *savpatinstart = patinstart; + int savforce = forceerrs, savpatinlen = patinlen; forceerrs = -1; savglobdots = globdots; matchederrs = errsfound; @@ -1607,9 +1771,9 @@ patmatch(char *prog) * possibilities for approximation have been * checked.) */ - for (syncpt = *syncstrp; !*syncpt; syncpt++) + for (syncpt = syncstrp->p; !*syncpt; syncpt++) ; - testptr = patinstart + (syncpt - *syncstrp); + testptr = patinstart + (syncpt - syncstrp->p); DPUTS(testptr > matchpt, "BUG: EXCSYNC failed"); savchar = *testptr; *testptr = '\0'; @@ -1625,7 +1789,7 @@ patmatch(char *prog) */ patglobflags &= ~0xff; errsfound = 0; - opnd = P_OPERAND(next) + sizeof(char *); + opnd = P_OPERAND(next) + 1; if (P_OP(next) == P_EXCLUDP && pathpos) { /* * top level exclusion with a file, @@ -1638,21 +1802,23 @@ patmatch(char *prog) zalloc(pathpos + patinlen); strcpy(buf, pathbuf); strcpy(buf + pathpos, patinput); - patinput = buf; + patinput = patinstart = buf; + patinlen += pathpos; } if (patmatch(opnd)) { ret = 0; -#ifdef BACKREFERENCES /* * Another subtlety: if we exclude the * match, any parentheses just found * become invalidated. */ parsfound = savparsfound; -#endif } - if (buf) + if (buf) { + patinstart = savpatinstart; + patinlen = savpatinlen; zfree(buf, pathpos + patinlen); + } if (!ret) break; next = PATNEXT(next); @@ -1666,8 +1832,8 @@ patmatch(char *prog) patglobflags = savglobflags; errsfound = saverrsfound; } - zfree((char *)*syncstrp, patinlen); - *syncstrp = oldsyncstr; + zfree((char *)syncstrp->p, patinlen); + syncstrp->p = oldsyncstr; if (ret) { patinput = matchpt; errsfound = matchederrs; @@ -1678,7 +1844,8 @@ patmatch(char *prog) ; } else { int ret = 1, pfree = 0; - unsigned char **ptrp = NULL, *ptr; + Upat ptrp = NULL; + unsigned char *ptr; if (P_OP(scan) == P_WBRANCH) { /* * This is where we make sure that we are not @@ -1696,15 +1863,14 @@ patmatch(char *prog) * there is already a 1, then the test fails. */ opnd = P_OPERAND(scan); - ptrp = (unsigned char **)opnd; - opnd += sizeof(unsigned char *); - if (!*ptrp) { + ptrp = opnd++; + if (!ptrp->p) { if (!patinlen) patinlen = strlen((char *)patinstart)+1; - *ptrp = (unsigned char *)zcalloc(patinlen); + ptrp->p = (unsigned char *)zcalloc(patinlen); pfree = 1; } - ptr = *ptrp + (patinput - patinstart); + ptr = ptrp->p + (patinput - patinstart); /* * Without approximation, this is just a @@ -1712,7 +1878,7 @@ patmatch(char *prog) * need to know how many errors there were * last time we made the test. If errsfound * is now smaller than it was, hence we can - * maker more approximations in the remaining + * make more approximations in the remaining * code, we continue with the test. * (This is why the max number of errors is * 254, not 255.) @@ -1725,8 +1891,8 @@ patmatch(char *prog) if (ret) ret = patmatch(opnd); if (pfree) { - zfree((char *)*ptrp, patinlen); - *ptrp = NULL; + zfree((char *)ptrp->p, patinlen); + ptrp->p = NULL; } if (ret) return 1; @@ -1769,12 +1935,38 @@ patmatch(char *prog) return 0; no = patrepeat(P_OPERAND(scan)); } - /* Lookahead to avoid useless matches */ - if (P_OP(next) == P_EXACTLY && !(patglobflags & 0xff)) - nextch = STOUC(UNMETA(P_OPERAND(next))); - else - nextch = -1; min = (op == P_TWOHASH) ? 1 : 0; + /* + * Lookahead to avoid useless matches. This is not possible + * with approximation. + */ + if (P_OP(next) == P_EXACTLY && !(patglobflags & 0xff)) { + char *nextop = (char *)P_OPERAND(next); + /* + * If that P_EXACTLY is last (common in simple patterns, + * such as *.c), then it can be only be matched at one + * point in the test string, so record that. + */ + if (P_OP(PATNEXT(next)) == P_END && + !(patflags & PAT_NOANCH)) { + int ptlen = strlen(patinput); + int oplen = strlen(nextop); + /* Are we in the right range? */ + if (oplen > strlen(min ? METANEXT(start) : start) || + oplen < ptlen) + return 0; + /* Yes, just position appropriately and test. */ + patinput += ptlen - oplen; + if (patinput > start && patinput[-1] == Meta) { + /* doesn't align properly, no go */ + return 0; + } + /* Continue loop with P_EXACTLY test. */ + break; + } + nextch = STOUC(UNMETA(nextop)); + } else + nextch = -1; save = patinput; savglobflags = patglobflags; saverrsfound = errsfound; @@ -2010,13 +2202,13 @@ patmatchrange(char *range, int ch) /* repeatedly match something simple and say how many times */ /**/ -static int patrepeat(char *p) +static int patrepeat(Upat p) { int count = 0, tch, inch; char *scan, *opnd; scan = patinput; - opnd = P_OPERAND(p); + opnd = (char *)P_OPERAND(p); switch(P_OP(p)) { #ifdef DEBUG @@ -2056,6 +2248,16 @@ static int patrepeat(char *p) return count; } +/* Free a patprog. */ + +/**/ +mod_export void +freepatprog(Patprog prog) +{ + if (prog && prog != dummy_patprog1 && prog != dummy_patprog2) + zfree(prog, prog->size); +} + /**/ #ifdef ZSH_PAT_DEBUG @@ -2067,7 +2269,8 @@ static int patrepeat(char *p) static void patdump(Patprog r) { - char *s, *next, *codestart, *base, op = P_EXACTLY; + char *s, *base, op = P_EXACTLY; + Upat up, codestart, next; base = (char *)r; s = base + r->startoff; @@ -2075,13 +2278,14 @@ patdump(Patprog r) if (r->flags & PAT_PURES) { printf("STRING:%s\n", (char *)s); } else { - codestart = s; + codestart = (Upat)s; while (op != P_END) { - op = P_OP(s); - printf("%2d%s", s-codestart, patprop(s)); - next = PATNEXT(s); - printf("(%d)", next ? (s-codestart)+(next-s) : 0); - s += sizeof(zalign_t); + up = (Upat)s; + op = P_OP(up); + printf("%2d%s", up-codestart, patprop(up)); + next = PATNEXT(up); + printf("(%d)", next ? next-codestart : 0); + s += sizeof(union upat); if (op == P_ANYOF || op == P_ANYBUT || op == P_EXACTLY) { while (*s != '\0') { if (itok(*s)) { @@ -2107,16 +2311,15 @@ patdump(Patprog r) s += sizeof(zrange_t); } } else if (op == P_GFLAGS) { - long *lptr = (long *)s; - printf("%ld, %ld", *lptr & ~0xff, *lptr & 0xff); - s += sizeof(long); + printf("%ld, %ld", (++up)->l & ~0xff, (++up)->l & 0xff); + s += sizeof(union upat); } else if (op == P_WBRANCH || op == P_EXCLUDE || op == P_EXCLUDP) { - s += sizeof(char *); + s += sizeof(union upat); } putchar('\n'); - s = base + (((s - base) + sizeof(zalign_t) - 1) & - ~(sizeof(zalign_t) - 1)); + s = base + (((s - base) + sizeof(union upat) - 1) & + ~(sizeof(union upat) - 1)); } } @@ -2125,18 +2328,16 @@ patdump(Patprog r) printf("start `%c' ", r->patstartch); if (!(r->flags & PAT_NOANCH)) printf("EOL-anchor "); -#ifdef BACKREFERENCES - if (r->flags & PAT_BACKR) - printf("backreferences "); -#endif + if (r->patnpar) + printf("%d active backreferences ", r->patnpar); if (r->mustoff) printf("must have \"%s\"", (char *)r + r->mustoff); printf("\n"); if (r->globflags) { printf("Globbing flags: "); - if (r->globflags & C_LCMATCHUC) + if (r->globflags & GF_LCMATCHUC) printf("LC matches UC "); - if (r->globflags & C_IGNCASE) + if (r->globflags & GF_IGNCASE) printf("Ignore case"); printf("\n"); if (r->globflags & 0xff) @@ -2146,7 +2347,7 @@ patdump(Patprog r) /**/ static char * -patprop(char *op) +patprop(Upat op) { char *p = NULL; static char buf[50]; @@ -2258,16 +2459,11 @@ int bin_patdebug(char *name, char **args, char *ops, int func) { Patprog prog; - int ret = 0, flags; + int ret = 0; tokenize(*args); -#ifdef BACKREFERENCES - flags = ops['b'] ? PAT_BACKR : 0; -#else - flags = 0; -#endif - if (!(prog = patcompile((char *)*args, flags, 0))) + if (!(prog = patcompile((char *)*args, 0, 0))) return 1; if (ops['p'] || !args[1]) { patdump(prog); |