From 13862569077a80821c2272e9e484ad6a36010846 Mon Sep 17 00:00:00 2001 From: Tanaka Akira Date: Tue, 14 Sep 1999 14:54:09 +0000 Subject: zsh-workers/7825 --- Src/pattern.c | 342 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 204 insertions(+), 138 deletions(-) (limited to 'Src/pattern.c') diff --git a/Src/pattern.c b/Src/pattern.c index 832d8fda0..e5c0a0cb3 100644 --- a/Src/pattern.c +++ b/Src/pattern.c @@ -70,11 +70,8 @@ typedef union upat *Upat; #include "pattern.pro" -/* - * Globbing flags: lower 8 bits gives approx count - */ -#define C_LCMATCHUC 0x0100 -#define C_IGNCASE 0x0200 +/* Number of active parenthesised expressions allowed in backreferencing */ +#define NSUBEXP 9 /* definition number opnd? meaning */ #define P_END 0x00 /* no End of program. */ @@ -205,8 +202,8 @@ typedef unsigned long zrange_t; * Characters which terminate a pattern segment. We actually use * a pointer patendseg which skips the first character if we are not * parsing a file pattern. - * Note that the size of this and the next array are hard-wired into - * patcompile. + * Note that the size of this and the next array are hard-wired + * via the definitions. */ static char endseg[] = { @@ -215,6 +212,9 @@ static char endseg[] = { Tilde /* extended glob only */ }; +#define PATENDSEGLEN_NORM 4 +#define PATENDSEGLEN_EXT 5 + /* Characters which terminate a simple string */ static char endstr[] = { @@ -224,6 +224,10 @@ static char endstr[] = { Tilde, Hat, Pound /* extended glob only */ }; +#define PATENDSTRLEN_NORM 9 +#define PATENDSTRLEN_EXT 12 + + /* Default size for pattern buffer */ #define P_DEF_ALLOC 256 @@ -291,18 +295,13 @@ patcompstart(void) Patprog patcompile(char *exp, int inflags, char **endexp) { - int flags, len; + int flags = 0, len = 0; long startoff; Upat pscan; - char *lng; + char *lng, *strp = NULL; Patprog p; -#ifdef BACKREFERENCES - startoff = (inflags & PAT_BACKR) ? sizeof(struct patprog) : - sizeof(struct patprog_short); -#else startoff = sizeof(struct patprog); -#endif /* Ensure alignment of start of program string */ startoff = (startoff + sizeof(union upat) - 1) & ~(sizeof(union upat) - 1); @@ -312,13 +311,17 @@ patcompile(char *exp, int inflags, char **endexp) patcode = patout + startoff; patsize = patcode - patout; patstart = patparse = exp; + /* + * Note global patnpar numbers parentheses 1..9, while patnpar + * in struct is actual count of parentheses. + */ patnpar = 1; - patflags = inflags; + patflags = inflags & ~PAT_PURES; patendseg = endseg; - patendseglen = isset(EXTENDEDGLOB) ? 5 : 4; + patendseglen = isset(EXTENDEDGLOB) ? PATENDSEGLEN_EXT : PATENDSEGLEN_NORM; patendstr = endstr; - patendstrlen = isset(EXTENDEDGLOB) ? 12 : 9; + patendstrlen = isset(EXTENDEDGLOB) ? PATENDSTRLEN_EXT : PATENDSTRLEN_NORM; if (!(patflags & PAT_FILE)) { patendseg++; @@ -333,66 +336,87 @@ patcompile(char *exp, int inflags, char **endexp) */ ((Patprog)patout)->globflags = patglobflags; - if (patflags & PAT_ANY) - flags = 0; - else if (patcompswitch(0, &flags) == 0) - return NULL; + if (!(patflags & PAT_ANY)) { + /* Look for a really pure string, with no tokens at all. */ + for (strp = exp; *strp && + (!(patflags & PAT_FILE) || *strp != '/') && !itok(*strp); + strp++) + ; + if (*strp && *strp != '/') { + /* No, do normal compilation. */ + strp = NULL; + if (patcompswitch(0, &flags) == 0) + return NULL; + } else { + /* Yes, copy the string and skip compilation altogether */ + patparse = strp; + len = strp - exp; + patadd(exp, 0, len + 1, 0); + patout[startoff + len] = '\0'; + patflags |= PAT_PURES; + } + } /* end of compilation: safe to use pointers */ p = (Patprog)patout; p->startoff = startoff; p->patstartch = '\0'; p->globend = patglobflags; - p->flags = (patflags & ~PAT_PURES); + p->flags = patflags; p->mustoff = 0; p->size = patsize; - p->patmlen = 0; - pscan = (Upat)(patout + startoff); + p->patmlen = len; + p->patnpar = patnpar-1; - if (!(patflags & PAT_ANY) && P_OP(PATNEXT(pscan)) == P_END) { - /* only one top level choice */ - pscan = P_OPERAND(pscan); + if (!strp) { + pscan = (Upat)(patout + startoff); - if (flags & P_PURESTR) { - /* - * The pattern can be matched with a simple strncmp/strcmp. - * Careful in case we've overwritten the node for the next ptr. - */ - char *dst = patout + startoff; - Upat next; - p->flags |= PAT_PURES; - for (; pscan; pscan = next) { - next = PATNEXT(pscan); - if (P_OP(pscan) == P_EXACTLY) { - char *opnd = (char *)P_OPERAND(pscan); - while ((*dst = *opnd++)) - dst++; + if (!(patflags & PAT_ANY) && P_OP(PATNEXT(pscan)) == P_END) { + /* only one top level choice */ + pscan = P_OPERAND(pscan); + + if (flags & P_PURESTR) { + /* + * The pattern can be matched with a simple strncmp/strcmp. + * Careful in case we've overwritten the node for the next ptr. + */ + char *dst = patout + startoff; + Upat next; + p->flags |= PAT_PURES; + for (; pscan; pscan = next) { + next = PATNEXT(pscan); + if (P_OP(pscan) == P_EXACTLY) { + char *opnd = (char *)P_OPERAND(pscan); + while ((*dst = *opnd++)) + dst++; + } } - } - *dst++ = '\0'; - p->size = dst - patout; - /* patmlen is really strlen, don't include null byte */ - p->patmlen = p->size - startoff - 1; - } else { - /* starting point info */ - if (P_OP(pscan) == P_EXACTLY && !p->globflags) - p->patstartch = *(char *)P_OPERAND(pscan); - /* Find the longest literal string in something expensive. - * This is itself not all that cheap if we have case-insensitive - * matching or approximation, so don't. - */ - if ((flags & P_HSTART) && !p->globflags) { - lng = NULL; - len = 0; - for (; pscan; pscan = PATNEXT(pscan)) - if (P_OP(pscan) == P_EXACTLY && - strlen((char *)P_OPERAND(pscan)) >= len) { - lng = (char *)P_OPERAND(pscan); - len = strlen(lng); + *dst++ = '\0'; + p->size = dst - patout; + /* patmlen is really strlen, don't include null byte */ + p->patmlen = p->size - startoff - 1; + } else { + /* starting point info */ + if (P_OP(pscan) == P_EXACTLY && !p->globflags) + p->patstartch = *(char *)P_OPERAND(pscan); + /* + * Find the longest literal string in something expensive. + * This is itself not all that cheap if we have + * case-insensitive matching or approximation, so don't. + */ + if ((flags & P_HSTART) && !p->globflags) { + lng = NULL; + len = 0; + for (; pscan; pscan = PATNEXT(pscan)) + if (P_OP(pscan) == P_EXACTLY && + strlen((char *)P_OPERAND(pscan)) >= len) { + lng = (char *)P_OPERAND(pscan); + len = strlen(lng); + } + if (lng) { + p->mustoff = lng - patout; + p->patmlen = len; } - if (lng) { - p->mustoff = lng - patout; - p->patmlen = len; } } } @@ -424,26 +448,22 @@ static long patcompswitch(int paren, int *flagp) { long starter, br, ender, excsync = 0; -#ifdef BACKREFERENCES int parno = 0; -#endif int flags, gfchanged = 0, savglobflags = patglobflags; Upat ptr; *flagp = 0; -#ifdef BACKREFERENCES - if (paren && (patflags & PAT_BACKR)) { + if (paren && (patglobflags & GF_BACKREF) && patnpar <= NSUBEXP) { /* * parenthesized: make an open node. * We can only refer to the first nine parentheses. * For any others, we just use P_OPEN on its own; there's * no gain in arbitrarily limiting the number of parentheses. */ - parno = patnpar >= NSUBEXP ? 0 : patnpar++; + parno = patnpar++; starter = patnode(P_OPEN + parno); } else -#endif starter = 0; br = patnode(P_BRANCH); @@ -559,12 +579,7 @@ patcompswitch(int paren, int *flagp) * branch at that point would indicate the current choices continue, * which they don't. */ -#ifdef BACKREFERENCES - ender = patnode(paren ? (patflags & PAT_BACKR) ? P_CLOSE+parno - : P_NOTHING : P_END); -#else - ender = patnode(paren ? P_NOTHING : P_END); -#endif + ender = patnode(paren ? parno ? P_CLOSE+parno : P_NOTHING : P_END); pattail(starter, ender); /* @@ -708,17 +723,37 @@ patgetglobflags(char **strp) case 'l': /* Lowercase in pattern matches lower or upper in target */ - patglobflags = (patglobflags & ~C_IGNCASE) | C_LCMATCHUC; + patglobflags = (patglobflags & ~GF_IGNCASE) | GF_LCMATCHUC; break; case 'i': /* Fully case insensitive */ - patglobflags = (patglobflags & ~C_LCMATCHUC) | C_IGNCASE; + patglobflags = (patglobflags & ~GF_LCMATCHUC) | GF_IGNCASE; break; case 'I': /* Restore case sensitivity */ - patglobflags &= ~(C_LCMATCHUC|C_IGNCASE); + patglobflags &= ~(GF_LCMATCHUC|GF_IGNCASE); + break; + + case 'b': + /* Make backreferences */ + patglobflags |= GF_BACKREF; + break; + + case 'B': + /* Don't make backreferences */ + patglobflags &= ~GF_BACKREF; + break; + + case 'm': + /* Make references to complete match */ + patglobflags |= GF_MATCHREF; + break; + + case 'M': + /* Don't */ + patglobflags &= ~GF_MATCHREF; break; default: @@ -1204,11 +1239,20 @@ char *patinput; /* String input pointer */ /* Length of input string, plus null byte, if needed */ static int patinlen; -#ifdef BACKREFERENCES -static char **patstartp; /* Pointer to backref starts */ -static char **patendp; /* Pointer to backref ends */ -static int parsfound; /* parentheses found */ -#endif + +/* + * Offset of string at which we are trying to match. + * This is added in to the positions recorded in patbeginp and patendp + * when we are looking for substrings. Currently this only happens + * in the parameter substitution code. + */ +/**/ +int patoffset; + +static char *patbeginp[NSUBEXP]; /* Pointer to backref beginnings */ +static char *patendp[NSUBEXP]; /* Pointer to backref ends */ +static int parsfound; /* parentheses (with backrefs) found */ + static int globdots; /* Glob initial dots? */ /* @@ -1233,10 +1277,8 @@ pattrystart(void) int pattry(Patprog prog, char *string) { -#ifdef BACKREFERENCES int i; char **sp, **ep; -#endif char *progstr = (char *)prog + prog->startoff; /* inherited from domatch, but why, exactly? */ @@ -1274,40 +1316,78 @@ pattry(Patprog prog, char *string) errsfound = 0; } globdots = !(patflags & PAT_NOGLD); -#ifdef BACKREFERENCES parsfound = 0; - if (patflags & PAT_BACKR) { - patstartp = prog->ppStartp; - patendp = prog->ppEndp; - } else { - patstartp = patendp = NULL; - } -#endif if (patmatch((Upat)progstr)) { -#ifdef BACKREFERENCES - if (patflags & PAT_BACKR) { - prog->ppStartp[0] = string; - prog->ppEndp[0] = patinput; - - sp = patstartp+1; - ep = patendp + 1; - for (i = 1; i < NSUBEXP; i++) { - if (!(parsfound & (1 << (i - 1)))) - *sp = 0; - if (!(parsfound & (1 << (i + 15)))) - *ep = 0; - sp++; - ep++; - } - - } -#endif /* * we were lazy and didn't save the globflags if an exclusion * failed, so set it now */ patglobflags = prog->globend; + /* + * Should we clear backreferences and matches on a failed + * match? + */ + if ((patglobflags & GF_MATCHREF) && !(patflags & PAT_FILE)) { + /* + * m flag: for global match. This carries no overhead + * in the pattern matching part. + */ + char *str; + int len = patinput - patinstart; + + PERMALLOC { + str = dupstrpfx(patinstart, len); + } LASTALLOC; + setsparam("MATCH", str); + setiparam("MBEGIN", (zlong)(patoffset + !isset(KSHARRAYS))); + setiparam("MEND", + (zlong)(len + patoffset + !isset(KSHARRAYS) - 1)); + } + if (prog->patnpar && !(patflags & PAT_FILE)) { + /* + * b flag: for backreferences using parentheses. + */ + int palen = prog->patnpar+1; + char **matcharr, **mbeginarr, **mendarr; + char numbuf[DIGBUFSIZE]; + + matcharr = zcalloc(palen*sizeof(char *)); + mbeginarr = zcalloc(palen*sizeof(char *)); + mendarr = zcalloc(palen*sizeof(char *)); + + sp = patbeginp; + ep = patendp; + + PERMALLOC { + for (i = 0; i < prog->patnpar; i++) { + DPUTS(!*sp || !*ep, "BUG: backrefs not set."); + matcharr[i] = dupstrpfx(*sp, *ep - *sp); + /* + * mbegin and mend give indexes into the string + * in the standard notation, i.e. respecting + * KSHARRAYS, and with the end index giving + * the last character, not one beyond. + * For example, foo=foo; [[ $foo = (f)oo ]] gives + * (without KSHARRAYS) indexes 1 and 1, which + * corresponds to indexing as ${foo[1,1]}. + */ + sprintf(numbuf, "%ld", + (long)((*sp - patinstart) + patoffset + + !isset(KSHARRAYS))); + mbeginarr[i] = ztrdup(numbuf); + sprintf(numbuf, "%ld", + (long)((*ep - patinstart) + patoffset + + !isset(KSHARRAYS) - 1)); + mendarr[i] = ztrdup(numbuf); + sp++; + ep++; + } + } LASTALLOC; + setaparam("match", matcharr); + setaparam("mbegin", mbeginarr); + setaparam("mend", mendarr); + } return 1; } else return 0; @@ -1319,10 +1399,10 @@ pattry(Patprog prog, char *string) * comes from the input string, the second the current pattern. */ #define CHARMATCH(chin, chpa) (chin == chpa || \ - ((patglobflags & C_IGNCASE) ? \ + ((patglobflags & GF_IGNCASE) ? \ ((isupper(chin) ? tolower(chin) : chin) == \ (isupper(chpa) ? tolower(chpa) : chpa)) : \ - (patglobflags & C_LCMATCHUC) ? \ + (patglobflags & GF_LCMATCHUC) ? \ (islower(chpa) && toupper(chpa) == chin) : 0)) /* @@ -1480,7 +1560,6 @@ patmatch(Upat prog) case P_GFLAGS: patglobflags = P_OPERAND(scan)->l; break; -#ifdef BACKREFERENCES case P_OPEN: case P_OPEN+1: case P_OPEN+2: @@ -1495,13 +1574,12 @@ patmatch(Upat prog) save = patinput; if (patmatch(next)) { - DPUTS(!patstartp, "patstartp not set for backreferencing"); /* - * Don't set ppStartp if some later invocation of + * Don't set patbeginp if some later invocation of * the same parentheses already has. */ if (no && !(parsfound & (1 << (no - 1)))) { - patstartp[no] = save; + patbeginp[no-1] = save; parsfound |= 1 << (no - 1); } return 1; @@ -1524,14 +1602,13 @@ patmatch(Upat prog) if (patmatch(next)) { DPUTS(!patendp, "patendp not set for backreferencing"); if (no && !(parsfound & (1 << (no + 15)))) { - patendp[no] = save; + patendp[no-1] = save; parsfound |= 1 << (no + 15); } return 1; } else return 0; break; -#endif case P_EXCSYNC: /* See the P_EXCLUDE code below for where syncptr comes from */ { @@ -1605,9 +1682,7 @@ patmatch(Upat prog) unsigned char *oldsyncstr; char *matchpt = NULL; int ret, savglobdots, matchederrs = 0; -#ifdef BACKREFERENCES int savparsfound = parsfound; -#endif DPUTS(P_OP(scan) == P_WBRANCH, "BUG: excluded WBRANCH"); syncstrp = P_OPERAND(next); @@ -1674,14 +1749,12 @@ patmatch(Upat prog) } if (patmatch(opnd)) { ret = 0; -#ifdef BACKREFERENCES /* * Another subtlety: if we exclude the * match, any parentheses just found * become invalidated. */ parsfound = savparsfound; -#endif } if (buf) zfree(buf, pathpos + patinlen); @@ -2184,18 +2257,16 @@ patdump(Patprog r) printf("start `%c' ", r->patstartch); if (!(r->flags & PAT_NOANCH)) printf("EOL-anchor "); -#ifdef BACKREFERENCES - if (r->flags & PAT_BACKR) - printf("backreferences "); -#endif + if (r->patnpar) + printf("%d active backreferences ", r->patnpar); if (r->mustoff) printf("must have \"%s\"", (char *)r + r->mustoff); printf("\n"); if (r->globflags) { printf("Globbing flags: "); - if (r->globflags & C_LCMATCHUC) + if (r->globflags & GF_LCMATCHUC) printf("LC matches UC "); - if (r->globflags & C_IGNCASE) + if (r->globflags & GF_IGNCASE) printf("Ignore case"); printf("\n"); if (r->globflags & 0xff) @@ -2317,16 +2388,11 @@ int bin_patdebug(char *name, char **args, char *ops, int func) { Patprog prog; - int ret = 0, flags; + int ret = 0; tokenize(*args); -#ifdef BACKREFERENCES - flags = ops['b'] ? PAT_BACKR : 0; -#else - flags = 0; -#endif - if (!(prog = patcompile((char *)*args, flags, 0))) + if (!(prog = patcompile((char *)*args, 0, 0))) return 1; if (ops['p'] || !args[1]) { patdump(prog); -- cgit 1.4.1