diff options
-rw-r--r-- | Doc/Zsh/expn.yo | 38 | ||||
-rwxr-xr-x | Misc/globtests | 2 | ||||
-rw-r--r-- | Src/parse.c | 6 | ||||
-rw-r--r-- | Src/pattern.c | 698 |
4 files changed, 413 insertions, 331 deletions
diff --git a/Doc/Zsh/expn.yo b/Doc/Zsh/expn.yo index f78c959bb..ebb95a833 100644 --- a/Doc/Zsh/expn.yo +++ b/Doc/Zsh/expn.yo @@ -1089,13 +1089,16 @@ not in the given set. item(tt(<)[var(x)]tt(-)[var(y)]tt(>))( Matches any number in the range var(x) to var(y), inclusive. Either of the numbers may be omitted to make the range open-ended; -hence `tt(<->)' matches any number. +hence `tt(<->)' matches any number. To match individual digits, the +tt([)...tt(]) form is more efficient. ) item(tt(LPAR())...tt(RPAR()))( Matches the enclosed pattern. This is used for grouping. If the tt(KSH_GLOB) option is set, then a `tt(@)', `tt(*)', `tt(+)', `tt(?)' or `tt(!)' immediately preceding -the `tt(LPAR())' is treated specially, as detailed below. +the `tt(LPAR())' is treated specially, as detailed below. The option +tt(SH_GLOB) prevents bare parentheses from being used in this way, though +the tt(KSH_GLOB) option is still available. Note that grouping cannot currently extend over multiple directories: a `tt(/)' separating a directory terminates processing of the current group; processing resumes after the end of the group. @@ -1120,7 +1123,7 @@ This has lower precedence than any operator except `tt(|)', so `tt(*/*~foo/bar)' will search for all files in all directories in `tt(.)' and then exclude `tt(foo/bar)' if there was such a match. Multiple patterns can be excluded by -`var(foo)tt(~LPAR())var(bar)tt(|)var(baz)tt(RPAR())'. +`var(foo)tt(~)var(bar)tt(~)var(baz)'. In the exclusion pattern (var(y)), `tt(/)' and `tt(.)' are not treated specially the way they usually are in globbing. ) @@ -1128,13 +1131,19 @@ item(var(x)tt(#))( (Requires tt(EXTENDED_GLOB) to be set.) Matches zero or more occurrences of the pattern var(x). This operator has high precedence; `tt(12#)' is equivalent to `tt(1(2#))', -rather than `tt((12)#)'. +rather than `tt((12)#)'. It is an error for an unquoted `tt(#)' to follow +something which cannot be repeated; this includes an empty string, a +pattern already followed by `tt(##)', or parentheses when part of a +tt(KSH_GLOB) pattern (for example, `tt(!LPAR())var(foo)tt(RPAR()#)' is +invalid and must be replaced by +`tt(*LPAR()!LPAR())var(foo)tt(RPAR()RPAR())'). ) item(var(x)tt(##))( (Requires tt(EXTENDED_GLOB) to be set.) Matches one or more occurrences of the pattern var(x). This operator has high precedence; `tt(12##)' is equivalent to `tt(1(2##))', -rather than `tt((12)##)'. +rather than `tt((12)##)'. No more than two active `tt(#)' characters may +appear together. ) enditem() subsect(ksh-like Glob Operators) @@ -1162,6 +1171,20 @@ Match anything but the expression in parentheses. (Like `tt(LPAR()^LPAR())...tt(RPAR()RPAR())'.) ) enditem() +subsect(Precedence) +cindex(precedence of glob operators) +The precedence of the operators given above is (highest) `tt(^)', `tt(/)', +`tt(~)', `tt(|)' (lowest); the +remaining operators are simply treated from left to right as part of a +string, with `tt(#)' and `tt(##)' applying to the shortest possible +preceeding unit (i.e. a character, `tt(?)', `tt([)...tt(])', +`tt(<)...tt(>)', or a parenthesised expression). As mentioned +above, a `tt(/)' used as a directory separator may not appear inside +parentheses, while a `tt(|)' must do so; in patterns used in other contexts +than filename generation (for example, in tt(case) statements and tests +within `tt([[)...tt(]])'), a `tt(/)' is not special; and `tt(/)' is also +not special after a `tt(~)' appearing outside parentheses in a filename +pattern. subsect(Globbing Flags) There are various flags which affect any text to their right up to the end of the enclosing group or to the end of the pattern; they require @@ -1258,7 +1281,10 @@ tt(LPAR()#a1)tt(RPAR()cat)tt(LPAR()LPAR()#a0)tt(RPAR()dog)tt(RPAR()fox) allows one error in total, which may not occur in the tt(dog) section, and the pattern tt(LPAR()#a1)tt(RPAR()cat)tt(LPAR()#a0)tt(RPAR()dog)tt(LPAR()#a1)tt(RPAR()fox) -is equivalent. +is equivalent. Note that the point at which an error is first found is the +crucial one for establishing whether to use approximation; for example, +tt((#a1)abc(#a0)xyz) will not match tt(abcdxyz), because the error occurs +at the `tt(x)', where approximation is turned off. subsect(Recursive Globbing) A pathname component of the form `tt(LPAR())var(foo)tt(/RPAR()#)' diff --git a/Misc/globtests b/Misc/globtests index b2fe8d8ee..a0f8184ec 100755 --- a/Misc/globtests +++ b/Misc/globtests @@ -16,6 +16,8 @@ while read res str pat; do done <<EOT # a few simple things certain nameless idiots have been known to mess up t foo~ foo~ +t foo~ (foo~) +t foo~ (foo~|) t foo.c *.c~boo* f foo.c *.c~boo*~foo* # closures diff --git a/Src/parse.c b/Src/parse.c index 658a66660..d0a9bc45f 100644 --- a/Src/parse.c +++ b/Src/parse.c @@ -623,6 +623,12 @@ par_case(Cmd c) } if (*s || pct || s == str + 1) YYERRORV; + /* Simplify pattern by removing surrounding (...) */ + sl = strlen(str); + DPUTS(str[1] != Inpar || str[sl-1] != Outpar, + "BUG: strange case pattern"); + str[sl-1] = '\0'; + chuck(str+1); break; } else { char *str2; diff --git a/Src/pattern.c b/Src/pattern.c index 048e3d3ec..bfc9102a5 100644 --- a/Src/pattern.c +++ b/Src/pattern.c @@ -1,5 +1,5 @@ /* - * glob.c - filename generation + * pattern.c - pattern matching * * This file is part of zsh, the Z shell. * @@ -50,6 +50,24 @@ */ #include "zsh.mdh" + +/* + * The following union is used mostly for alignment purposes. + * Normal nodes are longs, while certain nodes take a char * as an argument; + * here we make sure that they both work out to the same length. + * The compiled regexp we construct consists of upats stuck together; + * anything else to be added (strings, numbers) is stuck after and + * then aligned to a whole number of upat units. + * + * Note also that offsets are in terms of the sizes of these things. + */ +union upat { + long l; + unsigned char *p; +}; + +typedef union upat *Upat; + #include "pattern.pro" /* @@ -154,36 +172,17 @@ #define PP_UNKWN 13 #define PP_RANGE 14 -/* Align everything to the pointer type. */ -typedef char *zalign_t; - -#define P_OP(p) (*(long *)(p) & 0xff) -#define P_NEXT(p) (*(long *)(p) >> 8) -#define P_OPERAND(p) ((p) + sizeof(zalign_t)) -#define P_ISBRANCH(p) (*(long *)(p) & 0x20) -#define P_ISEXCLUDE(p) ((*(long *)(p) & 0x30) == 0x30) -#define P_NOTDOT(p) (*(long *)(p) & 0x40) +#define P_OP(p) ((p)->l & 0xff) +#define P_NEXT(p) ((p)->l >> 8) +#define P_OPERAND(p) ((p) + 1) +#define P_ISBRANCH(p) ((p)->l & 0x20) +#define P_ISEXCLUDE(p) (((p)->l & 0x30) == 0x30) +#define P_NOTDOT(p) ((p)->l & 0x40) #define P_SIMPLE 0x01 /* Simple enough to be #/## operand. */ #define P_HSTART 0x02 /* Starts with # or ##'d pattern. */ #define P_PURESTR 0x04 /* Can be matched with a strcmp */ -/* - * pointer is end string to be parsed? - * a bit dire because of extendedglob possibilities: - * we need to make sure a ~ at the end of a string isn't mistaken - * for an excluder or lots of emacs users get very cross. - */ -#define ISENDCHAR(X) (!*(X) || ((patflags & PAT_FILE) && *(X) == '/') || \ - *(X) == Bar || *(X) == Outpar || \ - (isset(EXTENDEDGLOB) && \ - (*(X) == Hat || \ - (*(X) == Tilde && \ - !(!(X)[1] || ((patflags & PAT_FILE) && \ - (X)[1] == '/') || \ - (X)[1] == Bar || (X)[1] == Outpar || \ - (X)[1] == Tilde))))) - /* Next character after one which may be a Meta (x is any char *) */ #define METANEXT(x) (*(x) == Meta ? (x)+2 : (x)+1) /* @@ -202,6 +201,29 @@ typedef zlong zrange_t; typedef unsigned long zrange_t; #endif +/* + * Characters which terminate a pattern segment. We actually use + * a pointer patendseg which skips the first character if we are not + * parsing a file pattern. + * Note that the size of this and the next array are hard-wired into + * patcompile. + */ + +static char endseg[] = { + '/', /* file only */ + '\0', Bar, Outpar, /* all patterns */ + Tilde /* extended glob only */ +}; + +/* Characters which terminate a simple string */ + +static char endstr[] = { + '/', /* file only */ + '\0', Bar, Outpar, Quest, Star, Inbrack, Inpar, Inang, + /* all patterns */ + Tilde, Hat, Pound /* extended glob only */ +}; + /* Default size for pattern buffer */ #define P_DEF_ALLOC 256 @@ -212,6 +234,10 @@ static char *patcode; /* point of code emission */ static long patsize; /* size of code */ static char *patout; /* start of code emission string */ static long patalloc; /* size allocated for same */ +static char *patendseg; /* characters ending segment */ +static int patendseglen; /* length of same */ +static char *patendstr; /* characters ending plain string */ +static int patendstrlen; /* length of sameo */ /* Flags used in both compilation and execution */ static int patflags; /* flags passed down to patcompile */ @@ -226,8 +252,8 @@ patadd(char *add, int ch, long n, int noalgn) /* Make sure everything gets aligned unless we get noalgn. */ long newpatsize = patsize + n; if (!noalgn) - newpatsize = (newpatsize + sizeof(zalign_t) - 1) & - ~(sizeof(zalign_t) - 1); + newpatsize = (newpatsize + sizeof(union upat) - 1) & + ~(sizeof(union upat) - 1); if (patalloc < newpatsize) { long newpatalloc = 2*(newpatsize > patalloc ? newpatsize : patalloc); @@ -245,7 +271,7 @@ patadd(char *add, int ch, long n, int noalgn) } static long rn_offs; -/* operates on poiners, returns a pointer */ +/* operates on pointers to union upat, returns a pointer */ #define PATNEXT(p) ((rn_offs = P_NEXT(p)) ? \ (P_OP(p) == P_BACK) ? \ ((p)-rn_offs) : ((p)+rn_offs) : NULL) @@ -267,11 +293,10 @@ patcompile(char *exp, int inflags, char **endexp) { int flags, len; long startoff; - char *pscan, *lng; + Upat pscan; + char *lng; Patprog p; - DPUTS(sizeof(long) > sizeof(zalign_t), "BUG: patprog alignment too small"); - #ifdef BACKREFERENCES startoff = (inflags & PAT_BACKR) ? sizeof(struct patprog) : sizeof(struct patprog_short); @@ -279,7 +304,7 @@ patcompile(char *exp, int inflags, char **endexp) startoff = sizeof(struct patprog); #endif /* Ensure alignment of start of program string */ - startoff = (startoff + sizeof(zalign_t) - 1) & ~(sizeof(zalign_t) - 1); + startoff = (startoff + sizeof(union upat) - 1) & ~(sizeof(union upat) - 1); /* Allocate reasonable sized chunk if none, reduce size if too big */ if (patalloc != P_DEF_ALLOC) @@ -290,7 +315,16 @@ patcompile(char *exp, int inflags, char **endexp) patnpar = 1; patflags = inflags; + patendseg = endseg; + patendseglen = isset(EXTENDEDGLOB) ? 5 : 4; + patendstr = endstr; + patendstrlen = isset(EXTENDEDGLOB) ? 12 : 9; + if (!(patflags & PAT_FILE)) { + patendseg++; + patendstr++; + patendseglen--; + patendstrlen--; remnulargs(exp); patglobflags = 0; } @@ -313,7 +347,7 @@ patcompile(char *exp, int inflags, char **endexp) p->mustoff = 0; p->size = patsize; p->patmlen = 0; - pscan = patout + startoff; + pscan = (Upat)(patout + startoff); if (!(patflags & PAT_ANY) && P_OP(PATNEXT(pscan)) == P_END) { /* only one top level choice */ @@ -324,24 +358,25 @@ patcompile(char *exp, int inflags, char **endexp) * The pattern can be matched with a simple strncmp/strcmp. * Careful in case we've overwritten the node for the next ptr. */ - char *dst = patout + startoff, *next; + char *dst = patout + startoff; + Upat next; p->flags |= PAT_PURES; for (; pscan; pscan = next) { next = PATNEXT(pscan); if (P_OP(pscan) == P_EXACTLY) { - char *opnd = P_OPERAND(pscan); + char *opnd = (char *)P_OPERAND(pscan); while ((*dst = *opnd++)) dst++; } } *dst++ = '\0'; p->size = dst - patout; - /* patmlen is reall strlen, don't include null byte */ + /* patmlen is really strlen, don't include null byte */ p->patmlen = p->size - startoff - 1; } else { /* starting point info */ if (P_OP(pscan) == P_EXACTLY && !p->globflags) - p->patstartch = *P_OPERAND(pscan); + p->patstartch = *(char *)P_OPERAND(pscan); /* Find the longest literal string in something expensive. * This is itself not all that cheap if we have case-insensitive * matching or approximation, so don't. @@ -352,8 +387,8 @@ patcompile(char *exp, int inflags, char **endexp) for (; pscan; pscan = PATNEXT(pscan)) if (P_OP(pscan) == P_EXACTLY && strlen((char *)P_OPERAND(pscan)) >= len) { - lng = P_OPERAND(pscan); - len = strlen((char *)P_OPERAND(pscan)); + lng = (char *)P_OPERAND(pscan); + len = strlen(lng); } if (lng) { p->mustoff = lng - patout; @@ -393,7 +428,7 @@ patcompswitch(int paren, int *flagp) int parno = 0; #endif int flags, gfchanged = 0, savflags = patflags, savglobflags = patglobflags; - char *ptr; + Upat ptr; *flagp = 0; @@ -424,17 +459,15 @@ patcompswitch(int paren, int *flagp) *flagp |= flags & (P_HSTART|P_PURESTR); while (*patparse == Bar || - (isset(EXTENDEDGLOB) && - *patparse == Tilde && patparse[1] && patparse[1] != Bar && - patparse[1] != Outpar && patparse[1] != Tilde && - !((patflags & PAT_FILE) && patparse[1] == '/'))) { + (isset(EXTENDEDGLOB) && *patparse == Tilde && + !memchr(patendseg, patparse[1], patendseglen))) { int tilde = *patparse++ == Tilde; long gfnode = 0, newbr; *flagp &= ~P_PURESTR; if (tilde) { - unsigned char *unull = NULL; + union upat up; /* excsync remembers the P_EXCSYNC node before a chain of * exclusions: all point back to this. only the * original (non-excluded) branch gets a trailing P_EXCSYNC. @@ -455,7 +488,8 @@ patcompswitch(int paren, int *flagp) patglobflags &= ~0xff; br = patnode(!(patflags & PAT_FILET) || paren ? P_EXCLUDE : P_EXCLUDP); - patadd((char *)&unull, 0, sizeof(unull), 0); + up.p = NULL; + patadd((char *)&up, 0, sizeof(up), 0); /* / is not treated as special if we are at top level */ if (!paren) patflags &= ~PAT_FILE; @@ -485,9 +519,10 @@ patcompswitch(int paren, int *flagp) * No gfchanged, as nothing to follow branch at top * level. */ + union upat up; gfnode = patnode(P_GFLAGS); - patadd((char *)&patglobflags, 0, sizeof(long), - 0); + up.l = patglobflags; + patadd((char *)&up, 0, sizeof(union upat), 0); } } else { patglobflags = savglobflags; @@ -525,9 +560,9 @@ patcompswitch(int paren, int *flagp) * Hook the tails of the branches to the closing node, * except for exclusions which terminate where they are. */ - for (ptr = patout + starter; ptr; ptr = PATNEXT(ptr)) + for (ptr = (Upat)patout + starter; ptr; ptr = PATNEXT(ptr)) if (!P_ISEXCLUDE(ptr)) - patoptail(ptr-patout, ender); + patoptail(ptr-(Upat)patout, ender); /* check for proper termination */ if ((paren && *patparse++ != Outpar) || @@ -566,12 +601,9 @@ patcompbranch(int *flagp) *flagp = P_PURESTR; starter = chain = 0; - while (*patparse && !((patflags & PAT_FILE) && *patparse == '/') && - *patparse != Bar && *patparse != Outpar && - (!isset(EXTENDEDGLOB) || *patparse != Tilde || - !patparse[1] || patparse[1] == Bar || patparse[1] == Outpar - || patparse[1] == Tilde || - ((patflags & PAT_FILE) && patparse[1] == '/'))) { + while (!memchr(patendseg, *patparse, patendseglen) && + (*patparse != Tilde || + memchr(patendseg, patparse[1], patendseglen))) { if (isset(EXTENDEDGLOB) && ((!isset(SHGLOB) && (*patparse == Inpar && patparse[1] == Pound)) || @@ -601,8 +633,10 @@ patcompbranch(int *flagp) */ if (oldglobflags != patglobflags) { /* Flags changed */ + union upat up; latest = patnode(P_GFLAGS); - patadd((char *)&patglobflags, 0, sizeof(int), 0); + up.l = patglobflags; + patadd((char *)&up, 0, sizeof(union upat), 0); } else { /* No effect. */ continue; @@ -696,156 +730,104 @@ static long patcomppiece(int *flagp) { long starter, next, pound, op; - int flags, kshchar; - unsigned char *ptr; - - starter = patcompatom(&flags, &kshchar); - if (!starter) - return 0; - - if (!(pound = (*patparse == Pound && isset(EXTENDEDGLOB))) && - (kshchar <= 0 || kshchar == '@' || kshchar == '!')) { - *flagp = flags; - return starter; - } - - /* too much at once doesn't currently work */ - if (kshchar && pound) - return 0; - - if (kshchar == '*') { - op = P_ONEHASH; - *flagp = P_HSTART; - } else if (kshchar == '+') { - op = P_TWOHASH; - *flagp = P_HSTART; - } else if (kshchar == '?') { - op = 0; - *flagp = 0; - } else if (*++patparse == Pound) { - op = P_TWOHASH; - patparse++; - *flagp = P_HSTART; - } else { - op = P_ONEHASH; - *flagp = P_HSTART; - } - - /* - * Note optimizations with pointers into P_NOTHING branches: some - * should logically point to next node after current piece. - * - * Backtracking is also encoded in a slightly obscure way: the - * code emitted ensures we test the non-empty branch of complex - * patterns before the empty branch on each repetition. Hence - * each time we fail on a non-empty branch, we try the empty branch, - * which is equivalent to backtracking. - */ - if ((flags & P_SIMPLE) && op == P_ONEHASH && - P_OP(patout+starter) == P_ANY) { - /* Optimize ?# to *. Silly thing to do, since who would use - * use ?# ? But it makes the later code shorter. - */ - long *lptr = (long *)(patout + starter); - *lptr = (*lptr & ~0xff) | P_STAR; - } else if ((flags & P_SIMPLE) && op && !(patglobflags & 0xff)) { - /* Don't simplify if we need to look for approximations. */ - patinsert(op, starter, NULL, 0); - } else if (op == P_ONEHASH) { - /* Emit x# as (x&|), where & means "self". */ - ptr = NULL; - patinsert(P_WBRANCH, starter, (char *)&ptr, sizeof(ptr)); - /* Either x */ - patoptail(starter, patnode(P_BACK)); /* and loop */ - patoptail(starter, starter); /* back */ - pattail(starter, patnode(P_BRANCH)); /* or */ - pattail(starter, patnode(P_NOTHING)); /* null. */ - } else if (op == P_TWOHASH) { - /* Emit x## as x(&|) where & means "self". */ - next = patnode(P_WBRANCH); /* Either */ - ptr = NULL; - patadd((char *)&ptr , 0, sizeof(ptr), 0); - pattail(starter, next); - pattail(patnode(P_BACK), starter); /* loop back */ - pattail(next, patnode(P_BRANCH)); /* or */ - pattail(starter, patnode(P_NOTHING)); /* null. */ - } else if (kshchar == '?') { - /* Emit ?(x) as (x|) */ - patinsert(P_BRANCH, starter, NULL, 0); /* Either x */ - pattail(starter, patnode(P_BRANCH)); /* or */ - next = patnode(P_NOTHING); /* null */ - pattail(starter, next); - patoptail(starter, next); - } - if (*patparse == Pound) - return 0; - - return starter; -} - -/* - * Parse lowest level pattern. If doing ordinary characters, we - * gobble a complete string as far as we can get. - * kshcharp returns a character found before an Inpar, for handling - * as a closure. - */ - -/**/ -static long -patcompatom(int *flagp, int *kshcharp) -{ - long starter; - int patch, flags, len, ch; + int flags, flags2, kshchar, len, ch, patch; + union upat up; char *nptr, *str0, cbuf[2]; zrange_t from, to; - *flagp = 0; + flags = 0; str0 = patparse; for (;;) { - /* check kshglob here */ - *kshcharp = '\0'; + /* + * Check if we have a string. First, we need to make sure + * the string doesn't introduce a ksh-like parenthesised expression. + */ + kshchar = '\0'; if (isset(KSHGLOB) && *patparse && patparse[1] == Inpar) { - if (strchr("?*+!@", (char)*patparse)) - *kshcharp = STOUC(*patparse); + if (strchr("?*+!@", *patparse)) + kshchar = STOUC(*patparse); else if (*patparse == Star || *patparse == Quest) - *kshcharp = STOUC(ztokens[*patparse - Pound]); + kshchar = STOUC(ztokens[*patparse - Pound]); } - if (patparse > str0) { - /* - * This is up here instead of at the end to simplify the - * kshglob bracket testing. Note patparse doesn't - * get incremented till afterwards. - */ - if (ISENDCHAR(patparse) || *kshcharp || *patparse == Quest || - *patparse == Star || *patparse == Inbrack || - (*patparse == Inpar && !isset(SHGLOB)) || - *patparse == Inang || - (isset(EXTENDEDGLOB) && *patparse == Pound)) - break; - else { - METAINC(patparse); - continue; - } - } + /* + * End of string (or no string at all) if ksh-type parentheses, + * or special character, unless that character is a tilde and + * the character following is an end-of-segment character. Thus + * tildes are not special if there is nothing following to + * be excluded. + */ + if (kshchar || (memchr(patendstr, *patparse, patendstrlen) && + (*patparse != Tilde || + !memchr(patendseg, patparse[1], patendseglen)))) + break; + + METAINC(patparse); + } - if (*kshcharp) + if (patparse > str0) { + /* Ordinary string: cancel kshchar lookahead */ + kshchar = '\0'; + /* + * Assume it matches a simple string until we find otherwise. + */ + flags |= P_PURESTR; + DPUTS(patparse == str0, "BUG: matched nothing in patcomppiece."); + /* more than one character matched? */ + len = str0 + (*str0 == Meta ? 2 : 1) < patparse; + /* + * If we have more than one character, a following hash only + * applies to the last, so decrement. + */ + if (isset(EXTENDEDGLOB) && *patparse == Pound && len) + patparse -= (patparse > str0 + 1 && patparse[-2] == Meta) ? 2 : 1; + /* + * If len is 1, we can't have an active # following, so doesn't + * matter that we don't make X in `XX#' simple. + */ + if (!len) + flags |= P_SIMPLE; + starter = patnode(P_EXACTLY); + /* add enough space including null byte */ + len = patparse - str0; + patadd(str0, 0, len + 1, 0); + nptr = (char *)P_OPERAND((Upat)patout + starter); + nptr[len] = '\0'; + /* + * It's much simpler to turn off pure string mode for + * any case-insensitive or approximate matching; usually, + * that is correct, or they wouldn't have been turned on. + * However, we need to make sure we match a "." or ".." + * in a file name as a pure string. There's a minor bug + * that this will also apply to something like + * ..(#a1).. (i.e. the (#a1) has no effect), but if you're + * going to write funny patterns, you get no sympathy from me. + */ + if (patglobflags && + (!(patflags & PAT_FILE) || (strcmp(nptr, ".") && + strcmp(nptr, "..")))) + flags &= ~P_PURESTR; + for (; *nptr; METAINC(nptr)) + if (itok(*nptr)) + *nptr = ztokens[*nptr - Pound]; + } else { + if (kshchar) patparse++; patch = *patparse; METAINC(patparse); switch(patch) { case Quest: - *flagp |= P_SIMPLE; - return patnode(P_ANY); + flags |= P_SIMPLE; + starter = patnode(P_ANY); break; case Star: /* kshchar is used as a sign that we can't have #'s. */ - *kshcharp = -1; - return patnode(P_STAR); + kshchar = -1; + starter = patnode(P_STAR); break; case Inbrack: - *flagp |= P_SIMPLE; + flags |= P_SIMPLE; if (*patparse == Hat || *patparse == '^' || *patparse == '!') { patparse++; starter = patnode(P_ANYBUT); @@ -919,15 +901,14 @@ patcompatom(int *flagp, int *kshcharp) patparse++; /* terminate null string and fix alignment */ patadd(NULL, 0, 1, 0); - return starter; break; case Inpar: /* is this how to treat parentheses in SHGLOB? */ - if (isset(SHGLOB) && !*kshcharp) + if (isset(SHGLOB) && !kshchar) return 0; - if (*kshcharp == '!') { + if (kshchar == '!') { /* This is nasty, we should really either handle all - * kshglobbing upstairs or down here. But most of the + * kshglobbing below or here. But most of the * others look like non-ksh patterns, while this one * doesn't, so we handle it here and leave the rest. * We treat it like an extendedglob ^, except that @@ -939,12 +920,11 @@ patcompatom(int *flagp, int *kshcharp) * the expense of allowing the user to do things * they shouldn't. */ - if (!(starter = patcompnot(1, &flags))) + if (!(starter = patcompnot(1, &flags2))) return 0; - } else if (!(starter = patcompswitch(1, &flags))) + } else if (!(starter = patcompswitch(1, &flags2))) return 0; - *flagp |= flags & P_HSTART; - return starter; + flags |= flags2 & P_HSTART; break; case Inang: /* Numeric glob */ @@ -990,68 +970,101 @@ patcompatom(int *flagp, int *kshcharp) * Mention in manual that matching digits with [...] * is more efficient. */ - return starter; break; case Pound: - if (!isset(EXTENDEDGLOB)) - break; + DPUTS(!isset(EXTENDEDGLOB), "BUG: # not treated as string"); + /* + * A hash here is an error; it should follow something + * repeatable. + */ return 0; break; #ifdef DEBUG - case Bar: - case Outpar: - case '\0': - dputs("BUG: wrong character in patcompatom."); + default: + dputs("BUG: character not handled in patcomppiece"); return 0; break; #endif } } - /* Simple string: cancel kshchar lookahead */ - *kshcharp = '\0'; - /* - * Assume it matches a simple string until we find otherwise. - */ - *flagp |= P_PURESTR; - DPUTS(patparse == str0, "BUG: matched nothing in patcompatom."); - /* more than one character matched? */ - len = str0 + (*str0 == Meta ? 2 : 1) < patparse; + if (!(pound = (*patparse == Pound && isset(EXTENDEDGLOB))) && + (kshchar <= 0 || kshchar == '@' || kshchar == '!')) { + *flagp = flags; + return starter; + } + + /* too much at once doesn't currently work */ + if (kshchar && pound) + return 0; + + if (kshchar == '*') { + op = P_ONEHASH; + *flagp = P_HSTART; + } else if (kshchar == '+') { + op = P_TWOHASH; + *flagp = P_HSTART; + } else if (kshchar == '?') { + op = 0; + *flagp = 0; + } else if (*++patparse == Pound) { + op = P_TWOHASH; + patparse++; + *flagp = P_HSTART; + } else { + op = P_ONEHASH; + *flagp = P_HSTART; + } + /* - * Ordinary string of characters. - * If we have more than one character, a following hash only - * applies to the last, so decrement. - */ - if (isset(EXTENDEDGLOB) && *patparse == Pound && len) - patparse -= (patparse > str0 + 1 && patparse[-2] == Meta) ? 2 : 1; - /* if len is 1, we can't have an active # following, so doesn't - * matter that we don't make X in `XX#' simple. + * Note optimizations with pointers into P_NOTHING branches: some + * should logically point to next node after current piece. + * + * Backtracking is also encoded in a slightly obscure way: the + * code emitted ensures we test the non-empty branch of complex + * patterns before the empty branch on each repetition. Hence + * each time we fail on a non-empty branch, we try the empty branch, + * which is equivalent to backtracking. */ - if (!len) - *flagp |= P_SIMPLE; - starter = patnode(P_EXACTLY); - while (str0 < patparse) { - if (*str0 == Meta) { - cbuf[0] = *str0++; - cbuf[1] = *str0++; - } else { - cbuf[0] = itok(*str0) ? ztokens[*str0 - Pound] : *str0; - str0++; - } - ch = UNMETA(cbuf); - /* - * HACK: this cause a string consisting of any number of - * dots in files to be matched exactly, even with approximation. - * We just want to limit it to the first two. + if ((flags & P_SIMPLE) && op == P_ONEHASH && + P_OP((Upat)patout+starter) == P_ANY) { + /* Optimize ?# to *. Silly thing to do, since who would use + * use ?# ? But it makes the later code shorter. */ - if (((patglobflags & C_IGNCASE) && (islower(ch) || isupper(ch))) || - ((patglobflags & C_LCMATCHUC) && islower(ch)) || - ((patglobflags & 0xff) && !((patflags & PAT_FILE) && ch == '.'))) - *flagp &= ~P_PURESTR; - patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, 1); + Upat uptr = (Upat)patout + starter; + uptr->l = (uptr->l & ~0xff) | P_STAR; + } else if ((flags & P_SIMPLE) && op && !(patglobflags & 0xff)) { + /* Don't simplify if we need to look for approximations. */ + patinsert(op, starter, NULL, 0); + } else if (op == P_ONEHASH) { + /* Emit x# as (x&|), where & means "self". */ + up.p = NULL; + patinsert(P_WBRANCH, starter, (char *)&up, sizeof(up)); + /* Either x */ + patoptail(starter, patnode(P_BACK)); /* and loop */ + patoptail(starter, starter); /* back */ + pattail(starter, patnode(P_BRANCH)); /* or */ + pattail(starter, patnode(P_NOTHING)); /* null. */ + } else if (op == P_TWOHASH) { + /* Emit x## as x(&|) where & means "self". */ + next = patnode(P_WBRANCH); /* Either */ + up.p = NULL; + patadd((char *)&up, 0, sizeof(up), 0); + pattail(starter, next); + pattail(patnode(P_BACK), starter); /* loop back */ + pattail(next, patnode(P_BRANCH)); /* or */ + pattail(starter, patnode(P_NOTHING)); /* null. */ + } else if (kshchar == '?') { + /* Emit ?(x) as (x|) */ + patinsert(P_BRANCH, starter, NULL, 0); /* Either x */ + pattail(starter, patnode(P_BRANCH)); /* or */ + next = patnode(P_NOTHING); /* null */ + pattail(starter, next); + patoptail(starter, next); } - /* null terminate and fix alignment */ - patadd(NULL, 0, 1, 0); + if (*patparse == Pound) + return 0; + return starter; } @@ -1064,7 +1077,7 @@ patcompatom(int *flagp, int *kshcharp) static long patcompnot(int paren, int *flagsp) { - unsigned char *unull = NULL; + union upat up; long excsync, br, excl, n, starter; int dummy; @@ -1076,7 +1089,8 @@ patcompnot(int paren, int *flagsp) excsync = patnode(P_EXCSYNC); pattail(br, excsync); pattail(starter, excl = patnode(P_EXCLUDE)); - patadd((char *)&unull, 0, sizeof(unull), 0); + up.p = NULL; + patadd((char *)&up, 0, sizeof(up), 0); if (!(br = (paren ? patcompswitch(1, &dummy) : patcompbranch(&dummy)))) return 0; pattail(br, patnode(P_EXCEND)); @@ -1093,9 +1107,11 @@ patcompnot(int paren, int *flagsp) static long patnode(long op) { - long starter = patcode - patout; + long starter = (Upat)patcode - (Upat)patout; + union upat up; - patadd((char *)&op, 0, sizeof(long), 0); + up.l = op; + patadd((char *)&up, 0, sizeof(union upat), 0); return starter; } @@ -1109,22 +1125,22 @@ static void patinsert(long op, int opnd, char *xtra, int sz) { char *src, *dst, *opdst; - long buf, *lptr; + union upat buf, *lptr; - buf = 0; - patadd((char *)&buf, 0, sizeof(long), 0); + buf.l = 0; + patadd((char *)&buf, 0, sizeof(buf), 0); if (sz) patadd(xtra, 0, sz, 0); - src = patcode - sizeof(long) - sz; + src = patcode - sizeof(union upat) - sz; dst = patcode; - opdst = patout + opnd; + opdst = patout + opnd * sizeof(union upat); while (src > opdst) *--dst = *--src; /* A cast can't be an lvalue */ - lptr = (long *)opdst; - *lptr = op; - opdst += sizeof(long); + lptr = (Upat)opdst; + lptr->l = op; + opdst += sizeof(union upat); while (sz--) *opdst++ = *xtra++; } @@ -1135,10 +1151,10 @@ patinsert(long op, int opnd, char *xtra, int sz) static void pattail(long p, long val) { - char *scan, *temp; - long offset, *lptr; + Upat scan, temp; + long offset; - scan = patout + p; + scan = (Upat)patout + p; for (;;) { if (!(temp = PATNEXT(scan))) break; @@ -1146,10 +1162,9 @@ pattail(long p, long val) } offset = (P_OP(scan) == P_BACK) - ? (scan - patout) - val : val - (scan - patout); + ? (scan - (Upat)patout) - val : val - (scan - (Upat)patout); - lptr = (long *)scan; - *lptr |= offset << 8; + scan->l |= offset << 8; } /* do pattail, but on operand of first argument; nop if operandless */ @@ -1157,14 +1172,14 @@ pattail(long p, long val) /**/ static void patoptail(long p, long val) { - char *ptr = patout + p; + Upat ptr = (Upat)patout + p; int op = P_OP(ptr); if (!p || !P_ISBRANCH(ptr)) return; if (op == P_BRANCH) pattail(P_OPERAND(p), val); else - pattail(P_OPERAND(p) + sizeof(char *), val); + pattail(P_OPERAND(p) + 1, val); } @@ -1258,7 +1273,7 @@ pattry(Patprog prog, char *string) } #endif - if (patmatch(progstr)) { + if (patmatch((Upat)progstr)) { #ifdef BACKREFERENCES if (patflags & PAT_BACKR) { prog->ppStartp[0] = string; @@ -1314,11 +1329,12 @@ static char *exactpos; */ /**/ -int -patmatch(char *prog) +static int +patmatch(Upat prog) { /* Current and next nodes */ - char *scan = prog, *next, *opnd, *save, *start; + Upat scan = prog, next, opnd; + char *start, *save, *chrop; int savglobflags, op, no, min, nextch, fail = 0, saverrsfound; zrange_t from, to, comp; @@ -1338,36 +1354,38 @@ patmatch(char *prog) break; case P_EXACTLY: /* - * acts as nothing if *opnd is null: this is used by + * acts as nothing if *chrop is null: this is used by * approx code. */ - opnd = exactpos ? exactpos : P_OPERAND(scan); + chrop = exactpos ? exactpos : (char *)P_OPERAND(scan); exactpos = NULL; - while (*opnd && *patinput) { + while (*chrop && *patinput) { int chin = STOUC(UNMETA(patinput)); - int chpa = STOUC(UNMETA(opnd)); + int chpa = STOUC(UNMETA(chrop)); if (!CHARMATCH(chin, chpa)) { fail = 1; break; } - METAINC(opnd); + METAINC(chrop); METAINC(patinput); } - if (*opnd) { - exactpos = opnd; + if (*chrop) { + exactpos = chrop; fail = 1; } break; case P_ANYOF: if (!*patinput || - !patmatchrange(P_OPERAND(scan), STOUC(UNMETA(patinput)))) + !patmatchrange((char *)P_OPERAND(scan), + STOUC(UNMETA(patinput)))) fail = 1; else METAINC(patinput); break; case P_ANYBUT: if (!*patinput || - patmatchrange(P_OPERAND(scan), STOUC(UNMETA(patinput)))) + patmatchrange((char *)P_OPERAND(scan), + STOUC(UNMETA(patinput)))) fail = 1; else METAINC(patinput); @@ -1384,12 +1402,12 @@ patmatch(char *prog) * the first attempt. */ op = P_OP(scan); - start = P_OPERAND(scan); + start = (char *)P_OPERAND(scan); from = to = 0; if (op != P_NUMTO) { #ifdef ZSH_64_BIT_TYPE /* We can't rely on pointer alignment being good enough. */ - memcpy((char *)&from, (char *)start, sizeof(zrange_t)); + memcpy((char *)&from, start, sizeof(zrange_t)); #else from = *((zrange_t *) start); #endif @@ -1414,7 +1432,8 @@ patmatch(char *prog) return 1; } if (!no && P_OP(next) == P_EXACTLY && - !idigit(STOUC(*P_OPERAND(next))) && !(patglobflags & 0xff)) + !idigit(STOUC(*(char *)P_OPERAND(next))) && + !(patglobflags & 0xff)) return 0; patinput = --save; no++; @@ -1434,7 +1453,8 @@ patmatch(char *prog) if (patmatch(next)) return 1; if (!no && P_OP(next) == P_EXACTLY && - !idigit(STOUC(*P_OPERAND(next))) && !(patglobflags & 0xff)) + !idigit(STOUC(*(char *)P_OPERAND(next))) && + !(patglobflags & 0xff)) return 0; patinput = --save; no++; @@ -1447,7 +1467,7 @@ patmatch(char *prog) case P_BACK: break; case P_GFLAGS: - patglobflags = *(int *)P_OPERAND(scan); + patglobflags = P_OPERAND(scan)->l; break; #ifdef BACKREFERENCES case P_OPEN: @@ -1502,16 +1522,16 @@ patmatch(char *prog) break; #endif case P_EXCSYNC: - /* See the P_EXCLUDE code below for where syncstrp comes from */ + /* See the P_EXCLUDE code below for where syncptr comes from */ { - unsigned char **syncstrp, *syncptr; - char *after; + unsigned char *syncptr; + Upat after; after = P_OPERAND(scan); DPUTS(!P_ISEXCLUDE(after), "BUG: EXCSYNC not followed by EXCLUDE."); - syncstrp = (unsigned char **)P_OPERAND(after); - DPUTS(!*syncstrp, "BUG: EXCSYNC not handled by EXCLUDE"); - syncptr = *syncstrp + (patinput - patinstart); + DPUTS(!P_OPERAND(after)->p, + "BUG: EXCSYNC not handled by EXCLUDE"); + syncptr = P_OPERAND(after)->p + (patinput - patinstart); /* * If we already matched from here, this time we fail. * See WBRANCH code for story about error count. @@ -1570,7 +1590,8 @@ patmatch(char *prog) * The pointer also tells us where the asserted * pattern matched for use by the exclusion. */ - unsigned char **syncstrp, *oldsyncstr; + Upat syncstrp; + unsigned char *oldsyncstr; char *matchpt = NULL; int ret, savglobdots, matchederrs = 0; #ifdef BACKREFERENCES @@ -1578,7 +1599,7 @@ patmatch(char *prog) #endif DPUTS(P_OP(scan) == P_WBRANCH, "BUG: excluded WBRANCH"); - syncstrp = (unsigned char **)P_OPERAND(next); + syncstrp = P_OPERAND(next); /* * Unlike WBRANCH, each test at the same exclude * sync point (due to an external loop) is separate, @@ -1586,10 +1607,10 @@ patmatch(char *prog) * (foo~bar)(foo~bar)... from the exclusion point * of view, so we use a different sync string. */ - oldsyncstr = *syncstrp; + oldsyncstr = syncstrp->p; if (!patinlen) patinlen = strlen(patinstart)+1; - *syncstrp = (unsigned char *)zcalloc(patinlen); + syncstrp->p = (unsigned char *)zcalloc(patinlen); while ((ret = patmatch(P_OPERAND(scan)))) { unsigned char *syncpt; char savchar, *testptr; @@ -1607,9 +1628,9 @@ patmatch(char *prog) * possibilities for approximation have been * checked.) */ - for (syncpt = *syncstrp; !*syncpt; syncpt++) + for (syncpt = syncstrp->p; !*syncpt; syncpt++) ; - testptr = patinstart + (syncpt - *syncstrp); + testptr = patinstart + (syncpt - syncstrp->p); DPUTS(testptr > matchpt, "BUG: EXCSYNC failed"); savchar = *testptr; *testptr = '\0'; @@ -1625,7 +1646,7 @@ patmatch(char *prog) */ patglobflags &= ~0xff; errsfound = 0; - opnd = P_OPERAND(next) + sizeof(char *); + opnd = P_OPERAND(next) + 1; if (P_OP(next) == P_EXCLUDP && pathpos) { /* * top level exclusion with a file, @@ -1666,8 +1687,8 @@ patmatch(char *prog) patglobflags = savglobflags; errsfound = saverrsfound; } - zfree((char *)*syncstrp, patinlen); - *syncstrp = oldsyncstr; + zfree((char *)syncstrp->p, patinlen); + syncstrp->p = oldsyncstr; if (ret) { patinput = matchpt; errsfound = matchederrs; @@ -1678,7 +1699,8 @@ patmatch(char *prog) ; } else { int ret = 1, pfree = 0; - unsigned char **ptrp = NULL, *ptr; + Upat ptrp = NULL; + unsigned char *ptr; if (P_OP(scan) == P_WBRANCH) { /* * This is where we make sure that we are not @@ -1696,15 +1718,14 @@ patmatch(char *prog) * there is already a 1, then the test fails. */ opnd = P_OPERAND(scan); - ptrp = (unsigned char **)opnd; - opnd += sizeof(unsigned char *); - if (!*ptrp) { + ptrp = opnd++; + if (!ptrp->p) { if (!patinlen) patinlen = strlen((char *)patinstart)+1; - *ptrp = (unsigned char *)zcalloc(patinlen); + ptrp->p = (unsigned char *)zcalloc(patinlen); pfree = 1; } - ptr = *ptrp + (patinput - patinstart); + ptr = ptrp->p + (patinput - patinstart); /* * Without approximation, this is just a @@ -1712,7 +1733,7 @@ patmatch(char *prog) * need to know how many errors there were * last time we made the test. If errsfound * is now smaller than it was, hence we can - * maker more approximations in the remaining + * make more approximations in the remaining * code, we continue with the test. * (This is why the max number of errors is * 254, not 255.) @@ -1725,8 +1746,8 @@ patmatch(char *prog) if (ret) ret = patmatch(opnd); if (pfree) { - zfree((char *)*ptrp, patinlen); - *ptrp = NULL; + zfree((char *)ptrp->p, patinlen); + ptrp->p = NULL; } if (ret) return 1; @@ -1769,12 +1790,38 @@ patmatch(char *prog) return 0; no = patrepeat(P_OPERAND(scan)); } - /* Lookahead to avoid useless matches */ - if (P_OP(next) == P_EXACTLY && !(patglobflags & 0xff)) - nextch = STOUC(UNMETA(P_OPERAND(next))); - else - nextch = -1; min = (op == P_TWOHASH) ? 1 : 0; + /* + * Lookahead to avoid useless matches. This is not possible + * with approximation. + */ + if (P_OP(next) == P_EXACTLY && !(patglobflags & 0xff)) { + char *nextop = (char *)P_OPERAND(next); + /* + * If that P_EXACTLY is last (common in simple patterns, + * such as *.c), then it can be only be matched at one + * point in the test string, so record that. + */ + if (P_OP(PATNEXT(next)) == P_END && + !(patflags & PAT_NOANCH)) { + int ptlen = strlen(patinput); + int oplen = strlen(nextop); + /* Are we in the right range? */ + if (oplen > strlen(min ? METANEXT(start) : start) || + oplen < ptlen) + return 0; + /* Yes, just position appropriately and test. */ + patinput += ptlen - oplen; + if (patinput > start && patinput[-1] == Meta) { + /* doesn't align properly, no go */ + return 0; + } + /* Continue loop with P_EXACTLY test. */ + break; + } + nextch = STOUC(UNMETA(nextop)); + } else + nextch = -1; save = patinput; savglobflags = patglobflags; saverrsfound = errsfound; @@ -2010,13 +2057,13 @@ patmatchrange(char *range, int ch) /* repeatedly match something simple and say how many times */ /**/ -static int patrepeat(char *p) +static int patrepeat(Upat p) { int count = 0, tch, inch; char *scan, *opnd; scan = patinput; - opnd = P_OPERAND(p); + opnd = (char *)P_OPERAND(p); switch(P_OP(p)) { #ifdef DEBUG @@ -2067,7 +2114,8 @@ static int patrepeat(char *p) static void patdump(Patprog r) { - char *s, *next, *codestart, *base, op = P_EXACTLY; + char *s, *base, op = P_EXACTLY; + Upat up, codestart, next; base = (char *)r; s = base + r->startoff; @@ -2075,13 +2123,14 @@ patdump(Patprog r) if (r->flags & PAT_PURES) { printf("STRING:%s\n", (char *)s); } else { - codestart = s; + codestart = (Upat)s; while (op != P_END) { - op = P_OP(s); - printf("%2d%s", s-codestart, patprop(s)); - next = PATNEXT(s); - printf("(%d)", next ? (s-codestart)+(next-s) : 0); - s += sizeof(zalign_t); + up = (Upat)s; + op = P_OP(up); + printf("%2d%s", up-codestart, patprop(up)); + next = PATNEXT(up); + printf("(%d)", next ? next-codestart : 0); + s += sizeof(union upat); if (op == P_ANYOF || op == P_ANYBUT || op == P_EXACTLY) { while (*s != '\0') { if (itok(*s)) { @@ -2107,16 +2156,15 @@ patdump(Patprog r) s += sizeof(zrange_t); } } else if (op == P_GFLAGS) { - long *lptr = (long *)s; - printf("%ld, %ld", *lptr & ~0xff, *lptr & 0xff); - s += sizeof(long); + printf("%ld, %ld", (++up)->l & ~0xff, (++up)->l & 0xff); + s += sizeof(union upat); } else if (op == P_WBRANCH || op == P_EXCLUDE || op == P_EXCLUDP) { - s += sizeof(char *); + s += sizeof(union upat); } putchar('\n'); - s = base + (((s - base) + sizeof(zalign_t) - 1) & - ~(sizeof(zalign_t) - 1)); + s = base + (((s - base) + sizeof(union upat) - 1) & + ~(sizeof(union upat) - 1)); } } @@ -2146,7 +2194,7 @@ patdump(Patprog r) /**/ static char * -patprop(char *op) +patprop(Upat op) { char *p = NULL; static char buf[50]; |