From b115ca307adf7ef9dca13f6e7ce40768bc0b2b75 Mon Sep 17 00:00:00 2001 From: Peter Stephenson Date: Mon, 18 Oct 2004 11:56:14 +0000 Subject: 20500: Unmetafy patterns where possible and other minor pattern fixes --- Src/pattern.c | 673 +++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 484 insertions(+), 189 deletions(-) (limited to 'Src/pattern.c') diff --git a/Src/pattern.c b/Src/pattern.c index 13e6ca250..04452187d 100644 --- a/Src/pattern.c +++ b/Src/pattern.c @@ -47,6 +47,26 @@ * * Eagle-eyed readers will notice this is an altered version. Incredibly * sharp-eyed readers might even find bits that weren't altered. + * + * + * And I experienced a sense that, like certain regular + * expressions, seemed to match the day from beginning to end, so + * that I did not need to identify the parenthesised subexpression + * that told of dawn, nor the group of characters that indicated + * the moment when my grandfather returned home with news of + * Swann's departure for Paris; and the whole length of the month + * of May, as if matched by a closure, fitted into the buffer of my + * life with no sign of overflowing, turning the days, like a + * procession of insects that could consist of this or that + * species, into a random and unstructured repetition of different + * sequences, anchored from the first day of the month to the last + * in the same fashion as the weeks when I knew I would not see + * Gilberte and would search in vain for any occurrences of the + * string in the avenue of hawthorns by Tansonville, without my + * having to delimit explicitly the start or finish of the pattern. + * + * M. Proust, "In Search of Lost Files", + * bk I, "The Walk by Bourne's Place". */ #include "zsh.mdh" @@ -78,7 +98,7 @@ typedef union upat *Upat; #define P_EXCSYNC 0x01 /* no Test if following exclude already failed */ #define P_EXCEND 0x02 /* no Test if exclude matched orig branch */ #define P_BACK 0x03 /* no Match "", "next" ptr points backward. */ -#define P_EXACTLY 0x04 /* str Match this string. */ +#define P_EXACTLY 0x04 /* lstr Match this string. */ #define P_NOTHING 0x05 /* no Match empty string. */ #define P_ONEHASH 0x06 /* node Match this (simple) thing 0 or more times. */ #define P_TWOHASH 0x07 /* node Match this (simple) thing 1 or more times. */ @@ -103,10 +123,14 @@ typedef union upat *Upat; /* spaces left for P_OPEN+n,... for backreferences */ #define P_OPEN 0x80 /* no Mark this point in input as start of n. */ #define P_CLOSE 0x90 /* no Analogous to OPEN. */ -/* zl is the range type zrange_t: may be zlong or unsigned long - * char is a single char - * uc* is a pointer to unsigned char, used at run time and initialised +/* + * no no argument + * zr the range type zrange_t: may be zlong or unsigned long + * char a single char + * uc* a pointer to unsigned char, used at run time and initialised * to NULL. + * str null-terminated, metafied string + * lstr length as long then string, not null-terminated, unmetafied. */ /* @@ -179,16 +203,24 @@ typedef union upat *Upat; #define P_ISEXCLUDE(p) (((p)->l & 0x30) == 0x30) #define P_NOTDOT(p) ((p)->l & 0x40) +/* Specific to lstr type, i.e. P_EXACTLY. */ +#define P_LS_LEN(p) ((p)[1].l) /* can be used as lvalue */ +#define P_LS_STR(p) ((char *)((p) + 2)) + /* Flags needed when pattern is executed */ #define P_SIMPLE 0x01 /* Simple enough to be #/## operand. */ #define P_HSTART 0x02 /* Starts with # or ##'d pattern. */ #define P_PURESTR 0x04 /* Can be matched with a strcmp */ -/* Next character after one which may be a Meta (x is any char *) */ -#define METANEXT(x) (*(x) == Meta ? (x)+2 : (x)+1) /* * Increment pointer which may be on a Meta (x is a pointer variable), * returning the incremented value (i.e. like pre-increment). + * + * In future the following will need to refer to metafied multibyte + * characters. References to invidual characters are not turned + * into a macro when the characters is metafied (c.f. CHARREF() + * below then the character is not metafied) and will need tracking + * down. */ #define METAINC(x) ((x) += (*(x) == Meta) ? 2 : 1) /* @@ -254,13 +286,18 @@ static int patglobflags; /* globbing flags & approx */ /* Add n more characters, ensuring there is enough space. */ +enum { + PA_NOALIGN = 1, + PA_UNMETA = 2 +}; + /**/ static void -patadd(char *add, int ch, long n, int noalgn) +patadd(char *add, int ch, long n, int paflags) { - /* Make sure everything gets aligned unless we get noalgn. */ + /* Make sure everything gets aligned unless we get PA_NOALIGN. */ long newpatsize = patsize + n; - if (!noalgn) + if (!(paflags & PA_NOALIGN)) newpatsize = (newpatsize + sizeof(union upat) - 1) & ~(sizeof(union upat) - 1); if (patalloc < newpatsize) { @@ -272,8 +309,25 @@ patadd(char *add, int ch, long n, int noalgn) } patsize = newpatsize; if (add) { - while (n--) - *patcode++ = *add++; + if (paflags & PA_UNMETA) { + /* + * Unmetafy and untokenize the string as we go. + * The Meta characters in add aren't counted in n. + */ + while (n--) { + if (itok(*add)) + *patcode++ = ztokens[*add++ - Pound]; + else if (*add == Meta) { + add++; + *patcode++ = *add++ ^ 32; + } else { + *patcode++ = *add++; + } + } + } else { + while (n--) + *patcode++ = *add++; + } } else *patcode++ = ch; patcode = patout + patsize; @@ -297,13 +351,22 @@ patcompstart(void) patglobflags = GF_IGNCASE; } -/* Top level pattern compilation subroutine */ +/* + * Top level pattern compilation subroutine + * exp is a null-terminated, metafied string. + * inflags is an or of some PAT_* flags. + * endexp, if non-null, is set to a pointer to the end of the + * part of exp which was compiled. This is used when + * compiling patterns for directories which must be + * matched recursively. + */ /**/ mod_export Patprog patcompile(char *exp, int inflags, char **endexp) { - int flags = 0, len = 0; + int flags = 0; + long len = 0; long startoff; Upat pscan; char *lng, *strp = NULL; @@ -324,7 +387,7 @@ patcompile(char *exp, int inflags, char **endexp) * in struct is actual count of parentheses. */ patnpar = 1; - patflags = inflags & ~PAT_PURES; + patflags = inflags & ~(PAT_PURES|PAT_HAS_EXCLUDP); patendseg = endseg; patendseglen = isset(EXTENDEDGLOB) ? PATENDSEGLEN_EXT : PATENDSEGLEN_NORM; @@ -366,7 +429,12 @@ patcompile(char *exp, int inflags, char **endexp) if (patcompswitch(0, &flags) == 0) return NULL; } else { - /* Yes, copy the string and skip compilation altogether */ + /* + * Yes, copy the string, and skip compilation altogether. + * Null terminate for the benefit of globbing. + * Leave metafied both for globbing and for our own + * efficiency. + */ patparse = strp; len = strp - exp; patadd(exp, 0, len + 1, 0); @@ -404,19 +472,52 @@ patcompile(char *exp, int inflags, char **endexp) for (; pscan; pscan = next) { next = PATNEXT(pscan); if (P_OP(pscan) == P_EXACTLY) { - char *opnd = (char *)P_OPERAND(pscan); - while ((*dst = *opnd++)) - dst++; + char *opnd = P_LS_STR(pscan), *mtest; + long oplen = P_LS_LEN(pscan), ilen; + int nmeta = 0; + /* + * Unfortunately we unmetafied the string + * and we need to put any metacharacters + * back now we know it's a pure string. + * This shouldn't happen too often, it's + * just that there are some cases such + * as . and .. in files where we really + * need a pure string even if there are + * pattern characters flying around. + */ + for (mtest = opnd, ilen = oplen; ilen; + mtest++, ilen--) + if (imeta(*mtest)) + nmeta++; + if (nmeta) { + char *oldpatout = patout; + patadd(NULL, 0, nmeta, 0); + /* + * Yuk. + */ + p = (Patprog)patout; + opnd = patout + (opnd - oldpatout); + dst = patout + startoff; + } + + while (oplen--) { + if (imeta(*opnd)) { + *dst++ = Meta; + *dst++ = *opnd ^ 32; + } else { + *dst++ = *opnd++; + } + } } } - *dst++ = '\0'; p->size = dst - patout; - /* patmlen is really strlen, don't include null byte */ - p->patmlen = p->size - startoff - 1; + /* patmlen is really strlen. We don't need a null. */ + p->patmlen = p->size - startoff; } else { /* starting point info */ - if (P_OP(pscan) == P_EXACTLY && !p->globflags) - p->patstartch = *(char *)P_OPERAND(pscan); + if (P_OP(pscan) == P_EXACTLY && !p->globflags && + P_LS_LEN(pscan)) + p->patstartch = *P_LS_STR(pscan); /* * Find the longest literal string in something expensive. * This is itself not all that cheap if we have @@ -427,9 +528,9 @@ patcompile(char *exp, int inflags, char **endexp) len = 0; for (; pscan; pscan = PATNEXT(pscan)) if (P_OP(pscan) == P_EXACTLY && - (int)strlen((char *)P_OPERAND(pscan)) >= len) { - lng = (char *)P_OPERAND(pscan); - len = strlen(lng); + P_LS_LEN(pscan) >= len) { + lng = P_LS_STR(pscan); + len = P_LS_LEN(pscan); } if (lng) { p->mustoff = lng - patout; @@ -844,9 +945,9 @@ static long patcomppiece(int *flagp) { long starter = 0, next, pound, op; - int flags, flags2, kshchar, len, ch, patch; + int flags, flags2, kshchar, len, ch, patch, nmeta; union upat up; - char *nptr, *str0, cbuf[2]; + char *nptr, *str0, *ptr, cbuf[2]; zrange_t from, to; flags = 0; @@ -881,6 +982,9 @@ patcomppiece(int *flagp) } if (patparse > str0) { + long slen = patparse - str0; + int morelen; + /* Ordinary string: cancel kshchar lookahead */ kshchar = '\0'; /* @@ -889,25 +993,40 @@ patcomppiece(int *flagp) flags |= P_PURESTR; DPUTS(patparse == str0, "BUG: matched nothing in patcomppiece."); /* more than one character matched? */ - len = str0 + (*str0 == Meta ? 2 : 1) < patparse; + morelen = str0 + (*str0 == Meta ? 2 : 1) < patparse; /* * If we have more than one character, a following hash only * applies to the last, so decrement. */ - if (isset(EXTENDEDGLOB) && *patparse == Pound && len) + if (isset(EXTENDEDGLOB) && *patparse == Pound && morelen) patparse -= (patparse > str0 + 1 && patparse[-2] == Meta) ? 2 : 1; /* * If len is 1, we can't have an active # following, so doesn't * matter that we don't make X in `XX#' simple. */ - if (!len) + if (!morelen) flags |= P_SIMPLE; starter = patnode(P_EXACTLY); - /* add enough space including null byte */ - len = patparse - str0; - patadd(str0, 0, len + 1, 0); - nptr = (char *)P_OPERAND((Upat)patout + starter); - nptr[len] = '\0'; + + /* Get length of string without metafication. */ + nmeta = 0; + for (ptr = str0; ptr < patparse; ptr++) { + if (*ptr == Meta) { + nmeta++; + ptr++; + } + } + slen = (patparse - str0) - nmeta; + /* First add length, which is a long */ + patadd((char *)&slen, 0, sizeof(long), 0); + /* + * Then the string, not null terminated. + * Unmetafy and untokenize; pass the final length, + * which is what we need to allocate, i.e. not including + * a count for each Meta in the string. + */ + patadd(str0, 0, slen, PA_UNMETA); + nptr = P_LS_STR((Upat)patout + starter); /* * It's much simpler to turn off pure string mode for * any case-insensitive or approximate matching; usually, @@ -918,13 +1037,13 @@ patcomppiece(int *flagp) * ..(#a1).. (i.e. the (#a1) has no effect), but if you're * going to write funny patterns, you get no sympathy from me. */ - if (patglobflags && - (!(patflags & PAT_FILE) || (strcmp(nptr, ".") && - strcmp(nptr, "..")))) - flags &= ~P_PURESTR; - for (; *nptr; METAINC(nptr)) - if (itok(*nptr)) - *nptr = ztokens[*nptr - Pound]; + if (patglobflags) { + if (!(patflags & PAT_FILE)) + flags &= ~P_PURESTR; + else if (!(nptr[0] == '.' && + (slen == 1 || (nptr[1] == '.' && slen == 2)))) + flags &= ~P_PURESTR; + } } else { if (kshchar) patparse++; @@ -950,7 +1069,7 @@ patcomppiece(int *flagp) starter = patnode(P_ANYOF); if (*patparse == Outbrack) { patparse++; - patadd(NULL, ']', 1, 1); + patadd(NULL, ']', 1, PA_NOALIGN); } while (*patparse && *patparse != Outbrack) { /* Meta is not a token */ @@ -990,7 +1109,7 @@ patcomppiece(int *flagp) ch = PP_UNKWN; patparse = nptr + 2; if (ch != PP_UNKWN) - patadd(NULL, STOUC(Meta+ch), 1, 1); + patadd(NULL, STOUC(Meta+ch), 1, PA_NOALIGN); continue; } if (itok(*patparse)) { @@ -1003,15 +1122,17 @@ patcomppiece(int *flagp) patparse++; if (*patparse == '-' && patparse[1] != Outbrack) { - patadd(NULL, STOUC(Meta+PP_RANGE), 1, 1); - patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, 1); + patadd(NULL, STOUC(Meta+PP_RANGE), 1, PA_NOALIGN); + patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, PA_NOALIGN); if (itok(*++patparse)) { - patadd(0, STOUC(ztokens[*patparse - Pound]), 1, 1); + patadd(0, STOUC(ztokens[*patparse - Pound]), 1, + PA_NOALIGN); } else - patadd(patparse, 0, (*patparse == Meta) ? 2 : 1, 1); + patadd(patparse, 0, (*patparse == Meta) ? 2 : 1, + PA_NOALIGN); METAINC(patparse); } else - patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, 1); + patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, PA_NOALIGN); } if (*patparse != Outbrack) return 0; @@ -1309,19 +1430,11 @@ static void patoptail(long p, long val) */ static char *patinstart; /* Start of input string */ static char *patinend; /* End of input string */ +static char *patinput; /* String input pointer */ static char *patinpath; /* Full path for use with ~ exclusions */ - -/**/ -char *patinput; /* String input pointer */ - -/* - * Offset of string at which we are trying to match. - * This is added in to the positions recorded in patbeginp and patendp - * when we are looking for substrings. Currently this only happens - * in the parameter substitution code. - */ -/**/ -int patoffset; +static int patinlen; /* Length of last successful match. + * Includes count of Meta characters. + */ static char *patbeginp[NSUBEXP]; /* Pointer to backref beginnings */ static char *patendp[NSUBEXP]; /* Pointer to backref ends */ @@ -1329,9 +1442,24 @@ static int parsfound; /* parentheses (with backrefs) found */ static int globdots; /* Glob initial dots? */ +/* + * Macros which are currently trivial but are likely to be less + * so when we handle multibyte characters. They operate on + * umetafied strings. + */ + +/* Get a character from the start point in a string */ +#define CHARREF(x) (STOUC(*x)) +/* Get a pointer to the next character */ +#define CHARNEXT(x) (x+1) +/* Increment a pointer past the current character. */ +#define CHARINC(x) (x++) +/* Counter the number of characters between two pointers, largest first */ +#define CHARSUB(x,y) (x-y) + /* * The following need to be accessed in the globbing scanner for - * a multi-component file path. See horror story there. + * a multi-component file path. See horror story in glob.c. */ /**/ int errsfound; /* Total error count so far */ @@ -1347,23 +1475,59 @@ pattrystart(void) errsfound = 0; } +/* + * Test prog against null-terminated, metafied string. + */ + /**/ mod_export int pattry(Patprog prog, char *string) { - return pattryrefs(prog, string, NULL, NULL, NULL); + return pattryrefs(prog, string, -1, 0, NULL, NULL, NULL); +} + +/* + * Test prog against string of given length, no null termination + * but still metafied at this point. offset gives an offset + * to include in reported match indices + */ + +/**/ +mod_export int +pattrylen(Patprog prog, char *string, int len, int offset) +{ + return pattryrefs(prog, string, len, offset, NULL, NULL, NULL); } -/* The last three arguments are used to report the positions for the +/* + * Test prog against string with given length stringlen, which + * may be -1 to indicate a null-terminated string. The input + * string is metafied; the length is the raw string length, not the + * number of possibly metafied characters. + * + * offset is the position in the original string (not seen by + * the patter module) at which we are trying to match. + * This is added in to the positions recorded in patbeginp and patendp + * when we are looking for substrings. Currently this only happens + * in the parameter substitution code. + * + * Note this is a character offset, i.e. a metafied character + * counts as 1. + * + * The last three arguments are used to report the positions for the * backreferences. On entry, *nump should contain the maximum number - * positions to report. */ + * of positions to report. In this case the match, mbegin, mend + * arrays are not altered. + */ /**/ mod_export int -pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) +pattryrefs(Patprog prog, char *string, int stringlen, int patoffset, + int *nump, int *begp, int *endp) { - int i, maxnpos = 0, ret; - char **sp, **ep; + int i, maxnpos = 0, ret, needfullpath, unmetalen, unmetalenp; + int origlen; + char **sp, **ep, *tryalloced, *ptr; char *progstr = (char *)prog + prog->startoff; if (nump) { @@ -1374,12 +1538,87 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) if (*string == Nularg) string++; - patinstart = patinput = string; - patinend = patinstart + strlen(patinstart); + if (stringlen < 0) + stringlen = strlen(string); + origlen = stringlen; + + patflags = prog->flags; + /* + * For a top-level ~-exclusion, we will need the full + * path to exclude, so copy the path so far and append the + * current test string. + */ + needfullpath = (patflags & PAT_HAS_EXCLUDP) && pathpos; + + /* Get the length of the full string when unmetafied. */ + unmetalen = ztrsub(string + stringlen, string); + if (needfullpath) + unmetalenp = ztrsub(pathbuf + pathpos, pathbuf); + else + unmetalenp = 0; + + DPUTS(needfullpath && (patflags & (PAT_PURES|PAT_ANY)), + "rum sort of file exclusion"); + /* + * Partly for efficiency, and partly for the convenience of + * globbing, we don't unmetafy pure string patterns, and + * there's no reason to if the pattern is just a *. + */ + if (!(patflags & (PAT_PURES|PAT_ANY)) + && (needfullpath || unmetalen != stringlen)) { + /* + * We need to copy if we need to prepend the path so far + * (in which case we copy both chunks), or if we have + * Meta characters. + */ + char *dst; + int icopy, ncopy; + + dst = tryalloced = zalloc(unmetalen + unmetalenp); + + if (needfullpath) { + /* loop twice, copy path buffer first time */ + ptr = pathbuf; + ncopy = unmetalenp; + } else { + /* just loop once, copy string with unmetafication */ + ptr = string; + ncopy = unmetalen; + } + for (icopy = 0; icopy < 2; icopy++) { + for (i = 0; i < ncopy; i++) { + if (*ptr == Meta) { + ptr++; + *dst++ = *ptr++ ^ 32; + } else { + *dst++ = *ptr++; + } + } + if (!needfullpath) + break; + /* next time append test string to path so far */ + ptr = string; + ncopy = unmetalen; + } + + if (needfullpath) { + patinstart = tryalloced + unmetalenp; + patinpath = tryalloced; + } else { + patinstart = tryalloced; + patinpath = NULL; + } + stringlen = unmetalen; + } else { + patinstart = string; + tryalloced = patinpath = NULL; + } + + patinend = patinstart + stringlen; /* * From now on we do not require NULL termination of - * the test string. It is still metafied, as is string - * data in the prog. + * the test string. There should also be no more references + * to the variable string. */ if (prog->flags & (PAT_PURES|PAT_ANY)) { @@ -1399,11 +1638,11 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) * Testing a pure string. See if initial * components match. */ - int lendiff = (patinend - patinstart) - prog->patmlen; + int lendiff = stringlen - prog->patmlen; if (lendiff < 0) { /* No, the pattern string is too long. */ ret = 0; - } else if (!memcmp(progstr, string, prog->patmlen)) { + } else if (!memcmp(progstr, patinstart, prog->patmlen)) { /* * Initial component matches. Matches either * if lengths are the same or we are not anchored @@ -1420,47 +1659,61 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) * For files, we won't match initial "."s unless * glob_dots is set. */ - if ((prog->flags & PAT_NOGLD) && *string == '.') - return 0; - /* in case used for ${..#..} etc. */ - patinput = string + prog->patmlen; - /* if matching files, must update globbing flags */ - patglobflags = prog->globend; - return 1; - } else - return 0; + if ((prog->flags & PAT_NOGLD) && *patinstart == '.') { + ret = 0; + } else { + /* + * Remember the length in case used for ${..#..} etc. + * In this case, we didn't unmetafy the string. + */ + patinlen = (int)prog->patmlen; + /* if matching files, must update globbing flags */ + patglobflags = prog->globend; + } + } + + if (tryalloced) + zfree(tryalloced, unmetalen + unmetalenp); + + return ret; } else { /* * Test for a `must match' string, unless we're scanning for a match * in which case we don't need to do this each time. */ + ret = 1; if (!(prog->flags & PAT_SCAN) && prog->mustoff) { char *testptr; /* start pointer into test string */ char *teststop; /* last point from which we can match */ char *patptr = (char *)prog + prog->mustoff; - int patlen = strlen(patptr); + int patlen = prog->patmlen; int found = 0; - if (patlen > patinend - patinstart) { + if (patlen > stringlen) { /* Too long, can't match. */ - return 0; - } - teststop = patinend - patlen; + ret = 0; + } else { + teststop = patinend - patlen; - for (testptr = patinstart; testptr <= teststop; testptr++) - { - if (!memcmp(testptr, patptr, patlen)) { - found = 1; - break; + for (testptr = patinstart; testptr <= teststop; testptr++) + { + if (!memcmp(testptr, patptr, patlen)) { + found = 1; + break; + } } - } - if (!found) - return 0; + if (!found) + ret = 0; + } + } + if (!ret) { + if (tryalloced) + zfree(tryalloced, unmetalen + unmetalenp); + return 0; } - patflags = prog->flags; patglobflags = prog->globflags; if (!(patflags & PAT_FILE)) { forceerrs = -1; @@ -1469,26 +1722,7 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) globdots = !(patflags & PAT_NOGLD); parsfound = 0; - if ((patflags & PAT_HAS_EXCLUDP) && pathpos) { - /* - * For a top-level ~-exclusion, we will need the full - * path to exclude, so copy the path so far and append the - * current test string. - * - * There are some advantages in making patinstart etc. - * point into this new string; however, that gets confusing - * if we need patinput outside this file. That's - * not likely for files but I don't think it's worth - * the risk. - */ - int len = patinend - patinstart; - - patinpath = (char *)zalloc(pathpos + len); - memcpy(patinpath, pathbuf, pathpos); - memcpy(patinpath + pathpos, patinstart, len); - } else { - patinpath = NULL; - } + patinput = patinstart; if (patmatch((Upat)progstr)) { /* @@ -1496,6 +1730,23 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) * failed, so set it now */ patglobflags = prog->globend; + + /* + * Record length of successful match, including Meta + * characters. Do it here so that patmatchlen() can return + * it even if we delete the pattern strings. + */ + patinlen = patinput - patinstart; + /* + * Optimization: if we didn't find any Meta characters + * to begin with, we don't need to look for them now. + */ + if (unmetalen != origlen) { + for (ptr = patinstart; ptr < patinput; ptr++) + if (imeta(*ptr)) + patinlen++; + } + /* * Should we clear backreferences and matches on a failed * match? @@ -1504,15 +1755,18 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) /* * m flag: for global match. This carries no overhead * in the pattern matching part. + * + * Remember the test pattern is already unmetafied. */ char *str; - int mlen = ztrsub(patinput, patinstart); + int mlen = CHARSUB(patinput, patinstart); - str = ztrduppfx(patinstart, patinput - patinstart); + str = metafy(patinstart, patinput - patinstart, META_DUP); setsparam("MATCH", str); setiparam("MBEGIN", (zlong)(patoffset + !isset(KSHARRAYS))); setiparam("MEND", - (zlong)(mlen + patoffset + !isset(KSHARRAYS) - 1)); + (zlong)(mlen + patoffset + + !isset(KSHARRAYS) - 1)); } if (prog->patnpar && nump) { /* @@ -1527,9 +1781,10 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) for (i = 0; i < prog->patnpar && i < maxnpos; i++) { if (parsfound & (1 << i)) { if (begp) - *begp++ = ztrsub(*sp, patinstart) + patoffset; + *begp++ = CHARSUB(*sp, patinstart) + patoffset; if (endp) - *endp++ = ztrsub(*ep, patinstart) + patoffset - 1; + *endp++ = CHARSUB(*ep, patinstart) + patoffset + - 1; } else { if (begp) *begp++ = -1; @@ -1557,7 +1812,7 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) for (i = 0; i < prog->patnpar; i++) { if (parsfound & (1 << i)) { - matcharr[i] = ztrduppfx(*sp, *ep - *sp); + matcharr[i] = metafy(*sp, *ep - *sp, META_DUP); /* * mbegin and mend give indexes into the string * in the standard notation, i.e. respecting @@ -1568,12 +1823,12 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) * corresponds to indexing as ${foo[1,1]}. */ sprintf(numbuf, "%ld", - (long)(ztrsub(*sp, patinstart) + + (long)(CHARSUB(*sp, patinstart) + patoffset + !isset(KSHARRAYS))); mbeginarr[i] = ztrdup(numbuf); sprintf(numbuf, "%ld", - (long)(ztrsub(*ep, patinstart) + + (long)(CHARSUB(*ep, patinstart) + patoffset + !isset(KSHARRAYS) - 1)); mendarr[i] = ztrdup(numbuf); @@ -1593,19 +1848,31 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) setaparam("mbegin", mbeginarr); setaparam("mend", mendarr); } + ret = 1; } else ret = 0; - if (patinpath) { - zfree(patinpath, pathpos + (patinend - patinstart)); - patinpath = NULL; - } + if (tryalloced) + zfree(tryalloced, unmetalen + unmetalenp); return ret; } } +/* + * Return length of previous succesful match. This is + * in metafied bytes, i.e. includes a count of Meta characters. + * Unusual and futile attempt at modular encapsulation. + */ + +/**/ +int +patmatchlen(void) +{ + return patinlen; +} + /* * Match literal characters with case insensitivity test: the first * comes from the input string, the second the current pattern. @@ -1627,8 +1894,11 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp) * exactpos is used to remember how far down an exact string we have * matched, if we are doing approximation and can therefore redo from * the same point; we never need to otherwise. + * + * exactend is a pointer to the end of the string, which isn't + * null-terminated. */ -static char *exactpos; +static char *exactpos, *exactend; /* * Main matching routine. @@ -1643,7 +1913,7 @@ patmatch(Upat prog) { /* Current and next nodes */ Upat scan = prog, next, opnd; - char *start, *save, *chrop, *compend; + char *start, *save, *chrop, *chrend, *compend; int savglobflags, op, no, min, nextch, fail = 0, saverrsfound; zrange_t from, to, comp; @@ -1659,45 +1929,52 @@ patmatch(Upat prog) if (patinput == patinend) fail = 1; else - METAINC(patinput); + CHARINC(patinput); break; case P_EXACTLY: /* * acts as nothing if *chrop is null: this is used by * approx code. */ - chrop = exactpos ? exactpos : (char *)P_OPERAND(scan); + if (exactpos) { + chrop = exactpos; + chrend = exactend; + } else { + chrop = P_LS_STR(scan); + chrend = chrop + P_LS_LEN(scan); + } exactpos = NULL; - while (*chrop && patinput < patinend) { - int chin = STOUC(UNMETA(patinput)); - int chpa = STOUC(UNMETA(chrop)); + while (chrop < chrend && patinput < patinend) { + int chin = CHARREF(patinput); + int chpa = CHARREF(chrop); if (!CHARMATCH(chin, chpa)) { fail = 1; break; } - METAINC(chrop); - METAINC(patinput); + CHARINC(chrop); + CHARINC(patinput); } - if (*chrop) { + if (chrop < chrend) { exactpos = chrop; + exactend = chrend; fail = 1; } break; case P_ANYOF: if (patinput == patinend || !patmatchrange((char *)P_OPERAND(scan), - STOUC(UNMETA(patinput)))) + CHARREF(patinput))) fail = 1; else - METAINC(patinput); + CHARINC(patinput); break; case P_ANYBUT: if (patinput == patinend || patmatchrange((char *)P_OPERAND(scan), - STOUC(UNMETA(patinput)))) + CHARREF(patinput))) fail = 1; else - METAINC(patinput); + CHARINC(patinput); break; case P_NUMRNG: case P_NUMFROM: @@ -1771,7 +2048,8 @@ patmatch(Upat prog) return 1; } if (!no && P_OP(next) == P_EXACTLY && - !idigit(STOUC(*(char *)P_OPERAND(next))) && + (!P_LS_LEN(next) || + !idigit(STOUC(*P_LS_STR(next)))) && !(patglobflags & 0xff)) return 0; patinput = --save; @@ -1791,7 +2069,7 @@ patmatch(Upat prog) case P_NUMANY: /* This is <->: any old set of digits, don't bother comparing */ start = patinput; - while (patinput < patinend && idigit(STOUC(*patinput))) + while (patinput < patinend && idigit(CHARREF(patinput))) patinput++; save = patinput; no = 0; @@ -1799,7 +2077,8 @@ patmatch(Upat prog) if (patmatch(next)) return 1; if (!no && P_OP(next) == P_EXACTLY && - !idigit(STOUC(*(char *)P_OPERAND(next))) && + (!P_LS_LEN(next) || + !idigit(CHARREF(P_LS_STR(next)))) && !(patglobflags & 0xff)) return 0; patinput = --save; @@ -1963,7 +2242,7 @@ patmatch(Upat prog) origpatinend = patinend; while ((ret = patmatch(P_OPERAND(scan)))) { unsigned char *syncpt; - char *savpatinstart, *savpatinend; + char *savpatinstart; int savforce = forceerrs; int savpatflags = patflags, synclen; forceerrs = -1; @@ -1998,7 +2277,6 @@ patmatch(Upat prog) patflags |= PAT_NOTEND; } savpatinstart = patinstart; - savpatinend = patinend; next = PATNEXT(scan); while (next && P_ISEXCLUDE(next)) { patinput = save; @@ -2013,14 +2291,15 @@ patmatch(Upat prog) opnd = P_OPERAND(next) + 1; if (P_OP(next) == P_EXCLUDP && patinpath) { /* - * top level exclusion with a file, + * Top level exclusion with a file, * applies to whole path so add the - * segments already matched + * segments already matched. + * We copied these in front of the + * test pattern, so patinend doesn't + * need moving. */ DPUTS(patinput != patinstart, "BUG: not at start excluding path"); - patinend = patinpath + pathpos + - (patinend - patinstart); patinput = patinstart = patinpath; } if (patmatch(opnd)) { @@ -2036,7 +2315,6 @@ patmatch(Upat prog) patinput = savpatinstart + (patinput - patinstart); patinstart = savpatinstart; - patinend = savpatinend; } if (!ret) break; @@ -2146,7 +2424,7 @@ patmatch(Upat prog) /* Note that no counts possibly metafied characters */ start = patinput; if (op == P_STAR) { - for (no = 0; patinput < patinend; METAINC(patinput)) + for (no = 0; patinput < patinend; CHARINC(patinput)) no++; /* simple optimization for reasonably common case */ if (P_OP(next) == P_END) @@ -2156,7 +2434,7 @@ patmatch(Upat prog) "BUG: wrong backtracking with approximation."); if (!globdots && P_NOTDOT(P_OPERAND(scan)) && patinput == patinstart && patinput < patinend && - *patinput == '.') + CHARREF(patinput) == '.') return 0; no = patrepeat(P_OPERAND(scan)); } @@ -2165,8 +2443,9 @@ patmatch(Upat prog) * Lookahead to avoid useless matches. This is not possible * with approximation. */ - if (P_OP(next) == P_EXACTLY && !(patglobflags & 0xff)) { - char *nextop = (char *)P_OPERAND(next); + if (P_OP(next) == P_EXACTLY && P_LS_LEN(next) && + !(patglobflags & 0xff)) { + char *nextop = P_LS_STR(next); /* * If that P_EXACTLY is last (common in simple patterns, * such as *.c), then it can be only be matched at one @@ -2175,21 +2454,20 @@ patmatch(Upat prog) if (P_OP(PATNEXT(next)) == P_END && !(patflags & PAT_NOANCH)) { int ptlen = patinend - patinput; - int oplen = strlen(nextop); - int lenmatch = patinend - (min ? METANEXT(start) : start); + int lenmatch = patinend - (min ? CHARNEXT(start) : start); /* Are we in the right range? */ - if (oplen > lenmatch || oplen < ptlen) + if (P_LS_LEN(next) > lenmatch || P_LS_LEN(next) < ptlen) return 0; /* Yes, just position appropriately and test. */ - patinput += ptlen - oplen; - if (patinput > start && patinput[-1] == Meta) { - /* doesn't align properly, no go */ - return 0; - } + patinput += ptlen - P_LS_LEN(next); + /* + * Here we will need to be careful that patinput is not + * in the middle of a multibyte character. + */ /* Continue loop with P_EXACTLY test. */ break; } - nextch = STOUC(UNMETA(nextop)); + nextch = CHARREF(nextop); } else nextch = -1; save = patinput; @@ -2199,14 +2477,17 @@ patmatch(Upat prog) int charmatch_cache; if (nextch < 0 || (patinput < patinend && - CHARMATCH_EXPR(STOUC(UNMETA(patinput)), nextch))) { + CHARMATCH_EXPR(CHARREF(patinput), nextch))) { if (patmatch(next)) return 1; } no--; save--; - if (save > start && save[-1] == Meta) - save--; + /* + * Here we will need to make sure save is + * decremented properly to the start of + * the preceeding multibyte character. + */ patinput = save; patglobflags = savglobflags; errsfound = saverrsfound; @@ -2270,7 +2551,7 @@ patmatch(Upat prog) /* Try omitting a character from the input string */ if (patinput < patinend) { - METAINC(patinput); + CHARINC(patinput); /* If we are not on an exact match, then this is * our last gasp effort, so we can optimize out * the recursive call. @@ -2285,11 +2566,11 @@ patmatch(Upat prog) char *nextexact = savexact; DPUTS(!savexact || !*savexact, "BUG: exact match has not set exactpos"); - METAINC(nextexact); + CHARINC(nextexact); if (save < patinend) { char *nextin = save; - METAINC(nextin); + CHARINC(nextin); patglobflags = savglobflags; errsfound = saverrsfound; exactpos = savexact; @@ -2299,18 +2580,18 @@ patmatch(Upat prog) * exactpos */ if (save < patinend && nextin < patinend && - *nextexact) { - int cin0 = UNMETA(save); - int cpa0 = UNMETA(exactpos); - int cin1 = UNMETA(nextin); - int cpa1 = UNMETA(nextexact); + nextexact < exactend) { + int cin0 = CHARREF(save); + int cpa0 = CHARREF(exactpos); + int cin1 = CHARREF(nextin); + int cpa1 = CHARREF(nextexact); if (CHARMATCH(cin0, cpa1) && CHARMATCH(cin1, cpa0)) { patinput = nextin; - METAINC(patinput); + CHARINC(patinput); exactpos = nextexact; - METAINC(exactpos); + CHARINC(exactpos); if (patmatch(scan)) return 1; @@ -2333,12 +2614,13 @@ patmatch(Upat prog) exactpos = savexact; } + DPUTS(exactpos == exactend, "approximating too far"); /* * Try moving up the exact match pattern. * This must be the last attempt, so just loop * instead of calling recursively. */ - METAINC(exactpos); + CHARINC(exactpos); continue; } } @@ -2358,6 +2640,11 @@ patmatchrange(char *range, int ch) { int r1, r2; + /* + * Careful here: unlike other strings, range is a NULL-terminated, + * metafied string, because we need to treat the Posix and hyphenated + * ranges specially. + */ for (; *range; range++) { if (imeta(STOUC(*range))) { switch (STOUC(*range)-STOUC(Meta)) { @@ -2459,23 +2746,24 @@ static int patrepeat(Upat p) break; #endif case P_EXACTLY: - tch = STOUC(UNMETA(opnd)); + DPUTS(P_LS_LEN(p) != 1, "closure following more than one character"); + tch = CHARREF(P_LS_STR(p)); while (scan < patinend && - CHARMATCH_EXPR(STOUC(UNMETA(scan)), tch)) { + CHARMATCH_EXPR(CHARREF(scan), tch)) { count++; - METAINC(scan); + CHARINC(scan); } break; case P_ANYOF: - while (scan < patinend && patmatchrange(opnd, STOUC(UNMETA(scan)))) { + while (scan < patinend && patmatchrange(opnd, CHARREF(scan))) { count++; - METAINC(scan); + CHARINC(scan); } break; case P_ANYBUT: - while (scan < patinend && !patmatchrange(opnd, STOUC(UNMETA(scan)))) { + while (scan < patinend && !patmatchrange(opnd, CHARREF(scan))) { count++; - METAINC(scan); + CHARINC(scan); } break; #ifdef DEBUG @@ -2528,7 +2816,14 @@ patdump(Patprog r) next = PATNEXT(up); printf("(%d)", next ? next-codestart : 0); s += sizeof(union upat); - if (op == P_ANYOF || op == P_ANYBUT || op == P_EXACTLY) { + if (op == P_EXACTLY) { + long llen = *(long *)s; + s += sizeof(long); + while (llen--) { + putchar(CHARREF(s)); + CHARINC(s); + } + } else if (op == P_ANYOF || op == P_ANYBUT) { while (*s != '\0') { if (itok(*s)) { if (*s == Meta + PP_RANGE) { -- cgit 1.4.1