about summary refs log tree commit diff
path: root/Src/pattern.c
diff options
context:
space:
mode:
authorPeter Stephenson <pws@users.sourceforge.net>2004-10-18 11:56:14 +0000
committerPeter Stephenson <pws@users.sourceforge.net>2004-10-18 11:56:14 +0000
commitb115ca307adf7ef9dca13f6e7ce40768bc0b2b75 (patch)
tree64a7e6568c6f082ae52e2e28e342d82a9947ad29 /Src/pattern.c
parentbc704ef05c532cff4a9ae49a22aa6cd35299cd5b (diff)
downloadzsh-b115ca307adf7ef9dca13f6e7ce40768bc0b2b75.tar.gz
zsh-b115ca307adf7ef9dca13f6e7ce40768bc0b2b75.tar.xz
zsh-b115ca307adf7ef9dca13f6e7ce40768bc0b2b75.zip
20500: Unmetafy patterns where possible and other minor pattern fixes
Diffstat (limited to 'Src/pattern.c')
-rw-r--r--Src/pattern.c673
1 files changed, 484 insertions, 189 deletions
diff --git a/Src/pattern.c b/Src/pattern.c
index 13e6ca250..04452187d 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -47,6 +47,26 @@
  *
  * Eagle-eyed readers will notice this is an altered version.  Incredibly
  * sharp-eyed readers might even find bits that weren't altered.
+ *
+ *
+ *      And I experienced a sense that, like certain regular
+ *      expressions, seemed to match the day from beginning to end, so
+ *      that I did not need to identify the parenthesised subexpression
+ *      that told of dawn, nor the group of characters that indicated
+ *      the moment when my grandfather returned home with news of
+ *      Swann's departure for Paris; and the whole length of the month
+ *      of May, as if matched by a closure, fitted into the buffer of my
+ *      life with no sign of overflowing, turning the days, like a
+ *      procession of insects that could consist of this or that
+ *      species, into a random and unstructured repetition of different
+ *      sequences, anchored from the first day of the month to the last
+ *      in the same fashion as the weeks when I knew I would not see
+ *      Gilberte and would search in vain for any occurrences of the
+ *      string in the avenue of hawthorns by Tansonville, without my
+ *      having to delimit explicitly the start or finish of the pattern.
+ *
+ *                                 M. Proust, "In Search of Lost Files",
+ *                                 bk I, "The Walk by Bourne's Place".
  */
 
 #include "zsh.mdh"
@@ -78,7 +98,7 @@ typedef union upat *Upat;
 #define P_EXCSYNC 0x01	/* no   Test if following exclude already failed */
 #define P_EXCEND  0x02	/* no   Test if exclude matched orig branch */
 #define	P_BACK	  0x03	/* no	Match "", "next" ptr points backward. */
-#define	P_EXACTLY 0x04	/* str	Match this string. */
+#define	P_EXACTLY 0x04	/* lstr	Match this string. */
 #define	P_NOTHING 0x05	/* no	Match empty string. */
 #define	P_ONEHASH 0x06	/* node	Match this (simple) thing 0 or more times. */
 #define	P_TWOHASH 0x07	/* node	Match this (simple) thing 1 or more times. */
@@ -103,10 +123,14 @@ typedef union upat *Upat;
 /* spaces left for P_OPEN+n,... for backreferences */
 #define	P_OPEN	  0x80	/* no	Mark this point in input as start of n. */
 #define	P_CLOSE	  0x90	/* no	Analogous to OPEN. */
-/* zl    is the range type zrange_t:  may be zlong or unsigned long
- * char is a single char
- * uc*   is a pointer to unsigned char, used at run time and initialised
+/*
+ * no    no argument
+ * zr    the range type zrange_t:  may be zlong or unsigned long
+ * char  a single char
+ * uc*   a pointer to unsigned char, used at run time and initialised
  *       to NULL.
+ * str   null-terminated, metafied string
+ * lstr  length as long then string, not null-terminated, unmetafied.
  */
 
 /*
@@ -179,16 +203,24 @@ typedef union upat *Upat;
 #define P_ISEXCLUDE(p)	(((p)->l & 0x30) == 0x30)
 #define P_NOTDOT(p)	((p)->l & 0x40)
 
+/* Specific to lstr type, i.e. P_EXACTLY. */
+#define P_LS_LEN(p)	((p)[1].l) /* can be used as lvalue */
+#define P_LS_STR(p)	((char *)((p) + 2))
+
 /* Flags needed when pattern is executed */
 #define P_SIMPLE        0x01	/* Simple enough to be #/## operand. */
 #define P_HSTART        0x02	/* Starts with # or ##'d pattern. */
 #define P_PURESTR	0x04	/* Can be matched with a strcmp */
 
-/* Next character after one which may be a Meta (x is any char *) */
-#define METANEXT(x)	(*(x) == Meta ? (x)+2 : (x)+1)
 /*
  * Increment pointer which may be on a Meta (x is a pointer variable),
  * returning the incremented value (i.e. like pre-increment).
+ *
+ * In future the following will need to refer to metafied multibyte
+ * characters.  References to invidual characters are not turned
+ * into a macro when the characters is metafied (c.f. CHARREF()
+ * below then the character is not metafied) and will need tracking
+ * down.
  */
 #define METAINC(x)	((x) += (*(x) == Meta) ? 2 : 1)
 /*
@@ -254,13 +286,18 @@ static int patglobflags;  /* globbing flags & approx */
 
 /* Add n more characters, ensuring there is enough space. */
 
+enum {
+    PA_NOALIGN = 1,
+    PA_UNMETA  = 2
+};
+
 /**/
 static void
-patadd(char *add, int ch, long n, int noalgn)
+patadd(char *add, int ch, long n, int paflags)
 {
-    /* Make sure everything gets aligned unless we get noalgn. */
+    /* Make sure everything gets aligned unless we get PA_NOALIGN. */
     long newpatsize = patsize + n;
-    if (!noalgn)
+    if (!(paflags & PA_NOALIGN))
 	newpatsize = (newpatsize + sizeof(union upat) - 1) &
 		      ~(sizeof(union upat) - 1);
     if (patalloc < newpatsize) {
@@ -272,8 +309,25 @@ patadd(char *add, int ch, long n, int noalgn)
     }
     patsize = newpatsize;
     if (add) {
-	while (n--)
-	    *patcode++ = *add++;
+	if (paflags & PA_UNMETA) {
+	    /*
+	     * Unmetafy and untokenize the string as we go.
+	     * The Meta characters in add aren't counted in n.
+	     */
+	    while (n--) {
+		if (itok(*add))
+		    *patcode++ = ztokens[*add++ - Pound];
+		else if (*add == Meta) {
+		    add++;
+		    *patcode++ = *add++ ^ 32;
+		} else {
+		    *patcode++ = *add++;
+		}
+	    }
+	} else {
+	    while (n--)
+		*patcode++ = *add++;
+	}
     } else
 	*patcode++ = ch;
     patcode = patout + patsize;
@@ -297,13 +351,22 @@ patcompstart(void)
 	patglobflags = GF_IGNCASE;
 }
 
-/* Top level pattern compilation subroutine */
+/*
+ * Top level pattern compilation subroutine
+ * exp is a null-terminated, metafied string.
+ * inflags is an or of some PAT_* flags.
+ * endexp, if non-null, is set to a pointer to the end of the
+ *   part of exp which was compiled.  This is used when
+ *   compiling patterns for directories which must be
+ *   matched recursively.
+ */
 
 /**/
 mod_export Patprog
 patcompile(char *exp, int inflags, char **endexp)
 {
-    int flags = 0, len = 0;
+    int flags = 0;
+    long len = 0;
     long startoff;
     Upat pscan;
     char *lng, *strp = NULL;
@@ -324,7 +387,7 @@ patcompile(char *exp, int inflags, char **endexp)
      * in struct is actual count of parentheses.
      */
     patnpar = 1;
-    patflags = inflags & ~PAT_PURES;
+    patflags = inflags & ~(PAT_PURES|PAT_HAS_EXCLUDP);
 
     patendseg = endseg;
     patendseglen = isset(EXTENDEDGLOB) ? PATENDSEGLEN_EXT : PATENDSEGLEN_NORM;
@@ -366,7 +429,12 @@ patcompile(char *exp, int inflags, char **endexp)
 	    if (patcompswitch(0, &flags) == 0)
 		return NULL;
 	} else {
-	    /* Yes, copy the string and skip compilation altogether */
+	    /*
+	     * Yes, copy the string, and skip compilation altogether.
+	     * Null terminate for the benefit of globbing.
+	     * Leave metafied both for globbing and for our own
+	     * efficiency.
+	     */
 	    patparse = strp;
 	    len = strp - exp;
 	    patadd(exp, 0, len + 1, 0);
@@ -404,19 +472,52 @@ patcompile(char *exp, int inflags, char **endexp)
 		for (; pscan; pscan = next) {
 		    next = PATNEXT(pscan);
 		    if (P_OP(pscan) == P_EXACTLY) {
-			char *opnd = (char *)P_OPERAND(pscan);
-			while ((*dst = *opnd++))
-			    dst++;
+			char *opnd = P_LS_STR(pscan), *mtest;
+			long oplen = P_LS_LEN(pscan), ilen;
+			int nmeta = 0;
+			/*
+			 * Unfortunately we unmetafied the string
+			 * and we need to put any metacharacters
+			 * back now we know it's a pure string.
+			 * This shouldn't happen too often, it's
+			 * just that there are some cases such
+			 * as . and .. in files where we really
+			 * need a pure string even if there are
+			 * pattern characters flying around.
+			 */
+			for (mtest = opnd, ilen = oplen; ilen;
+			     mtest++, ilen--)
+			    if (imeta(*mtest))
+				nmeta++;
+			if (nmeta) {
+			    char *oldpatout = patout;
+			    patadd(NULL, 0, nmeta, 0);
+			    /*
+			     * Yuk.
+			     */
+			    p = (Patprog)patout;
+			    opnd = patout + (opnd - oldpatout);
+			    dst = patout + startoff;
+			}
+
+			while (oplen--) {
+			    if (imeta(*opnd)) {
+				*dst++ = Meta;
+				*dst++ = *opnd ^ 32;
+			    } else {
+				*dst++ = *opnd++;
+			    }
+			}
 		    }
 		}
-		*dst++ = '\0';
 		p->size = dst - patout;
-		/* patmlen is really strlen, don't include null byte */
-		p->patmlen = p->size - startoff - 1;
+		/* patmlen is really strlen.  We don't need a null. */
+		p->patmlen = p->size - startoff;
 	    } else {
 		/* starting point info */
-		if (P_OP(pscan) == P_EXACTLY && !p->globflags)
-		    p->patstartch = *(char *)P_OPERAND(pscan);
+		if (P_OP(pscan) == P_EXACTLY && !p->globflags &&
+		    P_LS_LEN(pscan))
+		    p->patstartch = *P_LS_STR(pscan);
 		/*
 		 * Find the longest literal string in something expensive.
 		 * This is itself not all that cheap if we have
@@ -427,9 +528,9 @@ patcompile(char *exp, int inflags, char **endexp)
 		    len = 0;
 		    for (; pscan; pscan = PATNEXT(pscan))
 			if (P_OP(pscan) == P_EXACTLY &&
-			    (int)strlen((char *)P_OPERAND(pscan)) >= len) {
-			    lng = (char *)P_OPERAND(pscan);
-			    len = strlen(lng);
+			    P_LS_LEN(pscan) >= len) {
+			    lng = P_LS_STR(pscan);
+			    len = P_LS_LEN(pscan);
 			}
 		    if (lng) {
 			p->mustoff = lng - patout;
@@ -844,9 +945,9 @@ static long
 patcomppiece(int *flagp)
 {
     long starter = 0, next, pound, op;
-    int flags, flags2, kshchar, len, ch, patch;
+    int flags, flags2, kshchar, len, ch, patch, nmeta;
     union upat up;
-    char *nptr, *str0, cbuf[2];
+    char *nptr, *str0, *ptr, cbuf[2];
     zrange_t from, to;
 
     flags = 0;
@@ -881,6 +982,9 @@ patcomppiece(int *flagp)
     }
 
     if (patparse > str0) {
+	long slen = patparse - str0;
+	int morelen;
+
 	/* Ordinary string: cancel kshchar lookahead */
 	kshchar = '\0';
 	/*
@@ -889,25 +993,40 @@ patcomppiece(int *flagp)
 	flags |= P_PURESTR;
 	DPUTS(patparse == str0, "BUG: matched nothing in patcomppiece.");
 	/* more than one character matched? */
-	len = str0 + (*str0 == Meta ? 2 : 1) < patparse;
+	morelen = str0 + (*str0 == Meta ? 2 : 1) < patparse;
 	/*
 	 * If we have more than one character, a following hash only
 	 * applies to the last, so decrement.
 	 */
-	if (isset(EXTENDEDGLOB) && *patparse == Pound && len)
+	if (isset(EXTENDEDGLOB) && *patparse == Pound && morelen)
 	    patparse -= (patparse > str0 + 1 && patparse[-2] == Meta) ? 2 : 1;
 	/*
 	 * If len is 1, we can't have an active # following, so doesn't
 	 * matter that we don't make X in `XX#' simple.
 	 */
-	if (!len)
+	if (!morelen)
 	    flags |= P_SIMPLE;
 	starter = patnode(P_EXACTLY);
-	/* add enough space including null byte */
-	len = patparse - str0;
-	patadd(str0, 0, len + 1, 0);
-	nptr = (char *)P_OPERAND((Upat)patout + starter);
-	nptr[len] = '\0';
+
+	/* Get length of string without metafication. */
+	nmeta = 0;
+	for (ptr = str0; ptr < patparse; ptr++) {
+	    if (*ptr == Meta) {
+		nmeta++;
+		ptr++;
+	    }
+	}
+	slen = (patparse - str0) - nmeta;
+	/* First add length, which is a long */
+	patadd((char *)&slen, 0, sizeof(long), 0);
+	/*
+	 * Then the string, not null terminated.
+	 * Unmetafy and untokenize; pass the final length,
+	 * which is what we need to allocate, i.e. not including
+	 * a count for each Meta in the string.
+	 */
+	patadd(str0, 0, slen, PA_UNMETA);
+	nptr = P_LS_STR((Upat)patout + starter);
 	/*
 	 * It's much simpler to turn off pure string mode for
 	 * any case-insensitive or approximate matching; usually,
@@ -918,13 +1037,13 @@ patcomppiece(int *flagp)
 	 * ..(#a1).. (i.e. the (#a1) has no effect), but if you're
 	 * going to write funny patterns, you get no sympathy from me.
 	 */
-	if (patglobflags &&
-	    (!(patflags & PAT_FILE) || (strcmp(nptr, ".") &&
-					strcmp(nptr, ".."))))
-	    flags &= ~P_PURESTR;
-	for (; *nptr; METAINC(nptr))
-	    if (itok(*nptr))
-		*nptr = ztokens[*nptr - Pound];
+	if (patglobflags) {
+	    if (!(patflags & PAT_FILE))
+		flags &= ~P_PURESTR;
+	    else if (!(nptr[0] == '.' &&
+		       (slen == 1 || (nptr[1] == '.' && slen == 2))))
+		flags &= ~P_PURESTR;
+	}
     } else {
 	if (kshchar)
 	    patparse++;
@@ -950,7 +1069,7 @@ patcomppiece(int *flagp)
 		starter = patnode(P_ANYOF);
 	    if (*patparse == Outbrack) {
 		patparse++;
-		patadd(NULL, ']', 1, 1);
+		patadd(NULL, ']', 1, PA_NOALIGN);
 	    }
 	    while (*patparse && *patparse != Outbrack) {
 		/* Meta is not a token */
@@ -990,7 +1109,7 @@ patcomppiece(int *flagp)
 			    ch = PP_UNKWN;
 			patparse = nptr + 2;
 			if (ch != PP_UNKWN)
-			    patadd(NULL, STOUC(Meta+ch), 1, 1);
+			    patadd(NULL, STOUC(Meta+ch), 1, PA_NOALIGN);
 			continue;
 		}
 		if (itok(*patparse)) {
@@ -1003,15 +1122,17 @@ patcomppiece(int *flagp)
 		patparse++;
 
 		if (*patparse == '-' && patparse[1] != Outbrack) {
-		    patadd(NULL, STOUC(Meta+PP_RANGE), 1, 1);
-		    patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, 1);
+		    patadd(NULL, STOUC(Meta+PP_RANGE), 1, PA_NOALIGN);
+		    patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, PA_NOALIGN);
 		    if (itok(*++patparse)) {
-			patadd(0, STOUC(ztokens[*patparse - Pound]), 1, 1);
+			patadd(0, STOUC(ztokens[*patparse - Pound]), 1,
+			       PA_NOALIGN);
 		    } else
-			patadd(patparse, 0, (*patparse == Meta) ? 2 : 1, 1);
+			patadd(patparse, 0, (*patparse == Meta) ? 2 : 1,
+			       PA_NOALIGN);
 		    METAINC(patparse);
 		} else
-		    patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, 1);
+		    patadd(cbuf, 0, (cbuf[0] == Meta) ? 2 : 1, PA_NOALIGN);
 	    }
 	    if (*patparse != Outbrack)
 		return 0;
@@ -1309,19 +1430,11 @@ static void patoptail(long p, long val)
  */
 static char *patinstart;	/* Start of input string */
 static char *patinend;		/* End of input string */
+static char *patinput;		/* String input pointer */
 static char *patinpath;		/* Full path for use with ~ exclusions */
-
-/**/
-char *patinput;		/* String input pointer */
-
-/*
- * Offset of string at which we are trying to match.
- * This is added in to the positions recorded in patbeginp and patendp
- * when we are looking for substrings.  Currently this only happens
- * in the parameter substitution code.
- */
-/**/
-int patoffset;
+static int   patinlen;		/* Length of last successful match.
+				 * Includes count of Meta characters.
+				 */
 
 static char *patbeginp[NSUBEXP];	/* Pointer to backref beginnings */
 static char *patendp[NSUBEXP];		/* Pointer to backref ends */
@@ -1330,8 +1443,23 @@ static int parsfound;			/* parentheses (with backrefs) found */
 static int globdots;			/* Glob initial dots? */
 
 /*
+ * Macros which are currently trivial but are likely to be less
+ * so when we handle multibyte characters.  They operate on
+ * umetafied strings.
+ */
+
+/* Get a character from the start point in a string */
+#define CHARREF(x)	(STOUC(*x))
+/* Get  a pointer to the next character */
+#define CHARNEXT(x)	(x+1)
+/* Increment a pointer past the current character. */
+#define CHARINC(x)	(x++)
+/* Counter the number of characters between two pointers, largest first */
+#define CHARSUB(x,y)	(x-y)
+
+/*
  * The following need to be accessed in the globbing scanner for
- * a multi-component file path.  See horror story there.
+ * a multi-component file path.  See horror story in glob.c.
  */
 /**/
 int errsfound;				/* Total error count so far */
@@ -1347,23 +1475,59 @@ pattrystart(void)
     errsfound = 0;
 }
 
+/*
+ * Test prog against null-terminated, metafied string.
+ */
+
 /**/
 mod_export int
 pattry(Patprog prog, char *string)
 {
-    return pattryrefs(prog, string, NULL, NULL, NULL);
+    return pattryrefs(prog, string, -1, 0, NULL, NULL, NULL);
+}
+
+/*
+ * Test prog against string of given length, no null termination
+ * but still metafied at this point.  offset gives an offset
+ * to include in reported match indices
+ */
+
+/**/
+mod_export int
+pattrylen(Patprog prog, char *string, int len, int offset)
+{
+    return pattryrefs(prog, string, len, offset, NULL, NULL, NULL);
 }
 
-/* The last three arguments are used to report the positions for the
+/*
+ * Test prog against string with given length stringlen, which
+ * may be -1 to indicate a null-terminated string.  The input
+ * string is metafied; the length is the raw string length, not the
+ * number of possibly metafied characters.
+ *
+ * offset is the position in the original string (not seen by
+ * the patter module) at which we are trying to match.
+ * This is added in to the positions recorded in patbeginp and patendp
+ * when we are looking for substrings.  Currently this only happens
+ * in the parameter substitution code.
+ *
+ * Note this is a character offset, i.e. a metafied character
+ * counts as 1.
+ *
+ * The last three arguments are used to report the positions for the
  * backreferences. On entry, *nump should contain the maximum number
- * positions to report. */
+ * of positions to report.  In this case the match, mbegin, mend
+ * arrays are not altered.
+ */
 
 /**/
 mod_export int
-pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
+pattryrefs(Patprog prog, char *string, int stringlen, int patoffset,
+	   int *nump, int *begp, int *endp)
 {
-    int i, maxnpos = 0, ret;
-    char **sp, **ep;
+    int i, maxnpos = 0, ret, needfullpath, unmetalen, unmetalenp;
+    int origlen;
+    char **sp, **ep, *tryalloced, *ptr;
     char *progstr = (char *)prog + prog->startoff;
 
     if (nump) {
@@ -1374,12 +1538,87 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
     if (*string == Nularg)
 	string++;
 
-    patinstart = patinput = string;
-    patinend = patinstart + strlen(patinstart);
+    if (stringlen < 0)
+	stringlen = strlen(string);
+    origlen = stringlen;
+
+    patflags = prog->flags;
+    /*
+     * For a top-level ~-exclusion, we will need the full
+     * path to exclude, so copy the path so far and append the
+     * current test string.
+     */
+    needfullpath = (patflags & PAT_HAS_EXCLUDP) && pathpos;
+
+    /* Get the length of the full string when unmetafied. */
+    unmetalen = ztrsub(string + stringlen, string);
+    if (needfullpath)
+	unmetalenp = ztrsub(pathbuf + pathpos, pathbuf);
+    else
+	unmetalenp = 0;
+
+    DPUTS(needfullpath && (patflags & (PAT_PURES|PAT_ANY)),
+	  "rum sort of file exclusion");
+    /*
+     * Partly for efficiency, and partly for the convenience of
+     * globbing, we don't unmetafy pure string patterns, and
+     * there's no reason to if the pattern is just a *.
+     */
+    if (!(patflags & (PAT_PURES|PAT_ANY))
+	&& (needfullpath || unmetalen != stringlen)) {
+	/*
+	 * We need to copy if we need to prepend the path so far
+	 * (in which case we copy both chunks), or if we have
+	 * Meta characters.
+	 */
+	char *dst;
+	int icopy, ncopy;
+
+	dst = tryalloced = zalloc(unmetalen + unmetalenp);
+
+	if (needfullpath) {
+	    /* loop twice, copy path buffer first time */
+	    ptr = pathbuf;
+	    ncopy = unmetalenp;
+	} else {
+	    /* just loop once, copy string with unmetafication */
+	    ptr = string;
+	    ncopy = unmetalen;
+	}
+	for (icopy = 0; icopy < 2; icopy++) {
+	    for (i = 0; i < ncopy; i++) {
+		if (*ptr == Meta) {
+		    ptr++;
+		    *dst++ = *ptr++ ^ 32;
+		} else {
+		    *dst++ = *ptr++;
+		}
+	    }
+	    if (!needfullpath)
+		break;
+	    /* next time append test string to path so far */
+	    ptr = string;
+	    ncopy = unmetalen;
+	}
+
+	if (needfullpath) {
+	    patinstart = tryalloced + unmetalenp;
+	    patinpath = tryalloced;
+	} else {
+	    patinstart = tryalloced;
+	    patinpath = NULL;
+	}
+	stringlen = unmetalen;
+    } else {
+	patinstart = string;
+	tryalloced = patinpath = NULL;
+    }
+
+    patinend = patinstart + stringlen;
     /*
      * From now on we do not require NULL termination of
-     * the test string.  It is still metafied, as is string
-     * data in the prog.
+     * the test string.  There should also be no more references
+     * to the variable string.
      */
 
     if (prog->flags & (PAT_PURES|PAT_ANY)) {
@@ -1399,11 +1638,11 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
 	     * Testing a pure string.  See if initial
 	     * components match.
 	     */
-	    int lendiff = (patinend - patinstart) - prog->patmlen;
+	    int lendiff = stringlen - prog->patmlen;
 	    if (lendiff < 0) {
 		/* No, the pattern string is too long. */
 		ret = 0;
-	    } else if (!memcmp(progstr, string, prog->patmlen)) {
+	    } else if (!memcmp(progstr, patinstart, prog->patmlen)) {
 		/*
 		 * Initial component matches.  Matches either
 		 * if lengths are the same or we are not anchored
@@ -1420,47 +1659,61 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
 	     * For files, we won't match initial "."s unless
 	     * glob_dots is set.
 	     */
-	    if ((prog->flags & PAT_NOGLD) && *string == '.')
-		return 0;
-	    /* in case used for ${..#..} etc. */
-	    patinput = string + prog->patmlen;
-	    /* if matching files, must update globbing flags */
-	    patglobflags = prog->globend;
-	    return 1;
-	} else
-	    return 0;
+	    if ((prog->flags & PAT_NOGLD) && *patinstart == '.') {
+		ret = 0;
+	    } else {
+		/*
+		 * Remember the length in case used for ${..#..} etc.
+		 * In this case, we didn't unmetafy the string.
+		 */
+		patinlen = (int)prog->patmlen;
+		/* if matching files, must update globbing flags */
+		patglobflags = prog->globend;
+	    }
+	}
+
+	if (tryalloced)
+	    zfree(tryalloced, unmetalen + unmetalenp);
+
+	return ret;
     } else {
 	/*
 	 * Test for a `must match' string, unless we're scanning for a match
 	 * in which case we don't need to do this each time.
 	 */
+	ret = 1;
 	if (!(prog->flags & PAT_SCAN) && prog->mustoff)
 	{
 	    char *testptr;	/* start pointer into test string */
 	    char *teststop;	/* last point from which we can match */
 	    char *patptr = (char *)prog + prog->mustoff;
-	    int patlen = strlen(patptr);
+	    int patlen = prog->patmlen;
 	    int found = 0;
 
-	    if (patlen > patinend - patinstart) {
+	    if (patlen > stringlen) {
 		/* Too long, can't match. */
-		return 0;
-	    }
-	    teststop = patinend - patlen;
+		ret = 0;
+	    } else {
+		teststop = patinend - patlen;
 
-	    for (testptr = patinstart; testptr <= teststop; testptr++)
-	    {
-		if (!memcmp(testptr, patptr, patlen)) {
-		    found = 1;
-		    break;
+		for (testptr = patinstart; testptr <= teststop; testptr++)
+		{
+		    if (!memcmp(testptr, patptr, patlen)) {
+			found = 1;
+			break;
+		    }
 		}
-	    }
 
-	    if (!found)
-		return 0;
+		if (!found)
+		    ret = 0;
+	    }
+	}
+	if (!ret) {
+	    if (tryalloced)
+		zfree(tryalloced, unmetalen + unmetalenp);
+	    return 0;
 	}
 
-	patflags = prog->flags;
 	patglobflags = prog->globflags;
 	if (!(patflags & PAT_FILE)) {
 	    forceerrs = -1;
@@ -1469,26 +1722,7 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
 	globdots = !(patflags & PAT_NOGLD);
 	parsfound = 0;
 
-	if ((patflags & PAT_HAS_EXCLUDP) && pathpos) {
-	    /*
-	     * For a top-level ~-exclusion, we will need the full
-	     * path to exclude, so copy the path so far and append the
-	     * current test string.
-	     *
-	     * There are some advantages in making patinstart etc.
-	     * point into this new string; however, that gets confusing
-	     * if we need patinput outside this file.  That's
-	     * not likely for files but I don't think it's worth
-	     * the risk.
-	     */
-	    int len = patinend - patinstart;
-
-	    patinpath = (char *)zalloc(pathpos + len);
-	    memcpy(patinpath, pathbuf, pathpos);
-	    memcpy(patinpath + pathpos, patinstart, len);
-	} else {
-	    patinpath = NULL;
-	}
+	patinput = patinstart;
 
 	if (patmatch((Upat)progstr)) {
 	    /*
@@ -1496,6 +1730,23 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
 	     * failed, so set it now
 	     */
 	    patglobflags = prog->globend;
+
+	    /*
+	     * Record length of successful match, including Meta
+	     * characters.  Do it here so that patmatchlen() can return
+	     * it even if we delete the pattern strings.
+	     */
+	    patinlen = patinput - patinstart;
+	    /*
+	     * Optimization: if we didn't find any Meta characters
+	     * to begin with, we don't need to look for them now.
+	     */
+	    if (unmetalen != origlen) {
+		for (ptr = patinstart; ptr < patinput; ptr++)
+		    if (imeta(*ptr))
+			patinlen++;
+	    }
+
 	    /*
 	     * Should we clear backreferences and matches on a failed
 	     * match?
@@ -1504,15 +1755,18 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
 		/*
 		 * m flag: for global match.  This carries no overhead
 		 * in the pattern matching part.
+		 *
+		 * Remember the test pattern is already unmetafied.
 		 */
 		char *str;
-		int mlen = ztrsub(patinput, patinstart);
+		int mlen = CHARSUB(patinput, patinstart);
 
-		str = ztrduppfx(patinstart, patinput - patinstart);
+		str = metafy(patinstart, patinput - patinstart, META_DUP);
 		setsparam("MATCH", str);
 		setiparam("MBEGIN", (zlong)(patoffset + !isset(KSHARRAYS)));
 		setiparam("MEND",
-			  (zlong)(mlen + patoffset + !isset(KSHARRAYS) - 1));
+			  (zlong)(mlen + patoffset +
+				  !isset(KSHARRAYS) - 1));
 	    }
 	    if (prog->patnpar && nump) {
 		/*
@@ -1527,9 +1781,10 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
 		for (i = 0; i < prog->patnpar && i < maxnpos; i++) {
 		    if (parsfound & (1 << i)) {
 			if (begp)
-			    *begp++ = ztrsub(*sp, patinstart) + patoffset;
+			    *begp++ = CHARSUB(*sp, patinstart) + patoffset;
 			if (endp)
-			    *endp++ = ztrsub(*ep, patinstart) + patoffset - 1;
+			    *endp++ = CHARSUB(*ep, patinstart) + patoffset
+				- 1;
 		    } else {
 			if (begp)
 			    *begp++ = -1;
@@ -1557,7 +1812,7 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
 
 		for (i = 0; i < prog->patnpar; i++) {
 		    if (parsfound & (1 << i)) {
-			matcharr[i] = ztrduppfx(*sp, *ep - *sp);
+			matcharr[i] = metafy(*sp, *ep - *sp, META_DUP);
 			/*
 			 * mbegin and mend give indexes into the string
 			 * in the standard notation, i.e. respecting
@@ -1568,12 +1823,12 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
 			 * corresponds to indexing as ${foo[1,1]}.
 			 */
 			sprintf(numbuf, "%ld",
-				(long)(ztrsub(*sp, patinstart) + 
+				(long)(CHARSUB(*sp, patinstart) +
 				       patoffset +
 				       !isset(KSHARRAYS)));
 			mbeginarr[i] = ztrdup(numbuf);
 			sprintf(numbuf, "%ld",
-				(long)(ztrsub(*ep, patinstart) + 
+				(long)(CHARSUB(*ep, patinstart) +
 				       patoffset +
 				       !isset(KSHARRAYS) - 1));
 			mendarr[i] = ztrdup(numbuf);
@@ -1593,20 +1848,32 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
 		setaparam("mbegin", mbeginarr);
 		setaparam("mend", mendarr);
 	    }
+
 	    ret = 1;
 	} else
 	    ret = 0;
 
-	if (patinpath) {
-	    zfree(patinpath, pathpos + (patinend - patinstart));
-	    patinpath = NULL;
-	}
+	if (tryalloced)
+	    zfree(tryalloced, unmetalen + unmetalenp);
 
 	return ret;
     }
 }
 
 /*
+ * Return length of previous succesful match.  This is
+ * in metafied bytes, i.e. includes a count of Meta characters.
+ * Unusual and futile attempt at modular encapsulation.
+ */
+
+/**/
+int
+patmatchlen(void)
+{
+    return patinlen;
+}
+
+/*
  * Match literal characters with case insensitivity test:  the first
  * comes from the input string, the second the current pattern.
  */
@@ -1627,8 +1894,11 @@ pattryrefs(Patprog prog, char *string, int *nump, int *begp, int *endp)
  * exactpos is used to remember how far down an exact string we have
  * matched, if we are doing approximation and can therefore redo from
  * the same point; we never need to otherwise.
+ *
+ * exactend is a pointer to the end of the string, which isn't
+ * null-terminated.
  */
-static char *exactpos;
+static char *exactpos, *exactend;
 
 /*
  * Main matching routine.
@@ -1643,7 +1913,7 @@ patmatch(Upat prog)
 {
     /* Current and next nodes */
     Upat scan = prog, next, opnd;
-    char *start, *save, *chrop, *compend;
+    char *start, *save, *chrop, *chrend, *compend;
     int savglobflags, op, no, min, nextch, fail = 0, saverrsfound;
     zrange_t from, to, comp;
 
@@ -1659,45 +1929,52 @@ patmatch(Upat prog)
 	    if (patinput == patinend)
 		fail = 1;
 	    else
-		METAINC(patinput);
+		CHARINC(patinput);
 	    break;
 	case P_EXACTLY:
 	    /*
 	     * acts as nothing if *chrop is null:  this is used by
 	     * approx code.
 	     */
-	    chrop = exactpos ? exactpos : (char *)P_OPERAND(scan);
+	    if (exactpos) {
+		chrop = exactpos;
+		chrend = exactend;
+	    } else {
+		chrop = P_LS_STR(scan);
+		chrend = chrop + P_LS_LEN(scan);
+	    }
 	    exactpos = NULL;
-	    while (*chrop && patinput < patinend) {
-		int chin = STOUC(UNMETA(patinput));
-		int chpa = STOUC(UNMETA(chrop));
+	    while (chrop < chrend && patinput < patinend) {
+		int chin = CHARREF(patinput);
+		int chpa = CHARREF(chrop);
 		if (!CHARMATCH(chin, chpa)) {
 		    fail = 1;
 		    break;
 		}
-		METAINC(chrop);
-		METAINC(patinput);
+		CHARINC(chrop);
+		CHARINC(patinput);
 	    }
-	    if (*chrop) {
+	    if (chrop < chrend) {
 		exactpos = chrop;
+		exactend = chrend;
 		fail = 1;
 	    }
 	    break;
 	case P_ANYOF:
 	    if (patinput == patinend ||
 		!patmatchrange((char *)P_OPERAND(scan),
-			       STOUC(UNMETA(patinput))))
+			       CHARREF(patinput)))
 		fail = 1;
 	    else
-		METAINC(patinput);
+		CHARINC(patinput);
 	    break;
 	case P_ANYBUT:
 	    if (patinput == patinend ||
 		patmatchrange((char *)P_OPERAND(scan),
-			      STOUC(UNMETA(patinput))))
+			      CHARREF(patinput)))
 		fail = 1;
 	    else
-		METAINC(patinput);
+		CHARINC(patinput);
 	    break;
 	case P_NUMRNG:
 	case P_NUMFROM:
@@ -1771,7 +2048,8 @@ patmatch(Upat prog)
 		    return 1;
 		}
 		if (!no && P_OP(next) == P_EXACTLY &&
-		    !idigit(STOUC(*(char *)P_OPERAND(next))) &&
+		    (!P_LS_LEN(next) ||
+		     !idigit(STOUC(*P_LS_STR(next)))) &&
 		    !(patglobflags & 0xff))
 		    return 0;
 		patinput = --save;
@@ -1791,7 +2069,7 @@ patmatch(Upat prog)
 	case P_NUMANY:
 	    /* This is <->: any old set of digits, don't bother comparing */
 	    start = patinput;
-	    while (patinput < patinend && idigit(STOUC(*patinput)))
+	    while (patinput < patinend && idigit(CHARREF(patinput)))
 		patinput++;
 	    save = patinput;
 	    no = 0;
@@ -1799,7 +2077,8 @@ patmatch(Upat prog)
 		if (patmatch(next))
 		    return 1;
 		if (!no && P_OP(next) == P_EXACTLY &&
-		    !idigit(STOUC(*(char *)P_OPERAND(next))) &&
+		    (!P_LS_LEN(next) ||
+		     !idigit(CHARREF(P_LS_STR(next)))) &&
 		    !(patglobflags & 0xff))
 		    return 0;
 		patinput = --save;
@@ -1963,7 +2242,7 @@ patmatch(Upat prog)
 			origpatinend = patinend;
 			while ((ret = patmatch(P_OPERAND(scan)))) {
 			    unsigned char *syncpt;
-			    char *savpatinstart, *savpatinend;
+			    char *savpatinstart;
 			    int savforce = forceerrs;
 			    int savpatflags = patflags, synclen;
 			    forceerrs = -1;
@@ -1998,7 +2277,6 @@ patmatch(Upat prog)
 				patflags |= PAT_NOTEND;
 			    }
 			    savpatinstart = patinstart;
-			    savpatinend = patinend;
 			    next = PATNEXT(scan);
 			    while (next && P_ISEXCLUDE(next)) {
 				patinput = save;
@@ -2013,14 +2291,15 @@ patmatch(Upat prog)
 				opnd = P_OPERAND(next) + 1;
 				if (P_OP(next) == P_EXCLUDP && patinpath) {
 				    /*
-				     * top level exclusion with a file,
+				     * Top level exclusion with a file,
 				     * applies to whole path so add the
-				     * segments already matched
+				     * segments already matched.
+				     * We copied these in front of the
+				     * test pattern, so patinend doesn't
+				     * need moving.
 				     */
 				    DPUTS(patinput != patinstart,
 					  "BUG: not at start excluding path");
-				    patinend = patinpath + pathpos +
-					(patinend - patinstart);
 				    patinput = patinstart = patinpath;
 				}
 				if (patmatch(opnd)) {
@@ -2036,7 +2315,6 @@ patmatch(Upat prog)
 				    patinput = savpatinstart +
 					(patinput - patinstart);
 				    patinstart = savpatinstart;
-				    patinend = savpatinend;
 				}
 				if (!ret)
 				    break;
@@ -2146,7 +2424,7 @@ patmatch(Upat prog)
 	    /* Note that no counts possibly metafied characters */
 	    start = patinput;
 	    if (op == P_STAR) {
-		for (no = 0; patinput < patinend; METAINC(patinput))
+		for (no = 0; patinput < patinend; CHARINC(patinput))
 		    no++;
 		/* simple optimization for reasonably common case */
 		if (P_OP(next) == P_END)
@@ -2156,7 +2434,7 @@ patmatch(Upat prog)
 		      "BUG: wrong backtracking with approximation.");
 		if (!globdots && P_NOTDOT(P_OPERAND(scan)) &&
 		    patinput == patinstart && patinput < patinend &&
-		    *patinput == '.')
+		    CHARREF(patinput) == '.')
 		    return 0;
 		no = patrepeat(P_OPERAND(scan));
 	    }
@@ -2165,8 +2443,9 @@ patmatch(Upat prog)
 	     * Lookahead to avoid useless matches. This is not possible
 	     * with approximation.
 	     */
-	    if (P_OP(next) == P_EXACTLY && !(patglobflags & 0xff)) {
-		char *nextop = (char *)P_OPERAND(next);
+	    if (P_OP(next) == P_EXACTLY && P_LS_LEN(next) &&
+		!(patglobflags & 0xff)) {
+		char *nextop = P_LS_STR(next);
 		/*
 		 * If that P_EXACTLY is last (common in simple patterns,
 		 * such as *.c), then it can be only be matched at one
@@ -2175,21 +2454,20 @@ patmatch(Upat prog)
 		if (P_OP(PATNEXT(next)) == P_END &&
 		    !(patflags & PAT_NOANCH)) {
 		    int ptlen = patinend - patinput;
-		    int oplen = strlen(nextop);
-		    int lenmatch = patinend - (min ? METANEXT(start) : start);
+		    int lenmatch = patinend - (min ? CHARNEXT(start) : start);
 		    /* Are we in the right range? */
-		    if (oplen > lenmatch || oplen < ptlen)
+		    if (P_LS_LEN(next) > lenmatch || P_LS_LEN(next) < ptlen)
 			return 0;
 		    /* Yes, just position appropriately and test. */
-		    patinput += ptlen - oplen;
-		    if (patinput > start && patinput[-1] == Meta) {
-			/* doesn't align properly, no go */
-			return 0;
-		    }
+		    patinput += ptlen - P_LS_LEN(next);
+		    /*
+		     * Here we will need to be careful that patinput is not
+		     * in the middle of a multibyte character.
+		     */
 		    /* Continue loop with P_EXACTLY test. */
 		    break;
 		}
-		nextch = STOUC(UNMETA(nextop));
+		nextch = CHARREF(nextop);
 	    } else
 		nextch = -1;
 	    save = patinput;
@@ -2199,14 +2477,17 @@ patmatch(Upat prog)
 		int charmatch_cache;
 		if (nextch < 0 ||
 		    (patinput < patinend &&
-		     CHARMATCH_EXPR(STOUC(UNMETA(patinput)), nextch))) {
+		     CHARMATCH_EXPR(CHARREF(patinput), nextch))) {
 		    if (patmatch(next))
 			return 1;
 		}
 		no--;
 		save--;
-		if (save > start && save[-1] == Meta)
-		    save--;
+		/*
+		 * Here we will need to make sure save is
+		 * decremented properly to the start of
+		 * the preceeding multibyte character.
+		 */
 		patinput = save;
 		patglobflags = savglobflags;
 		errsfound = saverrsfound;
@@ -2270,7 +2551,7 @@ patmatch(Upat prog)
 
 		/* Try omitting a character from the input string */
 		if (patinput < patinend) {
-		    METAINC(patinput);
+		    CHARINC(patinput);
 		    /* If we are not on an exact match, then this is
 		     * our last gasp effort, so we can optimize out
 		     * the recursive call.
@@ -2285,11 +2566,11 @@ patmatch(Upat prog)
 		    char *nextexact = savexact;
 		    DPUTS(!savexact || !*savexact,
 			  "BUG: exact match has not set exactpos");
-		    METAINC(nextexact);
+		    CHARINC(nextexact);
 
 		    if (save < patinend) {
 			char *nextin = save;
-			METAINC(nextin);
+			CHARINC(nextin);
 			patglobflags = savglobflags;
 			errsfound = saverrsfound;
 			exactpos = savexact;
@@ -2299,18 +2580,18 @@ patmatch(Upat prog)
 			 * exactpos
 			 */
 			if (save < patinend && nextin < patinend &&
-			    *nextexact) {
-			    int cin0 = UNMETA(save);
-			    int cpa0 = UNMETA(exactpos);
-			    int cin1 = UNMETA(nextin);
-			    int cpa1 = UNMETA(nextexact);
+			    nextexact < exactend) {
+			    int cin0 = CHARREF(save);
+			    int cpa0 = CHARREF(exactpos);
+			    int cin1 = CHARREF(nextin);
+			    int cpa1 = CHARREF(nextexact);
 
 			    if (CHARMATCH(cin0, cpa1) &&
 				CHARMATCH(cin1, cpa0)) {
 				patinput = nextin;
-				METAINC(patinput);
+				CHARINC(patinput);
 				exactpos = nextexact;
-				METAINC(exactpos);
+				CHARINC(exactpos);
 				if (patmatch(scan))
 				    return 1;
 
@@ -2333,12 +2614,13 @@ patmatch(Upat prog)
 			exactpos = savexact;
 		    }
 
+		    DPUTS(exactpos == exactend, "approximating too far");
 		    /*
 		     * Try moving up the exact match pattern.
 		     * This must be the last attempt, so just loop
 		     * instead of calling recursively.
 		     */
-		    METAINC(exactpos);
+		    CHARINC(exactpos);
 		    continue;
 		}
 	    }
@@ -2358,6 +2640,11 @@ patmatchrange(char *range, int ch)
 {
     int r1, r2;
 
+    /*
+     * Careful here: unlike other strings, range is a NULL-terminated,
+     * metafied string, because we need to treat the Posix and hyphenated
+     * ranges specially.
+     */
     for (; *range; range++) {
 	if (imeta(STOUC(*range))) {
 	    switch (STOUC(*range)-STOUC(Meta)) {
@@ -2459,23 +2746,24 @@ static int patrepeat(Upat p)
 	break;
 #endif
     case P_EXACTLY:
-	tch = STOUC(UNMETA(opnd));
+	DPUTS(P_LS_LEN(p) != 1, "closure following more than one character");
+	tch = CHARREF(P_LS_STR(p));
 	while (scan < patinend &&
-	       CHARMATCH_EXPR(STOUC(UNMETA(scan)), tch)) {
+	       CHARMATCH_EXPR(CHARREF(scan), tch)) {
 	    count++;
-	    METAINC(scan);
+	    CHARINC(scan);
 	}
 	break;
     case P_ANYOF:
-	while (scan < patinend && patmatchrange(opnd, STOUC(UNMETA(scan)))) {
+	while (scan < patinend && patmatchrange(opnd, CHARREF(scan))) {
 	    count++;
-	    METAINC(scan);
+	    CHARINC(scan);
     	}
 	break;
     case P_ANYBUT:
-	while (scan < patinend && !patmatchrange(opnd, STOUC(UNMETA(scan)))) {
+	while (scan < patinend && !patmatchrange(opnd, CHARREF(scan))) {
 	    count++;
-	    METAINC(scan);
+	    CHARINC(scan);
     	}
 	break;
 #ifdef DEBUG
@@ -2528,7 +2816,14 @@ patdump(Patprog r)
 	    next = PATNEXT(up);
 	    printf("(%d)", next ? next-codestart : 0);
 	    s += sizeof(union upat);
-	    if (op == P_ANYOF || op == P_ANYBUT || op == P_EXACTLY) {
+	    if (op == P_EXACTLY) {
+		long llen = *(long *)s;
+		s += sizeof(long);
+		while (llen--) {
+		    putchar(CHARREF(s));
+		    CHARINC(s);
+		}
+	    } else if (op == P_ANYOF || op == P_ANYBUT) {
 		while (*s != '\0') {
 		    if (itok(*s)) {
 			if (*s == Meta + PP_RANGE) {