From e86b3cce47e62c263301322ce24b56cf04b8cdb8 Mon Sep 17 00:00:00 2001
From: Peter Stephenson <p.w.stephenson@ntlworld.com>
Date: Thu, 10 Sep 2015 20:05:48 +0100
Subject: 36478: Add [[:INCOMPLETE:]] and [[:INVALID:]] pattern tests.

---
 ChangeLog              |  6 ++++++
 Doc/Zsh/expn.yo        | 14 ++++++++++++++
 Src/Zle/comp.h         |  5 +++--
 Src/pattern.c          | 43 ++++++++++++++++++++++++++++++++++---------
 Src/zsh.h              | 17 ++++++++++++++---
 Test/D07multibyte.ztst |  6 ++++++
 6 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 448bbb4f9..3dfd8ba53 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2015-09-10  Peter Stephenson  <p.w.stephenson@ntlworld.com>
+
+	* 36478: Src/pattern.c, Src/zsh.h, Src/Zle/comp.h,
+	Doc/Zsh/expn.yo, Test/D07multibyte.ztst: add [[:INCOMPLETE:]] and
+	[[:INVALID:]] pattern tests.
+
 2015-09-10  Barton E. Schaefer  <schaefer@zsh.org>
 
 	* 36470: Src/Zle/zle_main.c: Auxiliary to 36468, return an empty
diff --git a/Doc/Zsh/expn.yo b/Doc/Zsh/expn.yo
index d44b40a3b..5ea8610f2 100644
--- a/Doc/Zsh/expn.yo
+++ b/Doc/Zsh/expn.yo
@@ -1956,6 +1956,20 @@ ifzman(the zmanref(zshparam) manual page)\
 ifnzman(noderef(Parameters Used By The Shell))\
 .
 )
+item(tt([:INCOMPLETE:]))(
+Matches a byte that starts an incomplete multibyte character.
+Note that there may be a sequence of more than one bytes that
+taken together form the prefix of a multibyte character.  To
+test for a potentially incomplete byte sequence, use the pattern
+`tt([[:INCOMPLETE:]]*)'.  This will never match a sequence starting
+with a valid multibyte character.
+)
+item(tt([:INVALID:]))(
+Matches a byte that does not start a valid multibyte character.
+Note this may be a continuation byte of an incomplete multibyte
+character as any part of a multibyte string consisting of invalid and
+incomplete multibyte characters is treated as single bytes.
+)
 item(tt([:WORD:]))(
 The character is treated as part of a word; this test is sensitive
 to the value of the tt(WORDCHARS) parameter
diff --git a/Src/Zle/comp.h b/Src/Zle/comp.h
index 34da2cabb..023c41814 100644
--- a/Src/Zle/comp.h
+++ b/Src/Zle/comp.h
@@ -202,8 +202,9 @@ struct cpattern {
  * TODO: this will change.
  */
 #ifdef MULTIBYTE_SUPPORT
-#define PATMATCHRANGE(r, c, ip, mtp)	mb_patmatchrange(r, c, ip, mtp)
-#define PATMATCHINDEX(r, i, cp, mtp)	mb_patmatchindex(r, i, cp, mtp)
+#define PATMATCHRANGE(r, c, ip, mtp)		\
+    mb_patmatchrange(r, c, ZMB_VALID, ip, mtp)
+#define PATMATCHINDEX(r, i, cp, mtp)    mb_patmatchindex(r, i, cp, mtp)
 #define CONVCAST(c)			((wchar_t)(c))
 #define CHR_INVALID			(WEOF)
 #else
diff --git a/Src/pattern.c b/Src/pattern.c
index b4ba33e49..3b55ccf1c 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -145,7 +145,7 @@ typedef union upat *Upat;
  *
  *  P_ANY, P_ANYOF:  the operand is a null terminated
  *    string.  Normal characters match as expected.  Characters
- *    in the range Meta+PP_ALPHA..Meta+PP_UNKNWN do the appropriate
+ *    in the range Meta+PP_ALPHA..Meta+PP_UNKWN do the appropriate
  *    Posix range tests.  This relies on imeta returning true for these
  *    characters.  We treat unknown POSIX ranges as never matching.
  *    PP_RANGE means the next two (possibly metafied) characters form
@@ -1119,7 +1119,7 @@ patgetglobflags(char **strp, long *assertp, int *ignore)
 static const char *colon_stuffs[]  = {
     "alpha", "alnum", "ascii", "blank", "cntrl", "digit", "graph", 
     "lower", "print", "punct", "space", "upper", "xdigit", "IDENT",
-    "IFS", "IFSSPACE", "WORD", NULL
+    "IFS", "IFSSPACE", "WORD", "INCOMPLETE", "INVALID", NULL
 };
 
 /*
@@ -1870,9 +1870,9 @@ static int globdots;			/* Glob initial dots? */
 #ifdef MULTIBYTE_SUPPORT
 
 /* Get a character from the start point in a string */
-#define CHARREF(x, y)	charref((x), (y))
+#define CHARREF(x, y)	charref((x), (y), (int *)NULL)
 static wchar_t
-charref(char *x, char *y)
+charref(char *x, char *y, int *zmb_ind)
 {
     wchar_t wc;
     size_t ret;
@@ -1886,9 +1886,13 @@ charref(char *x, char *y)
 	/* Error. */
 	/* Reset the shift state for next time. */
 	memset(&shiftstate, 0, sizeof(shiftstate));
+	if (zmb_ind)
+	    *zmb_ind = (ret == MB_INVALID) ? ZMB_INVALID : ZMB_INCOMPLETE;
 	return WCHAR_INVALID(*x);
     }
 
+    if (zmb_ind)
+	*zmb_ind = ZMB_VALID;
     return wc;
 }
 
@@ -2580,10 +2584,11 @@ patmatch(Upat prog)
 		fail = 1;
 	    else {
 #ifdef MULTIBYTE_SUPPORT
-		wchar_t cr = CHARREF(patinput, patinend);
+		int zmb_ind;
+		wchar_t cr = charref(patinput, patinend, &zmb_ind);
 		char *scanop = (char *)P_OPERAND(scan);
 		if (patglobflags & GF_MULTIBYTE) {
-		    if (mb_patmatchrange(scanop, cr, NULL, NULL) ^
+		    if (mb_patmatchrange(scanop, cr, zmb_ind, NULL, NULL) ^
 			(P_OP(scan) == P_ANYOF))
 			fail = 1;
 		    else
@@ -3351,6 +3356,9 @@ patmatch(Upat prog)
  * The null-terminated specification is in range; the test
  * character is in ch.
  *
+ * zmb is one of the enum defined above charref(), for indicating
+ * incomplete or invalid multibyte characters.
+ *
  * indptr is used by completion matching, which is why this
  * function is exported.  If indptr is not NULL we set *indptr
  * to the index of the character in the range string, adjusted
@@ -3367,7 +3375,7 @@ patmatch(Upat prog)
 
 /**/
 mod_export int
-mb_patmatchrange(char *range, wchar_t ch, wint_t *indptr, int *mtp)
+mb_patmatchrange(char *range, wchar_t ch, int zmb_ind, wint_t *indptr, int *mtp)
 {
     wchar_t r1, r2;
 
@@ -3476,6 +3484,14 @@ mb_patmatchrange(char *range, wchar_t ch, wint_t *indptr, int *mtp)
 		    *indptr += r2 - r1;
 		}
 		break;
+	    case PP_INCOMPLETE:
+		if (zmb_ind == ZMB_INCOMPLETE)
+		    return 1;
+		break;
+	    case PP_INVALID:
+		if (zmb_ind == ZMB_INVALID)
+		    return 1;
+		break;
 	    case PP_UNKWN:
 		DPUTS(1, "BUG: unknown posix range passed through.\n");
 		break;
@@ -3545,6 +3561,8 @@ mb_patmatchindex(char *range, wint_t ind, wint_t *chr, int *mtp)
 	    case PP_IFS:
 	    case PP_IFSSPACE:
 	    case PP_WORD:
+	    case PP_INCOMPLETE:
+	    case PP_INVALID:
 		if (!ind) {
 		    *mtp = swtype;
 		    return 1;
@@ -3698,6 +3716,10 @@ patmatchrange(char *range, int ch, int *indptr, int *mtp)
 		if (indptr && r1 < r2)
 		    *indptr += r2 - r1;
 		break;
+	    case PP_INCOMPLETE:
+	    case PP_INVALID:
+		/* Never true if not in multibyte mode */
+		break;
 	    case PP_UNKWN:
 		DPUTS(1, "BUG: unknown posix range passed through.\n");
 		break;
@@ -3768,6 +3790,8 @@ patmatchindex(char *range, int ind, int *chr, int *mtp)
 	    case PP_IFS:
 	    case PP_IFSSPACE:
 	    case PP_WORD:
+	    case PP_INCOMPLETE:
+	    case PP_INVALID:
 		if (!ind) {
 		    *mtp = swtype;
 		    return 1;
@@ -3851,9 +3875,10 @@ static int patrepeat(Upat p, char *charstart)
     case P_ANYBUT:
 	while (scan < patinend) {
 #ifdef MULTIBYTE_SUPPORT
-	    wchar_t cr = CHARREF(scan, patinend);
+	    int zmb_ind;
+	    wchar_t cr = charref(scan, patinend, &zmb_ind);
 	    if (patglobflags & GF_MULTIBYTE) {
-		if (mb_patmatchrange(opnd, cr, NULL, NULL) ^
+		if (mb_patmatchrange(opnd, cr, zmb_ind, NULL, NULL) ^
 		    (P_OP(p) == P_ANYOF))
 		    break;
 	    } else if (patmatchrange(opnd, (int)cr, NULL, NULL) ^
diff --git a/Src/zsh.h b/Src/zsh.h
index a99c90065..4e2cb656e 100644
--- a/Src/zsh.h
+++ b/Src/zsh.h
@@ -1562,13 +1562,15 @@ typedef struct zpc_disables_save *Zpc_disables_save;
 #define PP_IFS    15
 #define PP_IFSSPACE   16
 #define PP_WORD   17
+#define PP_INCOMPLETE 18
+#define PP_INVALID 19
 /* Special value for last definition */
-#define PP_LAST   17
+#define PP_LAST   19
 
 /* Unknown type.  Not used in a valid token. */
-#define PP_UNKWN  18
+#define PP_UNKWN  20
 /* Range: token followed by the (possibly multibyte) start and end */
-#define PP_RANGE  19
+#define PP_RANGE  21
 
 /* Globbing flags: lower 8 bits gives approx count */
 #define GF_LCMATCHUC	0x0100
@@ -1577,6 +1579,15 @@ typedef struct zpc_disables_save *Zpc_disables_save;
 #define GF_MATCHREF	0x0800
 #define GF_MULTIBYTE	0x1000	/* Use multibyte if supported by build */
 
+enum {
+    /* Valid multibyte character from charref */
+    ZMB_VALID,
+    /* Incomplete multibyte character from charref */
+    ZMB_INCOMPLETE,
+    /* Invalid multibyte character charref */
+    ZMB_INVALID
+};
+
 /* Dummy Patprog pointers. Used mainly in executable code, but the
  * pattern code needs to know about it, too. */
 
diff --git a/Test/D07multibyte.ztst b/Test/D07multibyte.ztst
index 3fadd8066..ace191f06 100644
--- a/Test/D07multibyte.ztst
+++ b/Test/D07multibyte.ztst
@@ -525,3 +525,9 @@
     fi
   done
 0:Invalid characters in pattern matching
+
+  [[ $'\xe3' == [[:INCOMPLETE:]] ]] || print fail 1
+  [[ $'\xe3\x83' == [[:INCOMPLETE:]][[:INVALID:]] ]] || print fail 2
+  [[ $'\xe3\x83\x9b' != [[:INCOMPLETE:][:NVALID:]] ]] || print fail 3
+  [[ $'\xe3\x83\x9b' = ? ]] || print fail 4
+0:Testing incomplete and invalid multibyte character components
-- 
cgit 1.4.1