From e86b3cce47e62c263301322ce24b56cf04b8cdb8 Mon Sep 17 00:00:00 2001 From: Peter Stephenson Date: Thu, 10 Sep 2015 20:05:48 +0100 Subject: 36478: Add [[:INCOMPLETE:]] and [[:INVALID:]] pattern tests. --- ChangeLog | 6 ++++++ Doc/Zsh/expn.yo | 14 ++++++++++++++ Src/Zle/comp.h | 5 +++-- Src/pattern.c | 43 ++++++++++++++++++++++++++++++++++--------- Src/zsh.h | 17 ++++++++++++++--- Test/D07multibyte.ztst | 6 ++++++ 6 files changed, 77 insertions(+), 14 deletions(-) diff --git a/ChangeLog b/ChangeLog index 448bbb4f9..3dfd8ba53 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2015-09-10 Peter Stephenson + + * 36478: Src/pattern.c, Src/zsh.h, Src/Zle/comp.h, + Doc/Zsh/expn.yo, Test/D07multibyte.ztst: add [[:INCOMPLETE:]] and + [[:INVALID:]] pattern tests. + 2015-09-10 Barton E. Schaefer * 36470: Src/Zle/zle_main.c: Auxiliary to 36468, return an empty diff --git a/Doc/Zsh/expn.yo b/Doc/Zsh/expn.yo index d44b40a3b..5ea8610f2 100644 --- a/Doc/Zsh/expn.yo +++ b/Doc/Zsh/expn.yo @@ -1956,6 +1956,20 @@ ifzman(the zmanref(zshparam) manual page)\ ifnzman(noderef(Parameters Used By The Shell))\ . ) +item(tt([:INCOMPLETE:]))( +Matches a byte that starts an incomplete multibyte character. +Note that there may be a sequence of more than one bytes that +taken together form the prefix of a multibyte character. To +test for a potentially incomplete byte sequence, use the pattern +`tt([[:INCOMPLETE:]]*)'. This will never match a sequence starting +with a valid multibyte character. +) +item(tt([:INVALID:]))( +Matches a byte that does not start a valid multibyte character. +Note this may be a continuation byte of an incomplete multibyte +character as any part of a multibyte string consisting of invalid and +incomplete multibyte characters is treated as single bytes. +) item(tt([:WORD:]))( The character is treated as part of a word; this test is sensitive to the value of the tt(WORDCHARS) parameter diff --git a/Src/Zle/comp.h b/Src/Zle/comp.h index 34da2cabb..023c41814 100644 --- a/Src/Zle/comp.h +++ b/Src/Zle/comp.h @@ -202,8 +202,9 @@ struct cpattern { * TODO: this will change. */ #ifdef MULTIBYTE_SUPPORT -#define PATMATCHRANGE(r, c, ip, mtp) mb_patmatchrange(r, c, ip, mtp) -#define PATMATCHINDEX(r, i, cp, mtp) mb_patmatchindex(r, i, cp, mtp) +#define PATMATCHRANGE(r, c, ip, mtp) \ + mb_patmatchrange(r, c, ZMB_VALID, ip, mtp) +#define PATMATCHINDEX(r, i, cp, mtp) mb_patmatchindex(r, i, cp, mtp) #define CONVCAST(c) ((wchar_t)(c)) #define CHR_INVALID (WEOF) #else diff --git a/Src/pattern.c b/Src/pattern.c index b4ba33e49..3b55ccf1c 100644 --- a/Src/pattern.c +++ b/Src/pattern.c @@ -145,7 +145,7 @@ typedef union upat *Upat; * * P_ANY, P_ANYOF: the operand is a null terminated * string. Normal characters match as expected. Characters - * in the range Meta+PP_ALPHA..Meta+PP_UNKNWN do the appropriate + * in the range Meta+PP_ALPHA..Meta+PP_UNKWN do the appropriate * Posix range tests. This relies on imeta returning true for these * characters. We treat unknown POSIX ranges as never matching. * PP_RANGE means the next two (possibly metafied) characters form @@ -1119,7 +1119,7 @@ patgetglobflags(char **strp, long *assertp, int *ignore) static const char *colon_stuffs[] = { "alpha", "alnum", "ascii", "blank", "cntrl", "digit", "graph", "lower", "print", "punct", "space", "upper", "xdigit", "IDENT", - "IFS", "IFSSPACE", "WORD", NULL + "IFS", "IFSSPACE", "WORD", "INCOMPLETE", "INVALID", NULL }; /* @@ -1870,9 +1870,9 @@ static int globdots; /* Glob initial dots? */ #ifdef MULTIBYTE_SUPPORT /* Get a character from the start point in a string */ -#define CHARREF(x, y) charref((x), (y)) +#define CHARREF(x, y) charref((x), (y), (int *)NULL) static wchar_t -charref(char *x, char *y) +charref(char *x, char *y, int *zmb_ind) { wchar_t wc; size_t ret; @@ -1886,9 +1886,13 @@ charref(char *x, char *y) /* Error. */ /* Reset the shift state for next time. */ memset(&shiftstate, 0, sizeof(shiftstate)); + if (zmb_ind) + *zmb_ind = (ret == MB_INVALID) ? ZMB_INVALID : ZMB_INCOMPLETE; return WCHAR_INVALID(*x); } + if (zmb_ind) + *zmb_ind = ZMB_VALID; return wc; } @@ -2580,10 +2584,11 @@ patmatch(Upat prog) fail = 1; else { #ifdef MULTIBYTE_SUPPORT - wchar_t cr = CHARREF(patinput, patinend); + int zmb_ind; + wchar_t cr = charref(patinput, patinend, &zmb_ind); char *scanop = (char *)P_OPERAND(scan); if (patglobflags & GF_MULTIBYTE) { - if (mb_patmatchrange(scanop, cr, NULL, NULL) ^ + if (mb_patmatchrange(scanop, cr, zmb_ind, NULL, NULL) ^ (P_OP(scan) == P_ANYOF)) fail = 1; else @@ -3351,6 +3356,9 @@ patmatch(Upat prog) * The null-terminated specification is in range; the test * character is in ch. * + * zmb is one of the enum defined above charref(), for indicating + * incomplete or invalid multibyte characters. + * * indptr is used by completion matching, which is why this * function is exported. If indptr is not NULL we set *indptr * to the index of the character in the range string, adjusted @@ -3367,7 +3375,7 @@ patmatch(Upat prog) /**/ mod_export int -mb_patmatchrange(char *range, wchar_t ch, wint_t *indptr, int *mtp) +mb_patmatchrange(char *range, wchar_t ch, int zmb_ind, wint_t *indptr, int *mtp) { wchar_t r1, r2; @@ -3476,6 +3484,14 @@ mb_patmatchrange(char *range, wchar_t ch, wint_t *indptr, int *mtp) *indptr += r2 - r1; } break; + case PP_INCOMPLETE: + if (zmb_ind == ZMB_INCOMPLETE) + return 1; + break; + case PP_INVALID: + if (zmb_ind == ZMB_INVALID) + return 1; + break; case PP_UNKWN: DPUTS(1, "BUG: unknown posix range passed through.\n"); break; @@ -3545,6 +3561,8 @@ mb_patmatchindex(char *range, wint_t ind, wint_t *chr, int *mtp) case PP_IFS: case PP_IFSSPACE: case PP_WORD: + case PP_INCOMPLETE: + case PP_INVALID: if (!ind) { *mtp = swtype; return 1; @@ -3698,6 +3716,10 @@ patmatchrange(char *range, int ch, int *indptr, int *mtp) if (indptr && r1 < r2) *indptr += r2 - r1; break; + case PP_INCOMPLETE: + case PP_INVALID: + /* Never true if not in multibyte mode */ + break; case PP_UNKWN: DPUTS(1, "BUG: unknown posix range passed through.\n"); break; @@ -3768,6 +3790,8 @@ patmatchindex(char *range, int ind, int *chr, int *mtp) case PP_IFS: case PP_IFSSPACE: case PP_WORD: + case PP_INCOMPLETE: + case PP_INVALID: if (!ind) { *mtp = swtype; return 1; @@ -3851,9 +3875,10 @@ static int patrepeat(Upat p, char *charstart) case P_ANYBUT: while (scan < patinend) { #ifdef MULTIBYTE_SUPPORT - wchar_t cr = CHARREF(scan, patinend); + int zmb_ind; + wchar_t cr = charref(scan, patinend, &zmb_ind); if (patglobflags & GF_MULTIBYTE) { - if (mb_patmatchrange(opnd, cr, NULL, NULL) ^ + if (mb_patmatchrange(opnd, cr, zmb_ind, NULL, NULL) ^ (P_OP(p) == P_ANYOF)) break; } else if (patmatchrange(opnd, (int)cr, NULL, NULL) ^ diff --git a/Src/zsh.h b/Src/zsh.h index a99c90065..4e2cb656e 100644 --- a/Src/zsh.h +++ b/Src/zsh.h @@ -1562,13 +1562,15 @@ typedef struct zpc_disables_save *Zpc_disables_save; #define PP_IFS 15 #define PP_IFSSPACE 16 #define PP_WORD 17 +#define PP_INCOMPLETE 18 +#define PP_INVALID 19 /* Special value for last definition */ -#define PP_LAST 17 +#define PP_LAST 19 /* Unknown type. Not used in a valid token. */ -#define PP_UNKWN 18 +#define PP_UNKWN 20 /* Range: token followed by the (possibly multibyte) start and end */ -#define PP_RANGE 19 +#define PP_RANGE 21 /* Globbing flags: lower 8 bits gives approx count */ #define GF_LCMATCHUC 0x0100 @@ -1577,6 +1579,15 @@ typedef struct zpc_disables_save *Zpc_disables_save; #define GF_MATCHREF 0x0800 #define GF_MULTIBYTE 0x1000 /* Use multibyte if supported by build */ +enum { + /* Valid multibyte character from charref */ + ZMB_VALID, + /* Incomplete multibyte character from charref */ + ZMB_INCOMPLETE, + /* Invalid multibyte character charref */ + ZMB_INVALID +}; + /* Dummy Patprog pointers. Used mainly in executable code, but the * pattern code needs to know about it, too. */ diff --git a/Test/D07multibyte.ztst b/Test/D07multibyte.ztst index 3fadd8066..ace191f06 100644 --- a/Test/D07multibyte.ztst +++ b/Test/D07multibyte.ztst @@ -525,3 +525,9 @@ fi done 0:Invalid characters in pattern matching + + [[ $'\xe3' == [[:INCOMPLETE:]] ]] || print fail 1 + [[ $'\xe3\x83' == [[:INCOMPLETE:]][[:INVALID:]] ]] || print fail 2 + [[ $'\xe3\x83\x9b' != [[:INCOMPLETE:][:NVALID:]] ]] || print fail 3 + [[ $'\xe3\x83\x9b' = ? ]] || print fail 4 +0:Testing incomplete and invalid multibyte character components -- cgit 1.4.1