From f52795ea3e637396ac2bbdef8fa1b1b6f2088db6 Mon Sep 17 00:00:00 2001 From: Peter Stephenson Date: Fri, 4 Sep 2015 10:07:51 +0100 Subject: 36415: remap bytes from invalid multibyte characters. These now go to 0xdc00 + index. If wchar_t is a Unicode code point, this is by construction an invalid character within the Unicode range. If it isn't, we would hope the result was no worse than the current fudge. --- ChangeLog | 8 ++++++++ Src/pattern.c | 28 ++++++++++++++++++++++------ Test/D07multibyte.ztst | 17 +++++++++++++++++ 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/ChangeLog b/ChangeLog index f9822b9c8..860e09df4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2015-09-04 Peter Stephenson + + * 36415: Src/pattern.c, Test/D07multibyte.ztst: remap bytes from + invalid multibyte characters to 0xDC00 + index which is invalid + in Unicode. Strictly this only works if whcar_t is + ISO-10646-compliant, however it ought to be at least as good as + the current fudge in any case. + 2015-09-03 Peter Stephenson * 36416: Src/Zle/zle_refresh.c, Src/Zle/zle_utils.c: If diff --git a/Src/pattern.c b/Src/pattern.c index 7d38988a0..7457cbd23 100644 --- a/Src/pattern.c +++ b/Src/pattern.c @@ -224,6 +224,22 @@ typedef zlong zrange_t; typedef unsigned long zrange_t; #endif +#ifdef MULTIBYTE_SUPPORT +/* + * Handle a byte that's not part of a valid character. + * + * This range in Unicode is recommended for purposes of this + * kind as it corresponds to invalid characters. + * + * Note that this strictly only works if wchar_t represents + * Unicode code points, which isn't necessarily true; however, + * converting an invalid character into an unknown format is + * a bit tricky... + */ +#define WCHAR_INVALID(ch) \ + ((wchar_t) (0xDC00 + STOUC(ch))) +#endif /* MULTIBYTE_SUPPORT */ + /* * Array of characters corresponding to zpc_chars enum, which it must match. */ @@ -353,10 +369,10 @@ metacharinc(char **x) return wc; } - /* Error. Treat as single byte. */ + /* Error. */ /* Reset the shift state for next time. */ memset(&shiftstate, 0, sizeof(shiftstate)); - return (wchar_t) STOUC(*(*x)++); + return WCHAR_INVALID(*(*x)++); } #else @@ -1867,10 +1883,10 @@ charref(char *x, char *y) ret = mbrtowc(&wc, x, y-x, &shiftstate); if (ret == MB_INVALID || ret == MB_INCOMPLETE) { - /* Error. Treat as single byte. */ + /* Error. */ /* Reset the shift state for next time. */ memset(&shiftstate, 0, sizeof(shiftstate)); - return (wchar_t) STOUC(*x); + return WCHAR_INVALID(*x); } return wc; @@ -1913,7 +1929,7 @@ charrefinc(char **x, char *y, int *z) size_t ret; if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(**x) & 0x80)) - return (wchar_t) STOUC(*(*x)++); + return WCHAR_INVALID(*(*x)++); ret = mbrtowc(&wc, *x, y-*x, &shiftstate); @@ -1922,7 +1938,7 @@ charrefinc(char **x, char *y, int *z) *z = 1; /* Reset the shift state for next time. */ memset(&shiftstate, 0, sizeof(shiftstate)); - return (wchar_t) STOUC(*(*x)++); + return WCHAR_INVALID(*(*x)++); } /* Nulls here are normal characters */ diff --git a/Test/D07multibyte.ztst b/Test/D07multibyte.ztst index 0e3e98d38..3fadd8066 100644 --- a/Test/D07multibyte.ztst +++ b/Test/D07multibyte.ztst @@ -508,3 +508,20 @@ cd .. } 0:cd with special characters + + test_array=( + '[[ \xcc = \xcc ]]' + '[[ \xcc != \xcd ]]' + '[[ \xcc != \ucc ]]' + '[[ \ucc = \ucc ]]' + '[[ \ucc = [\ucc] ]]' + '[[ \xcc != [\ucc] ]]' + # Not clear how useful the following is... + '[[ \xcc = [\xcc] ]]' + ) + for test in $test_array; do + if ! eval ${(g::)test} ; then + print -rl "Test $test failed" >&2 + fi + done +0:Invalid characters in pattern matching -- cgit 1.4.1