From f1923bdfa6300a0d32e3329eb2488447f76b8970 Mon Sep 17 00:00:00 2001 From: Peter Stephenson Date: Fri, 12 Jun 2015 09:30:39 +0100 Subject: Add non-metafied character length handling. Use this in regex module and add test using $'\ua0'. Rename mb_metacharinit() to mb_charinit() as it does not involve metafied characters. --- ChangeLog | 10 ++++++ Src/Modules/curses.c | 2 +- Src/Modules/regex.c | 33 +++++++++++++------ Src/Zle/complist.c | 2 +- Src/Zle/zle_utils.c | 2 +- Src/builtin.c | 4 +-- Src/glob.c | 14 ++++---- Src/hist.c | 2 +- Src/prompt.c | 2 +- Src/utils.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++---- Src/zsh.h | 12 ++++++- Test/D07multibyte.ztst | 13 ++++++++ 12 files changed, 154 insertions(+), 31 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4f96800b6..5b45e5e8f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2015-06-12 Peter Stephenson + + * 35448: Src/Modules/curses.c, Src/Modules/regex.c, + Src/Zle/complist.c, Src/Zle/zle_utils.c, Src/builtin.c, + Src/glob.c, Src/hist.c, Src/prompt.c, Src/utils.c, Src/zsh.h, + Test/D07multibyte.ztst: Add non-metafied character length + handling and use this for regex module. Add test. + Rename mb_metacharinit() to mb_charinit() since it doesn't + involve metafied characters. + 2015-06-11 Peter Stephenson * 35442: Doc/Zsh/options.yo: multibyte option now on diff --git a/Src/Modules/curses.c b/Src/Modules/curses.c index 41ad2c6e4..62dbd55ea 100644 --- a/Src/Modules/curses.c +++ b/Src/Modules/curses.c @@ -765,7 +765,7 @@ zccmd_string(const char *nam, char **args) w = (ZCWin)getdata(node); #ifdef HAVE_WADDWSTR - mb_metacharinit(); + mb_charinit(); wptr = wstr = zhalloc((strlen(str)+1) * sizeof(wchar_t)); while (*str && (clen = mb_metacharlenconv(str, &wc))) { diff --git a/Src/Modules/regex.c b/Src/Modules/regex.c index ce57de986..94f523f32 100644 --- a/Src/Modules/regex.c +++ b/Src/Modules/regex.c @@ -115,6 +115,7 @@ zcond_regex_match(char **a, int id) } else { zlong offs; char *ptr; + int clen, leftlen; m = matches; s = metafy(lhstr + m->rm_so, m->rm_eo - m->rm_so, META_DUP); @@ -123,19 +124,25 @@ zcond_regex_match(char **a, int id) * Count the characters before the match. */ ptr = lhstr; + leftlen = m->rm_so; offs = 0; - MB_METACHARINIT(); - while (ptr < lhstr + m->rm_so) { + MB_CHARINIT(); + while (leftlen) { offs++; - ptr += MB_METACHARLEN(ptr); + clen = MB_CHARLEN(ptr, leftlen); + ptr += clen; + leftlen -= clen; } setiparam("MBEGIN", offs + !isset(KSHARRAYS)); /* * Add on the characters in the match. */ - while (ptr < lhstr + m->rm_eo) { + leftlen = m->rm_eo - m->rm_so; + while (leftlen) { offs++; - ptr += MB_METACHARLEN(ptr); + clen = MB_CHARLEN(ptr, leftlen); + ptr += clen; + leftlen -= clen; } setiparam("MEND", offs + !isset(KSHARRAYS) - 1); if (nelem) { @@ -149,19 +156,25 @@ zcond_regex_match(char **a, int id) { char buf[DIGBUFSIZE]; ptr = lhstr; + leftlen = m->rm_so; offs = 0; /* Find the start offset */ - MB_METACHARINIT(); - while (ptr < lhstr + m->rm_so) { + MB_CHARINIT(); + while (leftlen) { offs++; - ptr += MB_METACHARLEN(ptr); + clen = MB_CHARLEN(ptr, leftlen); + ptr += clen; + leftlen -= clen; } convbase(buf, offs + !isset(KSHARRAYS), 10); *bptr = ztrdup(buf); /* Continue to the end offset */ - while (ptr < lhstr + m->rm_eo) { + leftlen = m->rm_eo - m->rm_so; + while (leftlen ) { offs++; - ptr += MB_METACHARLEN(ptr); + clen = MB_CHARLEN(ptr, leftlen); + ptr += clen; + leftlen -= clen; } convbase(buf, offs + !isset(KSHARRAYS) - 1, 10); *eptr = ztrdup(buf); diff --git a/Src/Zle/complist.c b/Src/Zle/complist.c index f54206619..a02a5c37b 100644 --- a/Src/Zle/complist.c +++ b/Src/Zle/complist.c @@ -728,7 +728,7 @@ clnicezputs(int do_colors, char *s, int ml) if (do_colors) initiscol(); - mb_metacharinit(); + mb_charinit(); while (umleft > 0) { size_t cnt = eol ? MB_INVALID : mbrtowc(&cc, uptr, umleft, &mbs); diff --git a/Src/Zle/zle_utils.c b/Src/Zle/zle_utils.c index e4ab97a54..06e458190 100644 --- a/Src/Zle/zle_utils.c +++ b/Src/Zle/zle_utils.c @@ -1288,7 +1288,7 @@ showmsg(char const *msg) p = unmetafy(umsg, &ulen); memset(&mbs, 0, sizeof mbs); - mb_metacharinit(); + mb_charinit(); while (ulen > 0) { char const *n; if (*p == '\n') { diff --git a/Src/builtin.c b/Src/builtin.c index a3d847f41..0edc07024 100644 --- a/Src/builtin.c +++ b/Src/builtin.c @@ -4582,7 +4582,7 @@ bin_print(char *name, char **args, Options ops, int func) convchar_t cc; #ifdef MULTIBYTE_SUPPORT if (isset(MULTIBYTE)) { - mb_metacharinit(); + mb_charinit(); (void)mb_metacharlenconv(metafy(curarg+1, curlen-1, META_USEHEAP), &cc); } @@ -5557,7 +5557,7 @@ bin_read(char *name, char **args, Options ops, UNUSED(int func)) wint_t wi; if (isset(MULTIBYTE)) { - mb_metacharinit(); + mb_charinit(); (void)mb_metacharlenconv(delimstr, &wi); } else diff --git a/Src/glob.c b/Src/glob.c index 057d44a17..eff34a24e 100644 --- a/Src/glob.c +++ b/Src/glob.c @@ -2237,7 +2237,7 @@ xpandbraces(LinkList list, LinkNode *np) #ifdef MULTIBYTE_SUPPORT char *ncptr; int nclen; - mb_metacharinit(); + mb_charinit(); ncptr = wcs_nicechar(cend, NULL, NULL); nclen = strlen(ncptr); p = zhalloc(lenalloc + nclen); @@ -2805,7 +2805,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, * ... now we know whether it's worth looking for the * shortest, which we do by brute force. */ - mb_metacharinit(); + mb_charinit(); for (t = s, umlen = 0; t < s + mlen; ) { set_pat_end(p, *t); if (pattrylen(p, s, t - s, umlen, 0)) { @@ -2831,7 +2831,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, * so that match, mbegin, mend and MATCH, MBEGIN, MEND are * correct. */ - mb_metacharinit(); + mb_charinit(); tmatch = NULL; for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) { set_pat_start(p, t-s); @@ -2855,7 +2855,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, /* Largest possible match at tail of string: * * move forward along string until we get a match. * * Again there's no optimisation. */ - mb_metacharinit(); + mb_charinit(); for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) { set_pat_start(p, t-s); if (pattrylen(p, t, s + l - t, umlen, ioff)) { @@ -2889,7 +2889,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, } ioff = 0; /* offset into string */ umlen = umltot; - mb_metacharinit(); + mb_charinit(); do { /* loop over all matches for global substitution */ matched = 0; @@ -2986,7 +2986,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, */ nmatches = 0; tmatch = NULL; - mb_metacharinit(); + mb_charinit(); for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) { set_pat_start(p, t-s); if (pattrylen(p, t, s + l - t, umlen, ioff)) { @@ -3002,7 +3002,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr, * We need to find the n'th last match. */ n = nmatches - n; - mb_metacharinit(); + mb_charinit(); for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) { set_pat_start(p, t-s); if (pattrylen(p, t, s + l - t, umlen, ioff) && diff --git a/Src/hist.c b/Src/hist.c index bd03c4f11..672531394 100644 --- a/Src/hist.c +++ b/Src/hist.c @@ -2000,7 +2000,7 @@ casemodify(char *str, int how) VARARR(char, mbstr, MB_CUR_MAX); mbstate_t ps; - mb_metacharinit(); + mb_charinit(); memset(&ps, 0, sizeof(ps)); while (*str) { wint_t wc; diff --git a/Src/prompt.c b/Src/prompt.c index ffc1d0df2..9e8589d5b 100644 --- a/Src/prompt.c +++ b/Src/prompt.c @@ -964,7 +964,7 @@ stradd(char *d) /* FALL THROUGH */ default: /* Take full wide character in one go */ - mb_metacharinit(); + mb_charinit(); pc = wcs_nicechar(cc, NULL, NULL); break; } diff --git a/Src/utils.c b/Src/utils.c index c33c16d5a..13fc96a16 100644 --- a/Src/utils.c +++ b/Src/utils.c @@ -82,7 +82,7 @@ set_widearray(char *mb_array, Widechar_array wca) wchar_t *wcptr = tmpwcs; wint_t wci; - mb_metacharinit(); + mb_charinit(); while (*mb_array) { int mblen = mb_metacharlenconv(mb_array, &wci); @@ -332,7 +332,7 @@ zerrmsg(FILE *file, const char *fmt, va_list ap) case 'c': num = va_arg(ap, int); #ifdef MULTIBYTE_SUPPORT - mb_metacharinit(); + mb_charinit(); zputs(wcs_nicechar(num, NULL, NULL), file); #else zputs(nicechar(num), file); @@ -461,12 +461,13 @@ static mbstate_t mb_shiftstate; /* * Initialise multibyte state: called before a sequence of - * wcs_nicechar() or mb_metacharlenconv(). + * wcs_nicechar(), mb_metacharlenconv(), or + * mb_charlenconv(). */ /**/ mod_export void -mb_metacharinit(void) +mb_charinit(void) { memset(&mb_shiftstate, 0, sizeof(mb_shiftstate)); } @@ -500,7 +501,7 @@ mb_metacharinit(void) * (but not both). (Note the complication that the wide character * part may contain metafied characters.) * - * The caller needs to call mb_metacharinit() before the first call, to + * The caller needs to call mb_charinit() before the first call, to * set up the multibyte shift state for a range of characters. */ @@ -3832,7 +3833,7 @@ itype_end(const char *ptr, int itype, int once) #ifdef MULTIBYTE_SUPPORT if (isset(MULTIBYTE) && (itype != IIDENT || !isset(POSIXIDENTIFIERS))) { - mb_metacharinit(); + mb_charinit(); while (*ptr) { wint_t wc; int len = mb_metacharlenconv(ptr, &wc); @@ -4972,6 +4973,65 @@ mb_metastrlenend(char *ptr, int width, char *eptr) return num + num_in_char; } +/* + * The equivalent of mb_metacharlenconv_r() for + * strings that aren't metafied and hence have + * explicit lengths. + */ + +/**/ +mod_export int +mb_charlenconv_r(const char *s, int slen, wint_t *wcp, mbstate_t *mbsp) +{ + size_t ret = MB_INVALID; + char inchar; + const char *ptr; + wchar_t wc; + + for (ptr = s; slen; ) { + inchar = *ptr; + ptr++; + slen--; + ret = mbrtowc(&wc, &inchar, 1, mbsp); + + if (ret == MB_INVALID) + break; + if (ret == MB_INCOMPLETE) + continue; + if (wcp) + *wcp = wc; + return ptr - s; + } + + if (wcp) + *wcp = WEOF; + /* No valid multibyte sequence */ + memset(mbsp, 0, sizeof(*mbsp)); + if (ptr > s) { + return 1; /* Treat as single byte character */ + } else + return 0; /* Probably shouldn't happen */ +} + +/* + * The equivalent of mb_metacharlenconv() for + * strings that aren't metafied and hence have + * explicit lengths; + */ + +/**/ +mod_export int +mb_charlenconv(const char *s, int slen, wint_t *wcp) +{ + if (!isset(MULTIBYTE)) { + if (wcp) + *wcp = (wint_t)*s; + return 1; + } + + return mb_charlenconv_r(s, slen, wcp, &mb_shiftstate); +} + /**/ #else @@ -4996,6 +5056,23 @@ metacharlenconv(const char *x, int *c) return 1; } +/* Simple replacement for mb_charlenconv */ + +/**/ +mod_export int +charlenconv(const char *x, int len, int *c) +{ + if (!len) { + if (c) + *c = '\0'; + return 0; + } + + if (c) + *c = (char)*x; + return 1; +} + /**/ #endif /* MULTIBYTE_SUPPORT */ diff --git a/Src/zsh.h b/Src/zsh.h index c88c2e739..fb04929d9 100644 --- a/Src/zsh.h +++ b/Src/zsh.h @@ -2921,8 +2921,9 @@ enum { #define AFTERTRAPHOOK (zshhooks + 2) #ifdef MULTIBYTE_SUPPORT +/* Metafied input */ #define nicezputs(str, outs) (void)mb_niceformat((str), (outs), NULL, 0) -#define MB_METACHARINIT() mb_metacharinit() +#define MB_METACHARINIT() mb_charinit() typedef wint_t convchar_t; #define MB_METACHARLENCONV(str, cp) mb_metacharlenconv((str), (cp)) #define MB_METACHARLEN(str) mb_metacharlenconv(str, NULL) @@ -2932,6 +2933,11 @@ typedef wint_t convchar_t; #define MB_METASTRLEN2END(str, widthp, eptr) \ mb_metastrlenend(str, widthp, eptr) +/* Unmetafined input */ +#define MB_CHARINIT() mb_charinit() +#define MB_CHARLENCONV(str, len, cp) mb_charlenconv((str), (len), (cp)) +#define MB_CHARLEN(str, len) mb_charlenconv((str), (len), NULL) + /* * We replace broken implementations with one that uses Unicode * characters directly as wide characters. In principle this is only @@ -3015,6 +3021,10 @@ typedef int convchar_t; #define MB_METASTRLEN2(str, widthp) ztrlen(str) #define MB_METASTRLEN2END(str, widthp, eptr) ztrlenend(str, eptr) +#define MB_CHARINIT() +#define MB_CHARLENCONV(str, len, cp) charlenconv((str), (len), (cp)) +#define MB_CHARLEN(str, len) ((len) ? 1 : 0) + #define WCWIDTH_WINT(c) (1) /* Leave character or string as is. */ diff --git a/Test/D07multibyte.ztst b/Test/D07multibyte.ztst index c9ecb78e9..5f9e8abcf 100644 --- a/Test/D07multibyte.ztst +++ b/Test/D07multibyte.ztst @@ -484,3 +484,16 @@ # This doesn't look aligned in my editor because actually the characters # aren't quite double width, but the arithmetic is correct. # It appears just to be an effect of the font. + + if zmodload -i zsh/regex 2>/dev/null; then + [[ $'\ua0' =~ '^.$' ]] && print OK + [[ $'\ua0' =~ $'^\ua0$' ]] && print OK + [[ $'\ua0'X =~ '^X$' ]] || print OK + else + print -u$ZTST_fd "Regexp test skipped, regexp library not found." + print -l OK OK OK + fi +0:Ensure no confusion on metafied input to regex module +>OK +>OK +>OK -- cgit 1.4.1