From 6157c14d0602c698aa9ebfac9a2135ef095a76b4 Mon Sep 17 00:00:00 2001 From: Peter Stephenson Date: Wed, 28 Jun 2006 13:12:55 +0000 Subject: 22525: lengths and cases of multibyte strings in parameters and history --- Src/hist.c | 174 ++++++++++++++++++++++++++++++++++++++++------------------ Src/jobs.c | 2 +- Src/pattern.c | 19 +++---- Src/subst.c | 67 +++++++++------------- Src/utils.c | 20 +++++-- Src/zsh.h | 13 ++++- 6 files changed, 182 insertions(+), 113 deletions(-) (limited to 'Src') diff --git a/Src/hist.c b/Src/hist.c index 0873ccce5..33c4035bf 100644 --- a/Src/hist.c +++ b/Src/hist.c @@ -635,10 +635,10 @@ histsubchar(int c) quotebreak(&sline); break; case 'l': - downcase(&sline); + sline = casemodify(sline, CASMOD_LOWER); break; case 'u': - upcase(&sline); + sline = casemodify(sline, CASMOD_UPPER); break; default: herrflush(); @@ -1503,42 +1503,130 @@ remlpaths(char **junkptr) return 0; } -/**/ -int -makeuppercase(char **junkptr) -{ - char *str = *junkptr; - - for (; *str; str++) - *str = tuupper(*str); - return 1; -} +/* + * Return modified version of str from the heap with modification + * according to one of the CASMOD_* types defined in zsh.h; CASMOD_NONE + * is not handled, for obvious reasons. + */ /**/ -int -makelowercase(char **junkptr) -{ - char *str = *junkptr; - - for (; *str; str++) - *str = tulower(*str); - return 1; -} +char * +casemodify(char *str, int how) +{ + char *str2 = zhalloc(2 * strlen(str) + 1); + char *ptr2 = str2; + int nextupper = 1; + +#ifdef MULTIBYTE_SUPPORT + if (isset(MULTIBYTE)) { + VARARR(char, mbstr, MB_CUR_MAX); + mbstate_t ps; + + mb_metacharinit(); + memset(&ps, 0, sizeof(ps)); + while (*str) { + wint_t wc; + int len = mb_metacharlenconv(str, &wc), mod = 0, len2; + /* + * wc is set to WEOF if the start of str couldn't be + * converted. Presumably WEOF doesn't match iswlower(), but + * better be safe. + */ + if (wc == WEOF) { + while (len--) + *ptr2++ = *str++; + /* not alphanumeric */ + nextupper = 1; + continue; + } + switch (how) { + case CASMOD_LOWER: + if (iswupper(wc)) { + wc = towlower(wc); + mod = 1; + } + break; -/**/ -int -makecapitals(char **junkptr) -{ - char *str = *junkptr; + case CASMOD_UPPER: + if (iswlower(wc)) { + wc = towupper(wc); + mod = 1; + } + break; - for (; *str;) { - for (; *str && !ialnum(*str); str++); - if (*str) - *str = tuupper(*str), str++; - for (; *str && ialnum(*str); str++) - *str = tulower(*str); + case CASMOD_CAPS: + default: /* shuts up compiler */ + if (!iswalnum(wc)) + nextupper = 1; + else if (nextupper) { + if (iswlower(wc)) { + wc = towupper(wc); + mod = 1; + } + nextupper = 0; + } else if (iswupper(wc)) { + wc = towlower(wc); + mod = 1; + } + break; + } + if (mod && (len2 = wcrtomb(mbstr, wc, &ps)) > 0) { + char *mbptr; + + for (mbptr = mbstr; mbptr < mbstr + len2; mbptr++) { + if (imeta(STOUC(*mbptr))) { + *ptr2++ = Meta; + *ptr2++ = *mbptr ^ 32; + } else + *ptr2++ = *mbptr; + } + str += len; + } else { + while (len--) + *ptr2++ = *str++; + } + } } - return 1; + else +#endif + while (*str) { + int c; + if (*str == Meta) { + c = str[1] ^ 32; + str += 2; + } else + c = *str++; + switch (how) { + case CASMOD_LOWER: + if (isupper(c)) + c = tolower(c); + break; + + case CASMOD_UPPER: + if (islower(c)) + c = toupper(c); + break; + + case CASMOD_CAPS: + default: /* shuts up compiler */ + if (!ialnum(c)) + nextupper = 1; + else if (nextupper) { + if (islower(c)) + c = toupper(c); + nextupper = 0; + } else if (isupper(c)) + c = tolower(c); + break; + } + if (imeta(c)) { + *ptr2++ = Meta; + *ptr2++ = c ^ 32; + } else + *ptr2++ = c; + } + *ptr2 = '\0'; + return str2; } /**/ @@ -1644,26 +1732,6 @@ getargs(Histent elist, int arg1, int arg2) return dupstrpfx(elist->node.nam + pos1, words[2*arg2+1] - pos1); } -/**/ -void -upcase(char **x) -{ - char *pp = *(char **)x; - - for (; *pp; pp++) - *pp = tuupper(*pp); -} - -/**/ -void -downcase(char **x) -{ - char *pp = *(char **)x; - - for (; *pp; pp++) - *pp = tulower(*pp); -} - /**/ int quote(char **tr) diff --git a/Src/jobs.c b/Src/jobs.c index cfc733ecf..509b9e843 100644 --- a/Src/jobs.c +++ b/Src/jobs.c @@ -2014,7 +2014,7 @@ bin_kill(char *nam, char **argv, UNUSED(Options ops), UNUSED(int func)) return 1; } else signame = *argv; - makeuppercase(&signame); + signame = casemodify(signame, CASMOD_UPPER); if (!strncmp(signame, "SIG", 3)) signame+=3; diff --git a/Src/pattern.c b/Src/pattern.c index a39095c37..bc9afbae3 100644 --- a/Src/pattern.c +++ b/Src/pattern.c @@ -1644,17 +1644,12 @@ charrefinc(char **x, char *y) } -#ifndef PARAMETER_CODE_HANDLES_MULTIBYTE /* - * TODO: We should use the other branch, but currently - * the parameter code doesn't handle multibyte input, - * so this would produce the wrong subscripts, - * so just use a raw byte difference for now. + * Counter the number of characters between two pointers, smaller first + * + * This is used when setting values in parameters, so we obey + * the MULTIBYTE option (even if it's been overridden locally). */ -/* Counter the number of characters between two pointers, smaller first */ -# define CHARSUB(x,y) ((y) - (x)) -#else -/* Counter the number of characters between two pointers, smaller first */ #define CHARSUB(x,y) charsub(x, y) static ptrdiff_t charsub(char *x, char *y) @@ -1663,6 +1658,9 @@ charsub(char *x, char *y) size_t ret; wchar_t wc; + if (!isset(MULTIBYTE)) + return y - x; + while (x < y) { ret = mbrtowc(&wc, x, y-x, &shiftstate); @@ -1674,13 +1672,12 @@ charsub(char *x, char *y) /* Treat nulls as normal characters */ if (!ret) ret = 1; - res += ret; + res++; x += ret; } return res; } -#endif #else /* no MULTIBYTE_SUPPORT */ diff --git a/Src/subst.c b/Src/subst.c index 803f8d99d..d69f34c4b 100644 --- a/Src/subst.c +++ b/Src/subst.c @@ -1019,7 +1019,7 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub) /* (u): straightforward. */ int unique = 0; /* combination of (L), (U) and (C) flags. */ - int casmod = 0; + int casmod = CASMOD_NONE; /* * quotemod says we are doing either (q) (positive), (Q) (negative) * or not (0). quotetype counts the q's for the first case. @@ -1211,13 +1211,13 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub) break; case 'L': - casmod = 2; + casmod = CASMOD_LOWER; break; case 'U': - casmod = 1; + casmod = CASMOD_UPPER; break; case 'C': - casmod = 3; + casmod = CASMOD_CAPS; break; case 'o': @@ -1819,17 +1819,13 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub) break; } switch (v->pm->node.flags & (PM_LOWER | PM_UPPER)) { - char *t; - case PM_LOWER: - t = val; - for (; (c = *t); t++) - *t = tulower(c); + val = casemodify(val, CASMOD_LOWER); + copied = 1; break; case PM_UPPER: - t = val; - for (; (c = *t); t++) - *t = tuupper(c); + val = casemodify(val, CASMOD_UPPER); + copied = 1; break; } } @@ -2316,14 +2312,14 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub) if (isarr) { char **ctr; - int sl = sep ? ztrlen(sep) : 1; + int sl = sep ? MB_METASTRLEN(sep) : 1; if (getlen == 1) for (ctr = aval; *ctr; ctr++, len++); else if (getlen == 2) { if (*aval) for (len = -sl, ctr = aval; - len += sl + ztrlen(*ctr), *++ctr;); + len += sl + MB_METASTRLEN(*ctr), *++ctr;); } else for (ctr = aval; @@ -2331,7 +2327,7 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub) len += wordcount(*ctr, spsep, getlen > 3), ctr++); } else { if (getlen < 3) - len = ztrlen(val); + len = MB_METASTRLEN(val); else len = wordcount(val, spsep, getlen > 3); } @@ -2387,33 +2383,19 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub) /* * Perform case modififications. */ - if (casmod) { + if (casmod != CASMOD_NONE) { + copied = 1; /* string is always modified by copy */ if (isarr) { - char **ap; + char **ap, **ap2; - if (!copied) - aval = arrdup(aval), copied = 1; ap = aval; + ap2 = aval = (char **) zhalloc(sizeof(char *) * (arrlen(aval)+1)); - if (casmod == 1) - for (; *ap; ap++) - makeuppercase(ap); - else if (casmod == 2) - for (; *ap; ap++) - makelowercase(ap); - else - for (; *ap; ap++) - makecapitals(ap); - + while (*ap) + *ap2++ = casemodify(*ap++, casmod); + *ap2++ = NULL; } else { - if (!copied) - val = dupstring(val), copied = 1; - if (casmod == 1) - makeuppercase(&val); - else if (casmod == 2) - makelowercase(&val); - else - makecapitals(&val); + val = casemodify(val, casmod); } } /* @@ -2975,7 +2957,8 @@ modify(char **str, char **ptr) for (t = e = *str; (tt = findword(&e, sep));) { tc = *e; *e = '\0'; - copy = dupstring(tt); + if (c != 'l' && c != 'u') + copy = dupstring(tt); *e = tc; switch (c) { case 'h': @@ -2991,10 +2974,10 @@ modify(char **str, char **ptr) remlpaths(©); break; case 'l': - downcase(©); + copy = casemodify(tt, CASMOD_LOWER); break; case 'u': - upcase(©); + copy = casemodify(tt, CASMOD_UPPER); break; case 's': if (hsubl && hsubr) @@ -3050,10 +3033,10 @@ modify(char **str, char **ptr) remlpaths(str); break; case 'l': - downcase(str); + *str = casemodify(*str, CASMOD_LOWER); break; case 'u': - upcase(str); + *str = casemodify(*str, CASMOD_UPPER); break; case 's': if (hsubl && hsubr) { diff --git a/Src/utils.c b/Src/utils.c index 4b2f07f19..32f6ae336 100644 --- a/Src/utils.c +++ b/Src/utils.c @@ -3687,7 +3687,7 @@ static mbstate_t mb_shiftstate; /* * Initialise multibyte state: called before a sequence of - * mb_metacharlen(). + * mb_metacharlenconv(). */ /**/ @@ -3703,18 +3703,24 @@ mb_metacharinit(void) * but character is not valid (e.g. possibly incomplete at end of string). * Returned value is guaranteed not to reach beyond the end of the * string (assuming correct metafication). + * + * If wcp is not NULL, the converted wide character is stored there. + * If no conversion could be done WEOF is used. */ /**/ int -mb_metacharlen(char *s) +mb_metacharlenconv(char *s, wint_t *wcp) { char inchar, *ptr; size_t ret; wchar_t wc; - if (!isset(MULTIBYTE)) + if (!isset(MULTIBYTE)) { + if (wcp) + *wcp = WEOF; return 1 + (*s == Meta); + } ret = MB_INVALID; for (ptr = s; *ptr; ) { @@ -3729,14 +3735,18 @@ mb_metacharlen(char *s) break; if (ret == MB_INCOMPLETE) continue; + if (wcp) + *wcp = wc; return ptr - s; } + if (wcp) + *wcp = WEOF; /* No valid multibyte sequence */ memset(&mb_shiftstate, 0, sizeof(mb_shiftstate)); - if (ptr > s) + if (ptr > s) { return 1 + (*s == Meta); /* Treat as single byte character */ - else + } else return 0; /* Probably shouldn't happen */ } diff --git a/Src/zsh.h b/Src/zsh.h index 31609d3c5..b0962574a 100644 --- a/Src/zsh.h +++ b/Src/zsh.h @@ -1882,6 +1882,17 @@ struct heap { #define ZSIG_ALIAS (1<<3) /* Trap is stored under an alias */ #define ZSIG_SHIFT 4 +/************************/ +/* Flags to casemodifiy */ +/************************/ + +enum { + CASMOD_NONE, /* dummy for tests */ + CASMOD_UPPER, + CASMOD_LOWER, + CASMOD_CAPS +}; + /**********************************/ /* Flags to third argument of zle */ /**********************************/ @@ -1927,7 +1938,7 @@ typedef char *(*ZleGetLineFn) _((int *, int *)); #ifdef MULTIBYTE_SUPPORT #define nicezputs(str, outs) (void)mb_niceformat((str), (outs), NULL, 0) #define MB_METACHARINIT() mb_metacharinit() -#define MB_METACHARLEN(str) mb_metacharlen(str) +#define MB_METACHARLEN(str) mb_metacharlenconv(str, NULL) #define MB_METASTRLEN(str) mb_metastrlen(str) #define MB_INCOMPLETE ((size_t)-2) -- cgit 1.4.1