From 05bd0b2dd1efabeda472cb2f9a675a7269d4cf6a Mon Sep 17 00:00:00 2001 From: Peter Stephenson Date: Mon, 26 Jun 2006 18:17:31 +0000 Subject: 22518: Initial go at making parameter subscripts use multibyte characters. --- Src/params.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++---------- Src/utils.c | 106 +++++++++++++++++++++++++++++++++++++++++ Src/zsh.h | 6 +++ 3 files changed, 240 insertions(+), 24 deletions(-) (limited to 'Src') diff --git a/Src/params.c b/Src/params.c index 7deee4288..c5bfc79f5 100644 --- a/Src/params.c +++ b/Src/params.c @@ -918,9 +918,33 @@ isident(char *s) return !ss[1]; } +/* + * Parse a single argument to a parameter subscript. + * The subscripts starts at *str; *str is updated (input/output) + * + * *inv is set to indicate if the subscript is reversed (output) + * v is the Value for the parameter being accessed (input; note + * v->isarr may be modified, and if v is a hash the parameter will + * be updated to the element of the hash) + * a2 is 1 if this is the second subscript of a range (input) + * *w is only set if we need to find the end of a word (input; should + * be set to 0 by the caller). + * + * The final two arguments are to support multibyte characters. + * If supplied they are set to the length of the character before + * the index position and the one at the index position. If + * multibyte characters are not in use they are set to 1 for + * consistency. + * + * Returns a raw offset into the value from the start or end (i.e. + * after the arithmetic for Meta and possible multibyte characters has + * been taken into account). + */ + /**/ static zlong -getarg(char **str, int *inv, Value v, int a2, zlong *w) +getarg(char **str, int *inv, Value v, int a2, zlong *w, + int *prevcharlen, int *nextcharlen) { int hasbeg = 0, word = 0, rev = 0, ind = 0, down = 0, l, i, ishash; int keymatch = 0, needtok = 0; @@ -929,6 +953,10 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w) Patprog pprog = NULL; ishash = (v->pm && PM_TYPE(v->pm->node.flags) == PM_HASHED); + if (prevcharlen) + *prevcharlen = 1; + if (nextcharlen) + *nextcharlen = 1; /* first parse any subscription flags */ if (v->pm && (*s == '(' || *s == Inpar)) { @@ -1133,17 +1161,43 @@ getarg(char **str, int *inv, Value v, int a2, zlong *w) return (a2 ? s : d + 1) - t; } else if (!v->isarr && !word) { + int lastcharlen = 1; s = getstrvalue(v); + /* + * Note for the confused (= pws): the index r we + * have so far is that specified by the user. The value + * passed back is an offset from the start or end of + * the string. Hence it needs correcting at least + * for Meta characters and maybe for multibyte characters. + */ if (r > 0) { - for (t = s + r - 1; *s && s < t;) - if (*s++ == Meta) - s++, t++, r++; + zlong nchars = r; + + MB_METACHARINIT(); + for (t = s; nchars && *t; nchars--) + t += (lastcharlen = MB_METACHARLEN(t)); + /* for consistency, keep any remainder off the end */ + r = (zlong)(t - s) + nchars; + if (prevcharlen) + *prevcharlen = lastcharlen; + if (nextcharlen && *t) + *nextcharlen = MB_METACHARLEN(t); } else { - r += ztrlen(s); - for (t = s + r; *s && s < t; r--) - if (*s++ == Meta) - t++, r++; - r -= strlen(s); + zlong nchars = (zlong)MB_METASTRLEN(s) + r; + + if (nchars < 0) { + /* invalid but keep index anyway */ + r = nchars; + } else { + MB_METACHARINIT(); + for (t = s; nchars && *t; nchars--) + t += (lastcharlen = MB_METACHARLEN(t)); + r = - (zlong)strlen(t); /* keep negative */ + if (prevcharlen) + *prevcharlen = lastcharlen; + if (nextcharlen && *t) + *nextcharlen = MB_METACHARLEN(t); + } } } } else { @@ -1338,19 +1392,57 @@ getindex(char **pptr, Value v, int dq) s += 2; } else { zlong we = 0, dummy; + int startprevlen, startnextlen; - start = getarg(&s, &inv, v, 0, &we); + start = getarg(&s, &inv, v, 0, &we, &startprevlen, &startnextlen); if (inv) { if (!v->isarr && start != 0) { char *t, *p; t = getstrvalue(v); + /* + * Note for the confused (= pws): this is an inverse + * offset so at this stage we need to convert from + * the immediate offset into the value that we have + * into a logical character position. + */ if (start > 0) { - for (p = t + start - 1; p-- > t; ) - if (*p == Meta) - start--; - } else - start = -ztrlen(t + start + strlen(t)); + int nstart = 0; + char *target = t + start - startprevlen; + + p = t; + MB_METACHARINIT(); + while (*p) { + /* + * move up characters, counting how many we + * found + */ + p += MB_METACHARLEN(p); + if (p < target) + nstart++; + else { + if (p == target) + nstart++; + else + p = target; /* pretend we hit exactly */ + break; + } + } + /* if start was too big, keep the difference */ + start = nstart + (target - p) + startprevlen; + } else { + zlong startoff = start + strlen(t); + if (startoff < 0) { + /* invalid: keep index but don't dereference */ + start = startoff; + } else { + /* find start in full characters */ + MB_METACHARINIT(); + for (p = t; p < t + startoff;) + p += MB_METACHARLEN(p); + start = - MB_METASTRLEN(p); + } + } } if (start > 0 && (isset(KSHARRAYS) || (v->pm->node.flags & PM_HASHED))) start--; @@ -1373,15 +1465,21 @@ getindex(char **pptr, Value v, int dq) if ((com = (*s == ','))) { s++; - end = getarg(&s, &inv, v, 1, &dummy); + end = getarg(&s, &inv, v, 1, &dummy, NULL, NULL); } else { end = we ? we : start; } - if (start != end) com = 1; + if (start != end) + com = 1; + /* + * Somehow the logic sometimes forces us to use the previous + * or next character to what we would expect, which is + * why we had to calculate them in getarg(). + */ if (start > 0) - start--; + start -= startprevlen; else if (start == 0 && end == 0) - end++; + end = startnextlen; if (s == tbrack) { s++; if (v->isarr && !com && @@ -1578,13 +1676,19 @@ getstrvalue(Value v) if (v->start < 0) v->start = 0; } - if (v->end < 0) - v->end += strlen(s) + 1; + if (v->end < 0) { + v->end += strlen(s); + if (v->end >= 0) { + char *eptr = s + v->end; + if (*eptr) + v->end += MB_METACHARLEN(eptr); + } + } s = (v->start > (int)strlen(s)) ? dupstring("") : dupstring(s + v->start); if (v->end <= v->start) s[0] = '\0'; else if (v->end - v->start <= (int)strlen(s)) - s[v->end - v->start + (s[v->end - v->start - 1] == Meta)] = '\0'; + s[v->end - v->start] = '\0'; return s; } @@ -2791,7 +2895,7 @@ char * tiedarrgetfn(Param pm) { struct tieddata *dptr = (struct tieddata *)pm->u.data; - return *dptr->arrptr ? zjoin(*dptr->arrptr, dptr->joinchar, 1) : ""; + return *dptr->arrptr ? zjoin(*dptr->arrptr, STOUC(dptr->joinchar), 1) : ""; } /**/ @@ -3463,7 +3567,7 @@ arrfixenv(char *s, char **t) return; if (pm->node.flags & PM_TIED) - joinchar = ((struct tieddata *)pm->u.data)->joinchar; + joinchar = STOUC(((struct tieddata *)pm->u.data)->joinchar); else joinchar = ':'; diff --git a/Src/utils.c b/Src/utils.c index 2d7bf2ec6..583945ac5 100644 --- a/Src/utils.c +++ b/Src/utils.c @@ -3683,6 +3683,112 @@ mb_width(const char *s) return width; } +static mbstate_t mb_shiftstate; + +/* + * Initialise multibyte state: called before a sequence of + * mb_metacharlen(). + */ + +/**/ +void +mb_metacharinit(void) +{ + memset(&mb_shiftstate, 0, sizeof(mb_shiftstate)); +} + +/* + * Length of metafied string s which contains the next multibyte + * character; single (possibly metafied) character if string is not null + * but character is not valid (e.g. possibly incomplete at end of string). + * Returned value is guaranteed not to reach beyond the end of the + * string (assuming correct metafication). + */ + +/**/ +int +mb_metacharlen(char *s) +{ + char inchar, *ptr; + size_t ret; + wchar_t wc; + + if (!isset(MULTIBYTE)) + return 1 + (*s == Meta); + + ret = MB_INVALID; + for (ptr = s; *ptr; ) { + if (*ptr == Meta) + inchar = *++ptr ^ 32; + else + inchar = *ptr; + ptr++; + ret = mbrtowc(&wc, &inchar, 1, &mb_shiftstate); + + if (ret == MB_INVALID) + break; + if (ret == MB_INCOMPLETE) + continue; + return ptr - s; + } + + /* No valid multibyte sequence */ + memset(&mb_shiftstate, 0, sizeof(mb_shiftstate)); + if (ptr > s) + return 1 + (*s == Meta); /* Treat as single byte character */ + else + return 0; /* Probably shouldn't happen */ +} + +/* + * Total number of multibyte characters in metafied string s. + * Same answer as iterating mb_metacharlen() and counting calls + * until end of string. + */ + +/**/ +int +mb_metastrlen(char *ptr) +{ + char inchar, *laststart; + size_t ret; + wchar_t wc; + int num, num_in_char; + + if (!isset(MULTIBYTE)) + return ztrlen(ptr); + + laststart = ptr; + ret = MB_INVALID; + num = num_in_char = 0; + + memset(&mb_shiftstate, 0, sizeof(mb_shiftstate)); + while (*ptr) { + if (*ptr == Meta) + inchar = *++ptr ^ 32; + else + inchar = *ptr; + ptr++; + ret = mbrtowc(&wc, &inchar, 1, &mb_shiftstate); + + if (ret == MB_INCOMPLETE) { + num_in_char++; + } else { + if (ret == MB_INVALID) { + /* Reset, treat as single character */ + memset(&mb_shiftstate, 0, sizeof(mb_shiftstate)); + ptr = laststart + (*laststart == Meta) + 1; + } else + laststart = ptr; + num++; + num_in_char = 0; + } + } + + /* If incomplete, treat remainder as trailing single bytes */ + return num + num_in_char; +} + /**/ #endif /* MULTIBYTE_SUPPORT */ diff --git a/Src/zsh.h b/Src/zsh.h index 092e05c0c..31609d3c5 100644 --- a/Src/zsh.h +++ b/Src/zsh.h @@ -1926,6 +1926,9 @@ typedef char *(*ZleGetLineFn) _((int *, int *)); #ifdef MULTIBYTE_SUPPORT #define nicezputs(str, outs) (void)mb_niceformat((str), (outs), NULL, 0) +#define MB_METACHARINIT() mb_metacharinit() +#define MB_METACHARLEN(str) mb_metacharlen(str) +#define MB_METASTRLEN(str) mb_metastrlen(str) #define MB_INCOMPLETE ((size_t)-2) #define MB_INVALID ((size_t)-1) @@ -1946,6 +1949,9 @@ typedef char *(*ZleGetLineFn) _((int *, int *)); #define ZWS(s) L ## s #else +#define MB_METACHARINIT() +#define MB_METACHARLEN(str) (*(str) == Meta ? 2 : 1) +#define MB_METASTRLEN(str) ztrlen(str) /* Leave character or string as is. */ #define ZWC(c) c -- cgit 1.4.1