From a242b1eb35863b73cbc63699fafe920e8b92c858 Mon Sep 17 00:00:00 2001 From: Peter Stephenson Date: Wed, 13 Sep 2006 20:55:29 +0000 Subject: 22705: make ${(l...)...} and ${(r...)...} handle multibyte characters --- ChangeLog | 7 + Doc/Zsh/expn.yo | 16 +- Src/prompt.c | 18 +-- Src/subst.c | 413 ++++++++++++++++++++++++++++++++++++++++++------- Src/utils.c | 88 ++++------- Src/zsh.h | 2 +- Test/D04parameter.ztst | 12 ++ Test/D07multibyte.ztst | 14 ++ 8 files changed, 440 insertions(+), 130 deletions(-) diff --git a/ChangeLog b/ChangeLog index efb8f978f..3faff76fa 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2006-09-13 Peter Stephenson + + * 22705: Doc/Zsh/expn.yo, Src/prompt.c, Src/subst.c, Src/utils.c, + Src/zsh.h, Test/D04parameter.ztst, Test/D07multibyte.ztst: + make ${(l...)...} and ${(r...)...} padding handle multibyte + characters including those wider than 1 unit. + 2006-09-13 Peter Stephenson * 22704: Completion/Unix/Command/_todo.sh: new completion. diff --git a/Doc/Zsh/expn.yo b/Doc/Zsh/expn.yo index fe16af735..813949222 100644 --- a/Doc/Zsh/expn.yo +++ b/Doc/Zsh/expn.yo @@ -857,12 +857,22 @@ Pad the resulting words on the left. Each word will be truncated if required and placed in a field var(expr) characters wide. The space to the left will be filled with var(string1) (concatenated as often as needed) or spaces if var(string1) is not given. If both -var(string1) and var(string2) are given, this string is inserted -once directly to the left of each word, before padding. +var(string1) and var(string2) are given, tt(string2) is inserted +once directly to the left of each word, truncated if necessary, before +var(string1) is used to produce any remaining padding. + +If the tt(MULTIBYTE) option is in effect, screen character widths will +be used for the calculation of padding; otherwise individual bytes are +treat as occupying one unit of width. ) item(tt(r:)var(expr)tt(::)var(string1)tt(::)var(string2)tt(:))( As tt(l), but pad the words on the right and insert var(string2) -on the right. +immediately to the right of the string to be padded. + +Left and right padding may be used together. In this case the strategy +is to apply left padding to the first half width of each of the resulting +words, and right padding to the second half. If the string to be +padded has odd width the extra padding is applied on the left. ) item(tt(s:)var(string)tt(:))( Force field splitting at the diff --git a/Src/prompt.c b/Src/prompt.c index 21dff16e0..974f70e40 100644 --- a/Src/prompt.c +++ b/Src/prompt.c @@ -1058,12 +1058,7 @@ prompttrunc(int arg, int truncchar, int doprint, int endchar) int twidth, maxwidth; int ntrunc = strlen(t); -#ifdef MULTIBYTE_SUPPORT - /* Use screen width of string */ - twidth = mb_width(t); -#else - twidth = ztrlen(t); -#endif + twidth = MB_METASTRWIDTH(t); if (twidth < truncwidth) { maxwidth = truncwidth - twidth; /* @@ -1130,7 +1125,7 @@ prompttrunc(int arg, int truncchar, int doprint, int endchar) * Normal text: build up a multibyte character. */ char inchar; - wchar_t cc; + wchar_t cc, wcw; /* * careful: string is still metafied (we @@ -1156,7 +1151,9 @@ prompttrunc(int arg, int truncchar, int doprint, int endchar) remw--; break; default: - remw -= wcwidth(cc); + wcw = wcwidth(cc); + if (wcw > 0) + remw -= wcw; break; } #else @@ -1197,6 +1194,7 @@ prompttrunc(int arg, int truncchar, int doprint, int endchar) #ifdef MULTIBYTE_SUPPORT char inchar; wchar_t cc; + int wcw; if (*skiptext == Meta) inchar = *++skiptext ^ 32; @@ -1216,7 +1214,9 @@ prompttrunc(int arg, int truncchar, int doprint, int endchar) maxwidth--; break; default: - maxwidth -= wcwidth(cc); + wcw = wcwidth(cc); + if (wcw > 0) + maxwidth -= wcw; break; } #else diff --git a/Src/subst.c b/Src/subst.c index 3a2c3e111..2be854524 100644 --- a/Src/subst.c +++ b/Src/subst.c @@ -718,12 +718,34 @@ invinstrpcmp(const void *a, const void *b) return -instrpcmp(a, b); } +/* + * Pad the string str, returning a result from the heap (or str itself, + * if it didn't need padding). If str is too large, it will be truncated. + * Calculations are in terms of width if MULTIBYTE is in effect, else + * characters. + * + * prenum and postnum are the width to which the string needs padding + * on the left and right. + * + * preone and postone are string to insert once only before and after + * str. They will be truncated on the left or right, respectively, + * if necessary to fit the width. Either or both may be NULL in which + * case they will not be used. + * + * premul and postmul are the padding strings to be repeated before + * on the left (if prenum is non-zero) and right (if postnum is non-zero). If + * NULL the first character of IFS (typically but not necessarily a space) + * will be used. + */ + /**/ static char * -dopadding(char *str, int prenum, int postnum, char *preone, char *postone, char *premul, char *postmul) +dopadding(char *str, int prenum, int postnum, char *preone, char *postone, + char *premul, char *postmul) { char *def, *ret, *t, *r; - int ls, ls2, lpreone, lpostone, lpremul, lpostmul, lr, f, m, c, cc; + int ls, ls2, lpreone, lpostone, lpremul, lpostmul, lr, f, m, c, cc, cl; + convchar_t cchar; MB_METACHARINIT(); if (*ifs) @@ -739,89 +761,357 @@ dopadding(char *str, int prenum, int postnum, char *preone, char *postone, char if (!postmul || !*postmul) postmul = def; - ls = strlen(str); - lpreone = preone ? strlen(preone) : 0; - lpostone = postone ? strlen(postone) : 0; - lpremul = strlen(premul); - lpostmul = strlen(postmul); + ls = MB_METASTRWIDTH(str); + lpreone = preone ? MB_METASTRWIDTH(preone) : 0; + lpostone = postone ? MB_METASTRWIDTH(postone) : 0; + lpremul = MB_METASTRWIDTH(premul); + lpostmul = MB_METASTRWIDTH(postmul); - lr = prenum + postnum; - - if (lr == ls) + if (prenum + postnum == ls) return str; + /* + * Try to be careful with allocated lengths. The following + * is a maximum, in case we need the entire repeated string + * for each repetition. We probably don't, but in case the user + * has given us something pathological which doesn't convert + * easily into a width we'd better be safe. + */ + lr = strlen(str) + strlen(premul) * prenum + strlen(postmul) * postnum; + /* + * Same logic for preone and postone, except those may be NULL. + */ + if (preone) + lr += strlen(preone); + if (postone) + lr += strlen(postone); r = ret = (char *)zhalloc(lr + 1); if (prenum) { + /* + * Pad on the left. + */ if (postnum) { + /* + * Pad on both right and left. + * The strategy is to divide the string into two halves. + * The first half is dealt with by the left hand padding + * code, the second by the right hand. + */ ls2 = ls / 2; + /* The width left to pad for the first half. */ f = prenum - ls2; - if (f <= 0) - for (str -= f, c = prenum; c--; *r++ = *str++); - else { - if (f <= lpreone) - for (c = f, t = preone + lpreone - f; c--; *r++ = *t++); - else { + if (f <= 0) { + /* First half doesn't fit. Skip the first -f width. */ + f = -f; + MB_METACHARINIT(); + while (f > 0) { + str += MB_METACHARLENCONV(str, &cchar); + f -= WCWIDTH(cchar); + } + /* Now finish the first half. */ + for (c = prenum; c > 0; ) { + cl = MB_METACHARLENCONV(str, &cchar); + while (cl--) + *r++ = *str++; + c -= WCWIDTH(cchar); + } + } else { + if (f <= lpreone) { + if (preone) { + /* + * The unrepeated string doesn't fit. + */ + MB_METACHARINIT(); + /* The width we need to skip */ + f = lpreone - f; + /* So skip. */ + for (t = preone; f > 0; ) { + t += MB_METACHARLENCONV(t, &cchar); + f -= WCWIDTH(cchar); + } + /* Then copy the entire remainder. */ + while (*t) + *r++ = *t++; + } + } else { f -= lpreone; - if ((m = f % lpremul)) - for (c = m, t = premul + lpremul - m; c--; *r++ = *t++); - for (cc = f / lpremul; cc--;) - for (c = lpremul, t = premul; c--; *r++ = *t++); - for (c = lpreone; c--; *r++ = *preone++); + if ((m = f % lpremul)) { + /* + * Left over fraction of repeated string. + */ + MB_METACHARINIT(); + /* Skip this much. */ + m = lpremul - m; + for (t = premul; m > 0; ) { + t += MB_METACHARLENCONV(t, &cchar); + m -= WCWIDTH(cchar); + } + /* Output the rest. */ + while (*t) + *r++ = *t++; + } + for (cc = f / lpremul; cc--;) { + /* Repeat the repeated string */ + MB_METACHARINIT(); + for (c = lpremul, t = premul; c > 0; ) { + cl = MB_METACHARLENCONV(t, &cchar); + while (cl--) + *r++ = *t++; + c -= WCWIDTH(cchar); + } + } + if (preone) { + /* Output the full unrepeated string */ + while (*preone) + *r++ = *preone++; + } + } + /* Output the first half width of the original string. */ + for (c = ls2; c > 0; ) { + cl = MB_METACHARLENCONV(str, &cchar); + c -= WCWIDTH(cchar); + while (cl--) + *r++ = *str++; } - for (c = ls2; c--; *r++ = *str++); } + /* Other half. In case the string had an odd length... */ ls2 = ls - ls2; + /* Width that needs padding... */ f = postnum - ls2; - if (f <= 0) - for (c = postnum; c--; *r++ = *str++); - else { - for (c = ls2; c--; *r++ = *str++); - if (f <= lpostone) - for (c = f; c--; *r++ = *postone++); - else { - f -= lpostone; - for (c = lpostone; c--; *r++ = *postone++); - for (cc = f / lpostmul; cc--;) - for (c = lpostmul, t = postmul; c--; *r++ = *t++); - if ((m = f % lpostmul)) - for (; m--; *r++ = *postmul++); + if (f <= 0) { + /* ...is negative, truncate original string */ + MB_METACHARINIT(); + for (c = postnum; c > 0; ) { + cl = MB_METACHARLENCONV(str, &cchar); + c -= WCWIDTH(cchar); + while (cl--) + *r++ = *str++; + } + } else { + /* Rest of original string fits, output it complete */ + while (*str) + *r++ = *str++; + if (f <= lpostone) { + if (postone) { + /* Can't fit unrepeated string, truncate it */ + for (c = f; c > 0; ) { + cl = MB_METACHARLENCONV(postone, &cchar); + c -= WCWIDTH(cchar); + while (cl--) + *r++ = *postone++; + } + } + } else { + if (postone) { + f -= lpostone; + /* Output entire unrepeated string */ + while (*postone) + *r++ = *postone++; + } + for (cc = f / lpostmul; cc--;) { + /* Begin the beguine */ + for (t = postmul; *t; ) + *r++ = *t++; + } + if ((m = f % lpostmul)) { + /* Fill leftovers with chunk of repeated string */ + MB_METACHARINIT(); + while (m > 0) { + cl = MB_METACHARLENCONV(postmul, &cchar); + m -= WCWIDTH(cchar); + while (cl--) + *r++ = *postmul++; + } + } } } } else { + /* + * Pad only on the left. + */ f = prenum - ls; - if (f <= 0) - for (c = prenum, str -= f; c--; *r++ = *str++); - else { - if (f <= lpreone) - for (c = f, t = preone + lpreone - f; c--; *r++ = *t++); - else { + if (f <= 0) { + /* + * Original string is at least as wide as padding. + * Truncate original string to width. + * Truncate on left, so skip the characters we + * don't need. + */ + f = -f; + MB_METACHARINIT(); + while (f > 0) { + str += MB_METACHARLENCONV(str, &cchar); + f -= WCWIDTH(cchar); + } + /* Copy the rest of the original string */ + for (c = prenum; c > 0; ) { + cl = MB_METACHARLENCONV(str, &cchar); + while (cl--) + *r++ = *str++; + c -= WCWIDTH(cchar); + } + } else { + /* + * We can fit the entire string... + */ + if (f <= lpreone) { + if (preone) { + /* + * ...with some fraction of the unrepeated string. + */ + /* We need this width of characters. */ + c = f; + /* + * We therefore need to skip this width of + * characters. + */ + f = lpreone - f; + MB_METACHARINIT(); + for (t = preone; f > 0; ) { + t += MB_METACHARLENCONV(t, &cchar); + f -= WCWIDTH(cchar); + } + /* Copy the rest of preone */ + while (*t) + *r++ = *t++; + } + } else { + /* + * We can fit the whole of preone, needing this width + * first + */ f -= lpreone; - if ((m = f % lpremul)) - for (c = m, t = premul + lpremul - m; c--; *r++ = *t++); - for (cc = f / lpremul; cc--;) - for (c = lpremul, t = premul; c--; *r++ = *t++); - for (c = lpreone; c--; *r++ = *preone++); + if ((m = f % lpremul)) { + /* + * Some fraction of the repeated string needed. + */ + /* Need this much... */ + c = m; + /* ...skipping this much first. */ + m = lpremul - m; + MB_METACHARINIT(); + for (t = premul; m > 0; ) { + t += MB_METACHARLENCONV(t, &cchar); + m -= WCWIDTH(cchar); + } + /* Now the rest of the repeated string. */ + while (c > 0) { + cl = MB_METACHARLENCONV(t, &cchar); + while (cl--) + *r++ = *t++; + c -= WCWIDTH(cchar); + } + } + for (cc = f / lpremul; cc--;) { + /* + * Repeat the repeated string. + */ + MB_METACHARINIT(); + for (c = lpremul, t = premul; c > 0; ) { + cl = MB_METACHARLENCONV(t, &cchar); + while (cl--) + *r++ = *t++; + c -= WCWIDTH(cchar); + } + } + if (preone) { + /* + * Now the entire unrepeated string. Don't + * count the width, just dump it. This is + * significant if there are special characters + * in this string. It's sort of a historical + * accident that this worked, but there's nothing + * to stop us just dumping the thing out and assuming + * the user knows what they're doing. + */ + while (*preone) + *r++ = *preone++; + } } - for (c = ls; c--; *r++ = *str++); + /* Now the string being padded */ + while (*str) + *r++ = *str++; } } } else if (postnum) { + /* + * Pad on the right. + */ f = postnum - ls; - if (f <= 0) - for (c = postnum; c--; *r++ = *str++); - else { - for (c = ls; c--; *r++ = *str++); - if (f <= lpostone) - for (c = f; c--; *r++ = *postone++); - else { - f -= lpostone; - for (c = lpostone; c--; *r++ = *postone++); - for (cc = f / lpostmul; cc--;) - for (c = lpostmul, t = postmul; c--; *r++ = *t++); - if ((m = f % lpostmul)) - for (; m--; *r++ = *postmul++); + MB_METACHARINIT(); + if (f <= 0) { + /* + * Original string is at least as wide as padding. + * Truncate original string to width. + */ + for (c = postnum; c > 0; ) { + cl = MB_METACHARLENCONV(str, &cchar); + while (cl--) + *r++ = *str++; + c -= WCWIDTH(cchar); + } + } else { + /* + * There's some space to fill. First copy the original + * string, counting the width. Make sure we copy the + * entire string. + */ + for (c = ls; *str; ) { + cl = MB_METACHARLENCONV(str, &cchar); + while (cl--) + *r++ = *str++; + c -= WCWIDTH(cchar); + } + MB_METACHARINIT(); + if (f <= lpostone) { + if (postone) { + /* + * Not enough or only just enough space to fit + * the unrepeated string. Truncate as necessary. + */ + for (c = f; c > 0; ) { + cl = MB_METACHARLENCONV(postone, &cchar); + while (cl--) + *r++ = *postone++; + c -= WCWIDTH(cchar); + } + } + } else { + if (postone) { + f -= lpostone; + /* Copy the entire unrepeated string */ + for (c = lpostone; *postone; ) { + cl = MB_METACHARLENCONV(postone, &cchar); + while (cl--) + *r++ = *postone++; + c -= WCWIDTH(cchar); + } + } + /* Repeat the repeated string */ + for (cc = f / lpostmul; cc--;) { + MB_METACHARINIT(); + for (c = lpostmul, t = postmul; *t; ) { + cl = MB_METACHARLENCONV(t, &cchar); + while (cl--) + *r++ = *t++; + c -= WCWIDTH(cchar); + } + } + /* + * See if there's any fraction of the repeated + * string needed to fill up the remaining space. + */ + if ((m = f % lpostmul)) { + MB_METACHARINIT(); + while (m > 0) { + cl = MB_METACHARLENCONV(postmul, &cchar); + while (cl--) + *r++ = *postmul++; + m -= WCWIDTH(cchar); + } + } } } } @@ -1779,6 +2069,9 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub) * by flags. TODO: maybe therefore this would * be more consistent if moved into getstrvalue()? * Bet that's easier said than done. + * + * TODO: use string widths. In fact, shouldn't the + * strlen()s be ztrlen()s anyway? */ val = getstrvalue(v); fwidth = v->pm->width ? v->pm->width : (int)strlen(val); diff --git a/Src/utils.c b/Src/utils.c index a72ddfcc5..37017bdc7 100644 --- a/Src/utils.c +++ b/Src/utils.c @@ -524,8 +524,12 @@ wcs_nicechar(wchar_t c, size_t *widthp, char **swidep) return buf; } - if (widthp) - *widthp = (s - buf) + wcwidth(c); + if (widthp) { + int wcw = wcwidth(c); + *widthp = (s - buf); + if (wcw > 0) + *widthp += wcw; + } if (swidep) *swidep = s; for (mbptr = mbstr; ret; s++, mbptr++, ret--) { @@ -539,6 +543,22 @@ wcs_nicechar(wchar_t c, size_t *widthp, char **swidep) *s = 0; return buf; } + +/**/ +mod_export int +zwcwidth(wint_t wc) +{ + int wcw; + /* assume a single-byte character if not valid */ + if (wc == WEOF) + return 1; + wcw = wcwidth(wc); + /* if not printable, assume zero width */ + if (wcw <= 0) + return 0; + return wcw; +} + /**/ #endif /* MULTIBYTE_SUPPORT */ @@ -3953,58 +3973,6 @@ nicedup(const char *s, int heap) return retstr; } -/* - * Return the screen width of a multibyte string. The input - * string is metafied. - */ -/**/ -mod_export int -mb_width(const char *s) -{ - char *ums = ztrdup(s), *umptr; - int umlen, eol = 0; - int width = 0; - mbstate_t mbs; - - memset(&mbs, 0, sizeof mbs); - umptr = unmetafy(ums, ¨en); - /* - * Convert one wide character at a time. We could convet - * the entire string using mbsrtowcs(), but that terminates on - * a NUL and we might have embedded NULs. - */ - while (umlen > 0) { - int wret; - wchar_t cc; - size_t cnt = eol ? MB_INVALID : mbrtowc(&cc, umptr, umlen, &mbs); - - switch (cnt) { - case MB_INCOMPLETE: - eol = 1; - /* FALL THROUGH */ - case MB_INVALID: - memset(&mbs, 0, sizeof mbs); - /* FALL THROUGH */ - case 0: - /* Assume a single-width character. */ - width++; - cnt = 1; - break; - default: - wret = wcwidth(cc); - if (wret > 0) - width += wret; - break; - } - - umlen -= cnt; - umptr += cnt; - } - - free(ums); - - return width; -} /* * Length of metafied string s which contains the next multibyte @@ -4107,9 +4075,15 @@ mb_metastrlen(char *ptr, int width) memset(&mb_shiftstate, 0, sizeof(mb_shiftstate)); ptr = laststart + (*laststart == Meta) + 1; num++; - } else if (width) - num += wcwidth(wc); - else + } else if (width) { + /* + * Returns -1 if not a printable character; best + * just to ignore these. + */ + int wcw = wcwidth(wc); + if (wcw > 0) + num += wcw; + } else num++; laststart = ptr; num_in_char = 0; diff --git a/Src/zsh.h b/Src/zsh.h index 3cb006cbf..27bb96493 100644 --- a/Src/zsh.h +++ b/Src/zsh.h @@ -2012,7 +2012,7 @@ typedef wint_t convchar_t; * It's written to use the wint_t from mb_metacharlenconv() without * further tests. */ -#define WCWIDTH(wc) ((wc == WEOF) ? 1 : wcwidth(wc)) +#define WCWIDTH(wc) zwcwidth(wc) #define MB_INCOMPLETE ((size_t)-2) #define MB_INVALID ((size_t)-1) diff --git a/Test/D04parameter.ztst b/Test/D04parameter.ztst index 7c0b28878..57147d53e 100644 --- a/Test/D04parameter.ztst +++ b/Test/D04parameter.ztst @@ -366,6 +366,18 @@ 0:${(pl...)...} >Xresulting """"Xwords roariously """Xpadded + print ${(l.5..X.r.5..Y.)foo} + print ${(l.6..X.r.4..Y.)foo} + print ${(l.7..X.r.3..Y.)foo} + print ${(l.6..X..A.r.6..Y..B.)foo} + print ${(l.6..X..AROOGA.r.6..Y..BARSOOM.)foo} +0:simultaneous left and right padding +>Xresulting XXXwordsYY proariousl XXpaddedYY +>XXresultin XXXXwordsY uproarious XXXpaddedY +>XXXresulti XXXXXwords Xuproariou XXXXpadded +>XAresultingB XXXAwordsBYY uproariously XXApaddedBYY +>GAresultingB OOGAwordsBAR uproariously OGApaddedBAR + foo=(why in goodness name am I doing this) print ${(r.5..!..?.)foo} 0:${(r...)...} diff --git a/Test/D07multibyte.ztst b/Test/D07multibyte.ztst index fe20ebb73..8b17a7294 100644 --- a/Test/D07multibyte.ztst +++ b/Test/D07multibyte.ztst @@ -283,3 +283,17 @@ >Ἐν ἀρχῇ ἦν ὁ >Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ ἦν ὁ λόγος >Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ + + foo=(κατέβην χθὲς εἰς Πειραιᾶ) + print ${(l.3..¥.r.3..£.)foo} + print ${(l.4..¥.r.2..£.)foo} + print ${(l.5..¥.r.1..£.)foo} + print ${(l.4..¥..«.r.4..£..».)foo} + print ${(l.4..¥..Σωκράτης.r.4..£..Γλαύκωνος.)foo} +0:simultaneous left and right padding +>κατέβη ¥χθὲς£ ¥¥εἰς£ Πειραι +>¥κατέβ ¥¥χθὲς ¥¥¥εἰς ¥Πειρα +>¥¥κατέ ¥¥¥χθὲ ¥¥¥¥εἰ ¥¥Πειρ +>«κατέβην ¥«χθὲς»£ ¥¥«εἰς»£ «Πειραιᾶ +>ςκατέβην ηςχθὲςΓλ τηςεἰςΓλ ςΠειραιᾶ +# er... yeah, that looks right... -- cgit 1.4.1