diff options
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | Src/sort.c | 48 | ||||
-rw-r--r-- | Test/B03print.ztst | 7 | ||||
-rw-r--r-- | Test/D07multibyte.ztst | 11 |
4 files changed, 67 insertions, 4 deletions
diff --git a/ChangeLog b/ChangeLog index 8bcc4a322..a747e8aa2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2007-01-22 Peter Stephenson <pws@csr.com> + + * 23119: Src/sort.c, Test/B03print.ztst, Test/D07multibyte.ztst: + do lowering of multibyte character case in sorting properly. + 2007-01-21 Peter Stephenson <p.w.stephenson@ntlworld.com> * 23118: Doc/Zsh/expn.yo, Src/builtin.c, Src/glob.c, Src/jobs.c, diff --git a/Src/sort.c b/Src/sort.c index 2fdb77931..1b8507342 100644 --- a/Src/sort.c +++ b/Src/sort.c @@ -248,7 +248,8 @@ strmetasort(char **array, int sortwhat, int *unmetalenp) || *metaptr == Meta) { char *s, *t, *src = *arrptr, *dst; int len; - sortarrptr->cmp = dst = (char *)zhalloc(strlen(src) + 1); + sortarrptr->cmp = dst = + (char *)zhalloc(((sortwhat & SORTIT_IGNORING_CASE)?2:1)*strlen(src)+1); if (unmetalenp) { /* Already unmetafied and we have the length. */ @@ -283,8 +284,49 @@ strmetasort(char **array, int sortwhat, int *unmetalenp) len = metaptr - src; } if (sortwhat & SORTIT_IGNORING_CASE) { - for (s = src, t = dst; s - src != len; ) - *t++ = tulower(*s++); + char *send = src + len; +#ifdef MULTIBYTE_SUPPORT + if (isset(MULTIBYTE)) { + /* + * Lower the case the hard way. Convert to a wide + * character, process that, and convert back. We + * don't assume the characters have the same + * multibyte length. We can't use casemodify() + * because we have unmetafied data, which may have + * been passed down to use. + */ + mbstate_t mbsin, mbsout; + int clen; + wchar_t wc; + memset(&mbsin, 0, sizeof(mbstate_t)); + memset(&mbsout, 0, sizeof(mbstate_t)); + + for (s = src, t = dst; s < send; ) { + clen = mbrtowc(&wc, s, send-s, &mbsin); + if (clen < 0) { + /* invalid or unfinished: treat as single bytes */ + while (s < send) + *t++ = tulower(*s++); + break; + } + if (clen == 0) { + /* embedded null */ + *t++ = '\0'; + s++; + continue; + } + s += clen; + wc = towlower(wc); + clen = wcrtomb(t, wc, &mbsout); + t += clen; + DPUTS(clen < 0, "Bad conversion when lowering case"); + } + *t = '\0'; + len = t - dst; + } else +#endif + for (s = src, t = dst; s < send; ) + *t++ = tulower(*s++); src = dst; } if (sortwhat & SORTIT_IGNORING_BACKSLASHES) { diff --git a/Test/B03print.ztst b/Test/B03print.ztst index c3ba42b18..92a24d6b6 100644 --- a/Test/B03print.ztst +++ b/Test/B03print.ztst @@ -34,7 +34,12 @@ >baz >bar - print -io a B c +# some locales force case-insensitive sorting + (LC_ALL=C; print -o a B c) +0:case-sensitive argument sorting +>B a c + + (LC_ALL=C; print -io a B c) 0:case-insensitive argument sorting >a B c diff --git a/Test/D07multibyte.ztst b/Test/D07multibyte.ztst index ecac737a1..c3a24c067 100644 --- a/Test/D07multibyte.ztst +++ b/Test/D07multibyte.ztst @@ -2,6 +2,8 @@ # Find a UTF-8 locale. setopt multibyte +# Don't let LC_* override our choice of locale. + unset -m LC_\* mb_ok= langs=(en_US.UTF-8 en_GB.UTF-8 en.UTF-8 $(locale -a 2>/dev/null | sed -e 's/utf8/UTF-8/' | grep UTF-8)) @@ -315,3 +317,12 @@ printf "%4.3s\n" főobar 0:Multibyte characters in printf widths > főo + +# We ask for case-insensitive sorting here (and supply upper case +# characters) so that we exercise the logic in the shell that lowers the +# case of the string for case-insensitive sorting. + print -oi HAH HUH HEH HÉH HÈH + (LC_ALL=C; print -oi HAH HUH HEH HÉH HÈH) +0:Multibyte characters in print sorting +>HAH HEH HÉH HÈH HUH +>HAH HEH HUH HÈH HÉH |