about summary refs log tree commit diff
diff options
context:
space:
mode:
authorPeter Stephenson <pws@users.sourceforge.net>2007-01-22 14:35:12 +0000
committerPeter Stephenson <pws@users.sourceforge.net>2007-01-22 14:35:12 +0000
commite375d5ee8817e7f98d0a2f37cfb7566b8572d0e0 (patch)
tree4fcda72583f7a2f899be950db94b24017aa3e0ce
parentc53aa4adee9236a08d2d88c6e753588760b88f0e (diff)
downloadzsh-e375d5ee8817e7f98d0a2f37cfb7566b8572d0e0.tar.gz
zsh-e375d5ee8817e7f98d0a2f37cfb7566b8572d0e0.tar.xz
zsh-e375d5ee8817e7f98d0a2f37cfb7566b8572d0e0.zip
23119: lower case in sorting properly
-rw-r--r--ChangeLog5
-rw-r--r--Src/sort.c48
-rw-r--r--Test/B03print.ztst7
-rw-r--r--Test/D07multibyte.ztst11
4 files changed, 67 insertions, 4 deletions
diff --git a/ChangeLog b/ChangeLog
index 8bcc4a322..a747e8aa2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2007-01-22  Peter Stephenson  <pws@csr.com>
+
+	* 23119: Src/sort.c, Test/B03print.ztst, Test/D07multibyte.ztst:
+	do lowering of multibyte character case in sorting properly.
+
 2007-01-21  Peter Stephenson  <p.w.stephenson@ntlworld.com>
 
 	* 23118: Doc/Zsh/expn.yo, Src/builtin.c, Src/glob.c, Src/jobs.c,
diff --git a/Src/sort.c b/Src/sort.c
index 2fdb77931..1b8507342 100644
--- a/Src/sort.c
+++ b/Src/sort.c
@@ -248,7 +248,8 @@ strmetasort(char **array, int sortwhat, int *unmetalenp)
 	    || *metaptr == Meta) {
 	    char *s, *t, *src = *arrptr, *dst;
 	    int len;
-	    sortarrptr->cmp = dst = (char *)zhalloc(strlen(src) + 1);
+	    sortarrptr->cmp = dst =
+		(char *)zhalloc(((sortwhat & SORTIT_IGNORING_CASE)?2:1)*strlen(src)+1);
 
 	    if (unmetalenp) {
 		/* Already unmetafied and we have the length. */
@@ -283,8 +284,49 @@ strmetasort(char **array, int sortwhat, int *unmetalenp)
 		len = metaptr - src;
 	    }
 	    if (sortwhat & SORTIT_IGNORING_CASE) {
-		for (s = src, t = dst; s - src != len; )
-		    *t++ = tulower(*s++);
+		char *send = src + len;
+#ifdef MULTIBYTE_SUPPORT
+		if (isset(MULTIBYTE)) {
+		    /*
+		     * Lower the case the hard way.  Convert to a wide
+		     * character, process that, and convert back.  We
+		     * don't assume the characters have the same
+		     * multibyte length.  We can't use casemodify()
+		     * because we have unmetafied data, which may have
+		     * been passed down to use.
+		     */
+		    mbstate_t mbsin, mbsout;
+		    int clen;
+		    wchar_t wc;
+		    memset(&mbsin, 0, sizeof(mbstate_t));
+		    memset(&mbsout, 0, sizeof(mbstate_t));
+
+		    for (s = src, t = dst; s < send; ) {
+			clen = mbrtowc(&wc, s, send-s, &mbsin);
+			if (clen < 0) {
+			    /* invalid or unfinished: treat as single bytes */
+			    while (s < send)
+				*t++ = tulower(*s++);
+			    break;
+			}
+			if (clen == 0) {
+			    /* embedded null */
+			    *t++ = '\0';
+			    s++;
+			    continue;
+			}
+			s += clen;
+			wc = towlower(wc);
+			clen = wcrtomb(t, wc, &mbsout);
+			t += clen;
+			DPUTS(clen < 0, "Bad conversion when lowering case");
+		    }
+		    *t = '\0';
+		    len = t - dst;
+		} else
+#endif
+		    for (s = src, t = dst; s < send; )
+			*t++ = tulower(*s++);
 		src = dst;
 	    }
 	    if (sortwhat & SORTIT_IGNORING_BACKSLASHES) {
diff --git a/Test/B03print.ztst b/Test/B03print.ztst
index c3ba42b18..92a24d6b6 100644
--- a/Test/B03print.ztst
+++ b/Test/B03print.ztst
@@ -34,7 +34,12 @@
 >baz
 >bar
 
- print -io a B c
+# some locales force case-insensitive sorting
+ (LC_ALL=C; print -o a B c)
+0:case-sensitive argument sorting
+>B a c
+
+ (LC_ALL=C; print -io a B c)
 0:case-insensitive argument sorting
 >a B c
 
diff --git a/Test/D07multibyte.ztst b/Test/D07multibyte.ztst
index ecac737a1..c3a24c067 100644
--- a/Test/D07multibyte.ztst
+++ b/Test/D07multibyte.ztst
@@ -2,6 +2,8 @@
 
 # Find a UTF-8 locale.
   setopt multibyte
+# Don't let LC_* override our choice of locale.
+  unset -m LC_\*
   mb_ok=
   langs=(en_US.UTF-8 en_GB.UTF-8 en.UTF-8
 	 $(locale -a 2>/dev/null | sed -e 's/utf8/UTF-8/' | grep UTF-8))
@@ -315,3 +317,12 @@
   printf "%4.3s\n" főobar
 0:Multibyte characters in printf widths
 > főo
+
+# We ask for case-insensitive sorting here (and supply upper case
+# characters) so that we exercise the logic in the shell that lowers the
+# case of the string for case-insensitive sorting.
+  print -oi HAH HUH HEH HÉH HÈH
+  (LC_ALL=C; print -oi HAH HUH HEH HÉH HÈH)
+0:Multibyte characters in print sorting
+>HAH HEH HÉH HÈH HUH
+>HAH HEH HUH HÈH HÉH