From da74eb63387aa8560eab844e8315f0d135c1c965 Mon Sep 17 00:00:00 2001
From: Oliver Kiddle <opk@users.sourceforge.net>
Date: Tue, 13 May 2003 12:50:26 +0000
Subject: 18525: add manual UTF-8 conversion so \u and \U should work on more
 systems

---
 ChangeLog   |   5 +++
 Src/utils.c | 143 ++++++++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 106 insertions(+), 42 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 19940b460..74d1e73b5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2003-05-13  Oliver Kiddle  <opk@zsh.org>
+
+	* 18525: Src/utils.c: add manual UTF-8 conversion as extra
+	implementation of \u and \U so it should work on more systems
+
 2003-05-12  Peter Stephenson  <pws@csr.com>
 
 	* 18524: Src/utils.c: sepjoin didn't respect !heap for an empty
diff --git a/Src/utils.c b/Src/utils.c
index 05ef32844..dd8794ca6 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -30,13 +30,15 @@
 #include "zsh.mdh"
 #include "utils.pro"
 
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB)
-#include <wchar.h>
-#  ifndef __STDC_ISO_10646__
-#    if defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
-#      include <iconv.h>
-#    endif
-#  endif
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined (__STDC_ISO_10646__)
+# include <wchar.h>
+#else
+# ifdef HAVE_LANGINFO_H 			       
+#   include <langinfo.h>			       
+#   if defined(HAVE_ICONV) || defined(HAVE_LIBICONV)   
+#     include <iconv.h> 			       
+#   endif					       
+# endif 					       
 #endif
 
 /* name of script being sourced */
@@ -3271,6 +3273,42 @@ dquotedzputs(char const *s, FILE *stream)
 }
 #endif
 
+# if defined(HAVE_NL_LANGINFO) && defined(CODESET) && !defined(__STDC_ISO_10646__)
+/* Convert a character from UCS4 encoding to UTF-8 */
+
+size_t
+ucs4toutf8(char *dest, unsigned int wval)
+{
+    size_t len;
+
+    if (wval < 0x80)
+      len = 1;
+    else if (wval < 0x800)
+      len = 2;
+    else if (wval < 0x10000)
+      len = 3;
+    else if (wval < 0x200000)
+      len = 4;
+    else if (wval < 0x4000000)
+      len = 5;
+    else
+      len = 6;
+
+    switch (len) { /* falls through except to the last case */
+    case 6: dest[5] = (wval & 0x3f) | 0x80; wval >>= 6;
+    case 5: dest[4] = (wval & 0x3f) | 0x80; wval >>= 6;
+    case 4: dest[3] = (wval & 0x3f) | 0x80; wval >>= 6;
+    case 3: dest[2] = (wval & 0x3f) | 0x80; wval >>= 6;
+    case 2: dest[1] = (wval & 0x3f) | 0x80; wval >>= 6;
+	*dest = wval | (0xfc << (6 - len)) & 0xfc;
+	break;
+    case 1: *dest = wval;
+    }
+
+    return len;
+}
+#endif
+
 /*
  * Decode a key string, turning it into the literal characters.
  * The length is returned in len.
@@ -3299,18 +3337,18 @@ getkeystring(char *s, int *len, int fromwhere, int *misc)
     char svchar = '\0';
     int meta = 0, control = 0;
     int i;
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB)
-#  ifdef __STDC_ISO_10646__
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
     wint_t wval;
-#  elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+    size_t count;
+#else
     unsigned int wval;
+# if defined(HAVE_NL_LANGINFO) && defined(CODESET) && (defined(HAVE_ICONV) || defined(HAVE_LIBICONV))
     iconv_t cd;
     char inbuf[4];
-    wchar_t outbuf[1];
     size_t inbytes, outbytes;
-    char *inptr, *outptr;
-#  endif
+    char *inptr;
     size_t count;
+# endif
 #endif
 
     if (fromwhere == 6)
@@ -3387,8 +3425,6 @@ getkeystring(char *s, int *len, int fromwhere, int *misc)
 		    *misc = 1;
 		    break;
 		}
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB)
-#if defined(__STDC_ISO_10646__) || defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
 	    case 'u':
 	    case 'U':
 	    	wval = 0;
@@ -3407,21 +3443,10 @@ getkeystring(char *s, int *len, int fromwhere, int *misc)
 		    *misc = wval;
 		    return s+1;
 		}
-#ifdef __STDC_ISO_10646__
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
 		count = wctomb(t, (wchar_t)wval);
-#elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
-    	    	inbytes = outbytes = 4;
-    	    	inptr = inbuf;
-    	    	outptr = (char *)outbuf;
-		/* assume big endian convention for UCS-4 */
-		for (i=3;i>=0;i--) {
-		    inbuf[i] = wval & 0xff;
-		    wval >>= 8;
-		}
-    	    	
-    	    	cd = iconv_open("WCHAR_T", "ISO-10646");
-		if (cd == (iconv_t)-1) {
-		    zerr("cannot do charset conversion", NULL, 0);
+		if (count == (size_t)-1) {
+		    zerr("character not in range", NULL, 0);
 		    if (fromwhere == 4) {
 			for (u = t; (*u++ = *++s););
 			return t;
@@ -3430,24 +3455,58 @@ getkeystring(char *s, int *len, int fromwhere, int *misc)
 		    *len = t - buf;
 		    return buf;
 		}
-                iconv(cd, (const char **)&inptr, &inbytes, &outptr, &outbytes);
-		iconv_close(cd);
-		count = wctomb(t, *outbuf);
-#endif
-		if (count == (size_t)-1) {
-		    zerr("character not in range", NULL, 0);
-		    if (fromwhere == 4) {
-			for (u = t; (*u++ = *++s););
-			return t;
+		t += count;  
+		continue;
+# else
+#  if defined(HAVE_NL_LANGINFO) && defined(CODESET)
+		if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
+		    t += ucs4toutf8(t, wval);
+		    continue;
+		} else {
+#   if defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+    	    	    inbytes = 4;
+		    outbytes = 6;
+    	    	    inptr = inbuf;
+		    /* assume big endian convention for UCS-4 */
+		    for (i=3;i>=0;i--) {
+			inbuf[i] = wval & 0xff;
+			wval >>= 8;
+		    }
+
+    	    	    cd = iconv_open(nl_langinfo(CODESET), "ISO-10646");
+		    if (cd == (iconv_t)-1) {
+			zerr("cannot do charset conversion", NULL, 0);
+			if (fromwhere == 4) {
+			    for (u = t; (*u++ = *++s););
+			    return t;
+			}
+			*t = '\0';
+			*len = t - buf;
+			return buf;
+		    }
+                    count = iconv(cd, (char **)&inptr, &inbytes, &t, &outbytes);
+		    iconv_close(cd);
+		    if (count == (size_t)-1) {
+                        zerr("cannot do charset conversion", NULL, 0);
+		        *t = '\0';
+			*len = t - buf;
+			return buf;
 		    }
+		    continue;
+#   else
+                    zerr("cannot do charset conversion", NULL, 0);
 		    *t = '\0';
 		    *len = t - buf;
 		    return buf;
+#   endif
 		}
-		t += count;  
-		continue;
-#endif
-#endif
+#  else
+                zerr("cannot do charset conversion", NULL, 0);
+		*t = '\0';
+		*len = t - buf;
+		return buf;
+#  endif
+# endif
 	    default:
 	    def:
 		if ((idigit(*s) && *s < '8') || *s == 'x') {
-- 
cgit 1.4.1