From da74eb63387aa8560eab844e8315f0d135c1c965 Mon Sep 17 00:00:00 2001 From: Oliver Kiddle Date: Tue, 13 May 2003 12:50:26 +0000 Subject: 18525: add manual UTF-8 conversion so \u and \U should work on more systems --- ChangeLog | 5 +++ Src/utils.c | 143 ++++++++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 106 insertions(+), 42 deletions(-) diff --git a/ChangeLog b/ChangeLog index 19940b460..74d1e73b5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2003-05-13 Oliver Kiddle + + * 18525: Src/utils.c: add manual UTF-8 conversion as extra + implementation of \u and \U so it should work on more systems + 2003-05-12 Peter Stephenson * 18524: Src/utils.c: sepjoin didn't respect !heap for an empty diff --git a/Src/utils.c b/Src/utils.c index 05ef32844..dd8794ca6 100644 --- a/Src/utils.c +++ b/Src/utils.c @@ -30,13 +30,15 @@ #include "zsh.mdh" #include "utils.pro" -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) -#include -# ifndef __STDC_ISO_10646__ -# if defined(HAVE_ICONV) || defined(HAVE_LIBICONV) -# include -# endif -# endif +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined (__STDC_ISO_10646__) +# include +#else +# ifdef HAVE_LANGINFO_H +# include +# if defined(HAVE_ICONV) || defined(HAVE_LIBICONV) +# include +# endif +# endif #endif /* name of script being sourced */ @@ -3271,6 +3273,42 @@ dquotedzputs(char const *s, FILE *stream) } #endif +# if defined(HAVE_NL_LANGINFO) && defined(CODESET) && !defined(__STDC_ISO_10646__) +/* Convert a character from UCS4 encoding to UTF-8 */ + +size_t +ucs4toutf8(char *dest, unsigned int wval) +{ + size_t len; + + if (wval < 0x80) + len = 1; + else if (wval < 0x800) + len = 2; + else if (wval < 0x10000) + len = 3; + else if (wval < 0x200000) + len = 4; + else if (wval < 0x4000000) + len = 5; + else + len = 6; + + switch (len) { /* falls through except to the last case */ + case 6: dest[5] = (wval & 0x3f) | 0x80; wval >>= 6; + case 5: dest[4] = (wval & 0x3f) | 0x80; wval >>= 6; + case 4: dest[3] = (wval & 0x3f) | 0x80; wval >>= 6; + case 3: dest[2] = (wval & 0x3f) | 0x80; wval >>= 6; + case 2: dest[1] = (wval & 0x3f) | 0x80; wval >>= 6; + *dest = wval | (0xfc << (6 - len)) & 0xfc; + break; + case 1: *dest = wval; + } + + return len; +} +#endif + /* * Decode a key string, turning it into the literal characters. * The length is returned in len. @@ -3299,18 +3337,18 @@ getkeystring(char *s, int *len, int fromwhere, int *misc) char svchar = '\0'; int meta = 0, control = 0; int i; -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) -# ifdef __STDC_ISO_10646__ +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__) wint_t wval; -# elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV) + size_t count; +#else unsigned int wval; +# if defined(HAVE_NL_LANGINFO) && defined(CODESET) && (defined(HAVE_ICONV) || defined(HAVE_LIBICONV)) iconv_t cd; char inbuf[4]; - wchar_t outbuf[1]; size_t inbytes, outbytes; - char *inptr, *outptr; -# endif + char *inptr; size_t count; +# endif #endif if (fromwhere == 6) @@ -3387,8 +3425,6 @@ getkeystring(char *s, int *len, int fromwhere, int *misc) *misc = 1; break; } -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) -#if defined(__STDC_ISO_10646__) || defined(HAVE_ICONV) || defined(HAVE_LIBICONV) case 'u': case 'U': wval = 0; @@ -3407,21 +3443,10 @@ getkeystring(char *s, int *len, int fromwhere, int *misc) *misc = wval; return s+1; } -#ifdef __STDC_ISO_10646__ +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__) count = wctomb(t, (wchar_t)wval); -#elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV) - inbytes = outbytes = 4; - inptr = inbuf; - outptr = (char *)outbuf; - /* assume big endian convention for UCS-4 */ - for (i=3;i>=0;i--) { - inbuf[i] = wval & 0xff; - wval >>= 8; - } - - cd = iconv_open("WCHAR_T", "ISO-10646"); - if (cd == (iconv_t)-1) { - zerr("cannot do charset conversion", NULL, 0); + if (count == (size_t)-1) { + zerr("character not in range", NULL, 0); if (fromwhere == 4) { for (u = t; (*u++ = *++s);); return t; @@ -3430,24 +3455,58 @@ getkeystring(char *s, int *len, int fromwhere, int *misc) *len = t - buf; return buf; } - iconv(cd, (const char **)&inptr, &inbytes, &outptr, &outbytes); - iconv_close(cd); - count = wctomb(t, *outbuf); -#endif - if (count == (size_t)-1) { - zerr("character not in range", NULL, 0); - if (fromwhere == 4) { - for (u = t; (*u++ = *++s);); - return t; + t += count; + continue; +# else +# if defined(HAVE_NL_LANGINFO) && defined(CODESET) + if (!strcmp(nl_langinfo(CODESET), "UTF-8")) { + t += ucs4toutf8(t, wval); + continue; + } else { +# if defined(HAVE_ICONV) || defined(HAVE_LIBICONV) + inbytes = 4; + outbytes = 6; + inptr = inbuf; + /* assume big endian convention for UCS-4 */ + for (i=3;i>=0;i--) { + inbuf[i] = wval & 0xff; + wval >>= 8; + } + + cd = iconv_open(nl_langinfo(CODESET), "ISO-10646"); + if (cd == (iconv_t)-1) { + zerr("cannot do charset conversion", NULL, 0); + if (fromwhere == 4) { + for (u = t; (*u++ = *++s);); + return t; + } + *t = '\0'; + *len = t - buf; + return buf; + } + count = iconv(cd, (char **)&inptr, &inbytes, &t, &outbytes); + iconv_close(cd); + if (count == (size_t)-1) { + zerr("cannot do charset conversion", NULL, 0); + *t = '\0'; + *len = t - buf; + return buf; } + continue; +# else + zerr("cannot do charset conversion", NULL, 0); *t = '\0'; *len = t - buf; return buf; +# endif } - t += count; - continue; -#endif -#endif +# else + zerr("cannot do charset conversion", NULL, 0); + *t = '\0'; + *len = t - buf; + return buf; +# endif +# endif default: def: if ((idigit(*s) && *s < '8') || *s == 'x') { -- cgit 1.4.1