From d8fac9fb2315c3edf5f51d0df81c8d99b7018662 Mon Sep 17 00:00:00 2001 From: Oliver Kiddle Date: Fri, 14 Mar 2003 13:36:07 +0000 Subject: 18343, 18348: handle \u and \U escapes for specifying unicode characters --- ChangeLog | 5 +++ Doc/Zsh/builtins.yo | 2 ++ Src/utils.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++-- zshconfig.ac | 7 ++-- 4 files changed, 102 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index dd1b20142..e4e024b18 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2003-03-14 Oliver Kiddle + + * 18343, 18348: zshconfig.ac, Doc/Zsh/builtins.yo, Src/utils.c: + handle \u and \U escapes for specifying unicode characters + 2003-03-15 Doug Kearns * 18351: Completion/Unix/Command/_antiword: new completion for diff --git a/Doc/Zsh/builtins.yo b/Doc/Zsh/builtins.yo index 899bc96d8..67d2c11e4 100644 --- a/Doc/Zsh/builtins.yo +++ b/Doc/Zsh/builtins.yo @@ -278,6 +278,8 @@ sitem(tt(\v))(vertical tab) sitem(tt(\\))(backslash) sitem(tt(\0)var(NNN))(character code in octal) sitem(tt(\x)var(NN))(character code in hexadecimal) +sitem(tt(\u)var(NNNN))(unicode character code in hexadecimal) +sitem(tt(\U)var(NNNNNNNN))(unicode character code in hexadecimal) endsitem() pindex(BSD_ECHO, use of) diff --git a/Src/utils.c b/Src/utils.c index 64a6a722e..2b0e7faea 100644 --- a/Src/utils.c +++ b/Src/utils.c @@ -30,6 +30,15 @@ #include "zsh.mdh" #include "utils.pro" +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) +#include +# ifndef __STDC_ISO_10646__ +# if defined(HAVE_ICONV) || defined(HAVE_LIBICONV) +# include +# endif +# endif +#endif + /* name of script being sourced */ /**/ @@ -3274,7 +3283,8 @@ dquotedzputs(char const *s, FILE *stream) * for no newlines. * 3: As 1, but don't handle \c. * 4: Do $'...' quoting. Overwrites the existing string instead of - * zhalloc'ing + * zhalloc'ing. If \uNNNN ever generates multi-byte chars longer + * than 6 bytes, will need to adjust this to re-allocate memory. * 5: As 2, but \- is special. Expects misc to be defined. * 6: As 2, but parses only one character and returns end-pointer * and parsed character in *misc @@ -3288,11 +3298,28 @@ getkeystring(char *s, int *len, int fromwhere, int *misc) char *t, *u = NULL; char svchar = '\0'; int meta = 0, control = 0; + int i; +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) +# ifdef __STDC_ISO_10646__ + wint_t wval; +# elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV) + unsigned int wval; + iconv_t cd; + char inbuf[4]; + wchar_t outbuf[1]; + size_t inbytes, outbytes; + char *inptr, *outptr; +# endif + size_t count; + size_t buflen = MB_LEN_MAX * (strlen(s) / 6) + (strlen(s) % 6) + 1; +#else + size_t buflen = strlen(s) + 1; +#endif if (fromwhere == 6) t = buf = tmp; else if (fromwhere != 4) - t = buf = zhalloc(strlen(s) + 1); + t = buf = zhalloc(buflen); else { t = buf = s; s += 2; @@ -3363,6 +3390,67 @@ getkeystring(char *s, int *len, int fromwhere, int *misc) *misc = 1; break; } +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) +#if defined(__STDC_ISO_10646__) || defined(HAVE_ICONV) || defined(HAVE_LIBICONV) + case 'u': + case 'U': + wval = 0; + for (i=(*s == 'u' ? 4 : 8); i>0; i--) { + if (*++s && idigit(*s)) + wval = wval * 16 + (*s - '0'); + else if (*s && (*s >= 'a' && *s <= 'f') || + (*s >= 'A' && *s <= 'F')) + wval = wval * 16 + (*s & 0x1f) + 9; + else { + s--; + break; + } + } + if (fromwhere == 6) { + *misc = wval; + return s+1; + } +#ifdef __STDC_ISO_10646__ + count = wctomb(t, (wchar_t)wval); +#elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV) + inbytes = outbytes = 4; + inptr = inbuf; + outptr = (char *)outbuf; + /* assume big endian convention for UCS-4 */ + for (i=3;i>=0;i--) { + inbuf[i] = wval & 0xff; + wval >>= 8; + } + + cd = iconv_open("WCHAR_T", "ISO-10646"); + if (cd == (iconv_t)-1) { + zerr("cannot do charset conversion", NULL, 0); + if (fromwhere == 4) { + for (u = t; (*u++ = *++s);); + return t; + } + *t = '\0'; + *len = t - buf; + return buf; + } + iconv(cd, &inptr, &inbytes, &outptr, &outbytes); + iconv_close(cd); + count = wctomb(t, *outbuf); +#endif + if (count == (size_t)-1) { + zerr("character not in range", NULL, 0); + if (fromwhere == 4) { + for (u = t; (*u++ = *++s);); + return t; + } + *t = '\0'; + *len = t - buf; + return buf; + } + t += count; + continue; +#endif +#endif default: def: if ((idigit(*s) && *s < '8') || *s == 'x') { diff --git a/zshconfig.ac b/zshconfig.ac index 33ca83f78..fd578c028 100644 --- a/zshconfig.ac +++ b/zshconfig.ac @@ -494,7 +494,7 @@ AC_CHECK_HEADERS(sys/time.h sys/times.h sys/select.h termcap.h termio.h \ limits.h fcntl.h libc.h sys/utsname.h sys/resource.h \ locale.h errno.h stdio.h stdlib.h unistd.h sys/capability.h \ utmp.h utmpx.h sys/types.h pwd.h grp.h poll.h sys/mman.h \ - netinet/in_systm.h pcre.h langinfo.h) + netinet/in_systm.h pcre.h langinfo.h wchar.h) if test $dynamic = yes; then AC_CHECK_HEADERS(dlfcn.h) AC_CHECK_HEADERS(dl.h) @@ -663,6 +663,8 @@ AC_CHECK_LIB(cap, cap_get_proc) AC_CHECK_LIB(socket, socket) +AC_CHECK_LIB(iconv, iconv) + dnl pcre-config should probably be employed here AC_SEARCH_LIBS(pcre_compile, pcre) @@ -959,7 +961,8 @@ AC_CHECK_FUNCS(strftime difftime gettimeofday \ tgetent tigetflag tigetnum tigetstr setupterm \ pcre_compile pcre_study pcre_exec \ nl_langinfo \ - erand48 open_memstream) + erand48 open_memstream \ + wctomb iconv) AC_FUNC_STRCOLL dnl Check if tgetent accepts NULL (and will allocate its own termcap buffer) -- cgit 1.4.1