diff options
author | Jun-ichi Takimoto <takimoto-j@kba.biglobe.ne.jp> | 2023-09-27 01:56:47 +0900 |
---|---|---|
committer | Jun-ichi Takimoto <takimoto-j@kba.biglobe.ne.jp> | 2023-09-27 01:56:47 +0900 |
commit | 02e33c54d85208c7d9b96d91a26d04069ff19ed2 (patch) | |
tree | 2d4feecc28498ffd5cac25dbec09a319de380109 /Src/utils.c | |
parent | e4e9afe373479076ee448b16944a421836ba5a40 (diff) | |
download | zsh-02e33c54d85208c7d9b96d91a26d04069ff19ed2.tar.gz zsh-02e33c54d85208c7d9b96d91a26d04069ff19ed2.tar.xz zsh-02e33c54d85208c7d9b96d91a26d04069ff19ed2.zip |
52169: a few more improvements of (#) flag
fix (#X) in C locale in FreeBSD, DragonFly, NetBSD. Negative values such as ${(#X):--1} are now error. UCS4 is limited to < 0x8000_0000 (in OSes without __STDC_ISO_10646__).
Diffstat (limited to 'Src/utils.c')
-rw-r--r-- | Src/utils.c | 230 |
1 files changed, 114 insertions, 116 deletions
diff --git a/Src/utils.c b/Src/utils.c index 7040d0954..7028c155f 100644 --- a/Src/utils.c +++ b/Src/utils.c @@ -6672,11 +6672,14 @@ dquotedzputs(char const *s, FILE *stream) # if defined(HAVE_NL_LANGINFO) && defined(CODESET) && !defined(__STDC_ISO_10646__) /* Convert a character from UCS4 encoding to UTF-8 */ -static size_t +static int ucs4toutf8(char *dest, unsigned int wval) { - size_t len; + int len; + /* UCS4 is now equvalent to UTF-32 and limited to 0 - 0x10_FFFF. + * This function accepts 0 - 0x7FFF_FFFF (old range of UCS4) to be + * compatible with wctomb(3) (in UTF-8 locale) on Linux. */ if (wval < 0x80) len = 1; else if (wval < 0x800) @@ -6687,8 +6690,12 @@ ucs4toutf8(char *dest, unsigned int wval) len = 4; else if (wval < 0x4000000) len = 5; - else + else if (wval < 0x80000000) len = 6; + else { + zerr("character not in range"); + return -1; + } switch (len) { /* falls through except to the last case */ case 6: dest[5] = (wval & 0x3f) | 0x80; wval >>= 6; @@ -6705,30 +6712,89 @@ ucs4toutf8(char *dest, unsigned int wval) } #endif +/* Convert UCS4 to a multibyte character in current locale. + * Result is saved in buf (must be at least MB_CUR_MAX bytes long). + * Returns the number of bytes saved in buf, or -1 if conversion fails. */ -/* - * The following only occurs once or twice in the code, but in different - * places depending how character set conversion is implemented. - */ -#define CHARSET_FAILED() \ - if (how & GETKEY_DOLLAR_QUOTE) { \ - while ((*tdest++ = *++s)) { \ - if (how & GETKEY_UPDATE_OFFSET) { \ - if (s - sstart > *misc) \ - (*misc)++; \ - } \ - if (*s == Snull) { \ - *len = (s - sstart) + 1; \ - *tdest = '\0'; \ - return buf; \ - } \ - } \ - *len = tdest - buf; \ - return buf; \ - } \ - *t = '\0'; \ - *len = t - buf; \ - return buf +/**/ +int +ucs4tomb(unsigned int wval, char *buf) +{ +#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__) + int count = wctomb(buf, (wchar_t)wval); + if (count == -1) + zerr("character not in range"); + return count; +#else /* !(HAVE_WCHAR_H && HAVE_WCTOMB && __STDC_ISO_10646__) */ +# if defined(HAVE_NL_LANGINFO) && defined(CODESET) + if (!strcmp(nl_langinfo(CODESET), "UTF-8")) { + return ucs4toutf8(buf, wval); + } else { +# ifdef HAVE_ICONV + iconv_t cd; + char inbuf[4], *bsave = buf; + ICONV_CONST char *inptr = inbuf; + size_t inbytes = 4, outbytes = 6; + const char *codesetstr = nl_langinfo(CODESET); + size_t count; + int i; + + /* + * If the code set isn't handled, we'd better assume it's US-ASCII + * rather than just failing hopelessly. Solaris has a weird habit + * of returning 646. This is handled by the native iconv(), but + * not by GNU iconv; what's more, some versions of the native iconv + * don't handle standard names like ASCII. + * + * This should only be a problem if there's a mismatch between the + * NLS and the iconv in use, which probably only means if libiconv + * is in use. We checked at configure time if our libraries pulled + * in _libiconv_version, which should be a good test. + * + * It shouldn't ever be NULL, but while we're being paranoid... + */ +# ifdef ICONV_FROM_LIBICONV + if (!codesetstr || !*codesetstr) + codesetstr = "US-ASCII"; +# endif + cd = iconv_open(codesetstr, "UCS-4BE"); +# ifdef ICONV_FROM_LIBICONV + if (cd == (iconv_t)-1 && !strcmp(codesetstr, "646")) { + codesetstr = "US-ASCII"; + cd = iconv_open(codesetstr, "UCS-4BE"); + } +# endif + if (cd == (iconv_t)-1) { + zerr("cannot do charset conversion (iconv failed)"); + return -1; + } + + /* store value in big endian form */ + for (i=3; i>=0; i--) { + inbuf[i] = wval & 0xff; + wval >>= 8; + } + count = iconv(cd, &inptr, &inbytes, &buf, &outbytes); + iconv_close(cd); + if (count) { + /* -1 indicates error. Positive value means number of "invalid" + * (or "non-reversible") conversions, which we consider as + * "out-of-range" characters. */ + zerr("character not in range"); + return -1; + } + return buf - bsave; +# else /* !HAVE_ICONV */ + zerr("cannot do charset conversion (iconv not available)"); + return -1; +# endif /* HAVE_ICONV */ + } +# else /* !(HAVE_NL_LANGINFO && CODESET) */ + zerr("cannot do charset conversion (NLS not supported)"); + return -1; +# endif /* HAVE_NL_LANGINFO && CODESET */ +#endif /* HAVE_WCHAR_H && HAVE_WCTOMB && __STDC_ISO_10646__ */ +} /* * Decode a key string, turning it into the literal characters. @@ -6785,21 +6851,6 @@ getkeystring(char *s, int *len, int how, int *misc) char *t, *tdest = NULL, *u = NULL, *sstart = s, *tbuf = NULL; char svchar = '\0'; int meta = 0, control = 0, ignoring = 0; - int i; -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__) - wint_t wval; - int count; -#else - unsigned int wval; -# if defined(HAVE_NL_LANGINFO) && defined(CODESET) -# if defined(HAVE_ICONV) - iconv_t cd; - char inbuf[4]; - size_t inbytes, outbytes; -# endif - size_t count; -# endif -#endif DPUTS((how & GETKEY_UPDATE_OFFSET) && (how & ~(GETKEYS_DOLLARS_QUOTE|GETKEY_UPDATE_OFFSET)), @@ -6864,7 +6915,8 @@ getkeystring(char *s, int *len, int how, int *misc) } for (; *s; s++) { if (*s == '\\' && s[1]) { - int miscadded; + int miscadded, count, i; + unsigned int wval; if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc) { (*misc)--; miscadded = 1; @@ -6979,86 +7031,32 @@ getkeystring(char *s, int *len, int how, int *misc) *misc = wval; return s+1; } -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__) - count = wctomb(t, (wchar_t)wval); + count = ucs4tomb(wval, t); if (count == -1) { - zerr("character not in range"); - CHARSET_FAILED(); + if (how & GETKEY_DOLLAR_QUOTE) { + while ((*tdest++ = *++s)) { + if (how & GETKEY_UPDATE_OFFSET) { + if (s - sstart > *misc) + (*misc)++; + } + if (*s == Snull) { + *len = (s - sstart) + 1; + *tdest = '\0'; + return buf; + } + } + *len = tdest - buf; + } + else { + *t = '\0'; + *len = t - buf; + } + return buf; } if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc) (*misc) += count; t += count; -# else -# if defined(HAVE_NL_LANGINFO) && defined(CODESET) - if (!strcmp(nl_langinfo(CODESET), "UTF-8")) { - count = ucs4toutf8(t, wval); - t += count; - if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc) - (*misc) += count; - } else { -# ifdef HAVE_ICONV - ICONV_CONST char *inptr = inbuf; - const char *codesetstr = nl_langinfo(CODESET); - inbytes = 4; - outbytes = 6; - /* store value in big endian form */ - for (i=3;i>=0;i--) { - inbuf[i] = wval & 0xff; - wval >>= 8; - } - /* - * If the code set isn't handled, we'd better - * assume it's US-ASCII rather than just failing - * hopelessly. Solaris has a weird habit of - * returning 646. This is handled by the - * native iconv(), but not by GNU iconv; what's - * more, some versions of the native iconv don't - * handle standard names like ASCII. - * - * This should only be a problem if there's a - * mismatch between the NLS and the iconv in use, - * which probably only means if libiconv is in use. - * We checked at configure time if our libraries - * pulled in _libiconv_version, which should be - * a good test. - * - * It shouldn't ever be NULL, but while we're - * being paranoid... - */ -#ifdef ICONV_FROM_LIBICONV - if (!codesetstr || !*codesetstr) - codesetstr = "US-ASCII"; -#endif - cd = iconv_open(codesetstr, "UCS-4BE"); -#ifdef ICONV_FROM_LIBICONV - if (cd == (iconv_t)-1 && !strcmp(codesetstr, "646")) { - codesetstr = "US-ASCII"; - cd = iconv_open(codesetstr, "UCS-4BE"); - } -#endif - if (cd == (iconv_t)-1) { - zerr("cannot do charset conversion (iconv failed)"); - CHARSET_FAILED(); - } - count = iconv(cd, &inptr, &inbytes, &t, &outbytes); - iconv_close(cd); - if (count == (size_t)-1) { - zerr("character not in range"); - CHARSET_FAILED(); - } - if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc) - (*misc) += count; -# else - zerr("cannot do charset conversion (iconv not available)"); - CHARSET_FAILED(); -# endif - } -# else - zerr("cannot do charset conversion (NLS not supported)"); - CHARSET_FAILED(); -# endif -# endif if (how & GETKEY_DOLLAR_QUOTE) { char *t2; for (t2 = tbuf; t2 < t; t2++) { |