From 4a67f2479892fda348546404216270aaaff523ea Mon Sep 17 00:00:00 2001 From: Peter Stephenson Date: Mon, 10 Jul 2006 13:08:22 +0000 Subject: 22544: Improve use of ztype tests for multibyte characters. Add POSIX_IDENTIFIERS option to control allowability of multibyte alphanumeric characters in parameter and module names. --- Src/Zle/compcore.c | 16 +++++--- Src/Zle/zle_tricky.c | 104 ++++++++++++++++++++++++++++++++++----------------- Src/builtin.c | 4 +- Src/glob.c | 4 +- Src/lex.c | 7 +++- Src/math.c | 9 +++-- Src/module.c | 8 +--- Src/options.c | 1 + Src/params.c | 13 +++---- Src/parse.c | 5 +-- Src/subst.c | 20 +++++----- Src/utils.c | 94 +++++++++++++++++++++++++++++++++++++++++----- Src/zsh.h | 1 + Src/ztype.h | 32 ++++++++-------- 14 files changed, 213 insertions(+), 105 deletions(-) (limited to 'Src') diff --git a/Src/Zle/compcore.c b/Src/Zle/compcore.c index 008f49185..38b1934e2 100644 --- a/Src/Zle/compcore.c +++ b/Src/Zle/compcore.c @@ -1081,7 +1081,7 @@ check_param(char *s, int set, int test) } if ((*p == String || *p == Qstring) && p[1] != Inpar && p[1] != Inbrack) { /* This is really a parameter expression (not $(...) or $[...]). */ - char *b = p + 1, *e = b; + char *b = p + 1, *e = b, *ie; int n = 0, br = 1, nest = 0; if (*b == Inbrace) { @@ -1124,10 +1124,16 @@ check_param(char *s, int set, int test) else if (idigit(*e)) while (idigit(*e)) e++; - else if (iident(*e)) - while (iident(*e) || - (comppatmatch && *comppatmatch && (*e == Star || *e == Quest))) - e++; + else if ((ie = itype_end(e, IIDENT, 0)) != e) { + do { + e = ie; + if (comppatmatch && *comppatmatch && + (*e == Star || *e == Quest)) + ie = e + 1; + else + ie = itype_end(e, IIDENT, 0); + } while (ie != e); + } /* Now make sure that the cursor is inside the name. */ if (offs <= e - s && offs >= b - s && n <= 0) { diff --git a/Src/Zle/zle_tricky.c b/Src/Zle/zle_tricky.c index 250804648..28857b03e 100644 --- a/Src/Zle/zle_tricky.c +++ b/Src/Zle/zle_tricky.c @@ -551,9 +551,8 @@ parambeg(char *s) else if (idigit(*e)) while (idigit(*e)) e++; - else if (iident(*e)) - while (iident(*e)) - e++; + else + e = itype_end(e, IIDENT, 0); /* Now make sure that the cursor is inside the name. */ if (offs <= e - s && offs >= b - s && n <= 0) { @@ -740,8 +739,7 @@ docomplete(int lst) else if (idigit(*q)) do q++; while (idigit(*q)); else - while (iident(*q)) - q++; + q = itype_end(q, IIDENT, 0); sav = *q; *q = '\0'; if (zlemetacs - wb == q - s && @@ -1293,7 +1291,7 @@ get_comp_string(void) if (varq) tt = clwords[clwpos]; - for (s = tt; iident(*s); s++); + s = itype_end(tt, IIDENT, 0); sav = *s; *s = '\0'; zsfree(varname); @@ -1360,17 +1358,29 @@ get_comp_string(void) * as being in math. */ if (inwhat != IN_MATH) { int i = 0; - char *nnb = (iident(*s) ? s : s + 1), *nb = NULL, *ne = NULL; - - for (tt = s; ++tt < s + zlemetacs - wb;) + char *nnb, *nb = NULL, *ne = NULL; + + MB_METACHARINIT(); + if (itype_end(s, IIDENT, 1) == s) + nnb = s + MB_METACHARLEN(s); + else + nnb = s; + for (tt = s; tt < s + zlemetacs - wb;) { if (*tt == Inbrack) { i++; nb = nnb; ne = tt; - } else if (i && *tt == Outbrack) + tt++; + } else if (i && *tt == Outbrack) { i--; - else if (!iident(*tt)) - nnb = tt + 1; + tt++; + } else { + int nclen = MB_METACHARLEN(tt); + if (itype_end(tt, IIDENT, 1) == tt) + nnb = tt + nclen; + tt += nclen; + } + } if (i) { inwhat = IN_MATH; insubscr = 1; @@ -1415,33 +1425,59 @@ get_comp_string(void) /* In mathematical expression, we complete parameter names * * (even if they don't have a `$' in front of them). So we * * have to find that name. */ - for (we = zlemetacs; iident(zlemetaline[we]); we++); - for (wb = zlemetacs; --wb >= 0 && iident(zlemetaline[wb]);); - wb++; + char *cspos = zlemetaline + zlemetacs, *wptr, *cptr; + we = itype_end(cspos, IIDENT, 0) - cspos; + + /* + * With multibyte characters we need to go forwards, + * so start at the beginning of the line and continue + * until cspos. + */ + wptr = cptr = zlemetaline; + for (;;) { + cptr = itype_end(wptr, IIDENT, 0); + if (cptr == wptr) { + /* not an ident character */ + wptr = (cptr += MB_METACHARLEN(cptr)); + } + if (cptr >= cspos) { + wb = wptr - zlemetaline; + break; + } + } } zsfree(s); s = zalloc(we - wb + 1); strncpy(s, zlemetaline + wb, we - wb); s[we - wb] = '\0'; - if (wb > 2 && zlemetaline[wb - 1] == '[' && - iident(zlemetaline[wb - 2])) { - int i = wb - 3; - char sav = zlemetaline[wb - 1]; - while (i >= 0 && iident(zlemetaline[i])) - i--; + if (wb > 2 && zlemetaline[wb - 1] == '[') { + char *sqbr = zlemetaline + wb - 1, *cptr, *wptr; - zlemetaline[wb - 1] = '\0'; - zsfree(varname); - varname = ztrdup(zlemetaline + i + 1); - zlemetaline[wb - 1] = sav; - if ((keypm = (Param) paramtab->getnode(paramtab, varname)) && - (keypm->node.flags & PM_HASHED)) { - if (insubscr != 3) - insubscr = 2; - } else - insubscr = 1; + /* Need to search forward for word characters */ + cptr = wptr = zlemetaline; + for (;;) { + cptr = itype_end(wptr, IIDENT, 0); + if (cptr == wptr) { + /* not an ident character */ + wptr = (cptr += MB_METACHARLEN(cptr)); + } + if (cptr >= sqbr) + break; + } + + if (wptr < sqbr) { + zsfree(varname); + varname = ztrduppfx(wptr, sqbr - wptr); + if ((keypm = (Param) paramtab->getnode(paramtab, varname)) && + (keypm->node.flags & PM_HASHED)) { + if (insubscr != 3) + insubscr = 2; + } else + insubscr = 1; + } } + parse_subst_string(s); } /* This variable will hold the current word in quoted form. */ @@ -1562,12 +1598,12 @@ get_comp_string(void) *tp == '@') p++, i++; else { + char *ie; if (idigit(*tp)) while (idigit(*tp)) tp++; - else if (iident(*tp)) - while (iident(*tp)) - tp++; + else if ((ie = itype_end(tp, IIDENT, 0)) != tp) + tp = ie; else { tt = NULL; break; diff --git a/Src/builtin.c b/Src/builtin.c index ff396fb47..71dcbffc3 100644 --- a/Src/builtin.c +++ b/Src/builtin.c @@ -2629,9 +2629,7 @@ bin_functions(char *name, char **argv, Options ops, int func) char *modname = NULL; char *ptr; - for (ptr = funcname; *ptr; ptr++) - if (!iident(*ptr)) - break; + ptr = itype_end(funcname, IIDENT, 0); if (idigit(*funcname) || funcname == ptr || *ptr) { zwarnnam(name, "-M %s: bad math function name", funcname); return 1; diff --git a/Src/glob.c b/Src/glob.c index 3a1b28784..26b288efc 100644 --- a/Src/glob.c +++ b/Src/glob.c @@ -1443,9 +1443,7 @@ zglob(LinkList list, LinkNode np, int nountok) if (s[-1] == '+') { plus = 0; - tt = s; - while (iident(*tt)) - tt++; + tt = itype_end(s, IIDENT, 0); if (tt == s) { zerr("missing identifier after `+'"); diff --git a/Src/lex.c b/Src/lex.c index 635e847d2..57b752309 100644 --- a/Src/lex.c +++ b/Src/lex.c @@ -1135,10 +1135,13 @@ gettokstr(int c, int sub) if (idigit(*t)) while (++t < bptr && idigit(*t)); else { - while (iident(*t) && ++t < bptr); + int sav = *bptr; + *bptr = '\0'; + t = itype_end(t, IIDENT, 0); if (t < bptr) { - *bptr = '\0'; skipparens(Inbrack, Outbrack, &t); + } else { + *bptr = sav; } } if (*t == '+') diff --git a/Src/math.c b/Src/math.c index bd48288ec..fe5f7b74b 100644 --- a/Src/math.c +++ b/Src/math.c @@ -265,11 +265,12 @@ zzlex(void) { int cct = 0; yyval.type = MN_INTEGER; + char *ie; for (;; cct = 0) switch (*ptr++) { case '+': - if (*ptr == '+' && (unary || !ialnum(*ptr))) { + if (*ptr == '+') { ptr++; return (unary) ? PREPLUS : POSTPLUS; } @@ -279,7 +280,7 @@ zzlex(void) } return (unary) ? UPLUS : PLUS; case '-': - if (*ptr == '-' && (unary || !ialnum(*ptr))) { + if (*ptr == '-') { ptr++; return (unary) ? PREMINUS : POSTMINUS; } @@ -469,12 +470,12 @@ zzlex(void) } cct = 1; } - if (iident(*ptr)) { + if ((ie = itype_end(ptr, IIDENT, 0)) != ptr) { int func = 0; char *p; p = ptr; - while (iident(*++ptr)); + ptr = ie; if (*ptr == '[' || (!cct && *ptr == '(')) { char op = *ptr, cp = ((*ptr == '[') ? ']' : ')'); int l; diff --git a/Src/module.c b/Src/module.c index fde78ac1d..398958cce 100644 --- a/Src/module.c +++ b/Src/module.c @@ -734,12 +734,8 @@ static int modname_ok(char const *p) { do { - if(*p != '_' && !ialnum(*p)) - return 0; - do { - p++; - } while(*p == '_' || ialnum(*p)); - if(!*p) + p = itype_end(p, IIDENT, 0); + if (!*p) return 1; } while(*p++ == '/'); return 0; diff --git a/Src/options.c b/Src/options.c index 6c3fb26d1..307bd5430 100644 --- a/Src/options.c +++ b/Src/options.c @@ -176,6 +176,7 @@ static struct optname optns[] = { {{NULL, "overstrike", 0}, OVERSTRIKE}, {{NULL, "pathdirs", OPT_EMULATE}, PATHDIRS}, {{NULL, "posixbuiltins", OPT_EMULATE|OPT_BOURNE}, POSIXBUILTINS}, +{{NULL, "posixidentifiers", OPT_EMULATE|OPT_BOURNE}, POSIXIDENTIFIERS}, {{NULL, "printeightbit", 0}, PRINTEIGHTBIT}, {{NULL, "printexitvalue", 0}, PRINTEXITVALUE}, {{NULL, "privileged", OPT_SPECIAL}, PRIVILEGED}, diff --git a/Src/params.c b/Src/params.c index f589a740e..17ce2c54d 100644 --- a/Src/params.c +++ b/Src/params.c @@ -899,9 +899,7 @@ isident(char *s) break; } else { /* Find the first character in `s' not in the iident type table */ - for (ss = s; *ss; ss++) - if (!iident(*ss)) - break; + ss = itype_end(s, IIDENT, 0); } /* If the next character is not [, then it is * @@ -1653,7 +1651,7 @@ getvalue(Value v, char **pptr, int bracks) mod_export Value fetchvalue(Value v, char **pptr, int bracks, int flags) { - char *s, *t; + char *s, *t, *ie; char sav, c; int ppar = 0; @@ -1665,9 +1663,8 @@ fetchvalue(Value v, char **pptr, int bracks, int flags) else ppar = *s++ - '0'; } - else if (iident(c)) - while (iident(*s)) - s++; + else if ((ie = itype_end(s, IIDENT, 0)) != s) + s = ie; else if (c == Quest) *s++ = '?'; else if (c == Pound) @@ -1732,7 +1729,7 @@ fetchvalue(Value v, char **pptr, int bracks, int flags) return v; } } else if (!(flags & SCANPM_ASSIGNING) && v->isarr && - iident(*t) && isset(KSHARRAYS)) + itype_end(t, IIDENT, 1) != t && isset(KSHARRAYS)) v->end = 1, v->isarr = 0; } if (!bracks && *s) diff --git a/Src/parse.c b/Src/parse.c index 61ef9dcdc..17a24f2e9 100644 --- a/Src/parse.c +++ b/Src/parse.c @@ -1603,10 +1603,7 @@ par_simple(int *complex, int nr) if (*ptr == Outbrace && ptr > tokstr + 1) { - while (--ptr > tokstr) - if (!iident(*ptr)) - break; - if (ptr == tokstr) + if (itype_end(tokstr, IIDENT, 0) >= ptr - 1) { char *toksave = tokstr; char *idstring = dupstrpfx(tokstr+1, eptr-tokstr-1); diff --git a/Src/subst.c b/Src/subst.c index 12df115a0..821c1c79a 100644 --- a/Src/subst.c +++ b/Src/subst.c @@ -475,15 +475,14 @@ filesubstr(char **namptr, int assign) return 0; *namptr = dyncat(ds, ptr); return 1; - } else if (iuser(str[1])) { /* ~foo */ - char *ptr, *hom, save; + } else if ((ptr = itype_end(str+1, IUSER, 0)) != str+1) { /* ~foo */ + char *hom, save; - for (ptr = ++str; *ptr && iuser(*ptr); ptr++); save = *ptr; if (!isend(save)) return 0; *ptr = 0; - if (!(hom = getnameddir(str))) { + if (!(hom = getnameddir(++str))) { if (isset(NOMATCH)) zerr("no such user or named directory: %s", str); *ptr = save; @@ -1146,9 +1145,10 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub) * Shouldn't this be a table or something? We test for all * these later on, too. */ - if (!ialnum(c = *s) && c != '#' && c != Pound && c != '-' && - c != '!' && c != '$' && c != String && c != Qstring && - c != '?' && c != Quest && c != '_' && + c = *s; + if (itype_end(s, IIDENT, 1) == s && *s != '#' && c != Pound && + c != '-' && c != '!' && c != '$' && c != String && c != Qstring && + c != '?' && c != Quest && c != '*' && c != Star && c != '@' && c != '{' && c != Inbrace && c != '=' && c != Equals && c != Hat && c != '^' && c != '~' && c != Tilde && c != '+') { @@ -1446,8 +1446,8 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub) } else spbreak = 2; } else if ((c == '#' || c == Pound) && - (iident(cc = s[1]) - || cc == '*' || cc == Star || cc == '@' + (itype_end(s+1, IIDENT, 0) != s + 1 + || (cc = s[1]) == '*' || cc == Star || cc == '@' || cc == '-' || (cc == ':' && s[2] == '-') || (isstring(cc) && (s[2] == Inbrace || s[2] == Inpar)))) { getlen = 1 + whichlen, s++; @@ -1471,7 +1471,7 @@ paramsubst(LinkList l, LinkNode n, char **str, int qt, int ssub) * Try to handle this when parameter is named * by (P) (second part of test). */ - if (iident(s[1]) || (aspar && isstring(s[1]) && + if (itype_end(s+1, IIDENT, 0) != s+1 || (aspar && isstring(s[1]) && (s[2] == Inbrace || s[2] == Inpar))) chkset = 1, s++; else if (!inbrace) { diff --git a/Src/utils.c b/Src/utils.c index 75a736596..0d6cd8866 100644 --- a/Src/utils.c +++ b/Src/utils.c @@ -1921,7 +1921,7 @@ spckword(char **s, int hist, int cmd, int ask) return; if (**s == String && !*t) { guess = *s + 1; - if (*t || !ialpha(*guess)) + if (itype_end(guess, IIDENT, 1) == guess) return; ic = String; d = 100; @@ -2750,11 +2750,8 @@ wcsiword(wchar_t c) * iident() macro extended to support wide characters. * * The macro is intended to test if a character is allowed in an - * internal zsh identifier. Until the main shell handles multibyte - * characters it's not a good idea to allow characters other than - * ASCII characters; it would cause zle to allow characters that - * the main shell would reject. Eventually we should be able - * to allow all alphanumerics. + * internal zsh identifier. We allow all alphanumerics outside + * the ASCII range unless POSIXIDENTIFIERS is set. * * Otherwise similar to wcsiword. */ @@ -2774,14 +2771,90 @@ wcsiident(wchar_t c) } else if (len == 1 && iascii(*outstr)) { return iident(*outstr); } else { - /* TODO: not currently allowed, see above */ - return 0; + return !isset(POSIXIDENTIFIERS) && iswalnum(c); } } /**/ #endif +/* + * Find the end of a set of characters in the set specified by itype; + * one of IALNUM, IIDENT, IWORD or IUSER. For non-ASCII characters, we assume + * alphanumerics are part of the set, with the exception that + * identifiers are not treated that way if POSIXIDENTIFIERS is set. + * + * See notes above for identifiers. + * Returns the same pointer as passed if not on an identifier character. + * If "once" is set, just test the first character, i.e. (outptr != + * inptr) tests whether the first character is valid in an identifier. + * + * Currently this is only called with itype IIDENT or IUSER. + */ + +/**/ +mod_export char * +itype_end(const char *ptr, int itype, int once) +{ +#ifdef MULTIBYTE_SUPPORT + if (isset(MULTIBYTE) && + (itype != IIDENT || !isset(POSIXIDENTIFIERS))) { + mb_metacharinit(); + while (*ptr) { + wint_t wc; + int len = mb_metacharlenconv(ptr, &wc); + + if (!len) + break; + + if (wc == WEOF) { + /* invalid, treat as single character */ + int chr = STOUC(*ptr == Meta ? ptr[1] ^ 32 : *ptr); + /* in this case non-ASCII characters can't match */ + if (chr > 127 || !zistype(chr,itype)) + break; + } else if (len == 1 && iascii(*ptr)) { + /* ASCII: can't be metafied, use standard test */ + if (!zistype(*ptr,itype)) + break; + } else { + /* + * Valid non-ASCII character. Allow all alphanumerics; + * if testing for words, allow all wordchars. + */ + if (!(iswalnum(wc) || + (itype == IWORD && wcschr(wordchars_wide, wc)))) + break; + } + ptr += len; + + if (once) + break; + } + } else +#endif + for (;;) { + int chr = STOUC(*ptr == Meta ? ptr[1] ^ 32 : *ptr); + if (!zistype(chr,itype)) + break; + ptr += (*ptr == Meta) ? 2 : 1; + + if (once) + break; + } + + /* + * Nasty. The first argument is const char * because we + * don't modify it here. However, we really want to pass + * back the same type as was passed down, to allow idioms like + * p = itype_end(p, IIDENT, 0); + * So returning a const char * isn't really the right thing to do. + * Without having two different functions the following seems + * to be the best we can do. + */ + return (char *)ptr; +} + /**/ mod_export char ** arrdup(char **s) @@ -3710,9 +3783,10 @@ mb_metacharinit(void) /**/ int -mb_metacharlenconv(char *s, wint_t *wcp) +mb_metacharlenconv(const char *s, wint_t *wcp) { - char inchar, *ptr; + char inchar; + const char *ptr; size_t ret; wchar_t wc; diff --git a/Src/zsh.h b/Src/zsh.h index 8554b5c96..b5f675db5 100644 --- a/Src/zsh.h +++ b/Src/zsh.h @@ -1610,6 +1610,7 @@ enum { OVERSTRIKE, PATHDIRS, POSIXBUILTINS, + POSIXIDENTIFIERS, PRINTEIGHTBIT, PRINTEXITVALUE, PRIVILEGED, diff --git a/Src/ztype.h b/Src/ztype.h index b8bc584db..7b7973602 100644 --- a/Src/ztype.h +++ b/Src/ztype.h @@ -42,22 +42,22 @@ #define IMETA (1 << 12) #define IWSEP (1 << 13) #define INULL (1 << 14) -#define _icom(X,Y) (typtab[STOUC(X)] & Y) -#define idigit(X) _icom(X,IDIGIT) -#define ialnum(X) _icom(X,IALNUM) -#define iblank(X) _icom(X,IBLANK) /* blank, not including \n */ -#define inblank(X) _icom(X,INBLANK) /* blank or \n */ -#define itok(X) _icom(X,ITOK) -#define isep(X) _icom(X,ISEP) -#define ialpha(X) _icom(X,IALPHA) -#define iident(X) _icom(X,IIDENT) -#define iuser(X) _icom(X,IUSER) /* username char */ -#define icntrl(X) _icom(X,ICNTRL) -#define iword(X) _icom(X,IWORD) -#define ispecial(X) _icom(X,ISPECIAL) -#define imeta(X) _icom(X,IMETA) -#define iwsep(X) _icom(X,IWSEP) -#define inull(X) _icom(X,INULL) +#define zistype(X,Y) (typtab[STOUC(X)] & Y) +#define idigit(X) zistype(X,IDIGIT) +#define ialnum(X) zistype(X,IALNUM) +#define iblank(X) zistype(X,IBLANK) /* blank, not including \n */ +#define inblank(X) zistype(X,INBLANK) /* blank or \n */ +#define itok(X) zistype(X,ITOK) +#define isep(X) zistype(X,ISEP) +#define ialpha(X) zistype(X,IALPHA) +#define iident(X) zistype(X,IIDENT) +#define iuser(X) zistype(X,IUSER) /* username char */ +#define icntrl(X) zistype(X,ICNTRL) +#define iword(X) zistype(X,IWORD) +#define ispecial(X) zistype(X,ISPECIAL) +#define imeta(X) zistype(X,IMETA) +#define iwsep(X) zistype(X,IWSEP) +#define inull(X) zistype(X,INULL) #define iascii(X) isascii(STOUC(X)) #define ilower(X) islower(STOUC(X)) -- cgit 1.4.1