From 4a67f2479892fda348546404216270aaaff523ea Mon Sep 17 00:00:00 2001
From: Peter Stephenson <pws@users.sourceforge.net>
Date: Mon, 10 Jul 2006 13:08:22 +0000
Subject: 22544: Improve use of ztype tests for multibyte characters.  Add
 POSIX_IDENTIFIERS option to control allowability of multibyte alphanumeric
 characters in parameter and module names.

---
 Src/utils.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 84 insertions(+), 10 deletions(-)

(limited to 'Src/utils.c')

diff --git a/Src/utils.c b/Src/utils.c
index 75a736596..0d6cd8866 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -1921,7 +1921,7 @@ spckword(char **s, int hist, int cmd, int ask)
 	return;
     if (**s == String && !*t) {
 	guess = *s + 1;
-	if (*t || !ialpha(*guess))
+	if (itype_end(guess, IIDENT, 1) == guess)
 	    return;
 	ic = String;
 	d = 100;
@@ -2750,11 +2750,8 @@ wcsiword(wchar_t c)
  * iident() macro extended to support wide characters.
  *
  * The macro is intended to test if a character is allowed in an
- * internal zsh identifier.  Until the main shell handles multibyte
- * characters it's not a good idea to allow characters other than
- * ASCII characters; it would cause zle to allow characters that
- * the main shell would reject.  Eventually we should be able
- * to allow all alphanumerics.
+ * internal zsh identifier.  We allow all alphanumerics outside
+ * the ASCII range unless POSIXIDENTIFIERS is set.
  *
  * Otherwise similar to wcsiword.
  */
@@ -2774,14 +2771,90 @@ wcsiident(wchar_t c)
     } else if (len == 1 && iascii(*outstr)) {
 	return iident(*outstr);
     } else {
-	/* TODO: not currently allowed, see above */
-	return 0;
+	return !isset(POSIXIDENTIFIERS) && iswalnum(c);
     }
 }
 /**/
 #endif
 
 
+/*
+ * Find the end of a set of characters in the set specified by itype;
+ * one of IALNUM, IIDENT, IWORD or IUSER.  For non-ASCII characters, we assume
+ * alphanumerics are part of the set, with the exception that
+ * identifiers are not treated that way if POSIXIDENTIFIERS is set.
+ *
+ * See notes above for identifiers.
+ * Returns the same pointer as passed if not on an identifier character.
+ * If "once" is set, just test the first character, i.e. (outptr !=
+ * inptr) tests whether the first character is valid in an identifier.
+ *
+ * Currently this is only called with itype IIDENT or IUSER.
+ */
+
+/**/
+mod_export char *
+itype_end(const char *ptr, int itype, int once)
+{
+#ifdef MULTIBYTE_SUPPORT
+    if (isset(MULTIBYTE) &&
+	(itype != IIDENT || !isset(POSIXIDENTIFIERS))) {
+	mb_metacharinit();
+	while (*ptr) {
+	    wint_t wc;
+	    int len = mb_metacharlenconv(ptr, &wc);
+
+	    if (!len)
+		break;
+
+	    if (wc == WEOF) {
+		/* invalid, treat as single character */
+		int chr = STOUC(*ptr == Meta ? ptr[1] ^ 32 : *ptr);
+		/* in this case non-ASCII characters can't match */
+		if (chr > 127 || !zistype(chr,itype))
+		    break;
+	    } else if (len == 1 && iascii(*ptr)) {
+		/* ASCII: can't be metafied, use standard test */
+		if (!zistype(*ptr,itype))
+		    break;
+	    } else {
+		/*
+		 * Valid non-ASCII character.  Allow all alphanumerics;
+		 * if testing for words, allow all wordchars.
+		 */
+		if (!(iswalnum(wc) ||
+		      (itype == IWORD && wcschr(wordchars_wide, wc))))
+		    break;
+	    }
+	    ptr += len;
+
+	    if (once)
+		break;
+	}
+    } else
+#endif
+	for (;;) {
+	    int chr = STOUC(*ptr == Meta ? ptr[1] ^ 32 : *ptr);
+	    if (!zistype(chr,itype))
+		break;
+	    ptr += (*ptr == Meta) ? 2 : 1;
+
+	    if (once)
+		break;
+	}
+
+    /*
+     * Nasty.  The first argument is const char * because we
+     * don't modify it here.  However, we really want to pass
+     * back the same type as was passed down, to allow idioms like
+     *   p = itype_end(p, IIDENT, 0);
+     * So returning a const char * isn't really the right thing to do.
+     * Without having two different functions the following seems
+     * to be the best we can do.
+     */
+    return (char *)ptr;
+}
+
 /**/
 mod_export char **
 arrdup(char **s)
@@ -3710,9 +3783,10 @@ mb_metacharinit(void)
 
 /**/
 int
-mb_metacharlenconv(char *s, wint_t *wcp)
+mb_metacharlenconv(const char *s, wint_t *wcp)
 {
-    char inchar, *ptr;
+    char inchar;
+    const char *ptr;
     size_t ret;
     wchar_t wc;
 
-- 
cgit 1.4.1