21736: improve tests for word and identifier characters with multibyte input

author: Peter Stephenson <pws@users.sourceforge.net> 2005-09-20 15:10:26 +0000
committer: Peter Stephenson <pws@users.sourceforge.net> 2005-09-20 15:10:26 +0000
commit: 409296e22fb1cef515ccfff507c265a5fee0ab28 (patch)
tree: 7f15b9e7e175dd6302bf4c434886b263ceac84d8 /Src
parent: ce43e4a22c250fbaee14580b167986615455f78f (diff)
download: zsh-409296e22fb1cef515ccfff507c265a5fee0ab28.tar.gz
zsh-409296e22fb1cef515ccfff507c265a5fee0ab28.tar.xz
zsh-409296e22fb1cef515ccfff507c265a5fee0ab28.zip
6 files changed, 97 insertions, 35 deletions
diff --git a/Src/Zle/zle.h b/Src/Zle/zle.h
index 4b3f3f75a..fbfc02265 100644
--- a/Src/Zle/zle.h
+++ b/Src/Zle/zle.h
@@ -66,12 +66,7 @@ typedef wint_t   ZLE_INT_T;
 
 #define ZC_iblank iswspace
 #define ZC_icntrl iswcntrl
-/*
- * TODO: doesn't work on arguments with side effects.
- * Also YUK.  Not even sure this is guaranteed to work.
- * Should be easy to do along the lines of wcsiword.
- */
-#define ZC_iident(x)	(x < 256 && iident((int)x))
+#define ZC_iident wcsiident
 
 #define ZC_tolower towlower
 #define ZC_toupper towupper
diff --git a/Src/Zle/zle_main.c b/Src/Zle/zle_main.c
index 1b62ff027..923145710 100644
--- a/Src/Zle/zle_main.c
+++ b/Src/Zle/zle_main.c
@@ -106,11 +106,6 @@ mod_export ZLE_INT_T lastchar_wide;
 /**/
 mod_export int
 lastchar_wide_valid;
-
-/**/
-mod_export ZLE_STRING_T zle_wordchars;
-#else
-# define zle_wordchars wordchars;
 #endif
 
 /* the bindings for the previous and for this key */
@@ -1558,17 +1553,6 @@ trashzle(void)
 	kungetct = 0;
 }
 
-/**/
-mod_export void
-wordcharstrigger(void)
-{
-#ifdef ZLE_UNICODE_SUPPORT
-    zrealloc(zle_wordchars, strlen(wordchars)*MB_CUR_MAX);
-    mbsrtowcs(zle_wordchars, (const char **)&wordchars,
-	      strlen(wordchars), NULL);
-    /* TODO: error handling here */
-#endif
-}
 
 /* Hook functions. Used to allow access to zle parameters if zle is
  * active. */
@@ -1636,8 +1620,6 @@ setup_(UNUSED(Module m))
     kungetbuf = (char *) zalloc(kungetsz = 32);
     comprecursive = 0;
     rdstrs = NULL;
-    wordcharstriggerptr = wordcharstrigger;
-    wordcharstrigger();
 
     /* initialise the keymap system */
     init_keymaps();
@@ -1712,7 +1694,6 @@ finish_(UNUSED(Module m))
     zlegetlineptr = NULL;
     zlereadptr = fallback_zleread;
     zlesetkeymapptr= noop_function_int;
-    wordcharstriggerptr = noop_function;
 
     getkeyptr = NULL;
 
diff --git a/Src/init.c b/Src/init.c
index de6d4efcb..716898e28 100644
--- a/Src/init.c
+++ b/Src/init.c
@@ -1180,9 +1180,6 @@ mod_export ZleVoidIntFn zlesetkeymapptr = noop_function_int;
 #endif /* !LINKED_XMOD_zshQszle */
 
 /**/
-mod_export ZleVoidFn wordcharstriggerptr = noop_function;
-
-/**/
 unsigned char *
 autoload_zleread(char **lp, char **rp, int ha, int con)
 {
diff --git a/Src/params.c b/Src/params.c
index 89d25afee..218744000 100644
--- a/Src/params.c
+++ b/Src/params.c
@@ -3346,7 +3346,6 @@ wordcharssetfn(UNUSED(Param pm), char *x)
     zsfree(wordchars);
     wordchars = x;
     inittyptab();
-    wordcharstriggerptr();
 }
 
 /* Function to get value for special parameter `_' */
diff --git a/Src/pattern.c b/Src/pattern.c
index 393d9bf41..36578226c 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -2749,6 +2749,10 @@ patmatchrange(char *range, int ch)
 		    return 1;
 		break;
 	    case PP_WORD:
+		/*
+		 * HERE: when we support multibyte characters,
+		 * this test needs to be wcsiword().
+		 */
 		if (iword(ch))
 		    return 1;
 		break;
diff --git a/Src/utils.c b/Src/utils.c
index 71af531c3..dce10beee 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -35,6 +35,16 @@
 /**/
 char *scriptname;
 
+#ifdef ZLE_UNICODE_SUPPORT
+/*
+ * The wordchars variable turned into a wide character array.
+ * This is much more convenient for testing.
+ */
+
+/**/
+mod_export wchar_t *wordchars_wide;
+#endif
+
 /* Print an error */
  
 /**/
@@ -2456,8 +2466,18 @@ inittyptab(void)
 	typtab[t0] = IDIGIT | IALNUM | IWORD | IIDENT | IUSER;
     for (t0 = 'a'; t0 <= 'z'; t0++)
 	typtab[t0] = typtab[t0 - 'a' + 'A'] = IALPHA | IALNUM | IIDENT | IUSER | IWORD;
+#ifndef ZLE_UNICODE_SUPPORT
+    /*
+     * This really doesn't seem to me the right thing to do when
+     * we have multibyte character support...  it was a hack to assume
+     * eight bit characters `worked' for some values of work before
+     * we could test for them properly.  I'm not 100% convinced
+     * having IIDENT here is a good idea at all, but this code
+     * should disappear into history...
+     */
     for (t0 = 0240; t0 != 0400; t0++)
 	typtab[t0] = IALPHA | IALNUM | IIDENT | IUSER | IWORD;
+#endif
     typtab['_'] = IIDENT | IUSER;
     typtab['-'] = IUSER;
     typtab[' '] |= IBLANK | INBLANK;
@@ -2477,8 +2497,44 @@ inittyptab(void)
 	}
 	typtab[STOUC(*s == Meta ? *++s ^ 32 : *s)] |= ISEP;
     }
-    for (s = wordchars ? wordchars : DEFAULT_WORDCHARS; *s; s++)
-	typtab[STOUC(*s == Meta ? *++s ^ 32 : *s)] |= IWORD;
+#ifdef ZLE_UNICODE_SUPPORT
+    if (wordchars) {
+	const char *wordchars_ptr = wordchars;
+	mbstate_t mbs;
+	int nchars;
+
+	memset(&mbs, 0, sizeof(mbs));
+	wordchars_wide = (wchar_t *)
+	    zrealloc(wordchars_wide, (strlen(wordchars)+1)*sizeof(wchar_t));
+	nchars = mbsrtowcs(wordchars_wide, &wordchars_ptr, strlen(wordchars),
+			   &mbs);
+	if (nchars == -1) {
+	    /* Conversion state is undefined: better just set to null */
+	    *wordchars_wide = L'\0';
+	} else {
+	    wordchars_wide[nchars] = L'\0';
+	}
+    } else {
+	wordchars_wide = zrealloc(wordchars_wide, sizeof(wchar_t));
+	*wordchars_wide = L'\0';
+    }
+#endif
+    for (s = wordchars ? wordchars : DEFAULT_WORDCHARS; *s; s++) {
+	int c = STOUC(*s == Meta ? *++s ^ 32 : *s);
+#ifdef ZLE_UNICODE_SUPPORT
+	if (!isascii(c)) {
+	    /*
+	     * If we have support for multibyte characters, we don't
+	     * handle non-ASCII characters here; instead, we turn
+	     * wordchars into a wide character array.
+	     * (We may actually have a single-byte 8-bit character set,
+	     * but it works the same way.)
+	     */
+	    continue;
+	}
+#endif
+	typtab[c] |= IWORD;
+    }
     for (s = SPECCHARS; *s; s++)
 	typtab[STOUC(*s)] |= ISPECIAL;
     if (isset(BANGHIST) && bangchar && interact && isset(SHINSTDIN))
@@ -2503,9 +2559,6 @@ wcsiword(wchar_t c)
      * produces an ASCII character.  If it does, use iword on that.
      * If it doesn't, use iswalnum on the original character.  This
      * is pretty good most of the time.
-     *
-     * TODO: extend WORDCHARS to handle multibyte chars by some kind
-     * of hierarchical list or hash table.
      */
     len = wctomb(outstr, c);
 
@@ -2515,7 +2568,40 @@ wcsiword(wchar_t c)
     } else if (len == 1 && isascii(*outstr)) {
 	return iword(*outstr);
     } else {
-	return iswalnum(c);
+	return iswalnum(c) || wcschr(wordchars_wide, c);
+    }
+}
+
+/*
+ * iident() macro extended to support wide characters.
+ *
+ * The macro is intended to test if a character is allowed in an
+ * internal zsh identifier.  Until the main shell handles multibyte
+ * characters it's not a good idea to allow characters other than
+ * ASCII characters; it would cause zle to allow characters that
+ * the main shell would reject.  Eventually we should be able
+ * to allow all alphanumerics.
+ *
+ * Otherwise similar to wcsiword.
+ */
+
+/**/
+mod_export int
+wcsiident(wchar_t c)
+{
+    int len;
+    VARARR(char, outstr, MB_CUR_MAX);
+
+    len = wctomb(outstr, c);
+
+    if (len == 0) {
+	/* NULL is special */
+	return 0;
+    } else if (len == 1 && isascii(*outstr)) {
+	return iword(*outstr);
+    } else {
+	/* not currently allowed, see above */
+	return 0;
     }
 }
 #endif
author	Peter Stephenson <pws@users.sourceforge.net>	2005-09-20 15:10:26 +0000
committer	Peter Stephenson <pws@users.sourceforge.net>	2005-09-20 15:10:26 +0000
commit	409296e22fb1cef515ccfff507c265a5fee0ab28 (patch)
tree	7f15b9e7e175dd6302bf4c434886b263ceac84d8 /Src
parent	ce43e4a22c250fbaee14580b167986615455f78f (diff)
download	zsh-409296e22fb1cef515ccfff507c265a5fee0ab28.tar.gz zsh-409296e22fb1cef515ccfff507c265a5fee0ab28.tar.xz zsh-409296e22fb1cef515ccfff507c265a5fee0ab28.zip