about summary refs log tree commit diff
path: root/Src
diff options
context:
space:
mode:
authorPeter Stephenson <pws@zsh.org>2015-09-04 10:07:51 +0100
committerPeter Stephenson <pws@zsh.org>2015-09-04 10:07:51 +0100
commitf52795ea3e637396ac2bbdef8fa1b1b6f2088db6 (patch)
tree90e0760b1ad8942da1b255a8f4be8588c3c2cde3 /Src
parent32f5d3d8c16b4f3a11fa39c0ee378d72336ba853 (diff)
downloadzsh-f52795ea3e637396ac2bbdef8fa1b1b6f2088db6.tar.gz
zsh-f52795ea3e637396ac2bbdef8fa1b1b6f2088db6.tar.xz
zsh-f52795ea3e637396ac2bbdef8fa1b1b6f2088db6.zip
36415: remap bytes from invalid multibyte characters.
These now go to 0xdc00 + index.  If wchar_t is a Unicode code point,
this is by construction an invalid character within the Unicode range.
If it isn't, we would hope the result was no worse than the current
fudge.
Diffstat (limited to 'Src')
-rw-r--r--Src/pattern.c28
1 files changed, 22 insertions, 6 deletions
diff --git a/Src/pattern.c b/Src/pattern.c
index 7d38988a0..7457cbd23 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -224,6 +224,22 @@ typedef zlong zrange_t;
 typedef unsigned long zrange_t;
 #endif
 
+#ifdef MULTIBYTE_SUPPORT
+/*
+ * Handle a byte that's not part of a valid character.
+ *
+ * This range in Unicode is recommended for purposes of this
+ * kind as it corresponds to invalid characters.
+ *
+ * Note that this strictly only works if wchar_t represents
+ * Unicode code points, which isn't necessarily true; however,
+ * converting an invalid character into an unknown format is
+ * a bit tricky...
+ */
+#define WCHAR_INVALID(ch)			\
+    ((wchar_t) (0xDC00 + STOUC(ch)))
+#endif /* MULTIBYTE_SUPPORT */
+
 /*
  * Array of characters corresponding to zpc_chars enum, which it must match.
  */
@@ -353,10 +369,10 @@ metacharinc(char **x)
 	return wc;
     }
 
-    /* Error.  Treat as single byte. */
+    /* Error. */
     /* Reset the shift state for next time. */
     memset(&shiftstate, 0, sizeof(shiftstate));
-    return (wchar_t) STOUC(*(*x)++);
+    return WCHAR_INVALID(*(*x)++);
 }
 
 #else
@@ -1867,10 +1883,10 @@ charref(char *x, char *y)
     ret = mbrtowc(&wc, x, y-x, &shiftstate);
 
     if (ret == MB_INVALID || ret == MB_INCOMPLETE) {
-	/* Error.  Treat as single byte. */
+	/* Error. */
 	/* Reset the shift state for next time. */
 	memset(&shiftstate, 0, sizeof(shiftstate));
-	return (wchar_t) STOUC(*x);
+	return WCHAR_INVALID(*x);
     }
 
     return wc;
@@ -1913,7 +1929,7 @@ charrefinc(char **x, char *y, int *z)
     size_t ret;
 
     if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(**x) & 0x80))
-	return (wchar_t) STOUC(*(*x)++);
+	return WCHAR_INVALID(*(*x)++);
 
     ret = mbrtowc(&wc, *x, y-*x, &shiftstate);
 
@@ -1922,7 +1938,7 @@ charrefinc(char **x, char *y, int *z)
 	*z = 1;
 	/* Reset the shift state for next time. */
 	memset(&shiftstate, 0, sizeof(shiftstate));
-	return (wchar_t) STOUC(*(*x)++);
+	return WCHAR_INVALID(*(*x)++);
     }
 
     /* Nulls here are normal characters */