about summary refs log tree commit diff
path: root/u8decode.h
diff options
context:
space:
mode:
authorLeah Neukirchen <leah@vuxu.org>2017-11-23 16:22:06 +0100
committerLeah Neukirchen <leah@vuxu.org>2017-11-23 16:22:06 +0100
commit3e6f8047a6275d486139136153a08d51b18d3abd (patch)
tree69f7a5912f504168efdf241f484779b031767320 /u8decode.h
parent73641367db67f47c55d28d98b2d41467a8c87211 (diff)
downloadmblaze-3e6f8047a6275d486139136153a08d51b18d3abd.tar.gz
mblaze-3e6f8047a6275d486139136153a08d51b18d3abd.tar.xz
mblaze-3e6f8047a6275d486139136153a08d51b18d3abd.zip
u8decode: detect invalid encodings
Diffstat (limited to 'u8decode.h')
-rw-r--r--u8decode.h16
1 files changed, 12 insertions, 4 deletions
diff --git a/u8decode.h b/u8decode.h
index 7599377..76ae43d 100644
--- a/u8decode.h
+++ b/u8decode.h
@@ -2,7 +2,7 @@
 
 // Decode one UTF-8 codepoint into cp, return number of bytes to next one.
 // On invalid UTF-8, return -1, and do not change cp.
-// Overlong sequences, surrogates and invalid codepoints are not checked.
+// Invalid codepoints are not checked.
 //
 // This code is meant to be inlined, if cp is unused it can be optimized away.
 static int
@@ -12,10 +12,18 @@ u8decode(const char *cs, uint32_t *cp)
 
 	if (*s == 0)   { *cp = 0; return 0; }
 	if (*s < 0x80) { *cp = *s; return 1; }
-	if (*s < 0xc0) { return -1; }
+	if (*s < 0xc2) { return -1; }  //cont+overlong
 	if (*s < 0xe0) { *cp = *s & 0x1f; goto u2; }
-	if (*s < 0xf0) { *cp = *s & 0x0f; goto u3; }
-	if (*s < 0xf8) { *cp = *s & 0x07; goto u4; }
+	if (*s < 0xf0) {
+		if (*s == 0xe0 && (s[1] & 0xe0) == 0x80) return -1; //overlong
+		if (*s == 0xed && (s[1] & 0xe0) == 0xa0) return -1; //surrogate
+		*cp = *s & 0x0f; goto u3;
+	}
+	if (*s < 0xf5) {
+		if (*s == 0xf0 && (s[1] & 0xf0) == 0x80) return -1; //overlong
+		if (*s == 0xf4 && (s[1] > 0x8f)) return -1; //too high
+		*cp = *s & 0x07; goto u4;
+	}
 	return -1;
 
 u4:	if ((*++s & 0xc0) != 0x80) return -1;  *cp = (*cp << 6) | (*s & 0x3f);