about summary refs log tree commit diff
path: root/u8decode.h
diff options
context:
space:
mode:
authorLeah Neukirchen <leah@vuxu.org>2017-11-22 23:48:00 +0100
committerLeah Neukirchen <leah@vuxu.org>2017-11-22 23:48:32 +0100
commit73641367db67f47c55d28d98b2d41467a8c87211 (patch)
treeaf02907cb8bbbc9615d1bab2c7ade01c49b3a3e3 /u8decode.h
parent8b50a794b03244b5de8680bc35b40dec92e4131e (diff)
downloadmblaze-73641367db67f47c55d28d98b2d41467a8c87211.tar.gz
mblaze-73641367db67f47c55d28d98b2d41467a8c87211.tar.xz
mblaze-73641367db67f47c55d28d98b2d41467a8c87211.zip
revamp utf-8 handling code
Diffstat (limited to 'u8decode.h')
-rw-r--r--u8decode.h25
1 files changed, 25 insertions, 0 deletions
diff --git a/u8decode.h b/u8decode.h
new file mode 100644
index 0000000..7599377
--- /dev/null
+++ b/u8decode.h
@@ -0,0 +1,25 @@
+#include <stdint.h>
+
+// Decode one UTF-8 codepoint into cp, return number of bytes to next one.
+// On invalid UTF-8, return -1, and do not change cp.
+// Overlong sequences, surrogates and invalid codepoints are not checked.
+//
+// This code is meant to be inlined, if cp is unused it can be optimized away.
+static int
+u8decode(const char *cs, uint32_t *cp)
+{
+	const uint8_t *s = (uint8_t *)cs;
+
+	if (*s == 0)   { *cp = 0; return 0; }
+	if (*s < 0x80) { *cp = *s; return 1; }
+	if (*s < 0xc0) { return -1; }
+	if (*s < 0xe0) { *cp = *s & 0x1f; goto u2; }
+	if (*s < 0xf0) { *cp = *s & 0x0f; goto u3; }
+	if (*s < 0xf8) { *cp = *s & 0x07; goto u4; }
+	return -1;
+
+u4:	if ((*++s & 0xc0) != 0x80) return -1;  *cp = (*cp << 6) | (*s & 0x3f);
+u3:	if ((*++s & 0xc0) != 0x80) return -1;  *cp = (*cp << 6) | (*s & 0x3f);
+u2:	if ((*++s & 0xc0) != 0x80) return -1;  *cp = (*cp << 6) | (*s & 0x3f);
+	return s - (uint8_t *)cs + 1;
+}