diff options
author | Leah Neukirchen <leah@vuxu.org> | 2017-11-22 23:48:00 +0100 |
---|---|---|
committer | Leah Neukirchen <leah@vuxu.org> | 2017-11-22 23:48:32 +0100 |
commit | 73641367db67f47c55d28d98b2d41467a8c87211 (patch) | |
tree | af02907cb8bbbc9615d1bab2c7ade01c49b3a3e3 /safe_u8putstr.c | |
parent | 8b50a794b03244b5de8680bc35b40dec92e4131e (diff) | |
download | mblaze-73641367db67f47c55d28d98b2d41467a8c87211.tar.gz mblaze-73641367db67f47c55d28d98b2d41467a8c87211.tar.xz mblaze-73641367db67f47c55d28d98b2d41467a8c87211.zip |
revamp utf-8 handling code
Diffstat (limited to 'safe_u8putstr.c')
-rw-r--r-- | safe_u8putstr.c | 73 |
1 files changed, 27 insertions, 46 deletions
diff --git a/safe_u8putstr.c b/safe_u8putstr.c index e198b54..31cdb0b 100644 --- a/safe_u8putstr.c +++ b/safe_u8putstr.c @@ -1,76 +1,57 @@ #include <stdint.h> #include <stdio.h> +#include "u8decode.h" + void safe_u8putstr(char *s0, size_t l, FILE *stream) { // tty-safe output of s, with relaxed utf-8 semantics: // - C0 and C1 are displayed as escape sequences - // - valid utf8 is printed as is - // - rest is assumed to be latin1, and translated into utf8 + // - valid utf-8 is printed as is + // - rest is assumed to be latin-1, and translated into utf-8 // - translate CRLF to CR unsigned char *s = (unsigned char *)s0; unsigned char *e = s + l; + uint32_t c; while (s < e) { - if ((*s & 0x80) == 0) { - if (*s < 32 && - *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r') { - // C0 + int l = u8decode((char *)s, &c); + if (l == -1) { + l = 1; + if (*s <= 0x9fu) { + // C1 fputc(0xe2, stream); fputc(0x90, stream); - fputc(0x80+*s, stream); - } else if (*s == 127) { - // DEL + fputc(0x80+0x1b, stream); + fputc(0xe2, stream); fputc(0x90, stream); - fputc(0xa1, stream); - } else if (*s == '\r') { - if (e - s > 1 && s[1] == '\n') - s++; fputc(*s, stream); } else { - // safe ASCII - fputc(*s, stream); + /* invalid utf-8, assume it was latin-1 */ + fputc(0xc0 | (*s >> 6), stream); + fputc(0x80 | (*s & 0x3f), stream); } - } else if ((*s & 0xc0) == 0x80) { - if (*s >= 0xa0) - goto latin1; - - // C1 + } else if (c < 32 && + *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r') { + // C0 fputc(0xe2, stream); fputc(0x90, stream); - fputc(0x80+0x1b, stream); - + fputc(0x80+*s, stream); + } else if (c == 127) { + // DEL fputc(0xe2, stream); fputc(0x90, stream); + fputc(0xa1, stream); + } else if (c == '\r') { + if (e - s > 1 && s[1] == '\n') + s++; fputc(*s, stream); } else { - uint32_t f = 0; - if (e - s >= 4) - f = (s[0]<<24) | (s[1]<<16) | (s[2]<<8) | s[3]; - else if (e - s == 3) - f = (s[0]<<24) | (s[1]<<16) | (s[2]<<8); - else if (e - s == 2) - f = (s[0]<<24) | (s[1]<<16); - else if (e - s == 1) - f = (s[0]<<24); - - if ((f & 0xe0c00000) == 0xc0800000) goto u2; - else if ((f & 0xf0c0c000) == 0xe0808000) goto u3; - else if ((f & 0xf8c0c0c0) == 0xf0808080) { - fputc(*s++, stream); -u3: fputc(*s++, stream); -u2: fputc(*s++, stream); - fputc(*s, stream); - } else { -latin1: - /* invalid utf8, assume it was latin1 */ - fputc(0xc0 | (*s >> 6), stream); - fputc(0x80 | (*s & 0x3f), stream); - } + fwrite(s, 1, l, stream); } - s++; + s += l; } } |