From 73641367db67f47c55d28d98b2d41467a8c87211 Mon Sep 17 00:00:00 2001 From: Leah Neukirchen Date: Wed, 22 Nov 2017 23:48:00 +0100 Subject: revamp utf-8 handling code --- safe_u8putstr.c | 73 +++++++++++++++++++++------------------------------------ 1 file changed, 27 insertions(+), 46 deletions(-) (limited to 'safe_u8putstr.c') diff --git a/safe_u8putstr.c b/safe_u8putstr.c index e198b54..31cdb0b 100644 --- a/safe_u8putstr.c +++ b/safe_u8putstr.c @@ -1,76 +1,57 @@ #include #include +#include "u8decode.h" + void safe_u8putstr(char *s0, size_t l, FILE *stream) { // tty-safe output of s, with relaxed utf-8 semantics: // - C0 and C1 are displayed as escape sequences - // - valid utf8 is printed as is - // - rest is assumed to be latin1, and translated into utf8 + // - valid utf-8 is printed as is + // - rest is assumed to be latin-1, and translated into utf-8 // - translate CRLF to CR unsigned char *s = (unsigned char *)s0; unsigned char *e = s + l; + uint32_t c; while (s < e) { - if ((*s & 0x80) == 0) { - if (*s < 32 && - *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r') { - // C0 + int l = u8decode((char *)s, &c); + if (l == -1) { + l = 1; + if (*s <= 0x9fu) { + // C1 fputc(0xe2, stream); fputc(0x90, stream); - fputc(0x80+*s, stream); - } else if (*s == 127) { - // DEL + fputc(0x80+0x1b, stream); + fputc(0xe2, stream); fputc(0x90, stream); - fputc(0xa1, stream); - } else if (*s == '\r') { - if (e - s > 1 && s[1] == '\n') - s++; fputc(*s, stream); } else { - // safe ASCII - fputc(*s, stream); + /* invalid utf-8, assume it was latin-1 */ + fputc(0xc0 | (*s >> 6), stream); + fputc(0x80 | (*s & 0x3f), stream); } - } else if ((*s & 0xc0) == 0x80) { - if (*s >= 0xa0) - goto latin1; - - // C1 + } else if (c < 32 && + *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r') { + // C0 fputc(0xe2, stream); fputc(0x90, stream); - fputc(0x80+0x1b, stream); - + fputc(0x80+*s, stream); + } else if (c == 127) { + // DEL fputc(0xe2, stream); fputc(0x90, stream); + fputc(0xa1, stream); + } else if (c == '\r') { + if (e - s > 1 && s[1] == '\n') + s++; fputc(*s, stream); } else { - uint32_t f = 0; - if (e - s >= 4) - f = (s[0]<<24) | (s[1]<<16) | (s[2]<<8) | s[3]; - else if (e - s == 3) - f = (s[0]<<24) | (s[1]<<16) | (s[2]<<8); - else if (e - s == 2) - f = (s[0]<<24) | (s[1]<<16); - else if (e - s == 1) - f = (s[0]<<24); - - if ((f & 0xe0c00000) == 0xc0800000) goto u2; - else if ((f & 0xf0c0c000) == 0xe0808000) goto u3; - else if ((f & 0xf8c0c0c0) == 0xf0808080) { - fputc(*s++, stream); -u3: fputc(*s++, stream); -u2: fputc(*s++, stream); - fputc(*s, stream); - } else { -latin1: - /* invalid utf8, assume it was latin1 */ - fputc(0xc0 | (*s >> 6), stream); - fputc(0x80 | (*s & 0x3f), stream); - } + fwrite(s, 1, l, stream); } - s++; + s += l; } } -- cgit 1.4.1