about summary refs log tree commit diff
path: root/safe_u8putstr.c
diff options
context:
space:
mode:
authorLeah Neukirchen <leah@vuxu.org>2017-03-13 17:32:13 +0100
committerLeah Neukirchen <leah@vuxu.org>2017-03-13 17:32:13 +0100
commitd21ab392e99ef0a5a1b3382136fe94c784adf5b3 (patch)
tree770521c7db049230215107fcb63492debd3d2dc5 /safe_u8putstr.c
parent53c6a3b37ee28dd11698ec414939aad18244db68 (diff)
downloadmblaze-d21ab392e99ef0a5a1b3382136fe94c784adf5b3.tar.gz
mblaze-d21ab392e99ef0a5a1b3382136fe94c784adf5b3.tar.xz
mblaze-d21ab392e99ef0a5a1b3382136fe94c784adf5b3.zip
safe_u8putstr: actually transcode assumed latin1 into valid utf8
Diffstat (limited to 'safe_u8putstr.c')
-rw-r--r--safe_u8putstr.c18
1 files changed, 13 insertions, 5 deletions
diff --git a/safe_u8putstr.c b/safe_u8putstr.c
index 758c5be..28c541e 100644
--- a/safe_u8putstr.c
+++ b/safe_u8putstr.c
@@ -7,7 +7,7 @@ safe_u8putstr(char *s0, size_t l, FILE *stream)
 	// tty-safe output of s, with relaxed utf-8 semantics:
 	// - C0 and C1 are displayed as escape sequences
 	// - valid utf8 is printed as is
-	// - rest is printed bytewise as is (probably latin1)
+	// - rest is assumed to be latin1, and translated into utf8
 	// - translate CRLF to CR
 
 	unsigned char *s = (unsigned char* )s0;
@@ -35,6 +35,9 @@ safe_u8putstr(char *s0, size_t l, FILE *stream)
 				fputc(*s, stream);
 			}
 		} else if ((*s & 0xc0) == 0x80) {
+			if (*s >= 0xa0)
+				goto latin1;
+
 			// C1
 			fputc(0xe2, stream);
 			fputc(0x90, stream);
@@ -56,12 +59,17 @@ safe_u8putstr(char *s0, size_t l, FILE *stream)
 			
 			if      ((f & 0xe0c00000) == 0xc0800000) goto u2;
 			else if ((f & 0xf0c0c000) == 0xe0808000) goto u3;
-			else if ((f & 0xf8c0c0c0) == 0xf0808080) goto u4;
-			else    /* invalid utf8 */               goto u1;
-u4:				fputc(*s++, stream);
+			else if ((f & 0xf8c0c0c0) == 0xf0808080) {
+				fputc(*s++, stream);
 u3:				fputc(*s++, stream);
 u2:				fputc(*s++, stream);
-u1:				fputc(*s, stream);
+				fputc(*s, stream);
+			} else {
+latin1:
+				/* invalid utf8, assume it was latin1 */
+				fputc(0xc0 | (*s >> 6), stream);
+				fputc(0x80 | (*s & 0x3f), stream);
+			}
 		}
 		s++;
 	}