make fgetwc handling of encoding errors consistent with/without buffer

previously, fgetwc left all but the first byte of an illegal sequence unread (available for subsequent calls) when reading out of the FILE buffer, but dropped all bytes contibuting to the error when falling back to reading a byte at a time. neither behavior was ideal. in the buffered case, each malformed character produced one error per byte, rather than one per character. in the unbuffered case, consuming the last byte that caused the transition from "incomplete" to "invalid" state potentially dropped (and produced additional spurious encoding errors for) the next valid character. to handle both cases uniformly without duplicate code, revise the buffered case to only cover situations where a complete and valid character is present in the buffer, and fall back to byte-at-a-time for all other cases. this allows using mbtowc (stateless) instead of mbrtowc, which may slightly improve performance too. when an encoding error has been hit in the byte-at-a-time case, leave the final byte that produced the error unread (via ungetc) except in the case of single-byte errors (for UTF-8, bytes c0, c1, f5-ff, and continuation bytes with no lead byte). single-byte errors are fully consumed so as not to leave the caller in an infinite loop repeating the same error. none of these changes are distinguished from a conformance standpoint, since the file position is unspecified after encoding errors. they are intended merely as QoI/consistency improvements.
author: Rich Felker <dalias@aerifal.cx> 2017-11-20 16:25:54 -0500
committer: Rich Felker <dalias@aerifal.cx> 2017-11-20 16:25:54 -0500
commit: 4000b0107ddd7fe733fa31d4f078c6fcd35851d6 (patch)
tree: ed33813b0dbc943915c1a6f0458075e1742b6fd3 /src/stdio
parent: a90d9da1d1b14d81c4f93e1a6d1a686c3312e4ba (diff)
download: musl-4000b0107ddd7fe733fa31d4f078c6fcd35851d6.tar.gz
musl-4000b0107ddd7fe733fa31d4f078c6fcd35851d6.tar.xz
musl-4000b0107ddd7fe733fa31d4f078c6fcd35851d6.zip
1 files changed, 14 insertions, 14 deletions
diff --git a/src/stdio/fgetwc.c b/src/stdio/fgetwc.c
index a00c1a86..07fb6d7c 100644
--- a/src/stdio/fgetwc.c
+++ b/src/stdio/fgetwc.c
@@ -5,36 +5,36 @@
 
 static wint_t __fgetwc_unlocked_internal(FILE *f)
 {
-	mbstate_t st = { 0 };
 	wchar_t wc;
 	int c;
-	unsigned char b;
 	size_t l;
 
 	/* Convert character from buffer if possible */
 	if (f->rpos < f->rend) {
-		l = mbrtowc(&wc, (void *)f->rpos, f->rend - f->rpos, &st);
-		if (l+2 >= 2) {
+		l = mbtowc(&wc, (void *)f->rpos, f->rend - f->rpos);
+		if (l+1 >= 1) {
 			f->rpos += l + !l; /* l==0 means 1 byte, null */
 			return wc;
 		}
-		if (l == -1) {
-			f->rpos++;
-			return WEOF;
-		}
-		f->rpos = f->rend;
-	} else l = -2;
+	}
 
 	/* Convert character byte-by-byte */
-	while (l == -2) {
+	mbstate_t st = { 0 };
+	unsigned char b;
+	int first = 1;
+	do {
 		b = c = getc_unlocked(f);
 		if (c < 0) {
-			if (!mbsinit(&st)) errno = EILSEQ;
+			if (!first) errno = EILSEQ;
 			return WEOF;
 		}
 		l = mbrtowc(&wc, (void *)&b, 1, &st);
-		if (l == -1) return WEOF;
-	}
+		if (l == -1) {
+			if (!first) ungetc(b, f);
+			return WEOF;
+		}
+		first = 0;
+	} while (l == -2);
 
 	return wc;
 }
author	Rich Felker <dalias@aerifal.cx>	2017-11-20 16:25:54 -0500
committer	Rich Felker <dalias@aerifal.cx>	2017-11-20 16:25:54 -0500
commit	4000b0107ddd7fe733fa31d4f078c6fcd35851d6 (patch)
tree	ed33813b0dbc943915c1a6f0458075e1742b6fd3 /src/stdio
parent	a90d9da1d1b14d81c4f93e1a6d1a686c3312e4ba (diff)
download	musl-4000b0107ddd7fe733fa31d4f078c6fcd35851d6.tar.gz musl-4000b0107ddd7fe733fa31d4f078c6fcd35851d6.tar.xz musl-4000b0107ddd7fe733fa31d4f078c6fcd35851d6.zip