about summary refs log tree commit diff
path: root/src/locale/iconv.c
diff options
context:
space:
mode:
authorRich Felker <dalias@aerifal.cx>2017-11-13 18:34:27 -0500
committerRich Felker <dalias@aerifal.cx>2017-11-13 18:34:27 -0500
commit105eff9dec51bc4898a74af2854ab71f927a5c3b (patch)
tree5ee1447893b7b760c599afb19341b5f8f2eb8ee3 /src/locale/iconv.c
parenta71b46cfd289aa0ff829fc9a436c59c398f8326d (diff)
downloadmusl-105eff9dec51bc4898a74af2854ab71f927a5c3b.tar.gz
musl-105eff9dec51bc4898a74af2854ab71f927a5c3b.tar.xz
musl-105eff9dec51bc4898a74af2854ab71f927a5c3b.zip
generalize iconv framework for 8-bit codepages
previously, 8-bit codepages could only remap the high 128 bytes; the
low range was assumed/forced to agree with ascii. interpretation of
codepage table headers has been changed so that it's possible to
represent mappings for up to 256 slots (fewer if the initial portion
of the map is elided because it coincides with unicode codepoints).
this requires consuming a bit more of the 10-bit space of characters
that can be represented in 8-bit codepages, but there's still a plenty
left. the size of the legacy_chars table is actually reduced now by
eliding the first 256 entries and considering them to map implicitly
via the identity map.

before these changes, there seem to have been minor bugs/omissions in
codepage table generation, so it's likely that some actual bug fixes
are silently included in this commit. round-trip testing of a few
codepages was performed on the new version of the code, but no
differential testing against the old version was done.
Diffstat (limited to 'src/locale/iconv.c')
-rw-r--r--src/locale/iconv.c27
1 files changed, 16 insertions, 11 deletions
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index 2107b055..01f17521 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -27,8 +27,10 @@
 
 /* Definitions of charmaps. Each charmap consists of:
  * 1. Empty-string-terminated list of null-terminated aliases.
- * 2. Special type code or number of elided entries.
- * 3. Character table (size determined by field 2). */
+ * 2. Special type code or number of elided quads of entries.
+ * 3. Character table (size determined by field 2), consisting
+ *    of 5 bytes for every 4 characters, interpreted as 10-bit
+ *    indices into the legacy_chars table. */
 
 static const unsigned char charmaps[] =
 "utf8\0char\0\0\310"
@@ -51,6 +53,9 @@ static const unsigned char charmaps[] =
 #include "codepages.h"
 ;
 
+/* Table of characters that appear in legacy 8-bit codepages,
+ * limited to 1024 slots (10 bit indices). The first 256 entries
+ * are elided since those characters are obviously all included. */
 static const unsigned short legacy_chars[] = {
 #include "legacychars.h"
 };
@@ -96,7 +101,7 @@ static size_t find_charmap(const void *name)
 		s += strlen((void *)s)+1;
 		if (!*s) {
 			if (s[1] > 0200) s+=2;
-			else s+=2+(128U-s[1])/4*5;
+			else s+=2+(64U-s[1])*5;
 		}
 	}
 	return -1;
@@ -181,10 +186,10 @@ static void put_32(unsigned char *s, unsigned c, int e)
 
 static unsigned legacy_map(const unsigned char *map, unsigned c)
 {
-	unsigned x = c - 128 - map[-1];
-	x = legacy_chars[ map[x*5/4]>>2*x%8 |
-		map[x*5/4+1]<<8-2*x%8 & 1023 ];
-	return x ? x : c;
+	if (c < 4*map[-1]) return c;
+	unsigned x = c - 4*map[-1];
+	x = map[x*5/4]>>2*x%8 | map[x*5/4+1]<<8-2*x%8 & 1023;
+	return x < 256 ? x : legacy_chars[x-256];
 }
 
 size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
@@ -449,9 +454,9 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 			if (!c) goto ilseq;
 			break;
 		default:
-			if (c < 128+type) break;
+			if (!c) break;
 			c = legacy_map(map, c);
-			if (c==1) goto ilseq;
+			if (!c) goto ilseq;
 		}
 
 		switch (totype) {
@@ -475,14 +480,14 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 			if (c > 0x7f) subst: x++, c='*';
 		default:
 			if (*outb < 1) goto toobig;
-			if (c < 128+totype || (c<256 && c==legacy_map(tomap, c))) {
+			if (c<256 && c==legacy_map(tomap, c)) {
 			revout:
 				*(*out)++ = c;
 				*outb -= 1;
 				break;
 			}
 			d = c;
-			for (c=128+totype; c<256; c++) {
+			for (c=4*totype; c<256; c++) {
 				if (d == legacy_map(tomap, c)) {
 					goto revout;
 				}