about summary refs log tree commit diff
path: root/src/locale
diff options
context:
space:
mode:
authorRich Felker <dalias@aerifal.cx>2017-12-18 22:08:54 -0500
committerRich Felker <dalias@aerifal.cx>2017-12-18 22:31:18 -0500
commit95c6044e2ae85846330814c4ac5ebf4102dbe02c (patch)
treeaf2c6f65ebcb37cca79713f72a678faf54117e9f /src/locale
parent9d4d0ee41b06acf68dac40332f53be7bfbde7404 (diff)
downloadmusl-95c6044e2ae85846330814c4ac5ebf4102dbe02c.tar.gz
musl-95c6044e2ae85846330814c4ac5ebf4102dbe02c.tar.xz
musl-95c6044e2ae85846330814c4ac5ebf4102dbe02c.zip
add support for BOM-determined-endian UCS2, UTF-16, and UTF-32 to iconv
previously, the charset names without endianness specified were always
interpreted as big endian. unicode specifies that UTF-16 and UTF-32
have BOM-determined endianness if BOM is present, and are otherwise
big endian. since commit 5b546faa67544af395d6407553762b37e9711157
added support for stateful encodings, it is now possible to implement
BOM support via the conversion descriptor state.

for conversions to these charsets, the output is always big endian and
does not have a BOM.
Diffstat (limited to 'src/locale')
-rw-r--r--src/locale/iconv.c43
1 files changed, 40 insertions, 3 deletions
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index 1784dc9d..c5dd122f 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -16,6 +16,9 @@
 #define WCHAR_T     0306
 #define US_ASCII    0307
 #define UTF_8       0310
+#define UTF_16      0312
+#define UTF_32      0313
+#define UCS2        0314
 #define EUC_JP      0320
 #define SHIFT_JIS   0321
 #define ISO2022_JP  0322
@@ -35,13 +38,16 @@
 static const unsigned char charmaps[] =
 "utf8\0char\0\0\310"
 "wchart\0\0\306"
-"ucs2\0ucs2be\0\0\304"
+"ucs2be\0\0\304"
 "ucs2le\0\0\305"
-"utf16\0utf16be\0\0\302"
+"utf16be\0\0\302"
 "utf16le\0\0\301"
-"ucs4\0ucs4be\0utf32\0utf32be\0\0\300"
+"ucs4be\0utf32be\0\0\300"
 "ucs4le\0utf32le\0\0\303"
 "ascii\0usascii\0iso646\0iso646us\0\0\307"
+"utf16\0\0\312"
+"ucs4\0utf32\0\0\313"
+"ucs2\0\0\314"
 "eucjp\0\0\320"
 "shiftjis\0sjis\0\0\321"
 "iso2022jp\0\0\322"
@@ -145,6 +151,9 @@ iconv_t iconv_open(const char *to, const char *from)
 	iconv_t cd = combine_to_from(t, f);
 
 	switch (charmaps[f]) {
+	case UTF_16:
+	case UTF_32:
+	case UCS2:
 	case ISO2022_JP:
 		scd = malloc(sizeof *scd);
 		if (!scd) return (iconv_t)-1;
@@ -285,6 +294,31 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 				c = ((c-0xd7c0)<<10) + (d-0xdc00);
 			}
 			break;
+		case UCS2:
+		case UTF_16:
+			l = 0;
+			if (!scd->state) {
+				if (*inb < 2) goto starved;
+				c = get_16((void *)*in, 0);
+				scd->state = type==UCS2
+					? c==0xfffe ? UCS2LE : UCS2BE
+					: c==0xfffe ? UTF_16LE : UTF_16BE;
+				if (c == 0xfffe || c == 0xfeff)
+					l = 2;
+			}
+			type = scd->state;
+			continue;
+		case UTF_32:
+			l = 0;
+			if (!scd->state) {
+				if (*inb < 4) goto starved;
+				c = get_32((void *)*in, 0);
+				scd->state = c==0xfffe0000 ? UTF_32LE : UTF_32BE;
+				if (c == 0xfffe0000 || c == 0xfeff)
+					l = 4;
+			}
+			type = scd->state;
+			continue;
 		case SHIFT_JIS:
 			if (c < 128) break;
 			if (c-0xa1 <= 0xdf-0xa1) {
@@ -589,8 +623,11 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 			*(*out)++ = 'B';
 			*outb -= 8;
 			break;
+		case UCS2:
+			totype = UCS2BE;
 		case UCS2BE:
 		case UCS2LE:
+		case UTF_16:
 		case UTF_16BE:
 		case UTF_16LE:
 			if (c < 0x10000 || type-UCS2BE < 2U) {