about summary refs log tree commit diff
path: root/src/ctype/towctrans.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/ctype/towctrans.c')
-rw-r--r--src/ctype/towctrans.c345
1 files changed, 55 insertions, 290 deletions
diff --git a/src/ctype/towctrans.c b/src/ctype/towctrans.c
index 9b91b2de..76d13769 100644
--- a/src/ctype/towctrans.c
+++ b/src/ctype/towctrans.c
@@ -1,308 +1,73 @@
-#include <ctype.h>
-#include <stddef.h>
 #include <wctype.h>
 
-#define CASEMAP(u1,u2,l) { (u1), (l)-(u1), (u2)-(u1)+1 }
-#define CASELACE(u1,u2) CASEMAP((u1),(u2),(u1)+1)
+static const unsigned char tab[];
 
-static const struct {
-	unsigned short upper;
-	signed char lower;
-	unsigned char len;
-} casemaps[] = {
-	CASEMAP(0xc0,0xde,0xe0),
+static const unsigned char rulebases[512];
+static const int rules[];
 
-	CASELACE(0x0100,0x012e),
-	CASELACE(0x0132,0x0136),
-	CASELACE(0x0139,0x0147),
-	CASELACE(0x014a,0x0176),
-	CASELACE(0x0179,0x017d),
+static const unsigned char exceptions[][2];
 
-	CASELACE(0x370,0x372),
-	CASEMAP(0x391,0x3a1,0x3b1),
-	CASEMAP(0x3a3,0x3ab,0x3c3),
-	CASEMAP(0x400,0x40f,0x450),
-	CASEMAP(0x410,0x42f,0x430),
+#include "casemap.h"
 
-	CASELACE(0x460,0x480),
-	CASELACE(0x48a,0x4be),
-	CASELACE(0x4c1,0x4cd),
-	CASELACE(0x4d0,0x50e),
-
-	CASELACE(0x514,0x52e),
-	CASEMAP(0x531,0x556,0x561),
-
-	CASELACE(0x01a0,0x01a4),
-	CASELACE(0x01b3,0x01b5),
-	CASELACE(0x01cd,0x01db),
-	CASELACE(0x01de,0x01ee),
-	CASELACE(0x01f8,0x021e),
-	CASELACE(0x0222,0x0232),
-	CASELACE(0x03d8,0x03ee),
-
-	CASELACE(0x1e00,0x1e94),
-	CASELACE(0x1ea0,0x1efe),
-
-	CASEMAP(0x1f08,0x1f0f,0x1f00),
-	CASEMAP(0x1f18,0x1f1d,0x1f10),
-	CASEMAP(0x1f28,0x1f2f,0x1f20),
-	CASEMAP(0x1f38,0x1f3f,0x1f30),
-	CASEMAP(0x1f48,0x1f4d,0x1f40),
-
-	CASEMAP(0x1f68,0x1f6f,0x1f60),
-	CASEMAP(0x1f88,0x1f8f,0x1f80),
-	CASEMAP(0x1f98,0x1f9f,0x1f90),
-	CASEMAP(0x1fa8,0x1faf,0x1fa0),
-	CASEMAP(0x1fb8,0x1fb9,0x1fb0),
-	CASEMAP(0x1fba,0x1fbb,0x1f70),
-	CASEMAP(0x1fc8,0x1fcb,0x1f72),
-	CASEMAP(0x1fd8,0x1fd9,0x1fd0),
-	CASEMAP(0x1fda,0x1fdb,0x1f76),
-	CASEMAP(0x1fe8,0x1fe9,0x1fe0),
-	CASEMAP(0x1fea,0x1feb,0x1f7a),
-	CASEMAP(0x1ff8,0x1ff9,0x1f78),
-	CASEMAP(0x1ffa,0x1ffb,0x1f7c),
-
-	CASEMAP(0x13f0,0x13f5,0x13f8),
-	CASELACE(0xa698,0xa69a),
-	CASELACE(0xa796,0xa79e),
-
-	CASELACE(0x246,0x24e),
-	CASELACE(0x510,0x512),
-	CASEMAP(0x2160,0x216f,0x2170),
-	CASEMAP(0x2c00,0x2c2e,0x2c30),
-	CASELACE(0x2c67,0x2c6b),
-	CASELACE(0x2c80,0x2ce2),
-	CASELACE(0x2ceb,0x2ced),
-
-	CASELACE(0xa640,0xa66c),
-	CASELACE(0xa680,0xa696),
-
-	CASELACE(0xa722,0xa72e),
-	CASELACE(0xa732,0xa76e),
-	CASELACE(0xa779,0xa77b),
-	CASELACE(0xa77e,0xa786),
-
-	CASELACE(0xa790,0xa792),
-	CASELACE(0xa7a0,0xa7a8),
-
-	CASELACE(0xa7b4,0xa7b6),
-
-	CASEMAP(0xff21,0xff3a,0xff41),
-	{ 0,0,0 }
-};
-
-static const unsigned short pairs[][2] = {
-	{ 'I',    0x0131 },
-	{ 'S',    0x017f },
-	{ 0x0130, 'i'    },
-	{ 0x0178, 0x00ff },
-	{ 0x0181, 0x0253 },
-	{ 0x0182, 0x0183 },
-	{ 0x0184, 0x0185 },
-	{ 0x0186, 0x0254 },
-	{ 0x0187, 0x0188 },
-	{ 0x0189, 0x0256 },
-	{ 0x018a, 0x0257 },
-	{ 0x018b, 0x018c },
-	{ 0x018e, 0x01dd },
-	{ 0x018f, 0x0259 },
-	{ 0x0190, 0x025b },
-	{ 0x0191, 0x0192 },
-	{ 0x0193, 0x0260 },
-	{ 0x0194, 0x0263 },
-	{ 0x0196, 0x0269 },
-	{ 0x0197, 0x0268 },
-	{ 0x0198, 0x0199 },
-	{ 0x019c, 0x026f },
-	{ 0x019d, 0x0272 },
-	{ 0x019f, 0x0275 },
-	{ 0x01a6, 0x0280 },
-	{ 0x01a7, 0x01a8 },
-	{ 0x01a9, 0x0283 },
-	{ 0x01ac, 0x01ad },
-	{ 0x01ae, 0x0288 },
-	{ 0x01af, 0x01b0 },
-	{ 0x01b1, 0x028a },
-	{ 0x01b2, 0x028b },
-	{ 0x01b7, 0x0292 },
-	{ 0x01b8, 0x01b9 },
-	{ 0x01bc, 0x01bd },
-	{ 0x01c4, 0x01c6 },
-	{ 0x01c4, 0x01c5 },
-	{ 0x01c5, 0x01c6 },
-	{ 0x01c7, 0x01c9 },
-	{ 0x01c7, 0x01c8 },
-	{ 0x01c8, 0x01c9 },
-	{ 0x01ca, 0x01cc },
-	{ 0x01ca, 0x01cb },
-	{ 0x01cb, 0x01cc },
-	{ 0x01f1, 0x01f3 },
-	{ 0x01f1, 0x01f2 },
-	{ 0x01f2, 0x01f3 },
-	{ 0x01f4, 0x01f5 },
-	{ 0x01f6, 0x0195 },
-	{ 0x01f7, 0x01bf },
-	{ 0x0220, 0x019e },
-	{ 0x0386, 0x03ac },
-	{ 0x0388, 0x03ad },
-	{ 0x0389, 0x03ae },
-	{ 0x038a, 0x03af },
-	{ 0x038c, 0x03cc },
-	{ 0x038e, 0x03cd },
-	{ 0x038f, 0x03ce },
-	{ 0x0399, 0x0345 },
-	{ 0x0399, 0x1fbe },
-	{ 0x03a3, 0x03c2 },
-	{ 0x03f7, 0x03f8 },
-	{ 0x03fa, 0x03fb },
-	{ 0x1e60, 0x1e9b },
-	{ 0x1e9e, 0xdf },
-
-	{ 0x1f59, 0x1f51 },
-	{ 0x1f5b, 0x1f53 },
-	{ 0x1f5d, 0x1f55 },
-	{ 0x1f5f, 0x1f57 },
-	{ 0x1fbc, 0x1fb3 },
-	{ 0x1fcc, 0x1fc3 },
-	{ 0x1fec, 0x1fe5 },
-	{ 0x1ffc, 0x1ff3 },
-
-	{ 0x23a, 0x2c65 },
-	{ 0x23b, 0x23c },
-	{ 0x23d, 0x19a },
-	{ 0x23e, 0x2c66 },
-	{ 0x241, 0x242 },
-	{ 0x243, 0x180 },
-	{ 0x244, 0x289 },
-	{ 0x245, 0x28c },
-	{ 0x37f, 0x3f3 },
-	{ 0x3f4, 0x3b8 },
-	{ 0x3f9, 0x3f2 },
-	{ 0x3fd, 0x37b },
-	{ 0x3fe, 0x37c },
-	{ 0x3ff, 0x37d },
-	{ 0x4c0, 0x4cf },
-
-	{ 0x2126, 0x3c9 },
-	{ 0x212a, 'k' },
-	{ 0x212b, 0xe5 },
-	{ 0x2132, 0x214e },
-	{ 0x2183, 0x2184 },
-	{ 0x2c60, 0x2c61 },
-	{ 0x2c62, 0x26b },
-	{ 0x2c63, 0x1d7d },
-	{ 0x2c64, 0x27d },
-	{ 0x2c6d, 0x251 },
-	{ 0x2c6e, 0x271 },
-	{ 0x2c6f, 0x250 },
-	{ 0x2c70, 0x252 },
-	{ 0x2c72, 0x2c73 },
-	{ 0x2c75, 0x2c76 },
-	{ 0x2c7e, 0x23f },
-	{ 0x2c7f, 0x240 },
-	{ 0x2cf2, 0x2cf3 },
-
-	{ 0xa77d, 0x1d79 },
-	{ 0xa78b, 0xa78c },
-	{ 0xa78d, 0x265 },
-	{ 0xa7aa, 0x266 },
-
-	{ 0x10c7, 0x2d27 },
-	{ 0x10cd, 0x2d2d },
-
-	/* bogus greek 'symbol' letters */
-	{ 0x376, 0x377 },
-	{ 0x39c, 0xb5 },
-	{ 0x392, 0x3d0 },
-	{ 0x398, 0x3d1 },
-	{ 0x3a6, 0x3d5 },
-	{ 0x3a0, 0x3d6 },
-	{ 0x39a, 0x3f0 },
-	{ 0x3a1, 0x3f1 },
-	{ 0x395, 0x3f5 },
-	{ 0x3cf, 0x3d7 },
-
-	{ 0xa7ab, 0x25c },
-	{ 0xa7ac, 0x261 },
-	{ 0xa7ad, 0x26c },
-	{ 0xa7ae, 0x26a },
-	{ 0xa7b0, 0x29e },
-	{ 0xa7b1, 0x287 },
-	{ 0xa7b2, 0x29d },
-	{ 0xa7b3, 0xab53 },
-
-	/* special cyrillic lowercase forms */
-	{ 0x412, 0x1c80 },
-	{ 0x414, 0x1c81 },
-	{ 0x41e, 0x1c82 },
-	{ 0x421, 0x1c83 },
-	{ 0x422, 0x1c84 },
-	{ 0x422, 0x1c85 },
-	{ 0x42a, 0x1c86 },
-	{ 0x462, 0x1c87 },
-	{ 0xa64a, 0x1c88 },
-
-	{ 0,0 }
-};
-
-
-static wchar_t __towcase(wchar_t wc, int lower)
+static int casemap(unsigned c, int dir)
 {
-	int i;
-	int lmul = 2*lower-1;
-	int lmask = lower-1;
-	/* no letters with case in these large ranges */
-	if (!iswalpha(wc)
-	 || (unsigned)wc - 0x0600 <= 0x0fff-0x0600
-	 || (unsigned)wc - 0x2e00 <= 0xa63f-0x2e00
-	 || (unsigned)wc - 0xa800 <= 0xab52-0xa800
-	 || (unsigned)wc - 0xabc0 <= 0xfeff-0xabc0)
-		return wc;
-	/* special case because the diff between upper/lower is too big */
-	if (lower && (unsigned)wc - 0x10a0 < 0x2e)
-		if (wc>0x10c5 && wc != 0x10c7 && wc != 0x10cd) return wc;
-		else return wc + 0x2d00 - 0x10a0;
-	if (!lower && (unsigned)wc - 0x2d00 < 0x26)
-		if (wc>0x2d25 && wc != 0x2d27 && wc != 0x2d2d) return wc;
-		else return wc + 0x10a0 - 0x2d00;
-	if (lower && (unsigned)wc - 0x13a0 < 0x50)
-		return wc + 0xab70 - 0x13a0;
-	if (!lower && (unsigned)wc - 0xab70 < 0x50)
-		return wc + 0x13a0 - 0xab70;
-	for (i=0; casemaps[i].len; i++) {
-		int base = casemaps[i].upper + (lmask & casemaps[i].lower);
-		if ((unsigned)wc-base < casemaps[i].len) {
-			if (casemaps[i].lower == 1)
-				return wc + lower - ((wc-casemaps[i].upper)&1);
-			return wc + lmul*casemaps[i].lower;
+	unsigned b, x, y, v, rt, xb, xn;
+	int r, rd, c0 = c;
+
+	if (c >= 0x20000) return c;
+
+	b = c>>8;
+	c &= 255;
+	x = c/3;
+	y = c%3;
+
+	/* lookup entry in two-level base-6 table */
+	v = tab[tab[b]*86+x];
+	static const int mt[] = { 2048, 342, 57 };
+	v = (v*mt[y]>>11)%6;
+
+	/* use the bit vector out of the tables as an index into
+	 * a block-specific set of rules and decode the rule into
+	 * a type and a case-mapping delta. */
+	r = rules[rulebases[b]+v];
+	rt = r & 255;
+	rd = r >> 8;
+
+	/* rules 0/1 are simple lower/upper case with a delta.
+	 * apply according to desired mapping direction. */
+	if (rt < 2) return c0 + (rd & -(rt^dir));
+
+	/* binary search. endpoints of the binary search for
+	 * this block are stored in the rule delta field. */
+	xn = rd & 0xff;
+	xb = (unsigned)rd >> 8;
+	while (xn) {
+		unsigned try = exceptions[xb+xn/2][0];
+		if (try == c) {
+			r = rules[exceptions[xb+xn/2][1]];
+			rt = r & 255;
+			rd = r >> 8;
+			if (rt < 2) return c0 + (rd & -(rt^dir));
+			/* Hard-coded for the four exceptional titlecase */
+			return c0 + (dir ? -1 : 1);
+		} else if (try > c) {
+			xn /= 2;
+		} else {
+			xb += xn/2;
+			xn -= xn/2;
 		}
 	}
-	for (i=0; pairs[i][1-lower]; i++) {
-		if (pairs[i][1-lower] == wc)
-			return pairs[i][lower];
-	}
-	if ((unsigned)wc - (0x10428 - 0x28*lower) < 0x28)
-		return wc - 0x28 + 0x50*lower;
-	if ((unsigned)wc - (0x104d8 - 0x28*lower) < 0x24)
-		return wc - 0x28 + 0x50*lower;
-	if ((unsigned)wc - (0x10cc0 - 0x40*lower) < 0x33)
-		return wc - 0x40 + 0x80*lower;
-	if ((unsigned)wc - (0x118c0 - 0x20*lower) < 0x20)
-		return wc - 0x20 + 0x40*lower;
-	if ((unsigned)wc - (0x1e922 - 0x22*lower) < 0x22)
-		return wc - 0x22 + 0x44*lower;
-	return wc;
+	return c0;
 }
 
-wint_t towupper(wint_t wc)
+wint_t towlower(wint_t wc)
 {
-	return (unsigned)wc < 128 ? toupper(wc) : __towcase(wc, 0);
+	return casemap(wc, 0);
 }
 
-wint_t towlower(wint_t wc)
+wint_t towupper(wint_t wc)
 {
-	return (unsigned)wc < 128 ? tolower(wc) : __towcase(wc, 1);
+	return casemap(wc, 1);
 }
 
 wint_t __towupper_l(wint_t c, locale_t l)