From f1fa8b68f3e7623a3ef86dcd0c7d090ccf0389f5 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 15 Apr 1998 17:02:23 +0000 Subject: Update. 1998-04-15 16:41 Ulrich Drepper Don't name internal representation since it might be different from the external form (namely on little endian machines). * iconv/gconv_builtin.h: Add UCS4 support. Change references to UCS4 into references to INTERNAL. * iconv/gconv_simple.c: Implement UCS4<->INTERNAL converters. Add endianess support to UCS functions. Change references to UCS4 into references to INTERNAL. * iconv/gconv_int.h: Change references to UCS4 into references to INTERNAL. * iconv/iconv_prog.c: Don't mention INTERNAL in --list output. * iconvdata/gconv-modules: Change accordingly. * wcsmbs/wcsmbsload.c: Change names to use INTERNAL. * iconv/gconv_simple.c: Adjust input buffer pointer for output buffer overflow. * iconvdata/8bit-gap.c: Likewise. * iconvdata/8bit-generic.c: Likewise. * iconvdata/big5.c: Likewise. * iconvdata/euccn.c: Likewise. * iconvdata/eucjp.c: Likewise. * iconvdata/euckr.c: Likewise. * iconvdata/euctw.c: Likewise. * iconvdata/iso646.c: Likewise. * iconvdata/iso6937.c: Likewise. * iconvdata/iso8859-1.c: Likewise. * iconvdata/johab.c: Likewise. * iconvdata/sjis.c: Likewise. * iconvdata/t61.c: Likewise. * iconvdata/uhc.c: Likewise. * iconvdata/8bit-gap.c: Correct access to to_ucs4 array. * iconvdata/8bit-generic.c: Likewise. * iconvdata/TESTS: Add more tests. * sysdeps/i386/bits/byteswap.h: Change to use "=r" when ror is used. 1998-04-15 11:47 Ulrich Drepper * iconvdata/Makefile: Better rules to run tests. * iconvdata/testdata/ISO-8859-1..UTF8: New file. * iconvdata/testdata/ISO-8859-10: Likewise. * iconvdata/testdata/ISO-8859-10..UCS2: Likewise. * iconvdata/testdata/ISO-8859-2: Likewise. * iconvdata/testdata/ISO-8859-2..UCS4: Likewise. * iconvdata/testdata/ISO-8859-2..UTF8: Likewise. * iconvdata/testdata/ISO-8859-3: Likewise. * iconvdata/testdata/ISO-8859-4: Likewise. * iconvdata/testdata/ISO-8859-5: Likewise. * iconvdata/testdata/ISO-8859-6: Likewise. * iconvdata/testdata/ISO-8859-7: Likewise. * iconvdata/testdata/ISO-8859-8: Likewise. * iconvdata/testdata/ISO-8859-9: Likewise. * iconvdata/run-iconv-test.sh: Handle $from..$t file to compare intermediate result (if available). * iconvdata/Makefile: Add rules to run run-iconv-test.sh. (distribute): Add run-iconv-test.sh and testdata/*. * stdlib/testmb.c (main): Simplify mbc array handling. * iconvdata/testdata/ISO-8859-1: New file. --- iconv/gconv_builtin.h | 36 ++++--- iconv/gconv_int.h | 13 +-- iconv/gconv_simple.c | 268 +++++++++++++++++++++++++++++++++++++------------- iconv/iconv_prog.c | 15 +-- 4 files changed, 240 insertions(+), 92 deletions(-) (limited to 'iconv') diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h index 9c98c3513d..265dca1f01 100644 --- a/iconv/gconv_builtin.h +++ b/iconv/gconv_builtin.h @@ -18,27 +18,41 @@ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +BUILTIN_ALIAS ("UCS4//", "ISO-10646/UCS4/") +BUILTIN_ALIAS ("UCS-4//", "ISO-10646/UCS4/") +BUILTIN_ALIAS ("ISO-10646//", "ISO-10646/UCS4/") +BUILTIN_ALIAS ("10646-1:1993//", "ISO-10646/UCS4/") +BUILTIN_ALIAS ("10646-1:1993/UCS4/", "ISO-10646/UCS4/") + +BUILTIN_TRANSFORMATION (NULL, "INTERNAL", 8, + "ISO-10646/UCS4/", 1, "=INTERNAL->ucs4", + __gconv_transform_internal_ucs4, NULL, NULL) +BUILTIN_TRANSFORMATION (NULL, "ISO-10646/UCS4/", 15, + "INTERNAL", 1, "=ucs4->INTERNAL", + __gconv_transform_internal_ucs4, NULL, NULL) +/* Please note that we need only one function for both direction. */ + BUILTIN_ALIAS ("UTF8//", "ISO-10646/UTF8/") BUILTIN_ALIAS ("UTF-8//", "ISO-10646/UTF8/") -BUILTIN_TRANSFORMATION (NULL, "ISO-10646/UCS4/", 15, - "ISO-10646/UTF8/", 1, "=ucs4->utf8", - __gconv_transform_ucs4_utf8, NULL, NULL) +BUILTIN_TRANSFORMATION (NULL, "INTERNAL", 8, + "ISO-10646/UTF8/", 1, "=INTERNAL->utf8", + __gconv_transform_internal_utf8, NULL, NULL) BUILTIN_TRANSFORMATION ("ISO-10646/UTF-?8/", "ISO-10646/UTF", 13, - "ISO-10646/UCS4/", 1, "=utf8->ucs4", - __gconv_transform_utf8_ucs4, NULL, NULL) + "INTERNAL", 1, "=utf8->INTERNAL", + __gconv_transform_utf8_internal, NULL, NULL) BUILTIN_ALIAS ("UCS2//", "ISO-10646/UCS2/") BUILTIN_ALIAS ("UCS-2//", "ISO-10646/UCS2/") -BUILTIN_TRANSFORMATION (NULL, "ISO-10646/UCS2/", 15, "ISO-10646/UCS4/", - 1, "=ucs2->ucs4", - __gconv_transform_ucs2_ucs4, NULL, NULL) +BUILTIN_TRANSFORMATION (NULL, "ISO-10646/UCS2/", 15, "INTERNAL", + 1, "=ucs2->INTERNAL", + __gconv_transform_ucs2_internal, NULL, NULL) -BUILTIN_TRANSFORMATION (NULL, "ISO-10646/UCS4/", 15, "ISO-10646/UCS2/", - 1, "=ucs4->ucs2", - __gconv_transform_ucs4_ucs2, NULL, NULL) +BUILTIN_TRANSFORMATION (NULL, "INTERNAL", 8, "ISO-10646/UCS2/", + 1, "=INTERNAL->ucs2", + __gconv_transform_internal_ucs2, NULL, NULL) BUILTIN_TRANSFORMATION ("(.*)", NULL, 0, "\\1", 1, "=dummy", __gconv_transform_dummy, NULL, NULL) diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h index 35ec31a7b8..a1475f8508 100644 --- a/iconv/gconv_int.h +++ b/iconv/gconv_int.h @@ -153,12 +153,13 @@ extern void __gconv_get_builtin_trans (const char *__name, int __do_flush) __BUILTIN_TRANS (__gconv_transform_dummy); -__BUILTIN_TRANS (__gconv_transform_ascii_ucs4); -__BUILTIN_TRANS (__gconv_transform_ucs4_ascii); -__BUILTIN_TRANS (__gconv_transform_ucs4_utf8); -__BUILTIN_TRANS (__gconv_transform_utf8_ucs4); -__BUILTIN_TRANS (__gconv_transform_ucs2_ucs4); -__BUILTIN_TRANS (__gconv_transform_ucs4_ucs2); +__BUILTIN_TRANS (__gconv_transform_ascii_internal); +__BUILTIN_TRANS (__gconv_transform_internal_ascii); +__BUILTIN_TRANS (__gconv_transform_utf8_internal); +__BUILTIN_TRANS (__gconv_transform_internal_utf8); +__BUILTIN_TRANS (__gconv_transform_ucs2_internal); +__BUILTIN_TRANS (__gconv_transform_internal_ucs2); +__BUILTIN_TRANS (__gconv_transform_internal_ucs4); # undef __BUITLIN_TRANS #endif diff --git a/iconv/gconv_simple.c b/iconv/gconv_simple.c index 38b6b56adb..b72e61edcc 100644 --- a/iconv/gconv_simple.c +++ b/iconv/gconv_simple.c @@ -18,6 +18,8 @@ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include +#include #include #include #include @@ -76,16 +78,21 @@ __gconv_transform_dummy (struct gconv_step *step, struct gconv_step_data *data, } -/* Convert from ISO 646-IRV to ISO 10646/UCS4. */ +/* Transform from the internal, UCS4-like format, to UCS4. The + difference between the internal ucs4 format and the real UCS4 + format is, if any, the endianess. The Unicode/ISO 10646 says that + unless some higher protocol specifies it differently, the byte + order is big endian.*/ int -__gconv_transform_ascii_ucs4 (struct gconv_step *step, - struct gconv_step_data *data, const char *inbuf, - size_t *inlen, size_t *written, int do_flush) +__gconv_transform_internal_ucs4 (struct gconv_step *step, + struct gconv_step_data *data, + const char *inbuf, size_t *inlen, + size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; gconv_fct fct = next_step->fct; - size_t do_write; + size_t do_write = 0; int result; /* If the function is called with no input this means we have to reset @@ -95,7 +102,6 @@ __gconv_transform_ascii_ucs4 (struct gconv_step *step, { /* Clear the state. */ memset (data->statep, '\0', sizeof (mbstate_t)); - do_write = 0; /* Call the steps down the chain if there are any. */ if (data->is_last) @@ -114,12 +120,126 @@ __gconv_transform_ascii_ucs4 (struct gconv_step *step, else { int save_errno = errno; - do_write = 0; result = GCONV_OK; do { - const unsigned char *newinbuf = inbuf; + size_t n_convert = (MIN (*inlen, + (data->outbufsize - data->outbufavail)) + / sizeof (wchar_t)); + +#if __BYTE_ORDER == __LITTLE_ENDIAN + /* Sigh, we have to do some real work. */ + wchar_t *outbuf = (wchar_t *) &data->outbuf[data->outbufavail]; + size_t cnt; + + for (cnt = 0; cnt < n_convert; ++cnt) + outbuf[cnt] = bswap_32 (((wchar_t *) inbuf)[cnt]); + +#elif __BYTE_ORDER == __BIG_ENDIAN + /* Simply copy the data. */ + memcpy (&data->outbuf[data->outbufsize], inbuf, + n_convert * sizeof (wchar_t)); +#else +# error "This endianess is not supported." +#endif + + *inlen -= n_convert * sizeof (wchar_t); + inbuf += n_convert * sizeof (wchar_t); + data->outbufavail += n_convert * sizeof (wchar_t); + do_write += n_convert; + + if (*inlen > 0 && *inlen < sizeof (wchar_t)) + { + /* We have an incomplete character at the end. */ + result = GCONV_INCOMPLETE_INPUT; + break; + } + + if (data->is_last) + { + /* This is the last step. */ + result = (*inlen < sizeof (wchar_t) + ? GCONV_EMPTY_INPUT : GCONV_FULL_OUTPUT); + break; + } + + /* Status so far. */ + result = GCONV_EMPTY_INPUT; + + if (data->outbufavail > 0) + { + /* Call the functions below in the chain. */ + size_t newavail = data->outbufavail; + + result = (*fct) (next_step, next_data, data->outbuf, &newavail, + written, 0); + + /* Correct the output buffer. */ + if (newavail != data->outbufavail && newavail > 0) + { + memmove (data->outbuf, + &data->outbuf[data->outbufavail - newavail], + newavail); + data->outbufavail = newavail; + } + } + } + while (*inlen >= sizeof (wchar_t) && result == GCONV_EMPTY_INPUT); + + __set_errno (save_errno); + } + + if (written != NULL && data->is_last) + *written = do_write; + + return result; +} + + +/* Convert from ISO 646-IRV to the internal (UCS4-like) format. */ +int +__gconv_transform_ascii_internal (struct gconv_step *step, + struct gconv_step_data *data, + const char *inbuf, size_t *inlen, + size_t *written, int do_flush) +{ + struct gconv_step *next_step = step + 1; + struct gconv_step_data *next_data = data + 1; + gconv_fct fct = next_step->fct; + size_t do_write = 0; + int result; + + /* If the function is called with no input this means we have to reset + to the initial state. The possibly partly converted input is + dropped. */ + if (do_flush) + { + /* Clear the state. */ + memset (data->statep, '\0', sizeof (mbstate_t)); + + /* Call the steps down the chain if there are any. */ + if (data->is_last) + result = GCONV_OK; + else + { + struct gconv_step *next_step = step + 1; + struct gconv_step_data *next_data = data + 1; + + result = (*fct) (next_step, next_data, NULL, 0, written, 1); + + /* Clear output buffer. */ + data->outbufavail = 0; + } + } + else + { + const unsigned char *newinbuf = inbuf; + int save_errno = errno; + + result = GCONV_OK; + do + { size_t actually = 0; size_t cnt = 0; @@ -193,9 +313,10 @@ __gconv_transform_ascii_ucs4 (struct gconv_step *step, /* Convert from ISO 10646/UCS to ISO 646-IRV. */ int -__gconv_transform_ucs4_ascii (struct gconv_step *step, - struct gconv_step_data *data, const char *inbuf, - size_t *inlen, size_t *written, int do_flush) +__gconv_transform_internal_ascii (struct gconv_step *step, + struct gconv_step_data *data, + const char *inbuf, size_t *inlen, + size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; @@ -228,13 +349,13 @@ __gconv_transform_ucs4_ascii (struct gconv_step *step, } else { + const wchar_t *newinbuf = (const wchar_t *) inbuf; int save_errno = errno; do_write = 0; result = GCONV_OK; do { - const wchar_t *newinbuf = (const wchar_t *) inbuf; size_t actually = 0; size_t cnt = 0; @@ -264,11 +385,18 @@ __gconv_transform_ucs4_ascii (struct gconv_step *step, if (result != GCONV_OK) break; + /* Check for incomplete input. */ + if (*inlen > 0 && *inlen < sizeof (wchar_t)) + { + /* We have an incomplete character at the end. */ + result = GCONV_INCOMPLETE_INPUT; + break; + } + if (data->is_last) { /* This is the last step. */ - result = (*inlen < sizeof (wchar_t) - ? GCONV_EMPTY_INPUT : GCONV_FULL_OUTPUT); + result = *inlen == 0 ? GCONV_EMPTY_INPUT : GCONV_FULL_OUTPUT; break; } @@ -306,9 +434,10 @@ __gconv_transform_ucs4_ascii (struct gconv_step *step, int -__gconv_transform_ucs4_utf8 (struct gconv_step *step, - struct gconv_step_data *data, const char *inbuf, - size_t *inlen, size_t *written, int do_flush) +__gconv_transform_internal_utf8 (struct gconv_step *step, + struct gconv_step_data *data, + const char *inbuf, size_t *inlen, + size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; @@ -341,13 +470,13 @@ __gconv_transform_ucs4_utf8 (struct gconv_step *step, } else { + const wchar_t *newinbuf = (const wchar_t *) inbuf; int save_errno = errno; do_write = 0; result = GCONV_OK; do { - const wchar_t *newinbuf = (const wchar_t *) inbuf; size_t cnt = 0; while (data->outbufavail < data->outbufsize @@ -397,16 +526,24 @@ __gconv_transform_ucs4_utf8 (struct gconv_step *step, /* Remember how much we converted. */ do_write += cnt; *inlen -= cnt * sizeof (wchar_t); + newinbuf += cnt; /* Check whether an illegal character appeared. */ if (result != GCONV_OK) break; + /* Check for incomplete input. */ + if (*inlen > 0 && *inlen < sizeof (wchar_t)) + { + /* We have an incomplete character at the end. */ + result = GCONV_INCOMPLETE_INPUT; + break; + } + if (data->is_last) { /* This is the last step. */ - result = (*inlen < sizeof (wchar_t) - ? GCONV_EMPTY_INPUT : GCONV_FULL_OUTPUT); + result = *inlen == 0 ? GCONV_EMPTY_INPUT : GCONV_FULL_OUTPUT; break; } @@ -444,9 +581,10 @@ __gconv_transform_ucs4_utf8 (struct gconv_step *step, int -__gconv_transform_utf8_ucs4 (struct gconv_step *step, - struct gconv_step_data *data, const char *inbuf, - size_t *inlen, size_t *written, int do_flush) +__gconv_transform_utf8_internal (struct gconv_step *step, + struct gconv_step_data *data, + const char *inbuf, size_t *inlen, + size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; @@ -578,6 +716,7 @@ __gconv_transform_utf8_ucs4 (struct gconv_step *step, /* Remember how much we converted. */ do_write += actually; *inlen -= cnt; + inbuf += cnt; data->outbufavail += actually * sizeof (wchar_t); @@ -588,7 +727,7 @@ __gconv_transform_utf8_ucs4 (struct gconv_step *step, break; } - if (*inlen < extra) + if (*inlen > 0 && *inlen < extra) { /* We have an incomplete character at the end. */ result = GCONV_INCOMPLETE_INPUT; @@ -637,9 +776,10 @@ __gconv_transform_utf8_ucs4 (struct gconv_step *step, int -__gconv_transform_ucs2_ucs4 (struct gconv_step *step, - struct gconv_step_data *data, const char *inbuf, - size_t *inlen, size_t *written, int do_flush) +__gconv_transform_ucs2_internal (struct gconv_step *step, + struct gconv_step_data *data, + const char *inbuf, size_t *inlen, + size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; @@ -669,12 +809,12 @@ __gconv_transform_ucs2_ucs4 (struct gconv_step *step, } else { + const uint16_t *newinbuf = (const uint16_t *) inbuf; int save_errno = errno; do_write = 0; do { - const uint16_t *newinbuf = (const uint16_t *) inbuf; wchar_t *outbuf = (wchar_t *) &data->outbuf[data->outbufavail]; size_t actually = 0; @@ -683,34 +823,29 @@ __gconv_transform_ucs2_ucs4 (struct gconv_step *step, while (data->outbufavail + 4 <= data->outbufsize && *inlen >= 2) { - outbuf[actually++] = *newinbuf++; +#if __BYTE_ORDER == __LITTLE_ENDIAN + outbuf[actually++] = (wchar_t) bswap_16 (*newinbuf++); +#else + outbuf[actually++] = (wchar_t) *newinbuf++; +#endif data->outbufavail += 4; *inlen -= 2; } - if (*inlen != 1) - { - /* We have an incomplete input character. */ - mbstate_t *state = data->statep; - state->count = 1; - state->value = *(uint8_t *) newinbuf; - --*inlen; - } - /* Remember how much we converted. */ do_write += actually * sizeof (wchar_t); - /* Check whether an illegal character appeared. */ - if (errno != 0) + if (*inlen == 1) { - result = GCONV_ILLEGAL_INPUT; + /* We have an incomplete character at the end. */ + result = GCONV_INCOMPLETE_INPUT; break; } - if (*inlen == 0 && !__mbsinit (data->statep)) + /* Check whether an illegal character appeared. */ + if (errno != 0) { - /* We have an incomplete character at the end. */ - result = GCONV_INCOMPLETE_INPUT; + result = GCONV_ILLEGAL_INPUT; break; } @@ -756,9 +891,10 @@ __gconv_transform_ucs2_ucs4 (struct gconv_step *step, int -__gconv_transform_ucs4_ucs2 (struct gconv_step *step, - struct gconv_step_data *data, const char *inbuf, - size_t *inlen, size_t *written, int do_flush) +__gconv_transform_internal_ucs2 (struct gconv_step *step, + struct gconv_step_data *data, + const char *inbuf, size_t *inlen, + size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; @@ -791,12 +927,12 @@ __gconv_transform_ucs4_ucs2 (struct gconv_step *step, } else { + const wchar_t *newinbuf = (const wchar_t *) inbuf; int save_errno = errno; do_write = 0; do { - const wchar_t *newinbuf = (const wchar_t *) inbuf; uint16_t *outbuf = (uint16_t *) &data->outbuf[data->outbufavail]; size_t actually = 0; @@ -810,39 +946,33 @@ __gconv_transform_ucs4_ucs2 (struct gconv_step *step, __set_errno (EILSEQ); break; } - outbuf[actually++] = (wchar_t) *newinbuf; +#if __BYTE_ORDER == __LITTLE_ENDIAN + /* Please note that we use the `uint32_t' pointer as a + `uint16_t' pointer which works since we are on a + little endian machine. */ + outbuf[actually++] = bswap_16 (*((uint16_t *) newinbuf)); + ++newinbuf; +#else + outbuf[actually++] = *newinbuf++; +#endif *inlen -= 4; data->outbufavail += 2; } - if (*inlen < 4) - { - /* We have an incomplete input character. */ - mbstate_t *state = data->statep; - state->count = *inlen; - state->value = 0; - while (*inlen > 0) - { - state->value <<= 8; - state->value += *(uint8_t *) newinbuf; - --*inlen; - } - } - /* Remember how much we converted. */ do_write += (const char *) newinbuf - inbuf; - /* Check whether an illegal character appeared. */ - if (errno != 0) + if (*inlen > 0 && *inlen < 4) { - result = GCONV_ILLEGAL_INPUT; + /* We have an incomplete input character. */ + result = GCONV_INCOMPLETE_INPUT; break; } - if (*inlen == 0 && !__mbsinit (data->statep)) + /* Check whether an illegal character appeared. */ + if (errno != 0) { - /* We have an incomplete character at the end. */ - result = GCONV_INCOMPLETE_INPUT; + result = GCONV_ILLEGAL_INPUT; break; } diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c index 0c1b9d045d..569bd3b3ec 100644 --- a/iconv/iconv_prog.c +++ b/iconv/iconv_prog.c @@ -509,14 +509,17 @@ print_known_names (void) { if (__gconv_modules_db[cnt]->from_pattern == NULL) { - tsearch (__gconv_modules_db[cnt]->from_constpfx, &printlist, - (__compar_fn_t) strcoll); - tsearch (__gconv_modules_db[cnt]->to_string, &printlist, - (__compar_fn_t) strcoll); + if (strcmp (__gconv_modules_db[cnt]->from_constpfx, "INTERNAL")) + tsearch (__gconv_modules_db[cnt]->from_constpfx, &printlist, + (__compar_fn_t) strcoll); + if (strcmp (__gconv_modules_db[cnt]->to_string, "INTERNAL")) + tsearch (__gconv_modules_db[cnt]->to_string, &printlist, + (__compar_fn_t) strcoll); } else - tsearch (__gconv_modules_db[cnt]->from_pattern, &printlist, - (__compar_fn_t) strcoll); + if (strcmp (__gconv_modules_db[cnt]->from_pattern, "INTERNAL")) + tsearch (__gconv_modules_db[cnt]->from_pattern, &printlist, + (__compar_fn_t) strcoll); } fputs (_("\ -- cgit 1.4.1