diff options
Diffstat (limited to 'wcsmbs')
-rw-r--r-- | wcsmbs/btowc.c | 10 | ||||
-rw-r--r-- | wcsmbs/mbrlen.c | 3 | ||||
-rw-r--r-- | wcsmbs/mbrtowc.c | 111 | ||||
-rw-r--r-- | wcsmbs/mbsinit.c | 18 | ||||
-rw-r--r-- | wcsmbs/mbsrtowcs.c | 114 | ||||
-rw-r--r-- | wcsmbs/wchar.h | 20 | ||||
-rw-r--r-- | wcsmbs/wcrtomb.c | 62 | ||||
-rw-r--r-- | wcsmbs/wcsrtombs.c | 95 | ||||
-rw-r--r-- | wcsmbs/wctob.c | 9 |
9 files changed, 332 insertions, 110 deletions
diff --git a/wcsmbs/btowc.c b/wcsmbs/btowc.c index 062be7ec02..2f13cc7ce4 100644 --- a/wcsmbs/btowc.c +++ b/wcsmbs/btowc.c @@ -21,16 +21,14 @@ Boston, MA 02111-1307, USA. */ #include <wchar.h> +/* We use UTF8 encoding for multibyte strings and therefore a valid + one byte multibyte string only can have a value from 0 to 0x7f. */ wint_t btowc (c) int c; { - /*************************************************************\ - |* This is no complete implementation. While the multi-byte *| - |* character handling is not finished this will do. *| - \*************************************************************/ - if (WEOF != (wint_t) EOF) + if (WEOF != (wint_t) EOF || c < 0 || c > 0x7f) return WEOF; else - return c; + return (wint_t) c; } diff --git a/wcsmbs/mbrlen.c b/wcsmbs/mbrlen.c index a50631e8d1..c5a27116be 100644 --- a/wcsmbs/mbrlen.c +++ b/wcsmbs/mbrlen.c @@ -26,10 +26,11 @@ static mbstate_t internal; size_t -mbrlen (s, n, ps) +__mbrlen (s, n, ps) const char *s; size_t n; mbstate_t *ps; { return mbrtowc (NULL, s, n, ps ?: &internal); } +weak_alias (__mbrlen, mbrlen) diff --git a/wcsmbs/mbrtowc.c b/wcsmbs/mbrtowc.c index 2c4b0779da..9e70a0b2c9 100644 --- a/wcsmbs/mbrtowc.c +++ b/wcsmbs/mbrtowc.c @@ -1,6 +1,6 @@ /* Copyright (C) 1996 Free Software Foundation, Inc. This file is part of the GNU C Library. -Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu> +Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as @@ -17,50 +17,115 @@ License along with the GNU C Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include <errno.h> #include <wchar.h> +#ifndef EILSEQ +#define EILSEQ EINVAL +#endif + static mbstate_t internal; size_t -mbrtowc (pwc, s, n, ps) - wchar_t *pwc; - const char *s; - size_t n; - mbstate_t *ps; +mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) { wchar_t to_wide; + size_t used = 0; if (ps == NULL) ps = &internal; - /*************************************************************\ - |* This is no complete implementation. While the multi-byte *| - |* character handling is not finished this will do. *| - \*************************************************************/ - if (s == NULL) { + /* See first paragraph of description in 7.16.6.3.2. */ pwc = NULL; s = ""; n = 1; } - if (n == 0) - return (size_t) -2; + if (n > 0) + { + if (ps->count == 0) + { + unsigned char byte = (unsigned char) *s++; + ++used; - /* For now. */ - to_wide = (wchar_t) *s; + /* We must look for a possible first byte of a UTF8 sequence. */ + if (byte < 0x80) + { + /* One byte sequence. */ + if (pwc != NULL) + *pwc = (wchar_t) byte; + return byte ? used : 0; + } - if (pwc != NULL) - *pwc = to_wide; + if ((byte & 0xc0) == 0x80 || (byte & 0xfe) == 0xfe) + { + /* Oh, oh. An encoding error. */ + errno = EILSEQ; + return (size_t) -1; + } - if (pwc == L'\0') - { - *ps = 0; /* This is required. */ - return 0; + if ((byte & 0xe0) == 0xc0) + { + /* We expect two bytes. */ + ps->count = 1; + ps->value = byte & 0x1f; + } + else if ((byte & 0xf0) == 0xe0) + { + /* We expect three bytes. */ + ps->count = 2; + ps->value = byte & 0x0f; + } + else if ((byte & 0xf8) == 0xf0) + { + /* We expect four bytes. */ + ps->count = 3; + ps->value = byte & 0x07; + } + else if ((byte & 0xfc) == 0xf8) + { + /* We expect five bytes. */ + ps->count = 4; + ps->value = byte & 0x03; + } + else + { + /* We expect six bytes. */ + ps->count = 5; + ps->value = byte & 0x01; + } + } + + /* We know we have to handle a multibyte character and there are + some more bytes to read. */ + while (used < n) + { + /* The second to sixths byte must be of the form 10xxxxxx. */ + unsigned char byte = (unsigned char) *s++; + ++used; + + if ((byte & 0xc0) != 0x80) + { + /* Oh, oh. An encoding error. */ + errno = EILSEQ; + return (size_t) -1; + } + + ps->value <<= 6; + ps->value |= byte & 0x3f; + + if (--ps->count == 0) + { + /* The character is finished. */ + if (pwc != NULL) + *pwc = (wchar_t) ps->value; + return ps->value ? used : 0; + } + } } - /* Return code (size_t)-1 cannot happend for now. */ - return 1; + return (size_t) -2; } diff --git a/wcsmbs/mbsinit.c b/wcsmbs/mbsinit.c index efbfd09347..f56ce20331 100644 --- a/wcsmbs/mbsinit.c +++ b/wcsmbs/mbsinit.c @@ -1,6 +1,6 @@ /* Copyright (C) 1996 Free Software Foundation, Inc. This file is part of the GNU C Library. -Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu> +Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as @@ -20,15 +20,17 @@ Boston, MA 02111-1307, USA. */ #include <string.h> #include <wchar.h> - +/* In GNU libc the conversion functions only can convert between the + fixed wide character representation and the multibyte + representation of the same character set. Since we use ISO 10646 + in UCS4 encoding for wide characters the best solution for + multibyte characters is the UTF8 encoding. I.e., the only state + information is a counter of the processed bytes so far and the + value collected so far. Especially, we don't have different shift + states. */ int mbsinit (ps) const mbstate_t *ps; { - /*************************************************************\ - |* This is no complete implementation. While the multi-byte *| - |* character handling is not finished this will do. *| - \*************************************************************/ - - return ps == NULL || *ps == 0; + return ps == NULL || ps->count == 0; } diff --git a/wcsmbs/mbsrtowcs.c b/wcsmbs/mbsrtowcs.c index dc026b7252..712b199271 100644 --- a/wcsmbs/mbsrtowcs.c +++ b/wcsmbs/mbsrtowcs.c @@ -1,6 +1,6 @@ /* Copyright (C) 1996 Free Software Foundation, Inc. This file is part of the GNU C Library. -Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu> +Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as @@ -17,9 +17,16 @@ License along with the GNU C Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include <errno.h> #include <wchar.h> +#ifndef EILSEQ +#define EILSEQ EINVAL +#endif + +/* We don't need the state really because we don't have shift states + to maintain between calls to this function. */ static mbstate_t internal; size_t @@ -29,35 +36,102 @@ mbsrtowcs (dst, src, len, ps) size_t len; mbstate_t *ps; { - size_t result = 0; + size_t written = 0; + const char *run = *src; if (ps == NULL) ps = &internal; - /*************************************************************\ - |* This is no complete implementation. While the multi-byte *| - |* character handling is not finished this will do. *| - \*************************************************************/ + if (dst == NULL) + /* The LEN parameter has to be ignored if we don't actually write + anything. */ + len = ~0; - while (len > 0 && **src != '\0') + /* Copy all words. */ + while (written < len) { - /* For now there is no possibly illegal MB char sequence. */ - if (dst != NULL) - dst[result] = (wchar_t) **src; - ++result; - ++(*src); - --len; - } + wchar_t value; + size_t count; + unsigned char byte = *run++; - if (len > 0) - { + /* We expect a start of a new multibyte character. */ + if (byte < 0x80) + { + /* One byte sequence. */ + count = 0; + value = byte; + } + else if ((byte & 0xe0) == 0xc0) + { + count = 1; + value = byte & 0x1f; + } + else if ((byte & 0xf0) == 0xe0) + { + /* We expect three bytes. */ + count = 2; + value = byte & 0x0f; + } + else if ((byte & 0xf8) == 0xf0) + { + /* We expect four bytes. */ + count = 3; + value = byte & 0x07; + } + else if ((byte & 0xfc) == 0xf8) + { + /* We expect five bytes. */ + count = 4; + value = byte & 0x03; + } + else if ((byte & 0xfe) == 0xfc) + { + /* We expect six bytes. */ + count = 5; + value = byte & 0x01; + } + else + { + /* This is an illegal encoding. */ + errno = EILSEQ; + return (size_t) -1; + } + + /* Read the possible remaining bytes. */ + while (count-- > 0) + { + byte = *run++; + + if ((byte & 0xc0) != 0x80) + { + /* This is an illegal encoding. */ + errno = EILSEQ; + return (size_t) -1; + } + + value <<= 6; + value |= byte & 0x3f; + } + + /* Store value is required. */ if (dst != NULL) + *dst++ = value; + + /* The whole sequence is read. Check whether end of string is + reached. */ + if (value == L'\0') { - dst[result] = L'\0'; - *ps = 0; + /* Found the end of the string. */ + *src = NULL; + return written; } - *src = NULL; + + /* Increment counter of produced words. */ + ++written; } - return result; + /* Store address of next byte to process. */ + *src = run; + + return written; } diff --git a/wcsmbs/wchar.h b/wcsmbs/wchar.h index cc821b8a50..806bafa655 100644 --- a/wcsmbs/wchar.h +++ b/wcsmbs/wchar.h @@ -48,7 +48,11 @@ typedef unsigned int wint_t; /* Conversion state information. */ -typedef int mbstate_t; /* FIXME */ +typedef struct +{ + int count; /* Number of bytes needed for the current character. */ + wint_t value; /* Value so far. */ +} mbstate_t; #define WCHAR_MIN ((wchar_t) 0) #define WCHAR_MAX (~WCHAR_MIN) @@ -145,9 +149,6 @@ extern int wctob __P ((wint_t __c)); state. */ extern int mbsinit __P ((__const mbstate_t *__ps)); -/* Return number of bytes in multibyte character pointed to by S. */ -extern size_t mbrlen __P ((__const char *__s, size_t __n, mbstate_t *ps)); - /* Write wide character representation of multibyte character pointed to by S to PWC. */ extern size_t mbrtowc __P ((wchar_t *__pwc, __const char *__s, size_t __n, @@ -156,6 +157,17 @@ extern size_t mbrtowc __P ((wchar_t *__pwc, __const char *__s, size_t __n, /* Write multibyte representation of wide character WC to S. */ extern size_t wcrtomb __P ((char *__s, wchar_t __wc, mbstate_t *__ps)); +/* Return number of bytes in multibyte character pointed to by S. */ +extern size_t __mbrlen __P ((__const char *__s, size_t __n, mbstate_t *__ps)); +extern size_t mbrlen __P ((__const char *__s, size_t __n, mbstate_t *__ps)); + +#if defined (__OPTIMIZE__) \ + && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)) +/* Define inline function as optimization. */ +extern __inline size_t mbrlen (__const char *s, size_t n, mbstate_t *ps) +{ return ps != NULL ? mbrtowc (NULL, s, n, ps) : __mbrlen (s, n, NULL); } +#endif + /* Write wide character representation of multibyte chracter string SRC to DST. */ extern size_t mbsrtowcs __P ((wchar_t *__dst, __const char **__src, diff --git a/wcsmbs/wcrtomb.c b/wcsmbs/wcrtomb.c index 9069fb105c..eb007a69b9 100644 --- a/wcsmbs/wcrtomb.c +++ b/wcsmbs/wcrtomb.c @@ -1,6 +1,6 @@ /* Copyright (C) 1996 Free Software Foundation, Inc. This file is part of the GNU C Library. -Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu> +Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as @@ -24,46 +24,68 @@ Boston, MA 02111-1307, USA. */ #define EILSEQ EINVAL #endif +static const wchar_t encoding_mask[] = +{ + ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff +}; + +static const unsigned char encoding_byte[] = +{ + 0xc0, 0xe0, 0xf0, 0xf8, 0xfc +}; +/* The state is for this UTF8 encoding not used. */ static mbstate_t internal; size_t -wcrtomb (s, wc, ps) - char *s; - wchar_t wc; - mbstate_t *ps; +wcrtomb (char *s, wchar_t wc, mbstate_t *ps) { char fake[1]; + size_t written = 0; if (ps == NULL) ps = &internal; - /*************************************************************\ - |* This is no complete implementation. While the multi-byte *| - |* character handling is not finished this will do. *| - \*************************************************************/ - if (s == NULL) { s = fake; wc = L'\0'; } - if (wc == L'\0') + /* Store the UTF8 representation of WC. */ + if (wc < 0 || wc > 0x7fffffff) { - /* FIXME Write any shift sequence to get to *PS == NULL. */ - *ps = 0; - *s = '\0'; + /* This is no correct ISO 10646 character. */ + errno = EILSEQ; + return (size_t) -1; + } + + if (wc < 0x80) + { + /* It's a one byte sequence. */ + if (s != NULL) + *s = (char) wc; return 1; } - /* FIXME For now we don't handle real multi-byte encodings. */ - if ((wc & ~0xff) != 0) + for (written = 2; written < 6; ++written) + if ((wc & encoding_mask[written - 2]) == 0) + break; + + if (s != NULL) { - errno = EILSEQ; - return (size_t) -1; + size_t cnt = written; + s[0] = encoding_byte[cnt - 2]; + + --cnt; + do + { + s[cnt] = 0x80 | (wc & 0x3f); + wc >>= 6; + } + while (--cnt > 0); + s[0] |= wc; } - *s = (char) wc; - return 1; + return written; } diff --git a/wcsmbs/wcsrtombs.c b/wcsmbs/wcsrtombs.c index 9f1000937b..99ca6acc5b 100644 --- a/wcsmbs/wcsrtombs.c +++ b/wcsmbs/wcsrtombs.c @@ -1,6 +1,6 @@ /* Copyright (C) 1996 Free Software Foundation, Inc. This file is part of the GNU C Library. -Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu> +Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as @@ -25,6 +25,18 @@ Boston, MA 02111-1307, USA. */ #endif +static const wchar_t encoding_mask[] = +{ + ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff +}; + +static const unsigned char encoding_byte[] = +{ + 0xc0, 0xe0, 0xf0, 0xf8, 0xfc +}; + +/* We don't need the state really because we don't have shift states + to maintain between calls to this function. */ static mbstate_t internal; size_t @@ -34,40 +46,79 @@ wcsrtombs (dst, src, len, ps) size_t len; mbstate_t *ps; { - size_t result = 0; + size_t written = 0; + const wchar_t *run = *src; if (ps == NULL) ps = &internal; - /*************************************************************\ - |* This is no complete implementation. While the multi-byte *| - |* character handling is not finished this will do. *| - \*************************************************************/ + if (dst == NULL) + /* The LEN parameter has to be ignored if we don't actually write + anything. */ + len = ~0; - while (len > 0 && **src != L'\0') + while (written < len) { - if ((**src & ~0xff) != 0) + wchar_t wc = *run++; + + if (wc < 0 || wc > 0x7fffffff) { + /* This is no correct ISO 10646 character. */ errno = EILSEQ; return (size_t) -1; } - if (dst != NULL) - dst[result] = (char) **src; - ++result; - ++(*src); - --len; - } - - if (len > 0) - { - if (dst != NULL) + if (wc == L'\0') + { + /* Found the end. */ + if (dst != NULL) + *dst = '\0'; + *src = NULL; + return written; + } + else if (wc < 0x80) { - dst[result] = '\0'; - *ps = 0; + /* It's an one byte sequence. */ + if (dst != NULL) + *dst++ = (char) wc; + ++written; + } + else + { + size_t step; + + for (step = 2; step < 6; ++step) + if ((wc & encoding_mask[step - 2]) == 0) + break; + + if (written + step >= len) + /* Too long. */ + break; + + if (dst != NULL) + { + size_t cnt = step; + + dst[0] = encoding_byte[cnt - 2]; + + --cnt; + do + { + dst[cnt] = 0x80 | (wc & 0x3f); + wc >>= 6; + } + while (--cnt > 0); + dst[0] |= wc; + + dst += step; + } + + written += step; } - *src = NULL; } - return result; + /* Store position of first unprocessed word. */ + *src = run; + + return written; } diff --git a/wcsmbs/wctob.c b/wcsmbs/wctob.c index c27bd6baba..f541a2e97b 100644 --- a/wcsmbs/wctob.c +++ b/wcsmbs/wctob.c @@ -21,14 +21,11 @@ Boston, MA 02111-1307, USA. */ #include <wchar.h> +/* We use UTF8 encoding for multibyte strings and therefore a valid + one byte multibyte string only can have a value from 0 to 0x7f. */ int wctob (c) wint_t c; { - /*************************************************************\ - |* This is no complete implementation. While the multi-byte *| - |* character handling is not finished this will do. *| - \*************************************************************/ - - return (c & ~0xff) == 0 ? c : EOF; + return (c >= 0 && c <= 0x7f) ? c : EOF; } |