about summary refs log tree commit diff
path: root/wcsmbs
diff options
context:
space:
mode:
Diffstat (limited to 'wcsmbs')
-rw-r--r--wcsmbs/btowc.c10
-rw-r--r--wcsmbs/mbrlen.c3
-rw-r--r--wcsmbs/mbrtowc.c111
-rw-r--r--wcsmbs/mbsinit.c18
-rw-r--r--wcsmbs/mbsrtowcs.c114
-rw-r--r--wcsmbs/wchar.h20
-rw-r--r--wcsmbs/wcrtomb.c62
-rw-r--r--wcsmbs/wcsrtombs.c95
-rw-r--r--wcsmbs/wctob.c9
9 files changed, 332 insertions, 110 deletions
diff --git a/wcsmbs/btowc.c b/wcsmbs/btowc.c
index 062be7ec02..2f13cc7ce4 100644
--- a/wcsmbs/btowc.c
+++ b/wcsmbs/btowc.c
@@ -21,16 +21,14 @@ Boston, MA 02111-1307, USA.  */
 #include <wchar.h>
 
 
+/* We use UTF8 encoding for multibyte strings and therefore a valid
+   one byte multibyte string only can have a value from 0 to 0x7f.  */
 wint_t
 btowc (c)
      int c;
 {
-  /*************************************************************\
-  |* This is no complete implementation.  While the multi-byte *|
-  |* character handling is not finished this will do.	       *|
-  \*************************************************************/
-  if (WEOF != (wint_t) EOF)
+  if (WEOF != (wint_t) EOF || c < 0 || c > 0x7f)
     return WEOF;
   else
-    return c;
+    return (wint_t) c;
 }
diff --git a/wcsmbs/mbrlen.c b/wcsmbs/mbrlen.c
index a50631e8d1..c5a27116be 100644
--- a/wcsmbs/mbrlen.c
+++ b/wcsmbs/mbrlen.c
@@ -26,10 +26,11 @@ static mbstate_t internal;
 
 
 size_t
-mbrlen (s, n, ps)
+__mbrlen (s, n, ps)
      const char *s;
      size_t n;
      mbstate_t *ps;
 {
   return mbrtowc (NULL, s, n, ps ?: &internal);
 }
+weak_alias (__mbrlen, mbrlen)
diff --git a/wcsmbs/mbrtowc.c b/wcsmbs/mbrtowc.c
index 2c4b0779da..9e70a0b2c9 100644
--- a/wcsmbs/mbrtowc.c
+++ b/wcsmbs/mbrtowc.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 1996 Free Software Foundation, Inc.
 This file is part of the GNU C Library.
-Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
 
 The GNU C Library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Library General Public License as
@@ -17,50 +17,115 @@ License along with the GNU C Library; see the file COPYING.LIB.  If
 not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 Boston, MA 02111-1307, USA.  */
 
+#include <errno.h>
 #include <wchar.h>
 
+#ifndef EILSEQ
+#define EILSEQ EINVAL
+#endif
+
 
 static mbstate_t internal;
 
 size_t
-mbrtowc (pwc, s, n, ps)
-     wchar_t *pwc;
-     const char *s;
-     size_t n;
-     mbstate_t *ps;
+mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 {
   wchar_t to_wide;
+  size_t used = 0;
 
   if (ps == NULL)
     ps = &internal;
 
-  /*************************************************************\
-  |* This is no complete implementation.  While the multi-byte *|
-  |* character handling is not finished this will do.	       *|
-  \*************************************************************/
-
   if (s == NULL)
     {
+      /* See first paragraph of description in 7.16.6.3.2.  */
       pwc = NULL;
       s = "";
       n = 1;
     }
 
-  if (n == 0)
-    return (size_t) -2;
+  if (n > 0)
+    {
+      if (ps->count == 0)
+	{
+	  unsigned char byte = (unsigned char) *s++;
+	  ++used;
 
-  /* For now.  */
-  to_wide = (wchar_t) *s;
+	  /* We must look for a possible first byte of a UTF8 sequence.  */
+	  if (byte < 0x80)
+	    {
+	      /* One byte sequence.  */
+	      if (pwc != NULL)
+		*pwc = (wchar_t) byte;
+	      return byte ? used : 0;
+	    }
 
-  if (pwc != NULL)
-    *pwc = to_wide;
+	  if ((byte & 0xc0) == 0x80 || (byte & 0xfe) == 0xfe)
+	    {
+	      /* Oh, oh.  An encoding error.  */
+	      errno = EILSEQ;
+	      return (size_t) -1;
+	    }
 
-  if (pwc == L'\0')
-    {
-      *ps = 0;		/* This is required.  */
-      return 0;
+	  if ((byte & 0xe0) == 0xc0)
+	    {
+	      /* We expect two bytes.  */
+	      ps->count = 1;
+	      ps->value = byte & 0x1f;
+	    }
+	  else if ((byte & 0xf0) == 0xe0)
+	    {
+	      /* We expect three bytes.  */
+	      ps->count = 2;
+	      ps->value = byte & 0x0f;
+	    }
+	  else if ((byte & 0xf8) == 0xf0)
+	    {
+	      /* We expect four bytes.  */
+	      ps->count = 3;
+	      ps->value = byte & 0x07;
+	    }
+	  else if ((byte & 0xfc) == 0xf8)
+	    {
+	      /* We expect five bytes.  */
+	      ps->count = 4;
+	      ps->value = byte & 0x03;
+	    }
+	  else
+	    {
+	      /* We expect six bytes.  */
+	      ps->count = 5;
+	      ps->value = byte & 0x01;
+	    }
+	}
+
+      /* We know we have to handle a multibyte character and there are
+	 some more bytes to read.  */
+      while (used < n)
+	{
+	  /* The second to sixths byte must be of the form 10xxxxxx.  */
+	  unsigned char byte = (unsigned char) *s++;
+	  ++used;
+
+	  if ((byte & 0xc0) != 0x80)
+	    {
+	      /* Oh, oh.  An encoding error.  */
+	      errno = EILSEQ;
+	      return (size_t) -1;
+	    }
+
+	  ps->value <<= 6;
+	  ps->value |= byte & 0x3f;
+
+	  if (--ps->count == 0)
+	    {
+	      /* The character is finished.  */
+	      if (pwc != NULL)
+		*pwc = (wchar_t) ps->value;
+	      return ps->value ? used : 0;
+	    }
+	}
     }
 
-  /* Return code (size_t)-1 cannot happend for now.  */
-  return 1;
+  return (size_t) -2;
 }
diff --git a/wcsmbs/mbsinit.c b/wcsmbs/mbsinit.c
index efbfd09347..f56ce20331 100644
--- a/wcsmbs/mbsinit.c
+++ b/wcsmbs/mbsinit.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 1996 Free Software Foundation, Inc.
 This file is part of the GNU C Library.
-Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
 
 The GNU C Library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Library General Public License as
@@ -20,15 +20,17 @@ Boston, MA 02111-1307, USA.  */
 #include <string.h>
 #include <wchar.h>
 
-
+/* In GNU libc the conversion functions only can convert between the
+   fixed wide character representation and the multibyte
+   representation of the same character set.  Since we use ISO 10646
+   in UCS4 encoding for wide characters the best solution for
+   multibyte characters is the UTF8 encoding.  I.e., the only state
+   information is a counter of the processed bytes so far and the
+   value collected so far.  Especially, we don't have different shift
+   states.  */
 int
 mbsinit (ps)
      const mbstate_t *ps;
 {
-  /*************************************************************\
-  |* This is no complete implementation.  While the multi-byte *|
-  |* character handling is not finished this will do.	       *|
-  \*************************************************************/
-
-  return ps == NULL || *ps == 0;
+  return ps == NULL || ps->count == 0;
 }
diff --git a/wcsmbs/mbsrtowcs.c b/wcsmbs/mbsrtowcs.c
index dc026b7252..712b199271 100644
--- a/wcsmbs/mbsrtowcs.c
+++ b/wcsmbs/mbsrtowcs.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 1996 Free Software Foundation, Inc.
 This file is part of the GNU C Library.
-Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
 
 The GNU C Library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Library General Public License as
@@ -17,9 +17,16 @@ License along with the GNU C Library; see the file COPYING.LIB.  If
 not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 Boston, MA 02111-1307, USA.  */
 
+#include <errno.h>
 #include <wchar.h>
 
+#ifndef EILSEQ
+#define EILSEQ EINVAL
+#endif
 
+
+/* We don't need the state really because we don't have shift states
+   to maintain between calls to this function.  */
 static mbstate_t internal;
 
 size_t
@@ -29,35 +36,102 @@ mbsrtowcs (dst, src, len, ps)
      size_t len;
      mbstate_t *ps;
 {
-  size_t result = 0;
+  size_t written = 0;
+  const char *run = *src;
 
   if (ps == NULL)
     ps = &internal;
 
-  /*************************************************************\
-  |* This is no complete implementation.  While the multi-byte *|
-  |* character handling is not finished this will do.	       *|
-  \*************************************************************/
+  if (dst == NULL)
+    /* The LEN parameter has to be ignored if we don't actually write
+       anything.  */
+    len = ~0;
 
-  while (len > 0 && **src != '\0')
+  /* Copy all words.  */
+  while (written < len)
     {
-      /* For now there is no possibly illegal MB char sequence.  */
-      if (dst != NULL)
-	dst[result] = (wchar_t) **src;
-      ++result;
-      ++(*src);
-      --len;
-    }
+      wchar_t value;
+      size_t count;
+      unsigned char byte = *run++;
 
-  if (len > 0)
-    {
+      /* We expect a start of a new multibyte character.  */
+      if (byte < 0x80)
+	{
+	  /* One byte sequence.  */
+	  count = 0;
+	  value = byte;
+	}
+      else if ((byte & 0xe0) == 0xc0)
+	{
+	  count = 1;
+	  value = byte & 0x1f;
+	}
+      else if ((byte & 0xf0) == 0xe0)
+	{
+	  /* We expect three bytes.  */
+	  count = 2;
+	  value = byte & 0x0f;
+	}
+      else if ((byte & 0xf8) == 0xf0)
+	{
+	  /* We expect four bytes.  */
+	  count = 3;
+	  value = byte & 0x07;
+	}
+      else if ((byte & 0xfc) == 0xf8)
+	{
+	  /* We expect five bytes.  */
+	  count = 4;
+	  value = byte & 0x03;
+	}
+      else if ((byte & 0xfe) == 0xfc)
+	{
+	  /* We expect six bytes.  */
+	  count = 5;
+	  value = byte & 0x01;
+	}
+      else
+	{
+	  /* This is an illegal encoding.  */
+	  errno = EILSEQ;
+	  return (size_t) -1;
+	}
+
+      /* Read the possible remaining bytes.  */
+      while (count-- > 0)
+	{
+	  byte = *run++;
+
+	  if ((byte & 0xc0) != 0x80)
+	    {
+	      /* This is an illegal encoding.  */
+	      errno = EILSEQ;
+	      return (size_t) -1;
+	    }
+
+	  value <<= 6;
+	  value |= byte & 0x3f;
+	}
+
+      /* Store value is required.  */
       if (dst != NULL)
+	*dst++ = value;
+
+      /* The whole sequence is read.  Check whether end of string is
+	 reached.  */
+      if (value == L'\0')
 	{
-	  dst[result] = L'\0';
-	  *ps = 0;
+	  /* Found the end of the string.  */
+	  *src = NULL;
+	  return written;
 	}
-      *src = NULL;
+
+      /* Increment counter of produced words.  */
+      ++written;
     }
 
-  return result;
+  /* Store address of next byte to process.  */
+  *src = run;
+
+  return written;
 }
diff --git a/wcsmbs/wchar.h b/wcsmbs/wchar.h
index cc821b8a50..806bafa655 100644
--- a/wcsmbs/wchar.h
+++ b/wcsmbs/wchar.h
@@ -48,7 +48,11 @@ typedef unsigned int wint_t;
 
 
 /* Conversion state information.  */
-typedef int mbstate_t; /* FIXME */
+typedef struct
+{
+  int count;		/* Number of bytes needed for the current character. */
+  wint_t value;		/* Value so far.  */
+} mbstate_t;
 
 #define WCHAR_MIN ((wchar_t) 0)
 #define WCHAR_MAX (~WCHAR_MIN)
@@ -145,9 +149,6 @@ extern int wctob __P ((wint_t __c));
    state.  */
 extern int mbsinit __P ((__const mbstate_t *__ps));
 
-/* Return number of bytes in multibyte character pointed to by S.  */
-extern size_t mbrlen __P ((__const char *__s, size_t __n, mbstate_t *ps));
-
 /* Write wide character representation of multibyte character pointed
    to by S to PWC.  */
 extern size_t mbrtowc __P ((wchar_t *__pwc, __const char *__s, size_t __n,
@@ -156,6 +157,17 @@ extern size_t mbrtowc __P ((wchar_t *__pwc, __const char *__s, size_t __n,
 /* Write multibyte representation of wide character WC to S.  */
 extern size_t wcrtomb __P ((char *__s, wchar_t __wc, mbstate_t *__ps));
 
+/* Return number of bytes in multibyte character pointed to by S.  */
+extern size_t __mbrlen __P ((__const char *__s, size_t __n, mbstate_t *__ps));
+extern size_t mbrlen __P ((__const char *__s, size_t __n, mbstate_t *__ps));
+
+#if defined (__OPTIMIZE__) \
+    && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7))
+/* Define inline function as optimization.  */
+extern __inline size_t mbrlen (__const char *s, size_t n, mbstate_t *ps)
+{ return ps != NULL ? mbrtowc (NULL, s, n, ps) : __mbrlen (s, n, NULL); }
+#endif
+
 /* Write wide character representation of multibyte chracter string SRC
    to DST.  */
 extern size_t mbsrtowcs __P ((wchar_t *__dst, __const char **__src,
diff --git a/wcsmbs/wcrtomb.c b/wcsmbs/wcrtomb.c
index 9069fb105c..eb007a69b9 100644
--- a/wcsmbs/wcrtomb.c
+++ b/wcsmbs/wcrtomb.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 1996 Free Software Foundation, Inc.
 This file is part of the GNU C Library.
-Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
 
 The GNU C Library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Library General Public License as
@@ -24,46 +24,68 @@ Boston, MA 02111-1307, USA.  */
 #define EILSEQ EINVAL
 #endif
 
+static const wchar_t encoding_mask[] =
+{
+  ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
+};
+
+static const unsigned char encoding_byte[] =
+{
+  0xc0, 0xe0, 0xf0, 0xf8, 0xfc
+};
 
+/* The state is for this UTF8 encoding not used.  */
 static mbstate_t internal;
 
 size_t
-wcrtomb (s, wc, ps)
-     char *s;
-     wchar_t wc;
-     mbstate_t *ps;
+wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
 {
   char fake[1];
+  size_t written = 0;
 
   if (ps == NULL)
     ps = &internal;
 
-  /*************************************************************\
-  |* This is no complete implementation.  While the multi-byte *|
-  |* character handling is not finished this will do.	       *|
-  \*************************************************************/
-
   if (s == NULL)
     {
       s = fake;
       wc = L'\0';
     }
 
-  if (wc == L'\0')
+  /* Store the UTF8 representation of WC.  */
+  if (wc < 0 || wc > 0x7fffffff)
     {
-      /* FIXME Write any shift sequence to get to *PS == NULL.  */
-      *ps = 0;
-      *s = '\0';
+      /* This is no correct ISO 10646 character.  */
+      errno = EILSEQ;
+      return (size_t) -1;
+    }
+
+  if (wc < 0x80)
+    {
+      /* It's a one byte sequence.  */
+      if (s != NULL)
+	*s = (char) wc;
       return 1;
     }
 
-  /* FIXME For now we don't handle real multi-byte encodings.  */
-  if ((wc & ~0xff) != 0)
+  for (written = 2; written < 6; ++written)
+    if ((wc & encoding_mask[written - 2]) == 0)
+      break;
+
+  if (s != NULL)
     {
-      errno = EILSEQ;
-      return (size_t) -1;
+      size_t cnt = written;
+      s[0] = encoding_byte[cnt - 2];
+
+      --cnt;
+      do
+	{
+	  s[cnt] = 0x80 | (wc & 0x3f);
+	  wc >>= 6;
+	}
+      while (--cnt > 0);
+      s[0] |= wc;
     }
 
-  *s = (char) wc;
-  return 1;
+  return written;
 }
diff --git a/wcsmbs/wcsrtombs.c b/wcsmbs/wcsrtombs.c
index 9f1000937b..99ca6acc5b 100644
--- a/wcsmbs/wcsrtombs.c
+++ b/wcsmbs/wcsrtombs.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 1996 Free Software Foundation, Inc.
 This file is part of the GNU C Library.
-Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
 
 The GNU C Library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Library General Public License as
@@ -25,6 +25,18 @@ Boston, MA 02111-1307, USA.  */
 #endif
 
 
+static const wchar_t encoding_mask[] =
+{
+  ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
+};
+
+static const unsigned char encoding_byte[] =
+{
+  0xc0, 0xe0, 0xf0, 0xf8, 0xfc
+};
+
+/* We don't need the state really because we don't have shift states
+   to maintain between calls to this function.  */
 static mbstate_t internal;
 
 size_t
@@ -34,40 +46,79 @@ wcsrtombs (dst, src, len, ps)
      size_t len;
      mbstate_t *ps;
 {
-  size_t result = 0;
+  size_t written = 0;
+  const wchar_t *run = *src;
 
   if (ps == NULL)
     ps = &internal;
 
-  /*************************************************************\
-  |* This is no complete implementation.  While the multi-byte *|
-  |* character handling is not finished this will do.	       *|
-  \*************************************************************/
+  if (dst == NULL)
+    /* The LEN parameter has to be ignored if we don't actually write
+       anything.  */
+    len = ~0;
 
-  while (len > 0 && **src != L'\0')
+  while (written < len)
     {
-      if ((**src & ~0xff) != 0)
+      wchar_t wc = *run++;
+
+      if (wc < 0 || wc > 0x7fffffff)
 	{
+	  /* This is no correct ISO 10646 character.  */
 	  errno = EILSEQ;
 	  return (size_t) -1;
 	}
 
-      if (dst != NULL)
-	dst[result] = (char) **src;
-      ++result;
-      ++(*src);
-      --len;
-    }
-
-  if (len > 0)
-    {
-      if (dst != NULL)
+      if (wc == L'\0')
+	{
+	  /* Found the end.  */
+	  if (dst != NULL)
+	    *dst = '\0';
+	  *src = NULL;
+	  return written;
+	}
+      else if (wc < 0x80)
 	{
-	  dst[result] = '\0';
-	  *ps = 0;
+	  /* It's an one byte sequence.  */
+	  if (dst != NULL)
+	    *dst++ = (char) wc;
+	  ++written;
+	}
+      else
+	{
+	  size_t step;
+
+	  for (step = 2; step < 6; ++step)
+	    if ((wc & encoding_mask[step - 2]) == 0)
+	      break;
+
+	  if (written + step >= len)
+	    /* Too long.  */
+	    break;
+
+	  if (dst != NULL)
+	    {
+	      size_t cnt = step;
+
+	      dst[0] = encoding_byte[cnt - 2];
+
+	      --cnt;
+	      do
+		{
+		  dst[cnt] = 0x80 | (wc & 0x3f);
+		  wc >>= 6;
+		}
+	      while (--cnt > 0);
+	      dst[0] |= wc;
+
+	      dst += step;
+	    }
+
+	  written += step;
 	}
-      *src = NULL;
     }
 
-  return result;
+  /* Store position of first unprocessed word.  */
+  *src = run;
+
+  return written;
 }
diff --git a/wcsmbs/wctob.c b/wcsmbs/wctob.c
index c27bd6baba..f541a2e97b 100644
--- a/wcsmbs/wctob.c
+++ b/wcsmbs/wctob.c
@@ -21,14 +21,11 @@ Boston, MA 02111-1307, USA.  */
 #include <wchar.h>
 
 
+/* We use UTF8 encoding for multibyte strings and therefore a valid
+   one byte multibyte string only can have a value from 0 to 0x7f.  */
 int
 wctob (c)
      wint_t c;
 {
-  /*************************************************************\
-  |* This is no complete implementation.  While the multi-byte *|
-  |* character handling is not finished this will do.	       *|
-  \*************************************************************/
-
-  return (c & ~0xff) == 0 ? c : EOF;
+  return (c >= 0 && c <= 0x7f) ? c : EOF;
 }