about summary refs log tree commit diff
path: root/wcsmbs/mbrtowc.c
diff options
context:
space:
mode:
Diffstat (limited to 'wcsmbs/mbrtowc.c')
-rw-r--r--wcsmbs/mbrtowc.c111
1 files changed, 88 insertions, 23 deletions
diff --git a/wcsmbs/mbrtowc.c b/wcsmbs/mbrtowc.c
index 2c4b0779da..9e70a0b2c9 100644
--- a/wcsmbs/mbrtowc.c
+++ b/wcsmbs/mbrtowc.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 1996 Free Software Foundation, Inc.
 This file is part of the GNU C Library.
-Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
 
 The GNU C Library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Library General Public License as
@@ -17,50 +17,115 @@ License along with the GNU C Library; see the file COPYING.LIB.  If
 not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 Boston, MA 02111-1307, USA.  */
 
+#include <errno.h>
 #include <wchar.h>
 
+#ifndef EILSEQ
+#define EILSEQ EINVAL
+#endif
+
 
 static mbstate_t internal;
 
 size_t
-mbrtowc (pwc, s, n, ps)
-     wchar_t *pwc;
-     const char *s;
-     size_t n;
-     mbstate_t *ps;
+mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 {
   wchar_t to_wide;
+  size_t used = 0;
 
   if (ps == NULL)
     ps = &internal;
 
-  /*************************************************************\
-  |* This is no complete implementation.  While the multi-byte *|
-  |* character handling is not finished this will do.	       *|
-  \*************************************************************/
-
   if (s == NULL)
     {
+      /* See first paragraph of description in 7.16.6.3.2.  */
       pwc = NULL;
       s = "";
       n = 1;
     }
 
-  if (n == 0)
-    return (size_t) -2;
+  if (n > 0)
+    {
+      if (ps->count == 0)
+	{
+	  unsigned char byte = (unsigned char) *s++;
+	  ++used;
 
-  /* For now.  */
-  to_wide = (wchar_t) *s;
+	  /* We must look for a possible first byte of a UTF8 sequence.  */
+	  if (byte < 0x80)
+	    {
+	      /* One byte sequence.  */
+	      if (pwc != NULL)
+		*pwc = (wchar_t) byte;
+	      return byte ? used : 0;
+	    }
 
-  if (pwc != NULL)
-    *pwc = to_wide;
+	  if ((byte & 0xc0) == 0x80 || (byte & 0xfe) == 0xfe)
+	    {
+	      /* Oh, oh.  An encoding error.  */
+	      errno = EILSEQ;
+	      return (size_t) -1;
+	    }
 
-  if (pwc == L'\0')
-    {
-      *ps = 0;		/* This is required.  */
-      return 0;
+	  if ((byte & 0xe0) == 0xc0)
+	    {
+	      /* We expect two bytes.  */
+	      ps->count = 1;
+	      ps->value = byte & 0x1f;
+	    }
+	  else if ((byte & 0xf0) == 0xe0)
+	    {
+	      /* We expect three bytes.  */
+	      ps->count = 2;
+	      ps->value = byte & 0x0f;
+	    }
+	  else if ((byte & 0xf8) == 0xf0)
+	    {
+	      /* We expect four bytes.  */
+	      ps->count = 3;
+	      ps->value = byte & 0x07;
+	    }
+	  else if ((byte & 0xfc) == 0xf8)
+	    {
+	      /* We expect five bytes.  */
+	      ps->count = 4;
+	      ps->value = byte & 0x03;
+	    }
+	  else
+	    {
+	      /* We expect six bytes.  */
+	      ps->count = 5;
+	      ps->value = byte & 0x01;
+	    }
+	}
+
+      /* We know we have to handle a multibyte character and there are
+	 some more bytes to read.  */
+      while (used < n)
+	{
+	  /* The second to sixths byte must be of the form 10xxxxxx.  */
+	  unsigned char byte = (unsigned char) *s++;
+	  ++used;
+
+	  if ((byte & 0xc0) != 0x80)
+	    {
+	      /* Oh, oh.  An encoding error.  */
+	      errno = EILSEQ;
+	      return (size_t) -1;
+	    }
+
+	  ps->value <<= 6;
+	  ps->value |= byte & 0x3f;
+
+	  if (--ps->count == 0)
+	    {
+	      /* The character is finished.  */
+	      if (pwc != NULL)
+		*pwc = (wchar_t) ps->value;
+	      return ps->value ? used : 0;
+	    }
+	}
     }
 
-  /* Return code (size_t)-1 cannot happend for now.  */
-  return 1;
+  return (size_t) -2;
 }