about summary refs log tree commit diff
path: root/wcsmbs
diff options
context:
space:
mode:
authorTom Honermann <tom@honermann.net>2022-06-30 08:52:14 -0400
committerAdhemerval Zanella <adhemerval.zanella@linaro.org>2022-07-06 09:29:42 -0300
commit8bcca1db3d7c0dc900a4cad4054c1439baf73684 (patch)
treec3d2bb8a6e32462178ba347f755b43c5ae51caff /wcsmbs
parent598f790fb17bcfff7fedde5209933a82d7748328 (diff)
downloadglibc-8bcca1db3d7c0dc900a4cad4054c1439baf73684.tar.gz
glibc-8bcca1db3d7c0dc900a4cad4054c1439baf73684.tar.xz
glibc-8bcca1db3d7c0dc900a4cad4054c1439baf73684.zip
stdlib: Implement mbrtoc8, c8rtomb, and the char8_t typedef.
This change provides implementations for the mbrtoc8 and c8rtomb
functions adopted for C++20 via WG21 P0482R6 and for C2X via WG14
N2653.  It also provides the char8_t typedef from WG14 N2653.

The mbrtoc8 and c8rtomb functions are declared in uchar.h in C2X
mode or when the _GNU_SOURCE macro or C++20 __cpp_char8_t feature
test macro is defined.

The char8_t typedef is declared in uchar.h in C2X mode or when the
_GNU_SOURCE macro is defined and the C++20 __cpp_char8_t feature
test macro is not defined (if __cpp_char8_t is defined, then char8_t
is a builtin type).

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
Diffstat (limited to 'wcsmbs')
-rw-r--r--wcsmbs/Makefile2
-rw-r--r--wcsmbs/Versions3
-rw-r--r--wcsmbs/c8rtomb.c132
-rw-r--r--wcsmbs/mbrtoc8.c126
-rw-r--r--wcsmbs/uchar.h21
5 files changed, 283 insertions, 1 deletions
diff --git a/wcsmbs/Makefile b/wcsmbs/Makefile
index df9a85f4a9..bda281ad70 100644
--- a/wcsmbs/Makefile
+++ b/wcsmbs/Makefile
@@ -42,7 +42,7 @@ routines := wcscat wcschr wcscmp wcscpy wcscspn wcsdup wcslen wcsncat \
 	    wcsmbsload mbsrtowcs_l \
 	    isoc99_wscanf isoc99_vwscanf isoc99_fwscanf isoc99_vfwscanf \
 	    isoc99_swscanf isoc99_vswscanf \
-	    mbrtoc16 c16rtomb mbrtoc32 c32rtomb
+	    mbrtoc8 c8rtomb mbrtoc16 c16rtomb mbrtoc32 c32rtomb
 
 strop-tests :=  wcscmp wcsncmp wmemcmp wcslen wcschr wcsrchr wcscpy wcsnlen \
 		wcpcpy wcsncpy wcpncpy wcscat wcsncat wcschrnul wcsspn wcspbrk \
diff --git a/wcsmbs/Versions b/wcsmbs/Versions
index 0b31c1b940..ec28acfb73 100644
--- a/wcsmbs/Versions
+++ b/wcsmbs/Versions
@@ -49,4 +49,7 @@ libc {
     wcstof32; wcstof64; wcstof32x;
     wcstof32_l; wcstof64_l; wcstof32x_l;
   }
+  GLIBC_2.36 {
+    c8rtomb; mbrtoc8;
+  }
 }
diff --git a/wcsmbs/c8rtomb.c b/wcsmbs/c8rtomb.c
new file mode 100644
index 0000000000..b564770eb5
--- /dev/null
+++ b/wcsmbs/c8rtomb.c
@@ -0,0 +1,132 @@
+/* UTF-8 to multibyte conversion.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <uchar.h>
+#include <wchar.h>
+
+
+/* This is the private state used if PS is NULL.  */
+static mbstate_t state;
+
+size_t
+c8rtomb (char *s, char8_t c8, mbstate_t *ps)
+{
+  /* This implementation depends on the converter invoked by wcrtomb not
+     needing to retain state in either the top most bit of ps->__count or
+     in ps->__value between invocations.  This implementation uses the
+     top most bit of ps->__count to indicate that trailing code units are
+     expected and uses ps->__value to store previously seen code units.  */
+
+  wchar_t wc;
+
+  if (ps == NULL)
+    ps = &state;
+
+  if (s == NULL)
+    {
+      /* if 's' is a null pointer, behave as if u8'\0' was passed as 'c8'.  If
+         this occurs for an incomplete code unit sequence, then an error will
+         be reported below.  */
+      c8 = u8""[0];
+    }
+
+  if (! (ps->__count & 0x80000000))
+    {
+      /* Initial state.  */
+      if ((c8 >= 0x80 && c8 <= 0xC1) || c8 >= 0xF5)
+	{
+	  /* An invalid lead code unit.  */
+	  __set_errno (EILSEQ);
+	  return -1;
+	}
+      if (c8 >= 0xC2)
+	{
+	  /* A valid lead code unit.  */
+	  ps->__count |= 0x80000000;
+	  ps->__value.__wchb[0] = c8;
+	  ps->__value.__wchb[3] = 1;
+	  return 0;
+	}
+      /* A single byte (ASCII) code unit.  */
+      wc = c8;
+    }
+  else
+    {
+      char8_t cu1 = ps->__value.__wchb[0];
+      if (ps->__value.__wchb[3] == 1)
+	{
+	  /* A single lead code unit was previously seen.  */
+	  if ((c8 < 0x80 || c8 > 0xBF)
+              || (cu1 == 0xE0 && c8 < 0xA0)
+              || (cu1 == 0xED && c8 > 0x9F)
+              || (cu1 == 0xF0 && c8 < 0x90)
+              || (cu1 == 0xF4 && c8 > 0x8F))
+	    {
+	      /* An invalid second code unit.  */
+	      __set_errno (EILSEQ);
+	      return -1;
+	    }
+	  if (cu1 >= 0xE0)
+	    {
+	      /* A three or four code unit sequence.  */
+	      ps->__value.__wchb[1] = c8;
+	      ++ps->__value.__wchb[3];
+	      return 0;
+	    }
+	  wc = ((cu1 & 0x1F) << 6)
+	       + (c8 & 0x3F);
+	}
+      else
+	{
+	  char8_t cu2 = ps->__value.__wchb[1];
+	  /* A three or four byte code unit sequence.  */
+	  if (c8 < 0x80 || c8 > 0xBF)
+	    {
+	      /* An invalid third or fourth code unit.  */
+	      __set_errno (EILSEQ);
+	      return -1;
+	    }
+	  if (ps->__value.__wchb[3] == 2 && cu1 >= 0xF0)
+	    {
+	      /* A four code unit sequence.  */
+	      ps->__value.__wchb[2] = c8;
+	      ++ps->__value.__wchb[3];
+	      return 0;
+	    }
+	  if (cu1 < 0xF0)
+	    {
+	      wc = ((cu1 & 0x0F) << 12)
+		   + ((cu2 & 0x3F) << 6)
+		   + (c8 & 0x3F);
+	    }
+	  else
+	    {
+	      char8_t cu3 = ps->__value.__wchb[2];
+	      wc = ((cu1 & 0x07) << 18)
+		   + ((cu2 & 0x3F) << 12)
+		   + ((cu3 & 0x3F) << 6)
+		   + (c8 & 0x3F);
+	    }
+	}
+      ps->__count &= 0x7fffffff;
+      ps->__value.__wch = 0;
+    }
+
+  return wcrtomb (s, wc, ps);
+}
diff --git a/wcsmbs/mbrtoc8.c b/wcsmbs/mbrtoc8.c
new file mode 100644
index 0000000000..dd80b5282d
--- /dev/null
+++ b/wcsmbs/mbrtoc8.c
@@ -0,0 +1,126 @@
+/* Multibyte to UTF-8 conversion.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <gconv.h>
+#include <uchar.h>
+#include <wcsmbsload.h>
+
+#include <sysdep.h>
+
+#ifndef EILSEQ
+# define EILSEQ EINVAL
+#endif
+
+
+/* This is the private state used if PS is NULL.  */
+static mbstate_t state;
+
+size_t
+mbrtoc8 (char8_t *pc8, const char *s, size_t n, mbstate_t *ps)
+{
+  /* This implementation depends on the converter invoked by mbrtowc not
+     needing to retain state in either the top most bit of ps->__count or
+     in ps->__value between invocations.  This implementation uses the
+     top most bit of ps->__count to indicate that trailing code units are
+     yet to be written and uses ps->__value to store those code units.  */
+
+  if (ps == NULL)
+    ps = &state;
+
+  /* If state indicates that trailing code units are yet to be written, write
+     those first regardless of whether 's' is a null pointer.  */
+  if (ps->__count & 0x80000000)
+    {
+      /* ps->__value.__wchb[3] stores the index of the next code unit to
+         write.  Code units are stored in reverse order.  */
+      size_t i = ps->__value.__wchb[3];
+      if (pc8 != NULL)
+	{
+	  *pc8 = ps->__value.__wchb[i];
+	}
+      if (i == 0)
+	{
+	  ps->__count &= 0x7fffffff;
+	  ps->__value.__wch = 0;
+	}
+      else
+	--ps->__value.__wchb[3];
+      return -3;
+    }
+
+  if (s == NULL)
+    {
+      /* if 's' is a null pointer, behave as if a null pointer was passed for
+         'pc8', an empty string was passed for 's', and 1 passed for 'n'.  */
+      pc8 = NULL;
+      s = "";
+      n = 1;
+    }
+
+  wchar_t wc;
+  size_t result;
+
+  result = mbrtowc (&wc, s, n, ps);
+  if (result <= n)
+    {
+      if (wc <= 0x7F)
+	{
+	  if (pc8 != NULL)
+	    *pc8 = wc;
+	}
+      else if (wc <= 0x7FF)
+	{
+	  if (pc8 != NULL)
+	    *pc8 = 0xC0 + ((wc >> 6) & 0x1F);
+	  ps->__value.__wchb[0] = 0x80 + (wc & 0x3F);
+	  ps->__value.__wchb[3] = 0;
+	  ps->__count |= 0x80000000;
+	}
+      else if (wc <= 0xFFFF)
+	{
+	  if (pc8 != NULL)
+	    *pc8 = 0xE0 + ((wc >> 12) & 0x0F);
+	  ps->__value.__wchb[1] = 0x80 + ((wc >> 6) & 0x3F);
+	  ps->__value.__wchb[0] = 0x80 + (wc & 0x3F);
+	  ps->__value.__wchb[3] = 1;
+	  ps->__count |= 0x80000000;
+	}
+      else if (wc <= 0x10FFFF)
+	{
+	  if (pc8 != NULL)
+	    *pc8 = 0xF0 + ((wc >> 18) & 0x07);
+	  ps->__value.__wchb[2] = 0x80 + ((wc >> 12) & 0x3F);
+	  ps->__value.__wchb[1] = 0x80 + ((wc >> 6) & 0x3F);
+	  ps->__value.__wchb[0] = 0x80 + (wc & 0x3F);
+	  ps->__value.__wchb[3] = 2;
+	  ps->__count |= 0x80000000;
+	}
+    }
+  if (result == 0 && wc != 0)
+    {
+      /* mbrtowc() never returns -3.  When a MB sequence converts to multiple
+         WCs, no input is consumed when writing the subsequent WCs resulting
+         in a result of 0 even if a null character wasn't written.  */
+      result = -3;
+    }
+
+  return result;
+}
diff --git a/wcsmbs/uchar.h b/wcsmbs/uchar.h
index 051cdcbeb5..c37e8619a0 100644
--- a/wcsmbs/uchar.h
+++ b/wcsmbs/uchar.h
@@ -31,6 +31,13 @@
 #include <bits/types.h>
 #include <bits/types/mbstate_t.h>
 
+/* Declare the C2x char8_t typedef in C2x modes, but only if the C++
+  __cpp_char8_t feature test macro is not defined.  */
+#if __GLIBC_USE (ISOC2X) && !defined __cpp_char8_t
+/* Define the 8-bit character type.  */
+typedef unsigned char char8_t;
+#endif
+
 #ifndef __USE_ISOCXX11
 /* Define the 16-bit and 32-bit character types.  */
 typedef __uint_least16_t char16_t;
@@ -40,6 +47,20 @@ typedef __uint_least32_t char32_t;
 
 __BEGIN_DECLS
 
+/* Declare the C2x mbrtoc8() and c8rtomb() functions in C2x modes or if
+   the C++ __cpp_char8_t feature test macro is defined.  */
+#if __GLIBC_USE (ISOC2X) || defined __cpp_char8_t
+/* Write char8_t representation of multibyte character pointed
+   to by S to PC8.  */
+extern size_t mbrtoc8  (char8_t *__restrict __pc8,
+			const char *__restrict __s, size_t __n,
+			mbstate_t *__restrict __p) __THROW;
+
+/* Write multibyte representation of char8_t C8 to S.  */
+extern size_t c8rtomb  (char *__restrict __s, char8_t __c8,
+			mbstate_t *__restrict __ps) __THROW;
+#endif
+
 /* Write char16_t representation of multibyte character pointed
    to by S to PC16.  */
 extern size_t mbrtoc16 (char16_t *__restrict __pc16,