From 8bcca1db3d7c0dc900a4cad4054c1439baf73684 Mon Sep 17 00:00:00 2001 From: Tom Honermann Date: Thu, 30 Jun 2022 08:52:14 -0400 Subject: stdlib: Implement mbrtoc8, c8rtomb, and the char8_t typedef. This change provides implementations for the mbrtoc8 and c8rtomb functions adopted for C++20 via WG21 P0482R6 and for C2X via WG14 N2653. It also provides the char8_t typedef from WG14 N2653. The mbrtoc8 and c8rtomb functions are declared in uchar.h in C2X mode or when the _GNU_SOURCE macro or C++20 __cpp_char8_t feature test macro is defined. The char8_t typedef is declared in uchar.h in C2X mode or when the _GNU_SOURCE macro is defined and the C++20 __cpp_char8_t feature test macro is not defined (if __cpp_char8_t is defined, then char8_t is a builtin type). Reviewed-by: Adhemerval Zanella --- wcsmbs/Makefile | 2 +- wcsmbs/Versions | 3 ++ wcsmbs/c8rtomb.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ wcsmbs/mbrtoc8.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++ wcsmbs/uchar.h | 21 +++++++++ 5 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 wcsmbs/c8rtomb.c create mode 100644 wcsmbs/mbrtoc8.c (limited to 'wcsmbs') diff --git a/wcsmbs/Makefile b/wcsmbs/Makefile index df9a85f4a9..bda281ad70 100644 --- a/wcsmbs/Makefile +++ b/wcsmbs/Makefile @@ -42,7 +42,7 @@ routines := wcscat wcschr wcscmp wcscpy wcscspn wcsdup wcslen wcsncat \ wcsmbsload mbsrtowcs_l \ isoc99_wscanf isoc99_vwscanf isoc99_fwscanf isoc99_vfwscanf \ isoc99_swscanf isoc99_vswscanf \ - mbrtoc16 c16rtomb mbrtoc32 c32rtomb + mbrtoc8 c8rtomb mbrtoc16 c16rtomb mbrtoc32 c32rtomb strop-tests := wcscmp wcsncmp wmemcmp wcslen wcschr wcsrchr wcscpy wcsnlen \ wcpcpy wcsncpy wcpncpy wcscat wcsncat wcschrnul wcsspn wcspbrk \ diff --git a/wcsmbs/Versions b/wcsmbs/Versions index 0b31c1b940..ec28acfb73 100644 --- a/wcsmbs/Versions +++ b/wcsmbs/Versions @@ -49,4 +49,7 @@ libc { wcstof32; wcstof64; wcstof32x; wcstof32_l; wcstof64_l; wcstof32x_l; } + GLIBC_2.36 { + c8rtomb; mbrtoc8; + } } diff --git a/wcsmbs/c8rtomb.c b/wcsmbs/c8rtomb.c new file mode 100644 index 0000000000..b564770eb5 --- /dev/null +++ b/wcsmbs/c8rtomb.c @@ -0,0 +1,132 @@ +/* UTF-8 to multibyte conversion. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include + + +/* This is the private state used if PS is NULL. */ +static mbstate_t state; + +size_t +c8rtomb (char *s, char8_t c8, mbstate_t *ps) +{ + /* This implementation depends on the converter invoked by wcrtomb not + needing to retain state in either the top most bit of ps->__count or + in ps->__value between invocations. This implementation uses the + top most bit of ps->__count to indicate that trailing code units are + expected and uses ps->__value to store previously seen code units. */ + + wchar_t wc; + + if (ps == NULL) + ps = &state; + + if (s == NULL) + { + /* if 's' is a null pointer, behave as if u8'\0' was passed as 'c8'. If + this occurs for an incomplete code unit sequence, then an error will + be reported below. */ + c8 = u8""[0]; + } + + if (! (ps->__count & 0x80000000)) + { + /* Initial state. */ + if ((c8 >= 0x80 && c8 <= 0xC1) || c8 >= 0xF5) + { + /* An invalid lead code unit. */ + __set_errno (EILSEQ); + return -1; + } + if (c8 >= 0xC2) + { + /* A valid lead code unit. */ + ps->__count |= 0x80000000; + ps->__value.__wchb[0] = c8; + ps->__value.__wchb[3] = 1; + return 0; + } + /* A single byte (ASCII) code unit. */ + wc = c8; + } + else + { + char8_t cu1 = ps->__value.__wchb[0]; + if (ps->__value.__wchb[3] == 1) + { + /* A single lead code unit was previously seen. */ + if ((c8 < 0x80 || c8 > 0xBF) + || (cu1 == 0xE0 && c8 < 0xA0) + || (cu1 == 0xED && c8 > 0x9F) + || (cu1 == 0xF0 && c8 < 0x90) + || (cu1 == 0xF4 && c8 > 0x8F)) + { + /* An invalid second code unit. */ + __set_errno (EILSEQ); + return -1; + } + if (cu1 >= 0xE0) + { + /* A three or four code unit sequence. */ + ps->__value.__wchb[1] = c8; + ++ps->__value.__wchb[3]; + return 0; + } + wc = ((cu1 & 0x1F) << 6) + + (c8 & 0x3F); + } + else + { + char8_t cu2 = ps->__value.__wchb[1]; + /* A three or four byte code unit sequence. */ + if (c8 < 0x80 || c8 > 0xBF) + { + /* An invalid third or fourth code unit. */ + __set_errno (EILSEQ); + return -1; + } + if (ps->__value.__wchb[3] == 2 && cu1 >= 0xF0) + { + /* A four code unit sequence. */ + ps->__value.__wchb[2] = c8; + ++ps->__value.__wchb[3]; + return 0; + } + if (cu1 < 0xF0) + { + wc = ((cu1 & 0x0F) << 12) + + ((cu2 & 0x3F) << 6) + + (c8 & 0x3F); + } + else + { + char8_t cu3 = ps->__value.__wchb[2]; + wc = ((cu1 & 0x07) << 18) + + ((cu2 & 0x3F) << 12) + + ((cu3 & 0x3F) << 6) + + (c8 & 0x3F); + } + } + ps->__count &= 0x7fffffff; + ps->__value.__wch = 0; + } + + return wcrtomb (s, wc, ps); +} diff --git a/wcsmbs/mbrtoc8.c b/wcsmbs/mbrtoc8.c new file mode 100644 index 0000000000..dd80b5282d --- /dev/null +++ b/wcsmbs/mbrtoc8.c @@ -0,0 +1,126 @@ +/* Multibyte to UTF-8 conversion. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include + +#include + +#ifndef EILSEQ +# define EILSEQ EINVAL +#endif + + +/* This is the private state used if PS is NULL. */ +static mbstate_t state; + +size_t +mbrtoc8 (char8_t *pc8, const char *s, size_t n, mbstate_t *ps) +{ + /* This implementation depends on the converter invoked by mbrtowc not + needing to retain state in either the top most bit of ps->__count or + in ps->__value between invocations. This implementation uses the + top most bit of ps->__count to indicate that trailing code units are + yet to be written and uses ps->__value to store those code units. */ + + if (ps == NULL) + ps = &state; + + /* If state indicates that trailing code units are yet to be written, write + those first regardless of whether 's' is a null pointer. */ + if (ps->__count & 0x80000000) + { + /* ps->__value.__wchb[3] stores the index of the next code unit to + write. Code units are stored in reverse order. */ + size_t i = ps->__value.__wchb[3]; + if (pc8 != NULL) + { + *pc8 = ps->__value.__wchb[i]; + } + if (i == 0) + { + ps->__count &= 0x7fffffff; + ps->__value.__wch = 0; + } + else + --ps->__value.__wchb[3]; + return -3; + } + + if (s == NULL) + { + /* if 's' is a null pointer, behave as if a null pointer was passed for + 'pc8', an empty string was passed for 's', and 1 passed for 'n'. */ + pc8 = NULL; + s = ""; + n = 1; + } + + wchar_t wc; + size_t result; + + result = mbrtowc (&wc, s, n, ps); + if (result <= n) + { + if (wc <= 0x7F) + { + if (pc8 != NULL) + *pc8 = wc; + } + else if (wc <= 0x7FF) + { + if (pc8 != NULL) + *pc8 = 0xC0 + ((wc >> 6) & 0x1F); + ps->__value.__wchb[0] = 0x80 + (wc & 0x3F); + ps->__value.__wchb[3] = 0; + ps->__count |= 0x80000000; + } + else if (wc <= 0xFFFF) + { + if (pc8 != NULL) + *pc8 = 0xE0 + ((wc >> 12) & 0x0F); + ps->__value.__wchb[1] = 0x80 + ((wc >> 6) & 0x3F); + ps->__value.__wchb[0] = 0x80 + (wc & 0x3F); + ps->__value.__wchb[3] = 1; + ps->__count |= 0x80000000; + } + else if (wc <= 0x10FFFF) + { + if (pc8 != NULL) + *pc8 = 0xF0 + ((wc >> 18) & 0x07); + ps->__value.__wchb[2] = 0x80 + ((wc >> 12) & 0x3F); + ps->__value.__wchb[1] = 0x80 + ((wc >> 6) & 0x3F); + ps->__value.__wchb[0] = 0x80 + (wc & 0x3F); + ps->__value.__wchb[3] = 2; + ps->__count |= 0x80000000; + } + } + if (result == 0 && wc != 0) + { + /* mbrtowc() never returns -3. When a MB sequence converts to multiple + WCs, no input is consumed when writing the subsequent WCs resulting + in a result of 0 even if a null character wasn't written. */ + result = -3; + } + + return result; +} diff --git a/wcsmbs/uchar.h b/wcsmbs/uchar.h index 051cdcbeb5..c37e8619a0 100644 --- a/wcsmbs/uchar.h +++ b/wcsmbs/uchar.h @@ -31,6 +31,13 @@ #include #include +/* Declare the C2x char8_t typedef in C2x modes, but only if the C++ + __cpp_char8_t feature test macro is not defined. */ +#if __GLIBC_USE (ISOC2X) && !defined __cpp_char8_t +/* Define the 8-bit character type. */ +typedef unsigned char char8_t; +#endif + #ifndef __USE_ISOCXX11 /* Define the 16-bit and 32-bit character types. */ typedef __uint_least16_t char16_t; @@ -40,6 +47,20 @@ typedef __uint_least32_t char32_t; __BEGIN_DECLS +/* Declare the C2x mbrtoc8() and c8rtomb() functions in C2x modes or if + the C++ __cpp_char8_t feature test macro is defined. */ +#if __GLIBC_USE (ISOC2X) || defined __cpp_char8_t +/* Write char8_t representation of multibyte character pointed + to by S to PC8. */ +extern size_t mbrtoc8 (char8_t *__restrict __pc8, + const char *__restrict __s, size_t __n, + mbstate_t *__restrict __p) __THROW; + +/* Write multibyte representation of char8_t C8 to S. */ +extern size_t c8rtomb (char *__restrict __s, char8_t __c8, + mbstate_t *__restrict __ps) __THROW; +#endif + /* Write char16_t representation of multibyte character pointed to by S to PC16. */ extern size_t mbrtoc16 (char16_t *__restrict __pc16, -- cgit 1.4.1