Add generic C.UTF-8 locale (Bug 17318) codonell/c-utf8

We add a new C.UTF-8 locale. This locale is not builtin to glibc, but is provided as a distinct locale. The locale provides full support for UTF-8 and this includes full code point sorting via strcmp-based collation. The collation uses a new keyword 'strcmp_collation' which drops all collation rules and generates an empty zero rules collation to enable strcmp usage in collation. This ensures that we get full code point sorting for C.UTF-8 with a minimal 92 bytes of overhead (LC_COLLATE structure information). The new locale is added to SUPPORTED. Minimal test data for specific code points (minus those not supported by collate-test) is provided in C.UTF-8.in, and this verifies code point sorting is working reasonably across the range. The locale was tested manually with the full set of code points without failure. The locale is harmonized with locales already shipping in Gentoo, Debian, Ubuntu, Fedora, CentOS Stream, and RHEL. A new tst-iconv9 test is added which verifies the C.UTF-8 locale is generally usable. Testing for fnmatch, regexec, and recomp is provided by extending bug-regex1, bugregex19, bug-regex4, bug-regex6, transbug, tst-fnmatch, tst-regcomp-truncated, and tst-regex to use C.UTF-8. Tested on x86_64 or i686 without regression.
author: Carlos O'Donell <carlos@redhat.com> 2021-07-29 01:27:56 -0400
committer: Carlos O'Donell <carlos@redhat.com> 2021-07-29 01:31:39 -0400
commit: b2a77206b3c3bf4670acc069f6fdda76cf857d48 (patch)
tree: 3e35b2a2b9c864a91a223c9bc94c35797af57aae /localedata/locales/C
parent: 7de41f96a172e5f15bc449ec3c09dbd0ae4cf2c6 (diff)
download: glibc-codonell/c-utf8.tar.gz
glibc-codonell/c-utf8.tar.xz
glibc-codonell/c-utf8.zip
1 files changed, 194 insertions, 0 deletions
diff --git a/localedata/locales/C b/localedata/locales/C
new file mode 100644
index 0000000000..651691c724
--- /dev/null
+++ b/localedata/locales/C
@@ -0,0 +1,194 @@
+escape_char /
+comment_char %
+% Locale for C locale in UTF-8
+
+LC_IDENTIFICATION
+title      "C locale"
+source     ""
+address    ""
+contact    ""
+email      "bug-glibc-locales@gnu.org"
+tel        ""
+fax        ""
+language   ""
+territory  ""
+revision   "2.0"
+date       "2020-06-28"
+category  "i18n:2012";LC_IDENTIFICATION
+category  "i18n:2012";LC_CTYPE
+category  "i18n:2012";LC_COLLATE
+category  "i18n:2012";LC_TIME
+category  "i18n:2012";LC_NUMERIC
+category  "i18n:2012";LC_MONETARY
+category  "i18n:2012";LC_MESSAGES
+category  "i18n:2012";LC_PAPER
+category  "i18n:2012";LC_NAME
+category  "i18n:2012";LC_ADDRESS
+category  "i18n:2012";LC_TELEPHONE
+category  "i18n:2012";LC_MEASUREMENT
+END LC_IDENTIFICATION
+
+LC_CTYPE
+% Include only the i18n character type classes without any of the
+% transliteration that i18n uses by default.
+copy "i18n_ctype"
+
+% Include the neutral transliterations.  The builtin C and
+% POSIX locales have +1600 transliterations that are built into
+% the locales, and these are a superset of those.
+translit_start
+include "translit_neutral";""
+% We must use '?' for default_missing because the transliteration
+% framework includes it directly into the output and so it must
+% be compatible with ASCII if that is the target character set.
+default_missing <U003F>
+translit_end
+
+% Include the transliterations that can convert combined cahracters.
+% These are generally expected by users.
+translit_start
+include "translit_combining";""
+translit_end
+
+END LC_CTYPE
+
+LC_COLLATE
+% The keyword 'strcmp_collation' in any part of any LC_COLLATE
+% immediately discards all collation information and causes the
+% locale to use strcmp for collation comparison.  This is exactly
+% what is needed for C (ASCII) or C.UTF-8.
+strcmp_collation
+END LC_COLLATE
+
+LC_MONETARY
+
+% This is the 14652 i18n fdcc-set definition for the LC_MONETARY
+% category (except for the int_curr_symbol and currency_symbol, they are
+% empty in the 14652 i18n fdcc-set definition and also empty in
+% glibc/locale/C-monetary.c.).
+int_curr_symbol     ""
+currency_symbol     ""
+mon_decimal_point   "."
+mon_thousands_sep   ""
+mon_grouping        -1
+positive_sign       ""
+negative_sign       "-"
+int_frac_digits     -1
+frac_digits         -1
+p_cs_precedes       -1
+int_p_sep_by_space  -1
+p_sep_by_space      -1
+n_cs_precedes       -1
+int_n_sep_by_space  -1
+n_sep_by_space      -1
+p_sign_posn         -1
+n_sign_posn         -1
+%
+END LC_MONETARY
+
+LC_NUMERIC
+% This is the POSIX Locale definition for
+% the LC_NUMERIC category.
+%
+decimal_point   "."
+thousands_sep   ""
+grouping        -1
+END LC_NUMERIC
+
+LC_TIME
+% This is the POSIX Locale definition for the LC_TIME category with the
+% exception that time is per ISO 8601 and 24-hour.
+%
+% Abbreviated weekday names (%a)
+abday       "Sun";"Mon";"Tue";"Wed";"Thu";"Fri";"Sat"
+
+% Full weekday names (%A)
+day         "Sunday";"Monday";"Tuesday";"Wednesday";"Thursday";/
+            "Friday";"Saturday"
+
+% Abbreviated month names (%b)
+abmon       "Jan";"Feb";"Mar";"Apr";"May";"Jun";"Jul";"Aug";"Sep";/
+            "Oct";"Nov";"Dec"
+
+% Full month names (%B)
+mon         "January";"February";"March";"April";"May";"June";"July";/
+            "August";"September";"October";"November";"December"
+
+% Week description, consists of three fields:
+% 1. Number of days in a week.
+% 2. Gregorian date that is a first weekday (19971130 for Sunday, 19971201 for Monday).
+% 3. The weekday number to be contained in the first week of the year.
+%
+% ISO 8601 conforming applications should use the values 7, 19971201 (a
+% Monday), and 4 (Thursday), respectively.
+week    7;19971201;4
+first_weekday	1
+first_workday	2
+
+% Appropriate date and time representation (%c)
+d_t_fmt "%a %b %e %H:%M:%S %Y"
+
+% Appropriate date representation (%x)
+d_fmt   "%m/%d/%y"
+
+% Appropriate time representation (%X)
+t_fmt   "%H:%M:%S"
+
+% Appropriate AM/PM time representation (%r)
+t_fmt_ampm "%I:%M:%S %p"
+
+% Equivalent of AM/PM (%p)
+am_pm	"AM";"PM"
+
+% Appropriate date representation (date(1))   "%a %b %e %H:%M:%S %Z %Y"
+date_fmt	"%a %b %e %H:%M:%S %Z %Y"
+END LC_TIME
+
+LC_MESSAGES
+% This is the POSIX Locale definition for
+% the LC_NUMERIC category.
+%
+yesexpr "^[yY]"
+noexpr  "^[nN]"
+yesstr  "Yes"
+nostr   "No"
+END LC_MESSAGES
+
+LC_PAPER
+% This is the ISO/IEC 14652 "i18n" definition for
+% the LC_PAPER category.
+% (A4 paper, this is also used in the built in C/POSIX
+% locale in glibc/locale/C-paper.c)
+height   297
+width    210
+END LC_PAPER
+
+LC_NAME
+% This is the ISO/IEC 14652 "i18n" definition for
+% the LC_NAME category.
+% (also used in the built in C/POSIX locale in glibc/locale/C-name.c)
+name_fmt    "%p%t%g%t%m%t%f"
+END LC_NAME
+
+LC_ADDRESS
+% This is the ISO/IEC 14652 "i18n" definition for
+% the LC_ADDRESS category.
+% (also used in the built in C/POSIX locale in glibc/locale/C-address.c)
+postal_fmt    "%a%N%f%N%d%N%b%N%s %h %e %r%N%C-%z %T%N%c%N"
+END LC_ADDRESS
+
+LC_TELEPHONE
+% This is the ISO/IEC 14652 "i18n" definition for
+% the LC_TELEPHONE category.
+% "+%c %a %l"
+tel_int_fmt    "+%c %a %l"
+% (also used in the built in C/POSIX locale in glibc/locale/C-telephone.c)
+END LC_TELEPHONE
+
+LC_MEASUREMENT
+% This is the ISO/IEC 14652 "i18n" definition for
+% the LC_MEASUREMENT category.
+% (same as in the built in C/POSIX locale in glibc/locale/C-measurement.c)
+%metric
+measurement    1
+END LC_MEASUREMENT
author	Carlos O'Donell <carlos@redhat.com>	2021-07-29 01:27:56 -0400
committer	Carlos O'Donell <carlos@redhat.com>	2021-07-29 01:31:39 -0400
commit	b2a77206b3c3bf4670acc069f6fdda76cf857d48 (patch)
tree	3e35b2a2b9c864a91a223c9bc94c35797af57aae /localedata/locales/C
parent	7de41f96a172e5f15bc449ec3c09dbd0ae4cf2c6 (diff)
download	glibc-codonell/c-utf8.tar.gz glibc-codonell/c-utf8.tar.xz glibc-codonell/c-utf8.zip