diff options
Diffstat (limited to 'REORG.TODO/locale/programs/ld-ctype.c')
-rw-r--r-- | REORG.TODO/locale/programs/ld-ctype.c | 4030 |
1 files changed, 4030 insertions, 0 deletions
diff --git a/REORG.TODO/locale/programs/ld-ctype.c b/REORG.TODO/locale/programs/ld-ctype.c new file mode 100644 index 0000000000..df266c20d6 --- /dev/null +++ b/REORG.TODO/locale/programs/ld-ctype.c @@ -0,0 +1,4030 @@ +/* Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gnu.org>, 1995. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#include <alloca.h> +#include <byteswap.h> +#include <endian.h> +#include <errno.h> +#include <limits.h> +#include <obstack.h> +#include <stdlib.h> +#include <string.h> +#include <wchar.h> +#include <wctype.h> +#include <stdint.h> +#include <sys/uio.h> + +#include "localedef.h" +#include "charmap.h" +#include "localeinfo.h" +#include "langinfo.h" +#include "linereader.h" +#include "locfile-token.h" +#include "locfile.h" + +#include <assert.h> + + +/* The bit used for representing a special class. */ +#define BITPOS(class) ((class) - tok_upper) +#define BIT(class) (_ISbit (BITPOS (class))) +#define BITw(class) (_ISwbit (BITPOS (class))) + +#define ELEM(ctype, collection, idx, value) \ + *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \ + &ctype->collection##_act idx, value) + + +/* To be compatible with former implementations we for now restrict + the number of bits for character classes to 16. When compatibility + is not necessary anymore increase the number to 32. */ +#define char_class_t uint16_t +#define char_class32_t uint32_t + + +/* Type to describe a transliteration action. We have a possibly + multiple character from-string and a set of multiple character + to-strings. All are 32bit values since this is what is used in + the gconv functions. */ +struct translit_to_t +{ + uint32_t *str; + + struct translit_to_t *next; +}; + +struct translit_t +{ + uint32_t *from; + + const char *fname; + size_t lineno; + + struct translit_to_t *to; + + struct translit_t *next; +}; + +struct translit_ignore_t +{ + uint32_t from; + uint32_t to; + uint32_t step; + + const char *fname; + size_t lineno; + + struct translit_ignore_t *next; +}; + + +/* Type to describe a transliteration include statement. */ +struct translit_include_t +{ + const char *copy_locale; + const char *copy_repertoire; + + struct translit_include_t *next; +}; + +/* Provide some dummy pointer for empty string. */ +static uint32_t no_str[] = { 0 }; + + +/* Sparse table of uint32_t. */ +#define TABLE idx_table +#define ELEMENT uint32_t +#define DEFAULT ((uint32_t) ~0) +#define NO_ADD_LOCALE +#include "3level.h" + +#define TABLE wcwidth_table +#define ELEMENT uint8_t +#define DEFAULT 0xff +#include "3level.h" + +#define TABLE wctrans_table +#define ELEMENT int32_t +#define DEFAULT 0 +#define wctrans_table_add wctrans_table_add_internal +#include "3level.h" +#undef wctrans_table_add +/* The wctrans_table must actually store the difference between the + desired result and the argument. */ +static inline void +wctrans_table_add (struct wctrans_table *t, uint32_t wc, uint32_t mapped_wc) +{ + wctrans_table_add_internal (t, wc, mapped_wc - wc); +} + +/* Construction of sparse 3-level tables. + See wchar-lookup.h for their structure and the meaning of p and q. */ + +struct wctype_table +{ + /* Parameters. */ + unsigned int p; + unsigned int q; + /* Working representation. */ + size_t level1_alloc; + size_t level1_size; + uint32_t *level1; + size_t level2_alloc; + size_t level2_size; + uint32_t *level2; + size_t level3_alloc; + size_t level3_size; + uint32_t *level3; + size_t result_size; +}; + +static void add_locale_wctype_table (struct locale_file *file, + struct wctype_table *t); + +/* The real definition of the struct for the LC_CTYPE locale. */ +struct locale_ctype_t +{ + uint32_t *charnames; + size_t charnames_max; + size_t charnames_act; + /* An index lookup table, to speedup find_idx. */ + struct idx_table charnames_idx; + + struct repertoire_t *repertoire; + + /* We will allow up to 8 * sizeof (uint32_t) character classes. */ +#define MAX_NR_CHARCLASS (8 * sizeof (uint32_t)) + size_t nr_charclass; + const char *classnames[MAX_NR_CHARCLASS]; + uint32_t last_class_char; + uint32_t class256_collection[256]; + uint32_t *class_collection; + size_t class_collection_max; + size_t class_collection_act; + uint32_t class_done; + uint32_t class_offset; + + struct charseq **mbdigits; + size_t mbdigits_act; + size_t mbdigits_max; + uint32_t *wcdigits; + size_t wcdigits_act; + size_t wcdigits_max; + + struct charseq *mboutdigits[10]; + uint32_t wcoutdigits[10]; + size_t outdigits_act; + + /* If the following number ever turns out to be too small simply + increase it. But I doubt it will. --drepper@gnu */ +#define MAX_NR_CHARMAP 16 + const char *mapnames[MAX_NR_CHARMAP]; + uint32_t *map_collection[MAX_NR_CHARMAP]; + uint32_t map256_collection[2][256]; + size_t map_collection_max[MAX_NR_CHARMAP]; + size_t map_collection_act[MAX_NR_CHARMAP]; + size_t map_collection_nr; + size_t last_map_idx; + int tomap_done[MAX_NR_CHARMAP]; + uint32_t map_offset; + + /* Transliteration information. */ + struct translit_include_t *translit_include; + struct translit_t *translit; + struct translit_ignore_t *translit_ignore; + uint32_t ntranslit_ignore; + + uint32_t *default_missing; + const char *default_missing_file; + size_t default_missing_lineno; + + uint32_t to_nonascii; + uint32_t nonascii_case; + + /* The arrays for the binary representation. */ + char_class_t *ctype_b; + char_class32_t *ctype32_b; + uint32_t **map_b; + uint32_t **map32_b; + uint32_t **class_b; + struct wctype_table *class_3level; + struct wctrans_table *map_3level; + uint32_t *class_name_ptr; + uint32_t *map_name_ptr; + struct wcwidth_table width; + uint32_t mb_cur_max; + const char *codeset_name; + uint32_t *translit_from_idx; + uint32_t *translit_from_tbl; + uint32_t *translit_to_idx; + uint32_t *translit_to_tbl; + uint32_t translit_idx_size; + size_t translit_from_tbl_size; + size_t translit_to_tbl_size; + + struct obstack mempool; +}; + + +/* Marker for an empty slot. This has the value 0xFFFFFFFF, regardless + whether 'int' is 16 bit, 32 bit, or 64 bit. */ +#define EMPTY ((uint32_t) ~0) + + +#define obstack_chunk_alloc xmalloc +#define obstack_chunk_free free + + +/* Prototypes for local functions. */ +static void ctype_startup (struct linereader *lr, struct localedef_t *locale, + const struct charmap_t *charmap, + struct localedef_t *copy_locale, + int ignore_content); +static void ctype_class_new (struct linereader *lr, + struct locale_ctype_t *ctype, const char *name); +static void ctype_map_new (struct linereader *lr, + struct locale_ctype_t *ctype, + const char *name, const struct charmap_t *charmap); +static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table, + size_t *max, size_t *act, uint32_t idx); +static void set_class_defaults (struct locale_ctype_t *ctype, + const struct charmap_t *charmap, + struct repertoire_t *repertoire); +static void allocate_arrays (struct locale_ctype_t *ctype, + const struct charmap_t *charmap, + struct repertoire_t *repertoire); + + +static const char *longnames[] = +{ + "zero", "one", "two", "three", "four", + "five", "six", "seven", "eight", "nine" +}; +static const char *uninames[] = +{ + "U00000030", "U00000031", "U00000032", "U00000033", "U00000034", + "U00000035", "U00000036", "U00000037", "U00000038", "U00000039" +}; +static const unsigned char digits[] = "0123456789"; + + +static void +ctype_startup (struct linereader *lr, struct localedef_t *locale, + const struct charmap_t *charmap, + struct localedef_t *copy_locale, int ignore_content) +{ + unsigned int cnt; + struct locale_ctype_t *ctype; + + if (!ignore_content && locale->categories[LC_CTYPE].ctype == NULL) + { + if (copy_locale == NULL) + { + /* Allocate the needed room. */ + locale->categories[LC_CTYPE].ctype = ctype = + (struct locale_ctype_t *) xcalloc (1, + sizeof (struct locale_ctype_t)); + + /* We have seen no names yet. */ + ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512; + ctype->charnames = (uint32_t *) xmalloc (ctype->charnames_max + * sizeof (uint32_t)); + for (cnt = 0; cnt < 256; ++cnt) + ctype->charnames[cnt] = cnt; + ctype->charnames_act = 256; + idx_table_init (&ctype->charnames_idx); + + /* Fill character class information. */ + ctype->last_class_char = ILLEGAL_CHAR_VALUE; + /* The order of the following instructions determines the bit + positions! */ + ctype_class_new (lr, ctype, "upper"); + ctype_class_new (lr, ctype, "lower"); + ctype_class_new (lr, ctype, "alpha"); + ctype_class_new (lr, ctype, "digit"); + ctype_class_new (lr, ctype, "xdigit"); + ctype_class_new (lr, ctype, "space"); + ctype_class_new (lr, ctype, "print"); + ctype_class_new (lr, ctype, "graph"); + ctype_class_new (lr, ctype, "blank"); + ctype_class_new (lr, ctype, "cntrl"); + ctype_class_new (lr, ctype, "punct"); + ctype_class_new (lr, ctype, "alnum"); + + ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512; + ctype->class_collection + = (uint32_t *) xcalloc (sizeof (unsigned long int), + ctype->class_collection_max); + ctype->class_collection_act = 256; + + /* Fill character map information. */ + ctype->last_map_idx = MAX_NR_CHARMAP; + ctype_map_new (lr, ctype, "toupper", charmap); + ctype_map_new (lr, ctype, "tolower", charmap); + + /* Fill first 256 entries in `toXXX' arrays. */ + for (cnt = 0; cnt < 256; ++cnt) + { + ctype->map_collection[0][cnt] = cnt; + ctype->map_collection[1][cnt] = cnt; + + ctype->map256_collection[0][cnt] = cnt; + ctype->map256_collection[1][cnt] = cnt; + } + + if (enc_not_ascii_compatible) + ctype->to_nonascii = 1; + + obstack_init (&ctype->mempool); + } + else + ctype = locale->categories[LC_CTYPE].ctype = + copy_locale->categories[LC_CTYPE].ctype; + } +} + + +void +ctype_finish (struct localedef_t *locale, const struct charmap_t *charmap) +{ + /* See POSIX.2, table 2-6 for the meaning of the following table. */ +#define NCLASS 12 + static const struct + { + const char *name; + const char allow[NCLASS]; + } + valid_table[NCLASS] = + { + /* The order is important. See token.h for more information. + M = Always, D = Default, - = Permitted, X = Mutually exclusive */ + { "upper", "--MX-XDDXXX-" }, + { "lower", "--MX-XDDXXX-" }, + { "alpha", "---X-XDDXXX-" }, + { "digit", "XXX--XDDXXX-" }, + { "xdigit", "-----XDDXXX-" }, + { "space", "XXXXX------X" }, + { "print", "---------X--" }, + { "graph", "---------X--" }, + { "blank", "XXXXXM-----X" }, + { "cntrl", "XXXXX-XX--XX" }, + { "punct", "XXXXX-DD-X-X" }, + { "alnum", "-----XDDXXX-" } + }; + size_t cnt; + int cls1, cls2; + uint32_t space_value; + struct charseq *space_seq; + struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype; + int warned; + const void *key; + size_t len; + void *vdata; + void *curs; + + /* Now resolve copying and also handle completely missing definitions. */ + if (ctype == NULL) + { + const char *repertoire_name; + + /* First see whether we were supposed to copy. If yes, find the + actual definition. */ + if (locale->copy_name[LC_CTYPE] != NULL) + { + /* Find the copying locale. This has to happen transitively since + the locale we are copying from might also copying another one. */ + struct localedef_t *from = locale; + + do + from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE], + from->repertoire_name, charmap); + while (from->categories[LC_CTYPE].ctype == NULL + && from->copy_name[LC_CTYPE] != NULL); + + ctype = locale->categories[LC_CTYPE].ctype + = from->categories[LC_CTYPE].ctype; + } + + /* If there is still no definition issue an warning and create an + empty one. */ + if (ctype == NULL) + { + if (! be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +No definition for %s category found"), "LC_CTYPE")); + ctype_startup (NULL, locale, charmap, NULL, 0); + ctype = locale->categories[LC_CTYPE].ctype; + } + + /* Get the repertoire we have to use. */ + repertoire_name = locale->repertoire_name ?: repertoire_global; + if (repertoire_name != NULL) + ctype->repertoire = repertoire_read (repertoire_name); + } + + /* We need the name of the currently used 8-bit character set to + make correct conversion between this 8-bit representation and the + ISO 10646 character set used internally for wide characters. */ + ctype->codeset_name = charmap->code_set_name; + if (ctype->codeset_name == NULL) + { + if (! be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +No character set name specified in charmap"))); + ctype->codeset_name = "//UNKNOWN//"; + } + + /* Set default value for classes not specified. */ + set_class_defaults (ctype, charmap, ctype->repertoire); + + /* Check according to table. */ + for (cnt = 0; cnt < ctype->class_collection_act; ++cnt) + { + uint32_t tmp = ctype->class_collection[cnt]; + + if (tmp != 0) + { + for (cls1 = 0; cls1 < NCLASS; ++cls1) + if ((tmp & _ISwbit (cls1)) != 0) + for (cls2 = 0; cls2 < NCLASS; ++cls2) + if (valid_table[cls1].allow[cls2] != '-') + { + int eq = (tmp & _ISwbit (cls2)) != 0; + switch (valid_table[cls1].allow[cls2]) + { + case 'M': + if (!eq) + { + uint32_t value = ctype->charnames[cnt]; + + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +character L'\\u%0*x' in class `%s' must be in class `%s'"), + value > 0xffff ? 8 : 4, + value, + valid_table[cls1].name, + valid_table[cls2].name)); + } + break; + + case 'X': + if (eq) + { + uint32_t value = ctype->charnames[cnt]; + + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +character L'\\u%0*x' in class `%s' must not be in class `%s'"), + value > 0xffff ? 8 : 4, + value, + valid_table[cls1].name, + valid_table[cls2].name)); + } + break; + + case 'D': + ctype->class_collection[cnt] |= _ISwbit (cls2); + break; + + default: + WITH_CUR_LOCALE (error (5, 0, _("\ +internal error in %s, line %u"), __FUNCTION__, __LINE__)); + } + } + } + } + + for (cnt = 0; cnt < 256; ++cnt) + { + uint32_t tmp = ctype->class256_collection[cnt]; + + if (tmp != 0) + { + for (cls1 = 0; cls1 < NCLASS; ++cls1) + if ((tmp & _ISbit (cls1)) != 0) + for (cls2 = 0; cls2 < NCLASS; ++cls2) + if (valid_table[cls1].allow[cls2] != '-') + { + int eq = (tmp & _ISbit (cls2)) != 0; + switch (valid_table[cls1].allow[cls2]) + { + case 'M': + if (!eq) + { + char buf[17]; + + snprintf (buf, sizeof buf, "\\%Zo", cnt); + + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +character '%s' in class `%s' must be in class `%s'"), + buf, + valid_table[cls1].name, + valid_table[cls2].name)); + } + break; + + case 'X': + if (eq) + { + char buf[17]; + + snprintf (buf, sizeof buf, "\\%Zo", cnt); + + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +character '%s' in class `%s' must not be in class `%s'"), + buf, + valid_table[cls1].name, + valid_table[cls2].name)); + } + break; + + case 'D': + ctype->class256_collection[cnt] |= _ISbit (cls2); + break; + + default: + WITH_CUR_LOCALE (error (5, 0, _("\ +internal error in %s, line %u"), __FUNCTION__, __LINE__)); + } + } + } + } + + /* ... and now test <SP> as a special case. */ + space_value = 32; + if (((cnt = BITPOS (tok_space), + (ELEM (ctype, class_collection, , space_value) + & BITw (tok_space)) == 0) + || (cnt = BITPOS (tok_blank), + (ELEM (ctype, class_collection, , space_value) + & BITw (tok_blank)) == 0))) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"), + valid_table[cnt].name)); + } + else if (((cnt = BITPOS (tok_punct), + (ELEM (ctype, class_collection, , space_value) + & BITw (tok_punct)) != 0) + || (cnt = BITPOS (tok_graph), + (ELEM (ctype, class_collection, , space_value) + & BITw (tok_graph)) + != 0))) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +<SP> character must not be in class `%s'"), + valid_table[cnt].name)); + } + else + ELEM (ctype, class_collection, , space_value) |= BITw (tok_print); + + space_seq = charmap_find_value (charmap, "SP", 2); + if (space_seq == NULL) + space_seq = charmap_find_value (charmap, "space", 5); + if (space_seq == NULL) + space_seq = charmap_find_value (charmap, "U00000020", 9); + if (space_seq == NULL || space_seq->nbytes != 1) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +character <SP> not defined in character map"))); + } + else if (((cnt = BITPOS (tok_space), + (ctype->class256_collection[space_seq->bytes[0]] + & BIT (tok_space)) == 0) + || (cnt = BITPOS (tok_blank), + (ctype->class256_collection[space_seq->bytes[0]] + & BIT (tok_blank)) == 0))) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"), + valid_table[cnt].name)); + } + else if (((cnt = BITPOS (tok_punct), + (ctype->class256_collection[space_seq->bytes[0]] + & BIT (tok_punct)) != 0) + || (cnt = BITPOS (tok_graph), + (ctype->class256_collection[space_seq->bytes[0]] + & BIT (tok_graph)) != 0))) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +<SP> character must not be in class `%s'"), + valid_table[cnt].name)); + } + else + ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print); + + /* Check whether all single-byte characters make to their upper/lowercase + equivalent according to the ASCII rules. */ + for (cnt = 'A'; cnt <= 'Z'; ++cnt) + { + uint32_t uppval = ctype->map256_collection[0][cnt]; + uint32_t lowval = ctype->map256_collection[1][cnt]; + uint32_t lowuppval = ctype->map256_collection[0][lowval]; + uint32_t lowlowval = ctype->map256_collection[1][lowval]; + + if (uppval != cnt + || lowval != cnt + 0x20 + || lowuppval != cnt + || lowlowval != cnt + 0x20) + ctype->nonascii_case = 1; + } + for (cnt = 0; cnt < 256; ++cnt) + if (cnt < 'A' || (cnt > 'Z' && cnt < 'a') || cnt > 'z') + if (ctype->map256_collection[0][cnt] != cnt + || ctype->map256_collection[1][cnt] != cnt) + ctype->nonascii_case = 1; + + /* Now that the tests are done make sure the name array contains all + characters which are handled in the WIDTH section of the + character set definition file. */ + if (charmap->width_rules != NULL) + for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt) + { + unsigned char bytes[charmap->mb_cur_max]; + int nbytes = charmap->width_rules[cnt].from->nbytes; + + /* We have the range of character for which the width is + specified described using byte sequences of the multibyte + charset. We have to convert this to UCS4 now. And we + cannot simply convert the beginning and the end of the + sequence, we have to iterate over the byte sequence and + convert it for every single character. */ + memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes); + + while (nbytes < charmap->width_rules[cnt].to->nbytes + || memcmp (bytes, charmap->width_rules[cnt].to->bytes, + nbytes) <= 0) + { + /* Find the UCS value for `bytes'. */ + int inner; + uint32_t wch; + struct charseq *seq + = charmap_find_symbol (charmap, (char *) bytes, nbytes); + + if (seq == NULL) + wch = ILLEGAL_CHAR_VALUE; + else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE) + wch = seq->ucs4; + else + wch = repertoire_find_value (ctype->repertoire, seq->name, + strlen (seq->name)); + + if (wch != ILLEGAL_CHAR_VALUE) + /* We are only interested in the side-effects of the + `find_idx' call. It will add appropriate entries in + the name array if this is necessary. */ + (void) find_idx (ctype, NULL, NULL, NULL, wch); + + /* "Increment" the bytes sequence. */ + inner = nbytes - 1; + while (inner >= 0 && bytes[inner] == 0xff) + --inner; + + if (inner < 0) + { + /* We have to extend the byte sequence. */ + if (nbytes >= charmap->width_rules[cnt].to->nbytes) + break; + + bytes[0] = 1; + memset (&bytes[1], 0, nbytes); + ++nbytes; + } + else + { + ++bytes[inner]; + while (++inner < nbytes) + bytes[inner] = 0; + } + } + } + + /* Now set all the other characters of the character set to the + default width. */ + curs = NULL; + while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0) + { + struct charseq *data = (struct charseq *) vdata; + + if (data->ucs4 == UNINITIALIZED_CHAR_VALUE) + data->ucs4 = repertoire_find_value (ctype->repertoire, + data->name, len); + + if (data->ucs4 != ILLEGAL_CHAR_VALUE) + (void) find_idx (ctype, NULL, NULL, NULL, data->ucs4); + } + + /* There must be a multiple of 10 digits. */ + if (ctype->mbdigits_act % 10 != 0) + { + assert (ctype->mbdigits_act == ctype->wcdigits_act); + ctype->wcdigits_act -= ctype->mbdigits_act % 10; + ctype->mbdigits_act -= ctype->mbdigits_act % 10; + WITH_CUR_LOCALE (error (0, 0, _("\ +`digit' category has not entries in groups of ten"))); + } + + /* Check the input digits. There must be a multiple of ten available. + In each group it could be that one or the other character is missing. + In this case the whole group must be removed. */ + cnt = 0; + while (cnt < ctype->mbdigits_act) + { + size_t inner; + for (inner = 0; inner < 10; ++inner) + if (ctype->mbdigits[cnt + inner] == NULL) + break; + + if (inner == 10) + cnt += 10; + else + { + /* Remove the group. */ + memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10], + ((ctype->wcdigits_act - cnt - 10) + * sizeof (ctype->mbdigits[0]))); + ctype->mbdigits_act -= 10; + } + } + + /* If no input digits are given use the default. */ + if (ctype->mbdigits_act == 0) + { + if (ctype->mbdigits_max == 0) + { + ctype->mbdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool, + 10 * sizeof (struct charseq *)); + ctype->mbdigits_max = 10; + } + + for (cnt = 0; cnt < 10; ++cnt) + { + ctype->mbdigits[cnt] = charmap_find_symbol (charmap, + (char *) digits + cnt, 1); + if (ctype->mbdigits[cnt] == NULL) + { + ctype->mbdigits[cnt] = charmap_find_symbol (charmap, + longnames[cnt], + strlen (longnames[cnt])); + if (ctype->mbdigits[cnt] == NULL) + { + /* Hum, this ain't good. */ + WITH_CUR_LOCALE (error (0, 0, _("\ +no input digits defined and none of the standard names in the charmap"))); + + ctype->mbdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool, + sizeof (struct charseq) + 1); + + /* This is better than nothing. */ + ctype->mbdigits[cnt]->bytes[0] = digits[cnt]; + ctype->mbdigits[cnt]->nbytes = 1; + } + } + } + + ctype->mbdigits_act = 10; + } + + /* Check the wide character input digits. There must be a multiple + of ten available. In each group it could be that one or the other + character is missing. In this case the whole group must be + removed. */ + cnt = 0; + while (cnt < ctype->wcdigits_act) + { + size_t inner; + for (inner = 0; inner < 10; ++inner) + if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE) + break; + + if (inner == 10) + cnt += 10; + else + { + /* Remove the group. */ + memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10], + ((ctype->wcdigits_act - cnt - 10) + * sizeof (ctype->wcdigits[0]))); + ctype->wcdigits_act -= 10; + } + } + + /* If no input digits are given use the default. */ + if (ctype->wcdigits_act == 0) + { + if (ctype->wcdigits_max == 0) + { + ctype->wcdigits = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool, + 10 * sizeof (uint32_t)); + ctype->wcdigits_max = 10; + } + + for (cnt = 0; cnt < 10; ++cnt) + ctype->wcdigits[cnt] = L'0' + cnt; + + ctype->mbdigits_act = 10; + } + + /* Check the outdigits. */ + warned = 0; + for (cnt = 0; cnt < 10; ++cnt) + if (ctype->mboutdigits[cnt] == NULL) + { + static struct charseq replace[2]; + + if (!warned) + { + WITH_CUR_LOCALE (error (0, 0, _("\ +not all characters used in `outdigit' are available in the charmap"))); + warned = 1; + } + + replace[0].nbytes = 1; + replace[0].bytes[0] = '?'; + replace[0].bytes[1] = '\0'; + ctype->mboutdigits[cnt] = &replace[0]; + } + + warned = 0; + for (cnt = 0; cnt < 10; ++cnt) + if (ctype->wcoutdigits[cnt] == 0) + { + if (!warned) + { + WITH_CUR_LOCALE (error (0, 0, _("\ +not all characters used in `outdigit' are available in the repertoire"))); + warned = 1; + } + + ctype->wcoutdigits[cnt] = L'?'; + } + + /* Sort the entries in the translit_ignore list. */ + if (ctype->translit_ignore != NULL) + { + struct translit_ignore_t *firstp = ctype->translit_ignore; + struct translit_ignore_t *runp; + + ctype->ntranslit_ignore = 1; + + for (runp = firstp->next; runp != NULL; runp = runp->next) + { + struct translit_ignore_t *lastp = NULL; + struct translit_ignore_t *cmpp; + + ++ctype->ntranslit_ignore; + + for (cmpp = firstp; cmpp != NULL; lastp = cmpp, cmpp = cmpp->next) + if (runp->from < cmpp->from) + break; + + runp->next = lastp; + if (lastp == NULL) + firstp = runp; + } + + ctype->translit_ignore = firstp; + } +} + + +void +ctype_output (struct localedef_t *locale, const struct charmap_t *charmap, + const char *output_path) +{ + struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype; + const size_t nelems = (_NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1) + + ctype->nr_charclass + ctype->map_collection_nr); + struct locale_file file; + uint32_t default_missing_len; + size_t elem, cnt; + + /* Now prepare the output: Find the sizes of the table we can use. */ + allocate_arrays (ctype, charmap, ctype->repertoire); + + default_missing_len = (ctype->default_missing + ? wcslen ((wchar_t *) ctype->default_missing) + : 0); + + init_locale_data (&file, nelems); + for (elem = 0; elem < nelems; ++elem) + { + if (elem < _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1)) + switch (elem) + { +#define CTYPE_EMPTY(name) \ + case name: \ + add_locale_empty (&file); \ + break + + CTYPE_EMPTY(_NL_CTYPE_GAP1); + CTYPE_EMPTY(_NL_CTYPE_GAP2); + CTYPE_EMPTY(_NL_CTYPE_GAP3); + CTYPE_EMPTY(_NL_CTYPE_GAP4); + CTYPE_EMPTY(_NL_CTYPE_GAP5); + CTYPE_EMPTY(_NL_CTYPE_GAP6); + +#define CTYPE_RAW_DATA(name, base, size) \ + case _NL_ITEM_INDEX (name): \ + add_locale_raw_data (&file, base, size); \ + break + + CTYPE_RAW_DATA (_NL_CTYPE_CLASS, + ctype->ctype_b, + (256 + 128) * sizeof (char_class_t)); + +#define CTYPE_UINT32_ARRAY(name, base, n_elems) \ + case _NL_ITEM_INDEX (name): \ + add_locale_uint32_array (&file, base, n_elems); \ + break + + CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER, ctype->map_b[0], 256 + 128); + CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER, ctype->map_b[1], 256 + 128); + CTYPE_UINT32_ARRAY (_NL_CTYPE_TOUPPER32, ctype->map32_b[0], 256); + CTYPE_UINT32_ARRAY (_NL_CTYPE_TOLOWER32, ctype->map32_b[1], 256); + CTYPE_RAW_DATA (_NL_CTYPE_CLASS32, + ctype->ctype32_b, + 256 * sizeof (char_class32_t)); + +#define CTYPE_UINT32(name, value) \ + case _NL_ITEM_INDEX (name): \ + add_locale_uint32 (&file, value); \ + break + + CTYPE_UINT32 (_NL_CTYPE_CLASS_OFFSET, ctype->class_offset); + CTYPE_UINT32 (_NL_CTYPE_MAP_OFFSET, ctype->map_offset); + CTYPE_UINT32 (_NL_CTYPE_TRANSLIT_TAB_SIZE, ctype->translit_idx_size); + + CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_IDX, + ctype->translit_from_idx, + ctype->translit_idx_size); + + CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_FROM_TBL, + ctype->translit_from_tbl, + ctype->translit_from_tbl_size + / sizeof (uint32_t)); + + CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_IDX, + ctype->translit_to_idx, + ctype->translit_idx_size); + + CTYPE_UINT32_ARRAY (_NL_CTYPE_TRANSLIT_TO_TBL, + ctype->translit_to_tbl, + ctype->translit_to_tbl_size / sizeof (uint32_t)); + + case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES): + /* The class name array. */ + start_locale_structure (&file); + for (cnt = 0; cnt < ctype->nr_charclass; ++cnt) + add_locale_string (&file, ctype->classnames[cnt]); + add_locale_char (&file, 0); + align_locale_data (&file, LOCFILE_ALIGN); + end_locale_structure (&file); + break; + + case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES): + /* The class name array. */ + start_locale_structure (&file); + for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt) + add_locale_string (&file, ctype->mapnames[cnt]); + add_locale_char (&file, 0); + align_locale_data (&file, LOCFILE_ALIGN); + end_locale_structure (&file); + break; + + case _NL_ITEM_INDEX (_NL_CTYPE_WIDTH): + add_locale_wcwidth_table (&file, &ctype->width); + break; + + CTYPE_UINT32 (_NL_CTYPE_MB_CUR_MAX, ctype->mb_cur_max); + + case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME): + add_locale_string (&file, ctype->codeset_name); + break; + + CTYPE_UINT32 (_NL_CTYPE_MAP_TO_NONASCII, ctype->to_nonascii); + + CTYPE_UINT32 (_NL_CTYPE_NONASCII_CASE, ctype->nonascii_case); + + case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN): + add_locale_uint32 (&file, ctype->mbdigits_act / 10); + break; + + case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN): + add_locale_uint32 (&file, ctype->wcdigits_act / 10); + break; + + case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB): + start_locale_structure (&file); + for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB); + cnt < ctype->mbdigits_act; cnt += 10) + { + add_locale_raw_data (&file, ctype->mbdigits[cnt]->bytes, + ctype->mbdigits[cnt]->nbytes); + add_locale_char (&file, 0); + } + end_locale_structure (&file); + break; + + case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB): + start_locale_structure (&file); + cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB); + add_locale_raw_data (&file, ctype->mboutdigits[cnt]->bytes, + ctype->mboutdigits[cnt]->nbytes); + add_locale_char (&file, 0); + end_locale_structure (&file); + break; + + case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC): + start_locale_structure (&file); + for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC); + cnt < ctype->wcdigits_act; cnt += 10) + add_locale_uint32 (&file, ctype->wcdigits[cnt]); + end_locale_structure (&file); + break; + + case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC): + cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC); + add_locale_uint32 (&file, ctype->wcoutdigits[cnt]); + break; + + case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN): + add_locale_uint32 (&file, default_missing_len); + break; + + case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING): + add_locale_uint32_array (&file, ctype->default_missing, + default_missing_len); + break; + + case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE_LEN): + add_locale_uint32 (&file, ctype->ntranslit_ignore); + break; + + case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE): + start_locale_structure (&file); + { + struct translit_ignore_t *runp; + for (runp = ctype->translit_ignore; runp != NULL; + runp = runp->next) + { + add_locale_uint32 (&file, runp->from); + add_locale_uint32 (&file, runp->to); + add_locale_uint32 (&file, runp->step); + } + } + end_locale_structure (&file); + break; + + default: + assert (! "unknown CTYPE element"); + } + else + { + /* Handle extra maps. */ + size_t nr = elem - _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1); + if (nr < ctype->nr_charclass) + { + start_locale_prelude (&file); + add_locale_uint32_array (&file, ctype->class_b[nr], 256 / 32); + end_locale_prelude (&file); + add_locale_wctype_table (&file, &ctype->class_3level[nr]); + } + else + { + nr -= ctype->nr_charclass; + assert (nr < ctype->map_collection_nr); + add_locale_wctrans_table (&file, &ctype->map_3level[nr]); + } + } + } + + write_locale_data (output_path, LC_CTYPE, "LC_CTYPE", &file); +} + + +/* Local functions. */ +static void +ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype, + const char *name) +{ + size_t cnt; + + for (cnt = 0; cnt < ctype->nr_charclass; ++cnt) + if (strcmp (ctype->classnames[cnt], name) == 0) + break; + + if (cnt < ctype->nr_charclass) + { + lr_error (lr, _("character class `%s' already defined"), name); + return; + } + + if (ctype->nr_charclass == MAX_NR_CHARCLASS) + /* Exit code 2 is prescribed in P1003.2b. */ + WITH_CUR_LOCALE (error (2, 0, _("\ +implementation limit: no more than %Zd character classes allowed"), + MAX_NR_CHARCLASS)); + + ctype->classnames[ctype->nr_charclass++] = name; +} + + +static void +ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype, + const char *name, const struct charmap_t *charmap) +{ + size_t max_chars = 0; + size_t cnt; + + for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt) + { + if (strcmp (ctype->mapnames[cnt], name) == 0) + break; + + if (max_chars < ctype->map_collection_max[cnt]) + max_chars = ctype->map_collection_max[cnt]; + } + + if (cnt < ctype->map_collection_nr) + { + lr_error (lr, _("character map `%s' already defined"), name); + return; + } + + if (ctype->map_collection_nr == MAX_NR_CHARMAP) + /* Exit code 2 is prescribed in P1003.2b. */ + WITH_CUR_LOCALE (error (2, 0, _("\ +implementation limit: no more than %d character maps allowed"), + MAX_NR_CHARMAP)); + + ctype->mapnames[cnt] = name; + + if (max_chars == 0) + ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512; + else + ctype->map_collection_max[cnt] = max_chars; + + ctype->map_collection[cnt] = (uint32_t *) + xcalloc (sizeof (uint32_t), ctype->map_collection_max[cnt]); + ctype->map_collection_act[cnt] = 256; + + ++ctype->map_collection_nr; +} + + +/* We have to be prepared that TABLE, MAX, and ACT can be NULL. This + is possible if we only want to extend the name array. */ +static uint32_t * +find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max, + size_t *act, uint32_t idx) +{ + size_t cnt; + + if (idx < 256) + return table == NULL ? NULL : &(*table)[idx]; + + /* Use the charnames_idx lookup table instead of the slow search loop. */ +#if 1 + cnt = idx_table_get (&ctype->charnames_idx, idx); + if (cnt == EMPTY) + /* Not found. */ + cnt = ctype->charnames_act; +#else + for (cnt = 256; cnt < ctype->charnames_act; ++cnt) + if (ctype->charnames[cnt] == idx) + break; +#endif + + /* We have to distinguish two cases: the name is found or not. */ + if (cnt == ctype->charnames_act) + { + /* Extend the name array. */ + if (ctype->charnames_act == ctype->charnames_max) + { + ctype->charnames_max *= 2; + ctype->charnames = (uint32_t *) + xrealloc (ctype->charnames, + sizeof (uint32_t) * ctype->charnames_max); + } + ctype->charnames[ctype->charnames_act++] = idx; + idx_table_add (&ctype->charnames_idx, idx, cnt); + } + + if (table == NULL) + /* We have done everything we are asked to do. */ + return NULL; + + if (max == NULL) + /* The caller does not want to extend the table. */ + return (cnt >= *act ? NULL : &(*table)[cnt]); + + if (cnt >= *act) + { + if (cnt >= *max) + { + size_t old_max = *max; + do + *max *= 2; + while (*max <= cnt); + + *table = + (uint32_t *) xrealloc (*table, *max * sizeof (uint32_t)); + memset (&(*table)[old_max], '\0', + (*max - old_max) * sizeof (uint32_t)); + } + + *act = cnt + 1; + } + + return &(*table)[cnt]; +} + + +static int +get_character (struct token *now, const struct charmap_t *charmap, + struct repertoire_t *repertoire, + struct charseq **seqp, uint32_t *wchp) +{ + if (now->tok == tok_bsymbol) + { + /* This will hopefully be the normal case. */ + *wchp = repertoire_find_value (repertoire, now->val.str.startmb, + now->val.str.lenmb); + *seqp = charmap_find_value (charmap, now->val.str.startmb, + now->val.str.lenmb); + } + else if (now->tok == tok_ucs4) + { + char utmp[10]; + + snprintf (utmp, sizeof (utmp), "U%08X", now->val.ucs4); + *seqp = charmap_find_value (charmap, utmp, 9); + + if (*seqp == NULL) + *seqp = repertoire_find_seq (repertoire, now->val.ucs4); + + if (*seqp == NULL) + { + /* Compute the value in the charmap from the UCS value. */ + const char *symbol = repertoire_find_symbol (repertoire, + now->val.ucs4); + + if (symbol == NULL) + *seqp = NULL; + else + *seqp = charmap_find_value (charmap, symbol, strlen (symbol)); + + if (*seqp == NULL) + { + if (repertoire != NULL) + { + /* Insert a negative entry. */ + static const struct charseq negative + = { .ucs4 = ILLEGAL_CHAR_VALUE }; + uint32_t *newp = obstack_alloc (&repertoire->mem_pool, + sizeof (uint32_t)); + *newp = now->val.ucs4; + + insert_entry (&repertoire->seq_table, newp, + sizeof (uint32_t), (void *) &negative); + } + } + else + (*seqp)->ucs4 = now->val.ucs4; + } + else if ((*seqp)->ucs4 != now->val.ucs4) + *seqp = NULL; + + *wchp = now->val.ucs4; + } + else if (now->tok == tok_charcode) + { + /* We must map from the byte code to UCS4. */ + *seqp = charmap_find_symbol (charmap, now->val.str.startmb, + now->val.str.lenmb); + + if (*seqp == NULL) + *wchp = ILLEGAL_CHAR_VALUE; + else + { + if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE) + (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name, + strlen ((*seqp)->name)); + *wchp = (*seqp)->ucs4; + } + } + else + return 1; + + return 0; +} + + +/* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>' and + the .(2). counterparts. */ +static void +charclass_symbolic_ellipsis (struct linereader *ldfile, + struct locale_ctype_t *ctype, + const struct charmap_t *charmap, + struct repertoire_t *repertoire, + struct token *now, + const char *last_str, + unsigned long int class256_bit, + unsigned long int class_bit, int base, + int ignore_content, int handle_digits, int step) +{ + const char *nowstr = now->val.str.startmb; + char tmp[now->val.str.lenmb + 1]; + const char *cp; + char *endp; + unsigned long int from; + unsigned long int to; + + /* We have to compute the ellipsis values using the symbolic names. */ + assert (last_str != NULL); + + if (strlen (last_str) != now->val.str.lenmb) + { + invalid_range: + lr_error (ldfile, + _("`%s' and `%.*s' are not valid names for symbolic range"), + last_str, (int) now->val.str.lenmb, nowstr); + return; + } + + if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0) + /* Nothing to do, the names are the same. */ + return; + + for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp) + ; + + errno = 0; + from = strtoul (cp, &endp, base); + if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0') + goto invalid_range; + + to = strtoul (nowstr + (cp - last_str), &endp, base); + if ((to == UINT_MAX && errno == ERANGE) + || (endp - nowstr) != now->val.str.lenmb || from >= to) + goto invalid_range; + + /* OK, we have a range FROM - TO. Now we can create the symbolic names. */ + if (!ignore_content) + { + now->val.str.startmb = tmp; + while ((from += step) <= to) + { + struct charseq *seq; + uint32_t wch; + + sprintf (tmp, (base == 10 ? "%.*s%0*ld" : "%.*s%0*lX"), + (int) (cp - last_str), last_str, + (int) (now->val.str.lenmb - (cp - last_str)), + from); + + get_character (now, charmap, repertoire, &seq, &wch); + + if (seq != NULL && seq->nbytes == 1) + /* Yep, we can store information about this byte sequence. */ + ctype->class256_collection[seq->bytes[0]] |= class256_bit; + + if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0) + /* We have the UCS4 position. */ + *find_idx (ctype, &ctype->class_collection, + &ctype->class_collection_max, + &ctype->class_collection_act, wch) |= class_bit; + + if (handle_digits == 1) + { + /* We must store the digit values. */ + if (ctype->mbdigits_act == ctype->mbdigits_max) + { + ctype->mbdigits_max *= 2; + ctype->mbdigits = xrealloc (ctype->mbdigits, + (ctype->mbdigits_max + * sizeof (char *))); + ctype->wcdigits_max *= 2; + ctype->wcdigits = xrealloc (ctype->wcdigits, + (ctype->wcdigits_max + * sizeof (uint32_t))); + } + + ctype->mbdigits[ctype->mbdigits_act++] = seq; + ctype->wcdigits[ctype->wcdigits_act++] = wch; + } + else if (handle_digits == 2) + { + /* We must store the digit values. */ + if (ctype->outdigits_act >= 10) + { + lr_error (ldfile, _("\ +%s: field `%s' does not contain exactly ten entries"), + "LC_CTYPE", "outdigit"); + return; + } + + ctype->mboutdigits[ctype->outdigits_act] = seq; + ctype->wcoutdigits[ctype->outdigits_act] = wch; + ++ctype->outdigits_act; + } + } + } +} + + +/* Ellipsis like in `<U1234>..<U2345>' or `<U1234>..(2)..<U2345>'. */ +static void +charclass_ucs4_ellipsis (struct linereader *ldfile, + struct locale_ctype_t *ctype, + const struct charmap_t *charmap, + struct repertoire_t *repertoire, + struct token *now, uint32_t last_wch, + unsigned long int class256_bit, + unsigned long int class_bit, int ignore_content, + int handle_digits, int step) +{ + if (last_wch > now->val.ucs4) + { + lr_error (ldfile, _("\ +to-value <U%0*X> of range is smaller than from-value <U%0*X>"), + (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4, + (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch); + return; + } + + if (!ignore_content) + while ((last_wch += step) <= now->val.ucs4) + { + /* We have to find out whether there is a byte sequence corresponding + to this UCS4 value. */ + struct charseq *seq; + char utmp[10]; + + snprintf (utmp, sizeof (utmp), "U%08X", last_wch); + seq = charmap_find_value (charmap, utmp, 9); + if (seq == NULL) + { + snprintf (utmp, sizeof (utmp), "U%04X", last_wch); + seq = charmap_find_value (charmap, utmp, 5); + } + + if (seq == NULL) + /* Try looking in the repertoire map. */ + seq = repertoire_find_seq (repertoire, last_wch); + + /* If this is the first time we look for this sequence create a new + entry. */ + if (seq == NULL) + { + static const struct charseq negative + = { .ucs4 = ILLEGAL_CHAR_VALUE }; + + /* Find the symbolic name for this UCS4 value. */ + if (repertoire != NULL) + { + const char *symbol = repertoire_find_symbol (repertoire, + last_wch); + uint32_t *newp = obstack_alloc (&repertoire->mem_pool, + sizeof (uint32_t)); + *newp = last_wch; + + if (symbol != NULL) + /* We have a name, now search the multibyte value. */ + seq = charmap_find_value (charmap, symbol, strlen (symbol)); + + if (seq == NULL) + /* We have to create a fake entry. */ + seq = (struct charseq *) &negative; + else + seq->ucs4 = last_wch; + + insert_entry (&repertoire->seq_table, newp, sizeof (uint32_t), + seq); + } + else + /* We have to create a fake entry. */ + seq = (struct charseq *) &negative; + } + + /* We have a name, now search the multibyte value. */ + if (seq->ucs4 == last_wch && seq->nbytes == 1) + /* Yep, we can store information about this byte sequence. */ + ctype->class256_collection[(size_t) seq->bytes[0]] + |= class256_bit; + + /* And of course we have the UCS4 position. */ + if (class_bit != 0) + *find_idx (ctype, &ctype->class_collection, + &ctype->class_collection_max, + &ctype->class_collection_act, last_wch) |= class_bit; + + if (handle_digits == 1) + { + /* We must store the digit values. */ + if (ctype->mbdigits_act == ctype->mbdigits_max) + { + ctype->mbdigits_max *= 2; + ctype->mbdigits = xrealloc (ctype->mbdigits, + (ctype->mbdigits_max + * sizeof (char *))); + ctype->wcdigits_max *= 2; + ctype->wcdigits = xrealloc (ctype->wcdigits, + (ctype->wcdigits_max + * sizeof (uint32_t))); + } + + ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch + ? seq : NULL); + ctype->wcdigits[ctype->wcdigits_act++] = last_wch; + } + else if (handle_digits == 2) + { + /* We must store the digit values. */ + if (ctype->outdigits_act >= 10) + { + lr_error (ldfile, _("\ +%s: field `%s' does not contain exactly ten entries"), + "LC_CTYPE", "outdigit"); + return; + } + + ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch + ? seq : NULL); + ctype->wcoutdigits[ctype->outdigits_act] = last_wch; + ++ctype->outdigits_act; + } + } +} + + +/* Ellipsis as in `/xea/x12.../xea/x34'. */ +static void +charclass_charcode_ellipsis (struct linereader *ldfile, + struct locale_ctype_t *ctype, + const struct charmap_t *charmap, + struct repertoire_t *repertoire, + struct token *now, char *last_charcode, + uint32_t last_charcode_len, + unsigned long int class256_bit, + unsigned long int class_bit, int ignore_content, + int handle_digits) +{ + /* First check whether the to-value is larger. */ + if (now->val.charcode.nbytes != last_charcode_len) + { + lr_error (ldfile, _("\ +start and end character sequence of range must have the same length")); + return; + } + + if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0) + { + lr_error (ldfile, _("\ +to-value character sequence is smaller than from-value sequence")); + return; + } + + if (!ignore_content) + { + do + { + /* Increment the byte sequence value. */ + struct charseq *seq; + uint32_t wch; + int i; + + for (i = last_charcode_len - 1; i >= 0; --i) + if (++last_charcode[i] != 0) + break; + + if (last_charcode_len == 1) + /* Of course we have the charcode value. */ + ctype->class256_collection[(size_t) last_charcode[0]] + |= class256_bit; + + /* Find the symbolic name. */ + seq = charmap_find_symbol (charmap, last_charcode, + last_charcode_len); + if (seq != NULL) + { + if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE) + seq->ucs4 = repertoire_find_value (repertoire, seq->name, + strlen (seq->name)); + wch = seq == NULL ? ILLEGAL_CHAR_VALUE : seq->ucs4; + + if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0) + *find_idx (ctype, &ctype->class_collection, + &ctype->class_collection_max, + &ctype->class_collection_act, wch) |= class_bit; + } + else + wch = ILLEGAL_CHAR_VALUE; + + if (handle_digits == 1) + { + /* We must store the digit values. */ + if (ctype->mbdigits_act == ctype->mbdigits_max) + { + ctype->mbdigits_max *= 2; + ctype->mbdigits = xrealloc (ctype->mbdigits, + (ctype->mbdigits_max + * sizeof (char *))); + ctype->wcdigits_max *= 2; + ctype->wcdigits = xrealloc (ctype->wcdigits, + (ctype->wcdigits_max + * sizeof (uint32_t))); + } + + seq = xmalloc (sizeof (struct charseq) + last_charcode_len); + memcpy ((char *) (seq + 1), last_charcode, last_charcode_len); + seq->nbytes = last_charcode_len; + + ctype->mbdigits[ctype->mbdigits_act++] = seq; + ctype->wcdigits[ctype->wcdigits_act++] = wch; + } + else if (handle_digits == 2) + { + struct charseq *seq; + /* We must store the digit values. */ + if (ctype->outdigits_act >= 10) + { + lr_error (ldfile, _("\ +%s: field `%s' does not contain exactly ten entries"), + "LC_CTYPE", "outdigit"); + return; + } + + seq = xmalloc (sizeof (struct charseq) + last_charcode_len); + memcpy ((char *) (seq + 1), last_charcode, last_charcode_len); + seq->nbytes = last_charcode_len; + + ctype->mboutdigits[ctype->outdigits_act] = seq; + ctype->wcoutdigits[ctype->outdigits_act] = wch; + ++ctype->outdigits_act; + } + } + while (memcmp (last_charcode, now->val.charcode.bytes, + last_charcode_len) != 0); + } +} + + +static uint32_t * +find_translit2 (struct locale_ctype_t *ctype, const struct charmap_t *charmap, + uint32_t wch) +{ + struct translit_t *trunp = ctype->translit; + struct translit_ignore_t *tirunp = ctype->translit_ignore; + + while (trunp != NULL) + { + /* XXX We simplify things here. The transliterations we look + for are only allowed to have one character. */ + if (trunp->from[0] == wch && trunp->from[1] == 0) + { + /* Found it. Now look for a transliteration which can be + represented with the character set. */ + struct translit_to_t *torunp = trunp->to; + + while (torunp != NULL) + { + int i; + + for (i = 0; torunp->str[i] != 0; ++i) + { + char utmp[10]; + + snprintf (utmp, sizeof (utmp), "U%08X", torunp->str[i]); + if (charmap_find_value (charmap, utmp, 9) == NULL) + /* This character cannot be represented. */ + break; + } + + if (torunp->str[i] == 0) + return torunp->str; + + torunp = torunp->next; + } + + break; + } + + trunp = trunp->next; + } + + /* Check for ignored chars. */ + while (tirunp != NULL) + { + if (tirunp->from <= wch && tirunp->to >= wch) + { + uint32_t wi; + + for (wi = tirunp->from; wi <= wch; wi += tirunp->step) + if (wi == wch) + return no_str; + } + } + + /* Nothing found. */ + return NULL; +} + + +uint32_t * +find_translit (struct localedef_t *locale, const struct charmap_t *charmap, + uint32_t wch) +{ + struct locale_ctype_t *ctype; + uint32_t *result = NULL; + + assert (locale != NULL); + ctype = locale->categories[LC_CTYPE].ctype; + + if (ctype == NULL) + return NULL; + + if (ctype->translit != NULL) + result = find_translit2 (ctype, charmap, wch); + + if (result == NULL) + { + struct translit_include_t *irunp = ctype->translit_include; + + while (irunp != NULL && result == NULL) + { + result = find_translit (find_locale (CTYPE_LOCALE, + irunp->copy_locale, + irunp->copy_repertoire, + charmap), + charmap, wch); + irunp = irunp->next; + } + } + + return result; +} + + +/* Read one transliteration entry. */ +static uint32_t * +read_widestring (struct linereader *ldfile, struct token *now, + const struct charmap_t *charmap, + struct repertoire_t *repertoire) +{ + uint32_t *wstr; + + if (now->tok == tok_default_missing) + /* The special name "" will denote this case. */ + wstr = no_str; + else if (now->tok == tok_bsymbol) + { + /* Get the value from the repertoire. */ + wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t)); + wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb, + now->val.str.lenmb); + if (wstr[0] == ILLEGAL_CHAR_VALUE) + { + /* We cannot proceed, we don't know the UCS4 value. */ + free (wstr); + return NULL; + } + + wstr[1] = 0; + } + else if (now->tok == tok_ucs4) + { + wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t)); + wstr[0] = now->val.ucs4; + wstr[1] = 0; + } + else if (now->tok == tok_charcode) + { + /* Argh, we have to convert to the symbol name first and then to the + UCS4 value. */ + struct charseq *seq = charmap_find_symbol (charmap, + now->val.str.startmb, + now->val.str.lenmb); + if (seq == NULL) + /* Cannot find the UCS4 value. */ + return NULL; + + if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE) + seq->ucs4 = repertoire_find_value (repertoire, seq->name, + strlen (seq->name)); + if (seq->ucs4 == ILLEGAL_CHAR_VALUE) + /* We cannot proceed, we don't know the UCS4 value. */ + return NULL; + + wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t)); + wstr[0] = seq->ucs4; + wstr[1] = 0; + } + else if (now->tok == tok_string) + { + wstr = now->val.str.startwc; + if (wstr == NULL || wstr[0] == 0) + return NULL; + } + else + { + if (now->tok != tok_eol && now->tok != tok_eof) + lr_ignore_rest (ldfile, 0); + SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE"); + return (uint32_t *) -1l; + } + + return wstr; +} + + +static void +read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype, + struct token *now, const struct charmap_t *charmap, + struct repertoire_t *repertoire) +{ + uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire); + struct translit_t *result; + struct translit_to_t **top; + struct obstack *ob = &ctype->mempool; + int first; + int ignore; + + if (from_wstr == NULL) + /* There is no valid from string. */ + return; + + result = (struct translit_t *) obstack_alloc (ob, + sizeof (struct translit_t)); + result->from = from_wstr; + result->fname = ldfile->fname; + result->lineno = ldfile->lineno; + result->next = NULL; + result->to = NULL; + top = &result->to; + first = 1; + ignore = 0; + + while (1) + { + uint32_t *to_wstr; + + /* Next we have one or more transliterations. They are + separated by semicolons. */ + now = lr_token (ldfile, charmap, NULL, repertoire, verbose); + + if (!first && (now->tok == tok_semicolon || now->tok == tok_eol)) + { + /* One string read. */ + const uint32_t zero = 0; + + if (!ignore) + { + obstack_grow (ob, &zero, 4); + to_wstr = obstack_finish (ob); + + *top = obstack_alloc (ob, sizeof (struct translit_to_t)); + (*top)->str = to_wstr; + (*top)->next = NULL; + } + + if (now->tok == tok_eol) + { + result->next = ctype->translit; + ctype->translit = result; + return; + } + + if (!ignore) + top = &(*top)->next; + ignore = 0; + } + else + { + to_wstr = read_widestring (ldfile, now, charmap, repertoire); + if (to_wstr == (uint32_t *) -1l) + { + /* An error occurred. */ + obstack_free (ob, result); + return; + } + + if (to_wstr == NULL) + ignore = 1; + else + /* This value is usable. */ + obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4); + + first = 0; + } + } +} + + +static void +read_translit_ignore_entry (struct linereader *ldfile, + struct locale_ctype_t *ctype, + const struct charmap_t *charmap, + struct repertoire_t *repertoire) +{ + /* We expect a semicolon-separated list of characters we ignore. We are + only interested in the wide character definitions. These must be + single characters, possibly defining a range when an ellipsis is used. */ + while (1) + { + struct token *now = lr_token (ldfile, charmap, NULL, repertoire, + verbose); + struct translit_ignore_t *newp; + uint32_t from; + + if (now->tok == tok_eol || now->tok == tok_eof) + { + lr_error (ldfile, + _("premature end of `translit_ignore' definition")); + return; + } + + if (now->tok != tok_bsymbol && now->tok != tok_ucs4) + { + lr_error (ldfile, _("syntax error")); + lr_ignore_rest (ldfile, 0); + return; + } + + if (now->tok == tok_ucs4) + from = now->val.ucs4; + else + /* Try to get the value. */ + from = repertoire_find_value (repertoire, now->val.str.startmb, + now->val.str.lenmb); + + if (from == ILLEGAL_CHAR_VALUE) + { + lr_error (ldfile, "invalid character name"); + newp = NULL; + } + else + { + newp = (struct translit_ignore_t *) + obstack_alloc (&ctype->mempool, sizeof (struct translit_ignore_t)); + newp->from = from; + newp->to = from; + newp->step = 1; + + newp->next = ctype->translit_ignore; + ctype->translit_ignore = newp; + } + + /* Now we expect either a semicolon, an ellipsis, or the end of the + line. */ + now = lr_token (ldfile, charmap, NULL, repertoire, verbose); + + if (now->tok == tok_ellipsis2 || now->tok == tok_ellipsis2_2) + { + /* XXX Should we bother implementing `....'? `...' certainly + will not be implemented. */ + uint32_t to; + int step = now->tok == tok_ellipsis2_2 ? 2 : 1; + + now = lr_token (ldfile, charmap, NULL, repertoire, verbose); + + if (now->tok == tok_eol || now->tok == tok_eof) + { + lr_error (ldfile, + _("premature end of `translit_ignore' definition")); + return; + } + + if (now->tok != tok_bsymbol && now->tok != tok_ucs4) + { + lr_error (ldfile, _("syntax error")); + lr_ignore_rest (ldfile, 0); + return; + } + + if (now->tok == tok_ucs4) + to = now->val.ucs4; + else + /* Try to get the value. */ + to = repertoire_find_value (repertoire, now->val.str.startmb, + now->val.str.lenmb); + + if (to == ILLEGAL_CHAR_VALUE) + lr_error (ldfile, "invalid character name"); + else + { + /* Make sure the `to'-value is larger. */ + if (to >= from) + { + newp->to = to; + newp->step = step; + } + else + lr_error (ldfile, _("\ +to-value <U%0*X> of range is smaller than from-value <U%0*X>"), + (to | from) < 65536 ? 4 : 8, to, + (to | from) < 65536 ? 4 : 8, from); + } + + /* And the next token. */ + now = lr_token (ldfile, charmap, NULL, repertoire, verbose); + } + + if (now->tok == tok_eol || now->tok == tok_eof) + /* We are done. */ + return; + + if (now->tok == tok_semicolon) + /* Next round. */ + continue; + + /* If we come here something is wrong. */ + lr_error (ldfile, _("syntax error")); + lr_ignore_rest (ldfile, 0); + return; + } +} + + +/* The parser for the LC_CTYPE section of the locale definition. */ +void +ctype_read (struct linereader *ldfile, struct localedef_t *result, + const struct charmap_t *charmap, const char *repertoire_name, + int ignore_content) +{ + struct repertoire_t *repertoire = NULL; + struct locale_ctype_t *ctype; + struct token *now; + enum token_t nowtok; + size_t cnt; + uint32_t last_wch = 0; + enum token_t last_token; + enum token_t ellipsis_token; + int step; + char last_charcode[16]; + size_t last_charcode_len = 0; + const char *last_str = NULL; + int mapidx; + struct localedef_t *copy_locale = NULL; + + /* Get the repertoire we have to use. */ + if (repertoire_name != NULL) + repertoire = repertoire_read (repertoire_name); + + /* The rest of the line containing `LC_CTYPE' must be free. */ + lr_ignore_rest (ldfile, 1); + + + do + { + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + nowtok = now->tok; + } + while (nowtok == tok_eol); + + /* If we see `copy' now we are almost done. */ + if (nowtok == tok_copy) + { + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok != tok_string) + { + SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE"); + + skip_category: + do + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + while (now->tok != tok_eof && now->tok != tok_end); + + if (now->tok != tok_eof + || (now = lr_token (ldfile, charmap, NULL, NULL, verbose), + now->tok == tok_eof)) + lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE"); + else if (now->tok != tok_lc_ctype) + { + lr_error (ldfile, _("\ +%1$s: definition does not end with `END %1$s'"), "LC_CTYPE"); + lr_ignore_rest (ldfile, 0); + } + else + lr_ignore_rest (ldfile, 1); + + return; + } + + if (! ignore_content) + { + /* Get the locale definition. */ + copy_locale = load_locale (LC_CTYPE, now->val.str.startmb, + repertoire_name, charmap, NULL); + if ((copy_locale->avail & CTYPE_LOCALE) == 0) + { + /* Not yet loaded. So do it now. */ + if (locfile_read (copy_locale, charmap) != 0) + goto skip_category; + } + + if (copy_locale->categories[LC_CTYPE].ctype == NULL) + return; + } + + lr_ignore_rest (ldfile, 1); + + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + nowtok = now->tok; + } + + /* Prepare the data structures. */ + ctype_startup (ldfile, result, charmap, copy_locale, ignore_content); + ctype = result->categories[LC_CTYPE].ctype; + + /* Remember the repertoire we use. */ + if (!ignore_content) + ctype->repertoire = repertoire; + + while (1) + { + unsigned long int class_bit = 0; + unsigned long int class256_bit = 0; + int handle_digits = 0; + + /* Of course we don't proceed beyond the end of file. */ + if (nowtok == tok_eof) + break; + + /* Ingore empty lines. */ + if (nowtok == tok_eol) + { + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + nowtok = now->tok; + continue; + } + + switch (nowtok) + { + case tok_charclass: + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + while (now->tok == tok_ident || now->tok == tok_string) + { + ctype_class_new (ldfile, ctype, now->val.str.startmb); + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok != tok_semicolon) + break; + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + } + if (now->tok != tok_eol) + SYNTAX_ERROR (_("\ +%s: syntax error in definition of new character class"), "LC_CTYPE"); + break; + + case tok_charconv: + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + while (now->tok == tok_ident || now->tok == tok_string) + { + ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap); + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok != tok_semicolon) + break; + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + } + if (now->tok != tok_eol) + SYNTAX_ERROR (_("\ +%s: syntax error in definition of new character map"), "LC_CTYPE"); + break; + + case tok_class: + /* Ignore the rest of the line if we don't need the input of + this line. */ + if (ignore_content) + { + lr_ignore_rest (ldfile, 0); + break; + } + + /* We simply forget the `class' keyword and use the following + operand to determine the bit. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok == tok_ident || now->tok == tok_string) + { + /* Must can be one of the predefined class names. */ + for (cnt = 0; cnt < ctype->nr_charclass; ++cnt) + if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0) + break; + if (cnt >= ctype->nr_charclass) + { + /* OK, it's a new class. */ + ctype_class_new (ldfile, ctype, now->val.str.startmb); + + class_bit = _ISwbit (ctype->nr_charclass - 1); + } + else + { + class_bit = _ISwbit (cnt); + + free (now->val.str.startmb); + } + } + else if (now->tok == tok_digit) + goto handle_tok_digit; + else if (now->tok < tok_upper || now->tok > tok_blank) + goto err_label; + else + { + class_bit = BITw (now->tok); + class256_bit = BIT (now->tok); + } + + /* The next character must be a semicolon. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok != tok_semicolon) + goto err_label; + goto read_charclass; + + case tok_upper: + case tok_lower: + case tok_alpha: + case tok_alnum: + case tok_space: + case tok_cntrl: + case tok_punct: + case tok_graph: + case tok_print: + case tok_xdigit: + case tok_blank: + /* Ignore the rest of the line if we don't need the input of + this line. */ + if (ignore_content) + { + lr_ignore_rest (ldfile, 0); + break; + } + + class_bit = BITw (now->tok); + class256_bit = BIT (now->tok); + handle_digits = 0; + read_charclass: + ctype->class_done |= class_bit; + last_token = tok_none; + ellipsis_token = tok_none; + step = 1; + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + while (now->tok != tok_eol && now->tok != tok_eof) + { + uint32_t wch; + struct charseq *seq; + + if (ellipsis_token == tok_none) + { + if (get_character (now, charmap, repertoire, &seq, &wch)) + goto err_label; + + if (!ignore_content && seq != NULL && seq->nbytes == 1) + /* Yep, we can store information about this byte + sequence. */ + ctype->class256_collection[seq->bytes[0]] |= class256_bit; + + if (!ignore_content && wch != ILLEGAL_CHAR_VALUE + && class_bit != 0) + /* We have the UCS4 position. */ + *find_idx (ctype, &ctype->class_collection, + &ctype->class_collection_max, + &ctype->class_collection_act, wch) |= class_bit; + + last_token = now->tok; + /* Terminate the string. */ + if (last_token == tok_bsymbol) + { + now->val.str.startmb[now->val.str.lenmb] = '\0'; + last_str = now->val.str.startmb; + } + else + last_str = NULL; + last_wch = wch; + memcpy (last_charcode, now->val.charcode.bytes, 16); + last_charcode_len = now->val.charcode.nbytes; + + if (!ignore_content && handle_digits == 1) + { + /* We must store the digit values. */ + if (ctype->mbdigits_act == ctype->mbdigits_max) + { + ctype->mbdigits_max += 10; + ctype->mbdigits = xrealloc (ctype->mbdigits, + (ctype->mbdigits_max + * sizeof (char *))); + ctype->wcdigits_max += 10; + ctype->wcdigits = xrealloc (ctype->wcdigits, + (ctype->wcdigits_max + * sizeof (uint32_t))); + } + + ctype->mbdigits[ctype->mbdigits_act++] = seq; + ctype->wcdigits[ctype->wcdigits_act++] = wch; + } + else if (!ignore_content && handle_digits == 2) + { + /* We must store the digit values. */ + if (ctype->outdigits_act >= 10) + { + lr_error (ldfile, _("\ +%s: field `%s' does not contain exactly ten entries"), + "LC_CTYPE", "outdigit"); + lr_ignore_rest (ldfile, 0); + break; + } + + ctype->mboutdigits[ctype->outdigits_act] = seq; + ctype->wcoutdigits[ctype->outdigits_act] = wch; + ++ctype->outdigits_act; + } + } + else + { + /* Now it gets complicated. We have to resolve the + ellipsis problem. First we must distinguish between + the different kind of ellipsis and this must match the + tokens we have seen. */ + assert (last_token != tok_none); + + if (last_token != now->tok) + { + lr_error (ldfile, _("\ +ellipsis range must be marked by two operands of same type")); + lr_ignore_rest (ldfile, 0); + break; + } + + if (last_token == tok_bsymbol) + { + if (ellipsis_token == tok_ellipsis3) + lr_error (ldfile, _("with symbolic name range values \ +the absolute ellipsis `...' must not be used")); + + charclass_symbolic_ellipsis (ldfile, ctype, charmap, + repertoire, now, last_str, + class256_bit, class_bit, + (ellipsis_token + == tok_ellipsis4 + ? 10 : 16), + ignore_content, + handle_digits, step); + } + else if (last_token == tok_ucs4) + { + if (ellipsis_token != tok_ellipsis2) + lr_error (ldfile, _("\ +with UCS range values one must use the hexadecimal symbolic ellipsis `..'")); + + charclass_ucs4_ellipsis (ldfile, ctype, charmap, + repertoire, now, last_wch, + class256_bit, class_bit, + ignore_content, handle_digits, + step); + } + else + { + assert (last_token == tok_charcode); + + if (ellipsis_token != tok_ellipsis3) + lr_error (ldfile, _("\ +with character code range values one must use the absolute ellipsis `...'")); + + charclass_charcode_ellipsis (ldfile, ctype, charmap, + repertoire, now, + last_charcode, + last_charcode_len, + class256_bit, class_bit, + ignore_content, + handle_digits); + } + + /* Now we have used the last value. */ + last_token = tok_none; + } + + /* Next we expect a semicolon or the end of the line. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok == tok_eol || now->tok == tok_eof) + break; + + if (last_token != tok_none + && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4_2) + { + if (now->tok == tok_ellipsis2_2) + { + now->tok = tok_ellipsis2; + step = 2; + } + else if (now->tok == tok_ellipsis4_2) + { + now->tok = tok_ellipsis4; + step = 2; + } + + ellipsis_token = now->tok; + + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + continue; + } + + if (now->tok != tok_semicolon) + goto err_label; + + /* And get the next character. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + + ellipsis_token = tok_none; + step = 1; + } + break; + + case tok_digit: + /* Ignore the rest of the line if we don't need the input of + this line. */ + if (ignore_content) + { + lr_ignore_rest (ldfile, 0); + break; + } + + handle_tok_digit: + class_bit = _ISwdigit; + class256_bit = _ISdigit; + handle_digits = 1; + goto read_charclass; + + case tok_outdigit: + /* Ignore the rest of the line if we don't need the input of + this line. */ + if (ignore_content) + { + lr_ignore_rest (ldfile, 0); + break; + } + + if (ctype->outdigits_act != 0) + lr_error (ldfile, _("\ +%s: field `%s' declared more than once"), + "LC_CTYPE", "outdigit"); + class_bit = 0; + class256_bit = 0; + handle_digits = 2; + goto read_charclass; + + case tok_toupper: + /* Ignore the rest of the line if we don't need the input of + this line. */ + if (ignore_content) + { + lr_ignore_rest (ldfile, 0); + break; + } + + mapidx = 0; + goto read_mapping; + + case tok_tolower: + /* Ignore the rest of the line if we don't need the input of + this line. */ + if (ignore_content) + { + lr_ignore_rest (ldfile, 0); + break; + } + + mapidx = 1; + goto read_mapping; + + case tok_map: + /* Ignore the rest of the line if we don't need the input of + this line. */ + if (ignore_content) + { + lr_ignore_rest (ldfile, 0); + break; + } + + /* We simply forget the `map' keyword and use the following + operand to determine the mapping. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok == tok_ident || now->tok == tok_string) + { + size_t cnt; + + for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt) + if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0) + break; + + if (cnt < ctype->map_collection_nr) + free (now->val.str.startmb); + else + /* OK, it's a new map. */ + ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap); + + mapidx = cnt; + } + else if (now->tok < tok_toupper || now->tok > tok_tolower) + goto err_label; + else + mapidx = now->tok - tok_toupper; + + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + /* This better should be a semicolon. */ + if (now->tok != tok_semicolon) + goto err_label; + + read_mapping: + /* Test whether this mapping was already defined. */ + if (ctype->tomap_done[mapidx]) + { + lr_error (ldfile, _("duplicated definition for mapping `%s'"), + ctype->mapnames[mapidx]); + lr_ignore_rest (ldfile, 0); + break; + } + ctype->tomap_done[mapidx] = 1; + + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + while (now->tok != tok_eol && now->tok != tok_eof) + { + struct charseq *from_seq; + uint32_t from_wch; + struct charseq *to_seq; + uint32_t to_wch; + + /* Every pair starts with an opening brace. */ + if (now->tok != tok_open_brace) + goto err_label; + + /* Next comes the from-value. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (get_character (now, charmap, repertoire, &from_seq, + &from_wch) != 0) + goto err_label; + + /* The next is a comma. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok != tok_comma) + goto err_label; + + /* And the other value. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (get_character (now, charmap, repertoire, &to_seq, + &to_wch) != 0) + goto err_label; + + /* And the last thing is the closing brace. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok != tok_close_brace) + goto err_label; + + if (!ignore_content) + { + /* Check whether the mapping converts from an ASCII value + to a non-ASCII value. */ + if (from_seq != NULL && from_seq->nbytes == 1 + && isascii (from_seq->bytes[0]) + && to_seq != NULL && (to_seq->nbytes != 1 + || !isascii (to_seq->bytes[0]))) + ctype->to_nonascii = 1; + + if (mapidx < 2 && from_seq != NULL && to_seq != NULL + && from_seq->nbytes == 1 && to_seq->nbytes == 1) + /* We can use this value. */ + ctype->map256_collection[mapidx][from_seq->bytes[0]] + = to_seq->bytes[0]; + + if (from_wch != ILLEGAL_CHAR_VALUE + && to_wch != ILLEGAL_CHAR_VALUE) + /* Both correct values. */ + *find_idx (ctype, &ctype->map_collection[mapidx], + &ctype->map_collection_max[mapidx], + &ctype->map_collection_act[mapidx], + from_wch) = to_wch; + } + + /* Now comes a semicolon or the end of the line/file. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok == tok_semicolon) + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + } + break; + + case tok_translit_start: + /* Ignore the entire translit section with its peculiar syntax + if we don't need the input. */ + if (ignore_content) + { + do + { + lr_ignore_rest (ldfile, 0); + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + } + while (now->tok != tok_translit_end && now->tok != tok_eof); + + if (now->tok == tok_eof) + lr_error (ldfile, _(\ +"%s: `translit_start' section does not end with `translit_end'"), + "LC_CTYPE"); + + break; + } + + /* The rest of the line better should be empty. */ + lr_ignore_rest (ldfile, 1); + + /* We count here the number of allocated entries in the `translit' + array. */ + cnt = 0; + + ldfile->translate_strings = 1; + ldfile->return_widestr = 1; + + /* We proceed until we see the `translit_end' token. */ + while (now = lr_token (ldfile, charmap, NULL, repertoire, verbose), + now->tok != tok_translit_end && now->tok != tok_eof) + { + if (now->tok == tok_eol) + /* Ignore empty lines. */ + continue; + + if (now->tok == tok_include) + { + /* We have to include locale. */ + const char *locale_name; + const char *repertoire_name; + struct translit_include_t *include_stmt, **include_ptr; + + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + /* This should be a string or an identifier. In any + case something to name a locale. */ + if (now->tok != tok_string && now->tok != tok_ident) + { + translit_syntax: + lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE"); + lr_ignore_rest (ldfile, 0); + continue; + } + locale_name = now->val.str.startmb; + + /* Next should be a semicolon. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok != tok_semicolon) + goto translit_syntax; + + /* Now the repertoire name. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if ((now->tok != tok_string && now->tok != tok_ident) + || now->val.str.startmb == NULL) + goto translit_syntax; + repertoire_name = now->val.str.startmb; + if (repertoire_name[0] == '\0') + /* Ignore the empty string. */ + repertoire_name = NULL; + + /* Save the include statement for later processing. */ + include_stmt = (struct translit_include_t *) + xmalloc (sizeof (struct translit_include_t)); + include_stmt->copy_locale = locale_name; + include_stmt->copy_repertoire = repertoire_name; + include_stmt->next = NULL; + + include_ptr = &ctype->translit_include; + while (*include_ptr != NULL) + include_ptr = &(*include_ptr)->next; + *include_ptr = include_stmt; + + /* The rest of the line must be empty. */ + lr_ignore_rest (ldfile, 1); + + /* Make sure the locale is read. */ + add_to_readlist (LC_CTYPE, locale_name, repertoire_name, + 1, NULL); + continue; + } + else if (now->tok == tok_default_missing) + { + uint32_t *wstr; + + while (1) + { + /* We expect a single character or string as the + argument. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + wstr = read_widestring (ldfile, now, charmap, + repertoire); + + if (wstr != NULL) + { + if (ctype->default_missing != NULL) + { + lr_error (ldfile, _("\ +%s: duplicate `default_missing' definition"), "LC_CTYPE"); + WITH_CUR_LOCALE (error_at_line (0, 0, + ctype->default_missing_file, + ctype->default_missing_lineno, + _("\ +previous definition was here"))); + } + else + { + ctype->default_missing = wstr; + ctype->default_missing_file = ldfile->fname; + ctype->default_missing_lineno = ldfile->lineno; + } + /* We can have more entries, ignore them. */ + lr_ignore_rest (ldfile, 0); + break; + } + else if (wstr == (uint32_t *) -1l) + /* This was an syntax error. */ + break; + + /* Maybe there is another replacement we can use. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok == tok_eol || now->tok == tok_eof) + { + /* Nothing found. We tell the user. */ + lr_error (ldfile, _("\ +%s: no representable `default_missing' definition found"), "LC_CTYPE"); + break; + } + if (now->tok != tok_semicolon) + goto translit_syntax; + } + + continue; + } + else if (now->tok == tok_translit_ignore) + { + read_translit_ignore_entry (ldfile, ctype, charmap, + repertoire); + continue; + } + + read_translit_entry (ldfile, ctype, now, charmap, repertoire); + } + ldfile->return_widestr = 0; + + if (now->tok == tok_eof) + lr_error (ldfile, _(\ +"%s: `translit_start' section does not end with `translit_end'"), + "LC_CTYPE"); + + break; + + case tok_ident: + /* Ignore the rest of the line if we don't need the input of + this line. */ + if (ignore_content) + { + lr_ignore_rest (ldfile, 0); + break; + } + + /* This could mean one of several things. First test whether + it's a character class name. */ + for (cnt = 0; cnt < ctype->nr_charclass; ++cnt) + if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0) + break; + if (cnt < ctype->nr_charclass) + { + class_bit = _ISwbit (cnt); + class256_bit = cnt <= 11 ? _ISbit (cnt) : 0; + free (now->val.str.startmb); + goto read_charclass; + } + for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt) + if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0) + break; + if (cnt < ctype->map_collection_nr) + { + mapidx = cnt; + free (now->val.str.startmb); + goto read_mapping; + } + break; + + case tok_end: + /* Next we assume `LC_CTYPE'. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + if (now->tok == tok_eof) + break; + if (now->tok == tok_eol) + lr_error (ldfile, _("%s: incomplete `END' line"), + "LC_CTYPE"); + else if (now->tok != tok_lc_ctype) + lr_error (ldfile, _("\ +%1$s: definition does not end with `END %1$s'"), "LC_CTYPE"); + lr_ignore_rest (ldfile, now->tok == tok_lc_ctype); + return; + + default: + err_label: + if (now->tok != tok_eof) + SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE"); + } + + /* Prepare for the next round. */ + now = lr_token (ldfile, charmap, NULL, NULL, verbose); + nowtok = now->tok; + } + + /* When we come here we reached the end of the file. */ + lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE"); +} + + +/* Subroutine of set_class_defaults, below. */ +static void +set_one_default (struct locale_ctype_t *ctype, + const struct charmap_t *charmap, + int bitpos, int from, int to) +{ + char tmp[2]; + int ch; + int bit = _ISbit (bitpos); + int bitw = _ISwbit (bitpos); + /* Define string. */ + strcpy (tmp, "?"); + + for (ch = from; ch <= to; ++ch) + { + struct charseq *seq; + tmp[0] = ch; + + seq = charmap_find_value (charmap, tmp, 1); + if (seq == NULL) + { + char buf[10]; + sprintf (buf, "U%08X", ch); + seq = charmap_find_value (charmap, buf, 9); + } + if (seq == NULL) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' not defined while needed as default value"), + "LC_CTYPE", tmp)); + } + else if (seq->nbytes != 1) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' in charmap not representable with one byte"), + "LC_CTYPE", tmp)); + else + ctype->class256_collection[seq->bytes[0]] |= bit; + + /* No need to search here, the ASCII value is also the Unicode + value. */ + ELEM (ctype, class_collection, , ch) |= bitw; + } +} + +static void +set_class_defaults (struct locale_ctype_t *ctype, + const struct charmap_t *charmap, + struct repertoire_t *repertoire) +{ +#define set_default(bitpos, from, to) \ + set_one_default (ctype, charmap, bitpos, from, to) + + /* These function defines the default values for the classes and conversions + according to POSIX.2 2.5.2.1. + It may seem that the order of these if-blocks is arbitrary but it is NOT. + Don't move them unless you know what you do! */ + + /* Set default values if keyword was not present. */ + if ((ctype->class_done & BITw (tok_upper)) == 0) + /* "If this keyword [lower] is not specified, the lowercase letters + `A' through `Z', ..., shall automatically belong to this class, + with implementation defined character values." [P1003.2, 2.5.2.1] */ + set_default (BITPOS (tok_upper), 'A', 'Z'); + + if ((ctype->class_done & BITw (tok_lower)) == 0) + /* "If this keyword [lower] is not specified, the lowercase letters + `a' through `z', ..., shall automatically belong to this class, + with implementation defined character values." [P1003.2, 2.5.2.1] */ + set_default (BITPOS (tok_lower), 'a', 'z'); + + if ((ctype->class_done & BITw (tok_alpha)) == 0) + { + /* Table 2-6 in P1003.2 says that characters in class `upper' or + class `lower' *must* be in class `alpha'. */ + unsigned long int mask = BIT (tok_upper) | BIT (tok_lower); + unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower); + + for (size_t cnt = 0; cnt < 256; ++cnt) + if ((ctype->class256_collection[cnt] & mask) != 0) + ctype->class256_collection[cnt] |= BIT (tok_alpha); + + for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt) + if ((ctype->class_collection[cnt] & maskw) != 0) + ctype->class_collection[cnt] |= BITw (tok_alpha); + } + + if ((ctype->class_done & BITw (tok_digit)) == 0) + /* "If this keyword [digit] is not specified, the digits `0' through + `9', ..., shall automatically belong to this class, with + implementation-defined character values." [P1003.2, 2.5.2.1] */ + set_default (BITPOS (tok_digit), '0', '9'); + + /* "Only characters specified for the `alpha' and `digit' keyword + shall be specified. Characters specified for the keyword `alpha' + and `digit' are automatically included in this class. */ + { + unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit); + unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit); + + for (size_t cnt = 0; cnt < 256; ++cnt) + if ((ctype->class256_collection[cnt] & mask) != 0) + ctype->class256_collection[cnt] |= BIT (tok_alnum); + + for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt) + if ((ctype->class_collection[cnt] & maskw) != 0) + ctype->class_collection[cnt] |= BITw (tok_alnum); + } + + if ((ctype->class_done & BITw (tok_space)) == 0) + /* "If this keyword [space] is not specified, the characters <space>, + <form-feed>, <newline>, <carriage-return>, <tab>, and + <vertical-tab>, ..., shall automatically belong to this class, + with implementation-defined character values." [P1003.2, 2.5.2.1] */ + { + struct charseq *seq; + + seq = charmap_find_value (charmap, "space", 5); + if (seq == NULL) + seq = charmap_find_value (charmap, "SP", 2); + if (seq == NULL) + seq = charmap_find_value (charmap, "U00000020", 9); + if (seq == NULL) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' not defined while needed as default value"), + "LC_CTYPE", "<space>")); + } + else if (seq->nbytes != 1) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' in charmap not representable with one byte"), + "LC_CTYPE", "<space>")); + else + ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space); + + /* No need to search. */ + ELEM (ctype, class_collection, , L' ') |= BITw (tok_space); + + seq = charmap_find_value (charmap, "form-feed", 9); + if (seq == NULL) + seq = charmap_find_value (charmap, "U0000000C", 9); + if (seq == NULL) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' not defined while needed as default value"), + "LC_CTYPE", "<form-feed>")); + } + else if (seq->nbytes != 1) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' in charmap not representable with one byte"), + "LC_CTYPE", "<form-feed>")); + else + ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space); + + /* No need to search. */ + ELEM (ctype, class_collection, , L'\f') |= BITw (tok_space); + + + seq = charmap_find_value (charmap, "newline", 7); + if (seq == NULL) + seq = charmap_find_value (charmap, "U0000000A", 9); + if (seq == NULL) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' not defined while needed as default value"), + "LC_CTYPE", "<newline>")); + } + else if (seq->nbytes != 1) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' in charmap not representable with one byte"), + "LC_CTYPE", "<newline>")); + else + ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space); + + /* No need to search. */ + ELEM (ctype, class_collection, , L'\n') |= BITw (tok_space); + + + seq = charmap_find_value (charmap, "carriage-return", 15); + if (seq == NULL) + seq = charmap_find_value (charmap, "U0000000D", 9); + if (seq == NULL) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' not defined while needed as default value"), + "LC_CTYPE", "<carriage-return>")); + } + else if (seq->nbytes != 1) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' in charmap not representable with one byte"), + "LC_CTYPE", "<carriage-return>")); + else + ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space); + + /* No need to search. */ + ELEM (ctype, class_collection, , L'\r') |= BITw (tok_space); + + + seq = charmap_find_value (charmap, "tab", 3); + if (seq == NULL) + seq = charmap_find_value (charmap, "U00000009", 9); + if (seq == NULL) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' not defined while needed as default value"), + "LC_CTYPE", "<tab>")); + } + else if (seq->nbytes != 1) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' in charmap not representable with one byte"), + "LC_CTYPE", "<tab>")); + else + ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space); + + /* No need to search. */ + ELEM (ctype, class_collection, , L'\t') |= BITw (tok_space); + + + seq = charmap_find_value (charmap, "vertical-tab", 12); + if (seq == NULL) + seq = charmap_find_value (charmap, "U0000000B", 9); + if (seq == NULL) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' not defined while needed as default value"), + "LC_CTYPE", "<vertical-tab>")); + } + else if (seq->nbytes != 1) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' in charmap not representable with one byte"), + "LC_CTYPE", "<vertical-tab>")); + else + ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space); + + /* No need to search. */ + ELEM (ctype, class_collection, , L'\v') |= BITw (tok_space); + } + + if ((ctype->class_done & BITw (tok_xdigit)) == 0) + /* "If this keyword is not specified, the digits `0' to `9', the + uppercase letters `A' through `F', and the lowercase letters `a' + through `f', ..., shell automatically belong to this class, with + implementation defined character values." [P1003.2, 2.5.2.1] */ + { + set_default (BITPOS (tok_xdigit), '0', '9'); + set_default (BITPOS (tok_xdigit), 'A', 'F'); + set_default (BITPOS (tok_xdigit), 'a', 'f'); + } + + if ((ctype->class_done & BITw (tok_blank)) == 0) + /* "If this keyword [blank] is unspecified, the characters <space> and + <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */ + { + struct charseq *seq; + + seq = charmap_find_value (charmap, "space", 5); + if (seq == NULL) + seq = charmap_find_value (charmap, "SP", 2); + if (seq == NULL) + seq = charmap_find_value (charmap, "U00000020", 9); + if (seq == NULL) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' not defined while needed as default value"), + "LC_CTYPE", "<space>")); + } + else if (seq->nbytes != 1) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' in charmap not representable with one byte"), + "LC_CTYPE", "<space>")); + else + ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank); + + /* No need to search. */ + ELEM (ctype, class_collection, , L' ') |= BITw (tok_blank); + + + seq = charmap_find_value (charmap, "tab", 3); + if (seq == NULL) + seq = charmap_find_value (charmap, "U00000009", 9); + if (seq == NULL) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' not defined while needed as default value"), + "LC_CTYPE", "<tab>")); + } + else if (seq->nbytes != 1) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' in charmap not representable with one byte"), + "LC_CTYPE", "<tab>")); + else + ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank); + + /* No need to search. */ + ELEM (ctype, class_collection, , L'\t') |= BITw (tok_blank); + } + + if ((ctype->class_done & BITw (tok_graph)) == 0) + /* "If this keyword [graph] is not specified, characters specified for + the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct', + shall belong to this character class." [P1003.2, 2.5.2.1] */ + { + unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) | + BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct); + unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) | + BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) | + BITw (tok_punct); + + for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt) + if ((ctype->class_collection[cnt] & maskw) != 0) + ctype->class_collection[cnt] |= BITw (tok_graph); + + for (size_t cnt = 0; cnt < 256; ++cnt) + if ((ctype->class256_collection[cnt] & mask) != 0) + ctype->class256_collection[cnt] |= BIT (tok_graph); + } + + if ((ctype->class_done & BITw (tok_print)) == 0) + /* "If this keyword [print] is not provided, characters specified for + the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct', + and the <space> character shall belong to this character class." + [P1003.2, 2.5.2.1] */ + { + unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) | + BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct); + unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) | + BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) | + BITw (tok_punct); + struct charseq *seq; + + for (size_t cnt = 0; cnt < ctype->class_collection_act; ++cnt) + if ((ctype->class_collection[cnt] & maskw) != 0) + ctype->class_collection[cnt] |= BITw (tok_print); + + for (size_t cnt = 0; cnt < 256; ++cnt) + if ((ctype->class256_collection[cnt] & mask) != 0) + ctype->class256_collection[cnt] |= BIT (tok_print); + + + seq = charmap_find_value (charmap, "space", 5); + if (seq == NULL) + seq = charmap_find_value (charmap, "SP", 2); + if (seq == NULL) + seq = charmap_find_value (charmap, "U00000020", 9); + if (seq == NULL) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' not defined while needed as default value"), + "LC_CTYPE", "<space>")); + } + else if (seq->nbytes != 1) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' in charmap not representable with one byte"), + "LC_CTYPE", "<space>")); + else + ctype->class256_collection[seq->bytes[0]] |= BIT (tok_print); + + /* No need to search. */ + ELEM (ctype, class_collection, , L' ') |= BITw (tok_print); + } + + if (ctype->tomap_done[0] == 0) + /* "If this keyword [toupper] is not specified, the lowercase letters + `a' through `z', and their corresponding uppercase letters `A' to + `Z', ..., shall automatically be included, with implementation- + defined character values." [P1003.2, 2.5.2.1] */ + { + char tmp[4]; + int ch; + + strcpy (tmp, "<?>"); + + for (ch = 'a'; ch <= 'z'; ++ch) + { + struct charseq *seq_from, *seq_to; + + tmp[1] = (char) ch; + + seq_from = charmap_find_value (charmap, &tmp[1], 1); + if (seq_from == NULL) + { + char buf[10]; + sprintf (buf, "U%08X", ch); + seq_from = charmap_find_value (charmap, buf, 9); + } + if (seq_from == NULL) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' not defined while needed as default value"), + "LC_CTYPE", tmp)); + } + else if (seq_from->nbytes != 1) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' needed as default value not representable with one byte"), + "LC_CTYPE", tmp)); + } + else + { + /* This conversion is implementation defined. */ + tmp[1] = (char) (ch + ('A' - 'a')); + seq_to = charmap_find_value (charmap, &tmp[1], 1); + if (seq_to == NULL) + { + char buf[10]; + sprintf (buf, "U%08X", ch + ('A' - 'a')); + seq_to = charmap_find_value (charmap, buf, 9); + } + if (seq_to == NULL) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' not defined while needed as default value"), + "LC_CTYPE", tmp)); + } + else if (seq_to->nbytes != 1) + { + if (!be_quiet) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: character `%s' needed as default value not representable with one byte"), + "LC_CTYPE", tmp)); + } + else + /* The index [0] is determined by the order of the + `ctype_map_newP' calls in `ctype_startup'. */ + ctype->map256_collection[0][seq_from->bytes[0]] + = seq_to->bytes[0]; + } + + /* No need to search. */ + ELEM (ctype, map_collection, [0], ch) = ch + ('A' - 'a'); + } + } + + if (ctype->tomap_done[1] == 0) + /* "If this keyword [tolower] is not specified, the mapping shall be + the reverse mapping of the one specified to `toupper'." [P1003.2] */ + { + for (size_t cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt) + if (ctype->map_collection[0][cnt] != 0) + ELEM (ctype, map_collection, [1], + ctype->map_collection[0][cnt]) + = ctype->charnames[cnt]; + + for (size_t cnt = 0; cnt < 256; ++cnt) + if (ctype->map256_collection[0][cnt] != 0) + ctype->map256_collection[1][ctype->map256_collection[0][cnt]] = cnt; + } + + if (ctype->outdigits_act != 10) + { + if (ctype->outdigits_act != 0) + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: field `%s' does not contain exactly ten entries"), + "LC_CTYPE", "outdigit")); + + for (size_t cnt = ctype->outdigits_act; cnt < 10; ++cnt) + { + ctype->mboutdigits[cnt] = charmap_find_symbol (charmap, + (char *) digits + cnt, + 1); + + if (ctype->mboutdigits[cnt] == NULL) + ctype->mboutdigits[cnt] = charmap_find_symbol (charmap, + longnames[cnt], + strlen (longnames[cnt])); + + if (ctype->mboutdigits[cnt] == NULL) + ctype->mboutdigits[cnt] = charmap_find_symbol (charmap, + uninames[cnt], 9); + + if (ctype->mboutdigits[cnt] == NULL) + { + /* Provide a replacement. */ + WITH_CUR_LOCALE (error (0, 0, _("\ +no output digits defined and none of the standard names in the charmap"))); + + ctype->mboutdigits[cnt] = obstack_alloc (&((struct charmap_t *) charmap)->mem_pool, + sizeof (struct charseq) + + 1); + + /* This is better than nothing. */ + ctype->mboutdigits[cnt]->bytes[0] = digits[cnt]; + ctype->mboutdigits[cnt]->nbytes = 1; + } + + ctype->wcoutdigits[cnt] = L'0' + cnt; + } + + ctype->outdigits_act = 10; + } + +#undef set_default +} + + +/* Initialize. Assumes t->p and t->q have already been set. */ +static inline void +wctype_table_init (struct wctype_table *t) +{ + t->level1 = NULL; + t->level1_alloc = t->level1_size = 0; + t->level2 = NULL; + t->level2_alloc = t->level2_size = 0; + t->level3 = NULL; + t->level3_alloc = t->level3_size = 0; +} + +/* Retrieve an entry. */ +static inline int +wctype_table_get (struct wctype_table *t, uint32_t wc) +{ + uint32_t index1 = wc >> (t->q + t->p + 5); + if (index1 < t->level1_size) + { + uint32_t lookup1 = t->level1[index1]; + if (lookup1 != EMPTY) + { + uint32_t index2 = ((wc >> (t->p + 5)) & ((1 << t->q) - 1)) + + (lookup1 << t->q); + uint32_t lookup2 = t->level2[index2]; + if (lookup2 != EMPTY) + { + uint32_t index3 = ((wc >> 5) & ((1 << t->p) - 1)) + + (lookup2 << t->p); + uint32_t lookup3 = t->level3[index3]; + uint32_t index4 = wc & 0x1f; + + return (lookup3 >> index4) & 1; + } + } + } + return 0; +} + +/* Add one entry. */ +static void +wctype_table_add (struct wctype_table *t, uint32_t wc) +{ + uint32_t index1 = wc >> (t->q + t->p + 5); + uint32_t index2 = (wc >> (t->p + 5)) & ((1 << t->q) - 1); + uint32_t index3 = (wc >> 5) & ((1 << t->p) - 1); + uint32_t index4 = wc & 0x1f; + size_t i, i1, i2; + + if (index1 >= t->level1_size) + { + if (index1 >= t->level1_alloc) + { + size_t alloc = 2 * t->level1_alloc; + if (alloc <= index1) + alloc = index1 + 1; + t->level1 = (uint32_t *) xrealloc ((char *) t->level1, + alloc * sizeof (uint32_t)); + t->level1_alloc = alloc; + } + while (index1 >= t->level1_size) + t->level1[t->level1_size++] = EMPTY; + } + + if (t->level1[index1] == EMPTY) + { + if (t->level2_size == t->level2_alloc) + { + size_t alloc = 2 * t->level2_alloc + 1; + t->level2 = (uint32_t *) xrealloc ((char *) t->level2, + (alloc << t->q) * sizeof (uint32_t)); + t->level2_alloc = alloc; + } + i1 = t->level2_size << t->q; + i2 = (t->level2_size + 1) << t->q; + for (i = i1; i < i2; i++) + t->level2[i] = EMPTY; + t->level1[index1] = t->level2_size++; + } + + index2 += t->level1[index1] << t->q; + + if (t->level2[index2] == EMPTY) + { + if (t->level3_size == t->level3_alloc) + { + size_t alloc = 2 * t->level3_alloc + 1; + t->level3 = (uint32_t *) xrealloc ((char *) t->level3, + (alloc << t->p) * sizeof (uint32_t)); + t->level3_alloc = alloc; + } + i1 = t->level3_size << t->p; + i2 = (t->level3_size + 1) << t->p; + for (i = i1; i < i2; i++) + t->level3[i] = 0; + t->level2[index2] = t->level3_size++; + } + + index3 += t->level2[index2] << t->p; + + t->level3[index3] |= (uint32_t)1 << index4; +} + +/* Finalize and shrink. */ +static void +add_locale_wctype_table (struct locale_file *file, struct wctype_table *t) +{ + size_t i, j, k; + uint32_t reorder3[t->level3_size]; + uint32_t reorder2[t->level2_size]; + uint32_t level2_offset, level3_offset; + + /* Uniquify level3 blocks. */ + k = 0; + for (j = 0; j < t->level3_size; j++) + { + for (i = 0; i < k; i++) + if (memcmp (&t->level3[i << t->p], &t->level3[j << t->p], + (1 << t->p) * sizeof (uint32_t)) == 0) + break; + /* Relocate block j to block i. */ + reorder3[j] = i; + if (i == k) + { + if (i != j) + memcpy (&t->level3[i << t->p], &t->level3[j << t->p], + (1 << t->p) * sizeof (uint32_t)); + k++; + } + } + t->level3_size = k; + + for (i = 0; i < (t->level2_size << t->q); i++) + if (t->level2[i] != EMPTY) + t->level2[i] = reorder3[t->level2[i]]; + + /* Uniquify level2 blocks. */ + k = 0; + for (j = 0; j < t->level2_size; j++) + { + for (i = 0; i < k; i++) + if (memcmp (&t->level2[i << t->q], &t->level2[j << t->q], + (1 << t->q) * sizeof (uint32_t)) == 0) + break; + /* Relocate block j to block i. */ + reorder2[j] = i; + if (i == k) + { + if (i != j) + memcpy (&t->level2[i << t->q], &t->level2[j << t->q], + (1 << t->q) * sizeof (uint32_t)); + k++; + } + } + t->level2_size = k; + + for (i = 0; i < t->level1_size; i++) + if (t->level1[i] != EMPTY) + t->level1[i] = reorder2[t->level1[i]]; + + t->result_size = + 5 * sizeof (uint32_t) + + t->level1_size * sizeof (uint32_t) + + (t->level2_size << t->q) * sizeof (uint32_t) + + (t->level3_size << t->p) * sizeof (uint32_t); + + level2_offset = + 5 * sizeof (uint32_t) + + t->level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t->level1_size * sizeof (uint32_t) + + (t->level2_size << t->q) * sizeof (uint32_t); + + start_locale_structure (file); + add_locale_uint32 (file, t->q + t->p + 5); + add_locale_uint32 (file, t->level1_size); + add_locale_uint32 (file, t->p + 5); + add_locale_uint32 (file, (1 << t->q) - 1); + add_locale_uint32 (file, (1 << t->p) - 1); + + for (i = 0; i < t->level1_size; i++) + add_locale_uint32 + (file, + t->level1[i] == EMPTY + ? 0 + : (t->level1[i] << t->q) * sizeof (uint32_t) + level2_offset); + + for (i = 0; i < (t->level2_size << t->q); i++) + add_locale_uint32 + (file, + t->level2[i] == EMPTY + ? 0 + : (t->level2[i] << t->p) * sizeof (uint32_t) + level3_offset); + + add_locale_uint32_array (file, t->level3, t->level3_size << t->p); + end_locale_structure (file); + + if (t->level1_alloc > 0) + free (t->level1); + if (t->level2_alloc > 0) + free (t->level2); + if (t->level3_alloc > 0) + free (t->level3); +} + +/* Flattens the included transliterations into a translit list. + Inserts them in the list at `cursor', and returns the new cursor. */ +static struct translit_t ** +translit_flatten (struct locale_ctype_t *ctype, + const struct charmap_t *charmap, + struct translit_t **cursor) +{ + while (ctype->translit_include != NULL) + { + const char *copy_locale = ctype->translit_include->copy_locale; + const char *copy_repertoire = ctype->translit_include->copy_repertoire; + struct localedef_t *other; + + /* Unchain the include statement. During the depth-first traversal + we don't want to visit any locale more than once. */ + ctype->translit_include = ctype->translit_include->next; + + other = find_locale (LC_CTYPE, copy_locale, copy_repertoire, charmap); + + if (other == NULL || other->categories[LC_CTYPE].ctype == NULL) + { + WITH_CUR_LOCALE (error (0, 0, _("\ +%s: transliteration data from locale `%s' not available"), + "LC_CTYPE", copy_locale)); + } + else + { + struct locale_ctype_t *other_ctype = + other->categories[LC_CTYPE].ctype; + + cursor = translit_flatten (other_ctype, charmap, cursor); + assert (other_ctype->translit_include == NULL); + + if (other_ctype->translit != NULL) + { + /* Insert the other_ctype->translit list at *cursor. */ + struct translit_t *endp = other_ctype->translit; + while (endp->next != NULL) + endp = endp->next; + + endp->next = *cursor; + *cursor = other_ctype->translit; + + /* Avoid any risk of circular lists. */ + other_ctype->translit = NULL; + + cursor = &endp->next; + } + + if (ctype->default_missing == NULL) + ctype->default_missing = other_ctype->default_missing; + } + } + + return cursor; +} + +static void +allocate_arrays (struct locale_ctype_t *ctype, const struct charmap_t *charmap, + struct repertoire_t *repertoire) +{ + size_t idx, nr; + const void *key; + size_t len; + void *vdata; + void *curs; + + /* You wonder about this amount of memory? This is only because some + users do not manage to address the array with unsigned values or + data types with range >= 256. '\200' would result in the array + index -128. To help these poor people we duplicate the entries for + 128 up to 255 below the entry for \0. */ + ctype->ctype_b = (char_class_t *) xcalloc (256 + 128, sizeof (char_class_t)); + ctype->ctype32_b = (char_class32_t *) xcalloc (256, sizeof (char_class32_t)); + ctype->class_b = (uint32_t **) + xmalloc (ctype->nr_charclass * sizeof (uint32_t *)); + ctype->class_3level = (struct wctype_table *) + xmalloc (ctype->nr_charclass * sizeof (struct wctype_table)); + + /* This is the array accessed using the multibyte string elements. */ + for (idx = 0; idx < 256; ++idx) + ctype->ctype_b[128 + idx] = ctype->class256_collection[idx]; + + /* Mirror first 127 entries. We must take care that entry -1 is not + mirrored because EOF == -1. */ + for (idx = 0; idx < 127; ++idx) + ctype->ctype_b[idx] = ctype->ctype_b[256 + idx]; + + /* The 32 bit array contains all characters < 0x100. */ + for (idx = 0; idx < ctype->class_collection_act; ++idx) + if (ctype->charnames[idx] < 0x100) + ctype->ctype32_b[ctype->charnames[idx]] = ctype->class_collection[idx]; + + for (nr = 0; nr < ctype->nr_charclass; nr++) + { + ctype->class_b[nr] = (uint32_t *) xcalloc (256 / 32, sizeof (uint32_t)); + + /* We only set CLASS_B for the bits in the ISO C classes, not + the user defined classes. The number should not change but + who knows. */ +#define LAST_ISO_C_BIT 11 + if (nr <= LAST_ISO_C_BIT) + for (idx = 0; idx < 256; ++idx) + if (ctype->class256_collection[idx] & _ISbit (nr)) + ctype->class_b[nr][idx >> 5] |= (uint32_t) 1 << (idx & 0x1f); + } + + for (nr = 0; nr < ctype->nr_charclass; nr++) + { + struct wctype_table *t; + + t = &ctype->class_3level[nr]; + t->p = 4; /* or: 5 */ + t->q = 7; /* or: 6 */ + wctype_table_init (t); + + for (idx = 0; idx < ctype->class_collection_act; ++idx) + if (ctype->class_collection[idx] & _ISwbit (nr)) + wctype_table_add (t, ctype->charnames[idx]); + + if (verbose) + WITH_CUR_LOCALE (fprintf (stderr, _("\ +%s: table for class \"%s\": %lu bytes\n"), + "LC_CTYPE", ctype->classnames[nr], + (unsigned long int) t->result_size)); + } + + /* Room for table of mappings. */ + ctype->map_b = (uint32_t **) xmalloc (2 * sizeof (uint32_t *)); + ctype->map32_b = (uint32_t **) xmalloc (ctype->map_collection_nr + * sizeof (uint32_t *)); + ctype->map_3level = (struct wctrans_table *) + xmalloc (ctype->map_collection_nr * sizeof (struct wctrans_table)); + + /* Fill in all mappings. */ + for (idx = 0; idx < 2; ++idx) + { + unsigned int idx2; + + /* Allocate table. */ + ctype->map_b[idx] = (uint32_t *) + xmalloc ((256 + 128) * sizeof (uint32_t)); + + /* Copy values from collection. */ + for (idx2 = 0; idx2 < 256; ++idx2) + ctype->map_b[idx][128 + idx2] = ctype->map256_collection[idx][idx2]; + + /* Mirror first 127 entries. We must take care not to map entry + -1 because EOF == -1. */ + for (idx2 = 0; idx2 < 127; ++idx2) + ctype->map_b[idx][idx2] = ctype->map_b[idx][256 + idx2]; + + /* EOF must map to EOF. */ + ctype->map_b[idx][127] = EOF; + } + + for (idx = 0; idx < ctype->map_collection_nr; ++idx) + { + unsigned int idx2; + + /* Allocate table. */ + ctype->map32_b[idx] = (uint32_t *) xmalloc (256 * sizeof (uint32_t)); + + /* Copy values from collection. Default is identity mapping. */ + for (idx2 = 0; idx2 < 256; ++idx2) + ctype->map32_b[idx][idx2] = + (ctype->map_collection[idx][idx2] != 0 + ? ctype->map_collection[idx][idx2] + : idx2); + } + + for (nr = 0; nr < ctype->map_collection_nr; nr++) + { + struct wctrans_table *t; + + t = &ctype->map_3level[nr]; + t->p = 7; + t->q = 9; + wctrans_table_init (t); + + for (idx = 0; idx < ctype->map_collection_act[nr]; ++idx) + if (ctype->map_collection[nr][idx] != 0) + wctrans_table_add (t, ctype->charnames[idx], + ctype->map_collection[nr][idx]); + + if (verbose) + WITH_CUR_LOCALE (fprintf (stderr, _("\ +%s: table for map \"%s\": %lu bytes\n"), + "LC_CTYPE", ctype->mapnames[nr], + (unsigned long int) t->result_size)); + } + + /* Extra array for class and map names. */ + ctype->class_name_ptr = (uint32_t *) xmalloc (ctype->nr_charclass + * sizeof (uint32_t)); + ctype->map_name_ptr = (uint32_t *) xmalloc (ctype->map_collection_nr + * sizeof (uint32_t)); + + ctype->class_offset = _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1); + ctype->map_offset = ctype->class_offset + ctype->nr_charclass; + + /* Array for width information. Because the expected widths are very + small (never larger than 2) we use only one single byte. This + saves space. + We put only printable characters in the table. wcwidth is specified + to return -1 for non-printable characters. Doing the check here + saves a run-time check. + But we put L'\0' in the table. This again saves a run-time check. */ + { + struct wcwidth_table *t; + + t = &ctype->width; + t->p = 7; + t->q = 9; + wcwidth_table_init (t); + + /* First set all the printable characters of the character set to + the default width. */ + curs = NULL; + while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0) + { + struct charseq *data = (struct charseq *) vdata; + + if (data->ucs4 == UNINITIALIZED_CHAR_VALUE) + data->ucs4 = repertoire_find_value (ctype->repertoire, + data->name, len); + + if (data->ucs4 != ILLEGAL_CHAR_VALUE) + { + uint32_t *class_bits = + find_idx (ctype, &ctype->class_collection, NULL, + &ctype->class_collection_act, data->ucs4); + + if (class_bits != NULL && (*class_bits & BITw (tok_print))) + wcwidth_table_add (t, data->ucs4, charmap->width_default); + } + } + + /* Now add the explicitly specified widths. */ + if (charmap->width_rules != NULL) + for (size_t cnt = 0; cnt < charmap->nwidth_rules; ++cnt) + { + unsigned char bytes[charmap->mb_cur_max]; + int nbytes = charmap->width_rules[cnt].from->nbytes; + + /* We have the range of character for which the width is + specified described using byte sequences of the multibyte + charset. We have to convert this to UCS4 now. And we + cannot simply convert the beginning and the end of the + sequence, we have to iterate over the byte sequence and + convert it for every single character. */ + memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes); + + while (nbytes < charmap->width_rules[cnt].to->nbytes + || memcmp (bytes, charmap->width_rules[cnt].to->bytes, + nbytes) <= 0) + { + /* Find the UCS value for `bytes'. */ + int inner; + uint32_t wch; + struct charseq *seq = + charmap_find_symbol (charmap, (char *) bytes, nbytes); + + if (seq == NULL) + wch = ILLEGAL_CHAR_VALUE; + else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE) + wch = seq->ucs4; + else + wch = repertoire_find_value (ctype->repertoire, seq->name, + strlen (seq->name)); + + if (wch != ILLEGAL_CHAR_VALUE) + { + /* Store the value. */ + uint32_t *class_bits = + find_idx (ctype, &ctype->class_collection, NULL, + &ctype->class_collection_act, wch); + + if (class_bits != NULL && (*class_bits & BITw (tok_print))) + wcwidth_table_add (t, wch, + charmap->width_rules[cnt].width); + } + + /* "Increment" the bytes sequence. */ + inner = nbytes - 1; + while (inner >= 0 && bytes[inner] == 0xff) + --inner; + + if (inner < 0) + { + /* We have to extend the byte sequence. */ + if (nbytes >= charmap->width_rules[cnt].to->nbytes) + break; + + bytes[0] = 1; + memset (&bytes[1], 0, nbytes); + ++nbytes; + } + else + { + ++bytes[inner]; + while (++inner < nbytes) + bytes[inner] = 0; + } + } + } + + /* Set the width of L'\0' to 0. */ + wcwidth_table_add (t, 0, 0); + + if (verbose) + WITH_CUR_LOCALE (fprintf (stderr, _("%s: table for width: %lu bytes\n"), + "LC_CTYPE", (unsigned long int) t->result_size)); + } + + /* Set MB_CUR_MAX. */ + ctype->mb_cur_max = charmap->mb_cur_max; + + /* Now determine the table for the transliteration information. + + XXX It is not yet clear to me whether it is worth implementing a + complicated algorithm which uses a hash table to locate the entries. + For now I'll use a simple array which can be searching using binary + search. */ + if (ctype->translit_include != NULL) + /* Traverse the locales mentioned in the `include' statements in a + depth-first way and fold in their transliteration information. */ + translit_flatten (ctype, charmap, &ctype->translit); + + if (ctype->translit != NULL) + { + /* First count how many entries we have. This is the upper limit + since some entries from the included files might be overwritten. */ + size_t number = 0; + struct translit_t *runp = ctype->translit; + struct translit_t **sorted; + size_t from_len, to_len; + + while (runp != NULL) + { + ++number; + runp = runp->next; + } + + /* Next we allocate an array large enough and fill in the values. */ + sorted = (struct translit_t **) alloca (number + * sizeof (struct translit_t **)); + runp = ctype->translit; + number = 0; + do + { + /* Search for the place where to insert this string. + XXX Better use a real sorting algorithm later. */ + size_t idx = 0; + int replace = 0; + + while (idx < number) + { + int res = wcscmp ((const wchar_t *) sorted[idx]->from, + (const wchar_t *) runp->from); + if (res == 0) + { + replace = 1; + break; + } + if (res > 0) + break; + ++idx; + } + + if (replace) + sorted[idx] = runp; + else + { + memmove (&sorted[idx + 1], &sorted[idx], + (number - idx) * sizeof (struct translit_t *)); + sorted[idx] = runp; + ++number; + } + + runp = runp->next; + } + while (runp != NULL); + + /* The next step is putting all the possible transliteration + strings in one memory block so that we can write it out. + We need several different blocks: + - index to the from-string array + - from-string array + - index to the to-string array + - to-string array. + */ + from_len = to_len = 0; + for (size_t cnt = 0; cnt < number; ++cnt) + { + struct translit_to_t *srunp; + from_len += wcslen ((const wchar_t *) sorted[cnt]->from) + 1; + srunp = sorted[cnt]->to; + while (srunp != NULL) + { + to_len += wcslen ((const wchar_t *) srunp->str) + 1; + srunp = srunp->next; + } + /* Plus one for the extra NUL character marking the end of + the list for the current entry. */ + ++to_len; + } + + /* We can allocate the arrays for the results. */ + ctype->translit_from_idx = xmalloc (number * sizeof (uint32_t)); + ctype->translit_from_tbl = xmalloc (from_len * sizeof (uint32_t)); + ctype->translit_to_idx = xmalloc (number * sizeof (uint32_t)); + ctype->translit_to_tbl = xmalloc (to_len * sizeof (uint32_t)); + + from_len = 0; + to_len = 0; + for (size_t cnt = 0; cnt < number; ++cnt) + { + size_t len; + struct translit_to_t *srunp; + + ctype->translit_from_idx[cnt] = from_len; + ctype->translit_to_idx[cnt] = to_len; + + len = wcslen ((const wchar_t *) sorted[cnt]->from) + 1; + wmemcpy ((wchar_t *) &ctype->translit_from_tbl[from_len], + (const wchar_t *) sorted[cnt]->from, len); + from_len += len; + + ctype->translit_to_idx[cnt] = to_len; + srunp = sorted[cnt]->to; + while (srunp != NULL) + { + len = wcslen ((const wchar_t *) srunp->str) + 1; + wmemcpy ((wchar_t *) &ctype->translit_to_tbl[to_len], + (const wchar_t *) srunp->str, len); + to_len += len; + srunp = srunp->next; + } + ctype->translit_to_tbl[to_len++] = L'\0'; + } + + /* Store the information about the length. */ + ctype->translit_idx_size = number; + ctype->translit_from_tbl_size = from_len * sizeof (uint32_t); + ctype->translit_to_tbl_size = to_len * sizeof (uint32_t); + } + else + { + ctype->translit_from_idx = no_str; + ctype->translit_from_tbl = no_str; + ctype->translit_to_tbl = no_str; + ctype->translit_idx_size = 0; + ctype->translit_from_tbl_size = 0; + ctype->translit_to_tbl_size = 0; + } +} |