diff options
-rw-r--r-- | ChangeLog | 15 | ||||
-rw-r--r-- | locale/C-collate.c | 130 | ||||
-rw-r--r-- | locale/Makefile | 2 | ||||
-rw-r--r-- | locale/categories.def | 29 | ||||
-rw-r--r-- | locale/elem-hash.h | 34 | ||||
-rw-r--r-- | locale/langinfo.h | 3 | ||||
-rw-r--r-- | locale/programs/ld-collate.c | 119 | ||||
-rw-r--r-- | posix/regex.c | 168 |
8 files changed, 356 insertions, 144 deletions
diff --git a/ChangeLog b/ChangeLog index 59846bf425..47132bdea2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +1999-12-31 Ulrich Drepper <drepper@cygnus.com> + + * locale/langinfo.h: Add constants for wide character collation + symbol table. + * locale/categories.def: Add appropriate entries for collate symbol + table entries. + * locale/C-collate.c: Add initializers for new entries. + Remove commented out code. + * locale/elem-hash.h: New file. + * locale/Makefile (distribute): Add elem-hash.h. + * locale/programs/ld-collate.c: Implement output of collate symbol + table. + + * posix/regex.c: Implement collation class handling. + 1999-12-30 Ulrich Drepper <drepper@cygnus.com> * posix/regex.c (regex_compile): Implement equivalence class handling. diff --git a/locale/C-collate.c b/locale/C-collate.c index 7302682b30..679ed30871 100644 --- a/locale/C-collate.c +++ b/locale/C-collate.c @@ -20,137 +20,12 @@ #include <endian.h> #include "localeinfo.h" -#if 0 -/* These tables' entries contain values which make the function behave - according to POSIX.2 Table 2-8 ``LC_COLLATE Category Definition in - the POSIX Locale''. */ - -const uint32_t _nl_C_LC_COLLATE_symbol_hash[446] = -{ - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0x00000154u, 0x00000060u, 0xffffffffu, 0xffffffffu, - 0x0000004fu, 0x0000001au, 0x00000085u, 0x00000030u, 0xffffffffu, 0xffffffffu, - 0x000002beu, 0x000000fau, 0xffffffffu, 0xffffffffu, 0x0000014eu, 0x0000005eu, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x000000bbu, 0x00000044u, - 0xffffffffu, 0xffffffffu, 0x000000efu, 0x0000004cu, 0x00000147u, 0x0000005cu, - 0x000000a0u, 0x0000003eu, 0x00000000u, 0x00000000u, 0x00000038u, 0x00000016u, - 0x00000094u, 0x00000038u, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0x00000140u, 0x0000005au, 0x0000018cu, 0x00000076u, - 0x0000007du, 0x0000002cu, 0xffffffffu, 0xffffffffu, 0x00000115u, 0x00000052u, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x00000285u, 0x000000deu, - 0x00000171u, 0x0000006cu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, - 0x00000289u, 0x000000e2u, 0x000002d8u, 0x000000feu, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x00000022u, 0x00000010u, - 0x0000028fu, 0x000000e8u, 0x00000069u, 0x00000022u, 0x0000006du, 0x00000024u, - 0x00000071u, 0x00000026u, 0x00000075u, 0x00000028u, 0xffffffffu, 0xffffffffu, - 0x00000295u, 0x000000eeu, 0xffffffffu, 0xffffffffu, 0x00000297u, 0x000000f0u, - 0xffffffffu, 0xffffffffu, 0x00000299u, 0x000000f2u, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x00000213u, 0x000000b6u, - 0xffffffffu, 0xffffffffu, 0x00000014u, 0x0000000au, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0x00000227u, 0x000000b8u, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x0000015du, 0x00000064u, - 0xffffffffu, 0xffffffffu, 0x000001ffu, 0x000000a2u, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x0000013au, 0x00000058u, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x00000010u, 0x00000008u, - 0x000001dfu, 0x00000082u, 0x000001e1u, 0x00000084u, 0x00000167u, 0x00000068u, - 0x00000004u, 0x00000002u, 0x000001e7u, 0x0000008au, 0x00000186u, 0x00000074u, - 0x000001ebu, 0x0000008eu, 0x000001edu, 0x00000090u, 0x000001efu, 0x00000092u, - 0x000001f1u, 0x00000094u, 0x000001f3u, 0x00000096u, 0x000001f5u, 0x00000098u, - 0x000001f7u, 0x0000009au, 0x000001f9u, 0x0000009cu, 0x000001a5u, 0x0000007au, - 0x000001fdu, 0x000000a0u, 0x00000030u, 0x00000014u, 0x00000201u, 0x000000a4u, - 0x00000203u, 0x000000a6u, 0x00000205u, 0x000000a8u, 0x00000207u, 0x000000aau, - 0x00000209u, 0x000000acu, 0x0000020bu, 0x000000aeu, 0x0000020du, 0x000000b0u, - 0x0000020fu, 0x000000b2u, 0x00000211u, 0x000000b4u, 0xffffffffu, 0xffffffffu, - 0x0000009cu, 0x0000003cu, 0xffffffffu, 0xffffffffu, 0x00000098u, 0x0000003au, - 0x0000016cu, 0x0000006au, 0xffffffffu, 0xffffffffu, 0x00000269u, 0x000000c2u, - 0x0000026bu, 0x000000c4u, 0x0000026du, 0x000000c6u, 0x0000026fu, 0x000000c8u, - 0x00000271u, 0x000000cau, 0x00000273u, 0x000000ccu, 0x00000275u, 0x000000ceu, - 0x00000277u, 0x000000d0u, 0x00000279u, 0x000000d2u, 0x0000027bu, 0x000000d4u, - 0x0000027du, 0x000000d6u, 0x0000027fu, 0x000000d8u, 0x00000281u, 0x000000dau, - 0x00000283u, 0x000000dcu, 0x00000090u, 0x00000036u, 0x00000287u, 0x000000e0u, - 0x0000005fu, 0x0000001cu, 0x0000028bu, 0x000000e4u, 0x0000028du, 0x000000e6u, - 0x00000089u, 0x00000032u, 0x000001c3u, 0x0000007eu, 0x00000293u, 0x000000ecu, - 0x00000062u, 0x0000001eu, 0x000001b1u, 0x0000007cu, 0x00000130u, 0x00000056u, - 0x0000029bu, 0x000000f4u, 0x00000196u, 0x00000078u, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0x00000081u, 0x0000002eu, 0x00000251u, 0x000000beu, - 0x00000079u, 0x0000002au, 0x0000029du, 0x000000f6u, 0xffffffffu, 0xffffffffu, - 0x0000025cu, 0x000000c0u, 0xffffffffu, 0xffffffffu, 0x0000002cu, 0x00000012u, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x000000a4u, 0x00000040u, - 0xffffffffu, 0xffffffffu, 0x000002b0u, 0x000000f8u, 0xffffffffu, 0xffffffffu, - 0x000000f9u, 0x0000004eu, 0xffffffffu, 0xffffffffu, 0x0000001cu, 0x0000000eu, - 0xffffffffu, 0xffffffffu, 0x0000017bu, 0x00000070u, 0x0000000cu, 0x00000006u, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x000001e3u, 0x00000086u, - 0xffffffffu, 0xffffffffu, 0x000001e5u, 0x00000088u, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0x000001d1u, 0x00000080u, 0x000001e9u, 0x0000008cu, - 0x0000008cu, 0x00000034u, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0x00000291u, 0x000000eau, 0xffffffffu, 0xffffffffu, - 0x00000008u, 0x00000004u, 0xffffffffu, 0xffffffffu, 0x00000181u, 0x00000072u, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x00000231u, 0x000000bau, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x000000cau, 0x00000046u, - 0x00000246u, 0x000000bcu, 0xffffffffu, 0xffffffffu, 0x000001fbu, 0x0000009eu, - 0x000000d6u, 0x00000048u, 0x00000018u, 0x0000000cu, 0xffffffffu, 0xffffffffu, - 0x00000159u, 0x00000062u, 0xffffffffu, 0xffffffffu, 0x000000aau, 0x00000042u, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x000000e2u, 0x0000004au, - 0x00000175u, 0x0000006eu, 0xffffffffu, 0xffffffffu, 0x00000104u, 0x00000050u, - 0x00000065u, 0x00000020u, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0x000002d2u, 0x000000fcu, - 0xffffffffu, 0xffffffffu, 0x00000161u, 0x00000066u, 0x00000045u, 0x00000018u, - 0xffffffffu, 0xffffffffu, 0x00000127u, 0x00000054u, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu, - 0xffffffffu, 0xffffffffu -}; - -const char _nl_C_LC_COLLATE_symbol_strings[732] = - "NUL\0" "SOH\0" "STX\0" "ETX\0" "EOT\0" "ENQ\0" "ACK\0" "alert\0" - "backspace\0" "tab\0" "newline\0" "vertical-tab\0" "form-feed\0" - "carriage-return\0" "SI\0" "SO\0" "DLE\0" "DC1\0" "DC2\0" "DC3\0" "DC4\0" - "NAK\0" "SYN\0" "ETB\0" "CAN\0" "EM\0" "SUB\0" "ESC\0" "IS4\0" "IS3\0" - "IS2\0" "IS1\0" "space\0" "exclamation-mark\0" "quotation-mark\0" - "number-sign\0" "dollar-sign\0" "percent-sign\0" "ampersand\0" - "apostrophe\0" "left-parenthesis\0" "right-parenthesis\0" "asterisk\0" - "plus-sign\0" "comma\0" "hyphen\0" "period\0" "slash\0" "zero\0" "one\0" - "two\0" "three\0" "four\0" "five\0" "six\0" "seven\0" "eight\0" "nine\0" - "colon\0" "semicolon\0" "less-than-sign\0" "equals-sign\0" - "greater-than-sign\0" "question-mark\0" "commercial-at\0" "A\0" "B\0" "C\0" - "D\0" "E\0" "F\0" "G\0" "H\0" "I\0" "J\0" "K\0" "L\0" "M\0" "N\0" "O\0" - "P\0" "Q\0" "R\0" "S\0" "T\0" "U\0" "V\0" "W\0" "X\0" "Y\0" "Z\0" - "left-square-bracket\0" "backslash\0" "right-square-bracket\0" - "circumflex\0" "underscore\0" "grave-accent\0" "a\0" "b\0" "c\0" "d\0" "e\0" - "f\0" "g\0" "h\0" "i\0" "j\0" "k\0" "l\0" "m\0" "n\0" "o\0" "p\0" "q\0" - "r\0" "s\0" "t\0" "u\0" "v\0" "w\0" "x\0" "y\0" "z\0" "left-curly-bracket\0" - "vertical-line\0" "right-curly-bracket\0" "tilde\0" "DEL\0"; - -const uint32_t _nl_C_LC_COLLATE_symbol_classes[256] = -{ - 1, 0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, - 1, 8, 1, 9, 1, 10, 1, 11, 1, 12, 1, 13, 1, 14, 1, 15, - 1, 16, 1, 17, 1, 18, 1, 19, 1, 20, 1, 21, 1, 22, 1, 23, - 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29, 1, 30, 1, 31, - 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39, - 1, 40, 1, 41, 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, - 1, 48, 1, 49, 1, 50, 1, 51, 1, 52, 1, 53, 1, 54, 1, 55, - 1, 56, 1, 57, 1, 58, 1, 59, 1, 60, 1, 61, 1, 62, 1, 63, - 1, 64, 1, 65, 1, 66, 1, 67, 1, 68, 1, 69, 1, 70, 1, 71, - 1, 72, 1, 73, 1, 74, 1, 75, 1, 76, 1, 77, 1, 78, 1, 79, - 1, 80, 1, 81, 1, 82, 1, 83, 1, 84, 1, 85, 1, 86, 1, 87, - 1, 88, 1, 89, 1, 90, 1, 91, 1, 92, 1, 93, 1, 94, 1, 95, - 1, 96, 1, 97, 1, 98, 1, 99, 1, 100, 1, 101, 1, 102, 1, 103, - 1, 104, 1, 105, 1, 106, 1, 107, 1, 108, 1, 109, 1, 110, 1, 111, - 1, 112, 1, 113, 1, 114, 1, 115, 1, 116, 1, 117, 1, 118, 1, 119, - 1, 120, 1, 121, 1, 122, 1, 123, 1, 124, 1, 125, 1, 126, 1, 127 -}; -#endif - const struct locale_data _nl_C_LC_COLLATE = { _nl_C_name, NULL, 0, 0, /* no file mapped */ UNDELETABLE, - 13, + 16, { { word: 0 }, { string: NULL }, @@ -164,6 +39,9 @@ const struct locale_data _nl_C_LC_COLLATE = { string: NULL }, { string: NULL }, { string: NULL }, + { string: NULL }, + { word: 0 }, + { string: NULL }, { string: NULL } } }; diff --git a/locale/Makefile b/locale/Makefile index a087d7278f..9a34847e99 100644 --- a/locale/Makefile +++ b/locale/Makefile @@ -23,7 +23,7 @@ subdir := locale headers = locale.h langinfo.h xlocale.h distribute = localeinfo.h categories.def iso-639.def iso-3166.def \ - iso-4217.def weight.h strlen-hash.h \ + iso-4217.def weight.h strlen-hash.h elem-hash.h \ $(addprefix programs/, \ locale.c localedef.c \ $(localedef-modules:=.c) $(locale-modules:=.c) \ diff --git a/locale/categories.def b/locale/categories.def index 7ebb8536a5..e055d74a74 100644 --- a/locale/categories.def +++ b/locale/categories.def @@ -42,19 +42,22 @@ DEFINE_CATEGORY ( LC_COLLATE, "LC_COLLATE", ( - DEFINE_ELEMENT (_NL_COLLATE_NRULES, "collate-nrules", std, word) - DEFINE_ELEMENT (_NL_COLLATE_RULESETS, "collate-rulesets", std, string) - DEFINE_ELEMENT (_NL_COLLATE_TABLEMB, "collate-tablemb", std, string) - DEFINE_ELEMENT (_NL_COLLATE_WEIGHTMB, "collate-weightmb", std, string) - DEFINE_ELEMENT (_NL_COLLATE_EXTRAMB, "collate-extramb", std, string) - DEFINE_ELEMENT (_NL_COLLATE_INDIRECTMB, "collate-indirectmb", std, string) - DEFINE_ELEMENT (_NL_COLLATE_HASH_SIZE, "collate-hash-size", std, word) - DEFINE_ELEMENT (_NL_COLLATE_HASH_LAYERS, "collate-hash-layers", std, word) - DEFINE_ELEMENT (_NL_COLLATE_NAMES, "collate-names", std, string) - DEFINE_ELEMENT (_NL_COLLATE_TABLEWC, "collate-tablewc", std, string) - DEFINE_ELEMENT (_NL_COLLATE_WEIGHTWC, "collate-weightwc", std, string) - DEFINE_ELEMENT (_NL_COLLATE_EXTRAWC, "collate-extrawc", std, string) - DEFINE_ELEMENT (_NL_COLLATE_INDIRECTWC, "collate-indirectwc", std, string) + DEFINE_ELEMENT (_NL_COLLATE_NRULES, "collate-nrules", std, word) + DEFINE_ELEMENT (_NL_COLLATE_RULESETS, "collate-rulesets", std, string) + DEFINE_ELEMENT (_NL_COLLATE_TABLEMB, "collate-tablemb", std, string) + DEFINE_ELEMENT (_NL_COLLATE_WEIGHTMB, "collate-weightmb", std, string) + DEFINE_ELEMENT (_NL_COLLATE_EXTRAMB, "collate-extramb", std, string) + DEFINE_ELEMENT (_NL_COLLATE_INDIRECTMB, "collate-indirectmb", std, string) + DEFINE_ELEMENT (_NL_COLLATE_HASH_SIZE, "collate-hash-size", std, word) + DEFINE_ELEMENT (_NL_COLLATE_HASH_LAYERS, "collate-hash-layers", std, word) + DEFINE_ELEMENT (_NL_COLLATE_NAMES, "collate-names", std, string) + DEFINE_ELEMENT (_NL_COLLATE_TABLEWC, "collate-tablewc", std, string) + DEFINE_ELEMENT (_NL_COLLATE_WEIGHTWC, "collate-weightwc", std, string) + DEFINE_ELEMENT (_NL_COLLATE_EXTRAWC, "collate-extrawc", std, string) + DEFINE_ELEMENT (_NL_COLLATE_INDIRECTWC, "collate-indirectwc", std, string) + DEFINE_ELEMENT (_NL_COLLATE_SYMB_HASH_SIZEMB, "collate-symb-hash-sizemb", std, word) + DEFINE_ELEMENT (_NL_COLLATE_SYMB_TABLEMB, "collate-symb-tablemb", std, string) + DEFINE_ELEMENT (_NL_COLLATE_SYMB_EXTRAMB, "collate-symb-extramb", std, string) ), NO_POSTLOAD) diff --git a/locale/elem-hash.h b/locale/elem-hash.h new file mode 100644 index 0000000000..0529214e6c --- /dev/null +++ b/locale/elem-hash.h @@ -0,0 +1,34 @@ +/* Copyright (C) 1999 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Ulrich Drepper, <drepper@cygnus.com>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + + +/* The hashing function used for the table with collation symbols. */ +static inline int32_t +elem_hash (const char *str, int_fast32_t n) +{ + int32_t result = n; + + while (n > 0) + { + n <<= 3; + n += *str++; + } + + return result; +} diff --git a/locale/langinfo.h b/locale/langinfo.h index e46fc65184..8ba42fa036 100644 --- a/locale/langinfo.h +++ b/locale/langinfo.h @@ -243,6 +243,9 @@ enum _NL_COLLATE_WEIGHTWC, _NL_COLLATE_EXTRAWC, _NL_COLLATE_INDIRECTWC, + _NL_COLLATE_SYMB_HASH_SIZEMB, + _NL_COLLATE_SYMB_TABLEMB, + _NL_COLLATE_SYMB_EXTRAMB, _NL_NUM_LC_COLLATE, /* LC_CTYPE category: character classification. diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c index 3eff699e7b..8eb47d7f8e 100644 --- a/locale/programs/ld-collate.c +++ b/locale/programs/ld-collate.c @@ -25,12 +25,14 @@ #include <error.h> #include <stdlib.h> #include <wchar.h> +#include <sys/param.h> #include "charmap.h" #include "localeinfo.h" #include "linereader.h" #include "locfile.h" #include "localedef.h" +#include "elem-hash.h" /* Uncomment the following line in the production version. */ /* #define NDEBUG 1 */ @@ -88,11 +90,13 @@ struct element_t we changed if necessary but I doubt this is necessary. */ unsigned int used_in_level; + struct element_list_t *weights; + /* Index in the `weight' table in the output file for the character. */ + int32_t weights_idx; + /* Nonzero if this is a real character definition. */ int is_character; - struct element_list_t *weights; - /* Where does the definition come from. */ const char *file; size_t line; @@ -297,6 +301,7 @@ new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen, /* Will be allocated later. */ newp->weights = NULL; + newp->weights_idx = 0; newp->file = NULL; newp->line = 0; @@ -1804,6 +1809,9 @@ output_weight (struct obstack *pool, struct locale_collate_t *collate, obstack_grow (pool, buf, len); } + /* Remember the index. */ + elem->weights_idx = retval; + return retval | ((elem->section->ruleidx & 0x7f) << 24); } @@ -1866,7 +1874,10 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, uint32_t *names; uint32_t *tablewc; size_t table_size; + uint32_t elem_size; + uint32_t *elem_table; int i; + struct element_t *runp; data.magic = LIMAGIC (LC_COLLATE); data.n = nelems; @@ -2381,6 +2392,110 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, ++cnt; + /* Finally write the table with collation element names out. It is + a hash table with a simple function which gets the name of the + character as the input. One character might have many names. The + value associated with the name is an index into the weight table + where we are then interested in the first-level weight value. + + To determine how large the table should be we are counting the + elements have to put in. Since we are using internal chaining + using a secondary hash function we have to make the table a bit + larger to avoid extremely long search times. We can achieve + good results with a 40% larger table than there are entries. */ + elem_size = 0; + runp = collate->start; + while (runp != NULL) + { + if (runp->mbs != NULL && runp->weights != NULL) + /* Yep, the element really counts. */ + ++elem_size; + + runp = runp->next; + } + /* Add 40% and find the next prime number. */ + elem_size = MIN (next_prime (elem_size * 1.4), 257); + + /* Allocate the table. Each entry consists of two words: the hash + value and an index in a secondary table which provides the index + into the weight table and the string itself (so that a match can + be determined). */ + elem_table = (uint32_t *) obstack_alloc (&extrapool, + elem_size * 2 * sizeof (uint32_t)); + memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t)); + + /* Now add the elements. */ + runp = collate->start; + while (runp != NULL) + { + if (runp->mbs != NULL && runp->weights != NULL) + { + /* Compute the hash value of the name. */ + uint32_t namelen = strlen (runp->name); + uint32_t hash = elem_hash (runp->name, namelen); + size_t idx = hash % elem_size; + + if (elem_table[idx * 2] != 0) + { + /* The spot is already take. Try iterating using the value + from the secondary hashing function. */ + size_t iter = hash % (elem_size - 2); + + do + { + idx += iter; + if (idx >= elem_size) + idx -= elem_size; + } + while (elem_table[idx * 2] != 0); + + /* This is the spot where we will insert the value. */ + elem_table[idx * 2] = hash; + elem_table[idx * 2 + 1] = obstack_object_size (&extrapool); + + /* Now add the index into the weights table. We know the + address is always 32bit aligned. */ + if (sizeof (int) == sizeof (int32_t)) + obstack_int_grow (&extrapool, runp->weights_idx); + else + obstack_grow (&extrapool, &runp->weights_idx, + sizeof (int32_t)); + + /* The the string itself including length. */ + obstack_1grow (&extrapool, namelen); + obstack_grow (&extrapool, runp->name, namelen); + + /* And align again to 32 bits. */ + if ((1 + namelen) % sizeof (int32_t) != 0) + obstack_grow (&extrapool, "\0\0", + (sizeof (int32_t) + - (1 + namelen) % sizeof (int32_t))); + } + } + + runp = runp->next; + } + + /* Prepare to write out this data. */ + assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB)); + iov[2 + cnt].iov_base = &elem_size; + iov[2 + cnt].iov_len = sizeof (int32_t); + idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len; + ++cnt; + + assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_TABLEMB)); + iov[2 + cnt].iov_base = elem_table; + iov[2 + cnt].iov_len = elem_size * 2 * sizeof (int32_t); + idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len; + ++cnt; + + assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_EXTRAMB)); + iov[2 + cnt].iov_len = obstack_object_size (&extrapool); + iov[2 + cnt].iov_base = obstack_finish (&extrapool); + idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len; + ++cnt; + + assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE)); write_locale_data (output_path, "LC_COLLATE", 2 + cnt, iov); diff --git a/posix/regex.c b/posix/regex.c index e1a6917f71..a59f5d4a71 100644 --- a/posix/regex.c +++ b/posix/regex.c @@ -82,6 +82,7 @@ /* We are also using some library internals. */ # include <locale/localeinfo.h> +# include <locale/elem-hash.h> # include <langinfo.h> #endif @@ -2378,12 +2379,13 @@ regex_compile (pattern, size, syntax, bufp) had_char_class = false; } } -#ifdef _LIBC else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=') { unsigned char str[MB_LEN_MAX + 1]; +#ifdef _LIBC uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); +#endif PATFETCH (c); c1 = 0; @@ -2412,7 +2414,9 @@ regex_compile (pattern, size, syntax, bufp) character set and therefore we cannot have character with more than one byte in the multibyte representation. */ +#ifdef _LIBC if (nrules == 0) +#endif { if (c1 != 1) FREE_STACK_RETURN (REG_ECOLLATE); @@ -2424,6 +2428,7 @@ regex_compile (pattern, size, syntax, bufp) /* Set the bit for the character. */ SET_LIST_BIT (str[0]); } +#ifdef _LIBC else { /* Try to match the byte sequence in `str' against @@ -2495,8 +2500,168 @@ regex_compile (pattern, size, syntax, bufp) } } } +#endif had_char_class = true; } + } + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.') + { + unsigned char str[128]; /* Should be large enough. */ +#ifdef _LIBC + uint32_t nrules = + _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); +#endif + + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[='. */ + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + for (;;) + { + PATFETCH (c); + if ((c == '.' && *p == ']') || p == pend) + break; + if (c1 < sizeof (str)) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[c1] = '\0'; + + if (c == '.' && *p == ']' && str[0] != '\0') + { + /* If we have no collation data we use the default + collation in which each character is the name + for its own class which contains only the one + character. It also means that ASCII is the + character set and therefore we cannot have character + with more than one byte in the multibyte + representation. */ +#ifdef _LIBC + if (nrules == 0) +#endif + { + if (c1 != 1) + FREE_STACK_RETURN (REG_ECOLLATE); + + /* Throw away the ] at the end of the equivalence + class. */ + PATFETCH (c); + + /* Set the bit for the character. */ + SET_LIST_BIT (str[0]); + } +#ifdef _LIBC + else + { + /* Try to match the byte sequence in `str' against + those known to the collate implementation. + First find out whether the bytes in `str' are + actually from exactly one character. */ + const unsigned char *weights; + int32_t table_size; + const int32_t *table; + const int32_t *symb_table; + const unsigned char *extra; + int32_t idx; + int32_t elem; + const unsigned char *cp = str; + int32_t weight; + int32_t second; + int32_t hash; + int ch; + + table = (const int32_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); + weights = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); + table_size = + _NL_CURRENT_WORD (LC_COLLATE, + _NL_COLLATE_SYMB_HASH_SIZEMB); + symb_table = (const int32_t *) + _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_SYMB_TABLEMB); + extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, + _NL_COLLATE_SYMB_EXTRAMB); + + /* Locate the character in the hashing table. */ + hash = elem_hash (str, c1); + + idx = 0; + elem = hash % table_size; + second = hash % (table_size - 2); + while (symb_table[2 * elem] != 0) + { + /* First compare the hashing value. */ + if (symb_table[2 * elem] == hash + && (c1 == extra[symb_table[2 * elem + 1] + + sizeof (int32_t)]) + && memcmp (str, + &extra[symb_table[2 * elem + 1] + + sizeof (int32_t) + 1], + c1) == 0) + { + /* Yep, this is the entry. */ + idx = *((int32_t *) + (extra + + symb_table[2 * elem + 1])); + break; + } + + /* Next entry. */ + elem += second; + } + + if (symb_table[2 * elem] == 0) + /* This is no valid character. */ + FREE_STACK_RETURN (REG_ECOLLATE); + + /* Throw away the ] at the end of the equivalence + class. */ + PATFETCH (c); + + /* Now we have to go throught the whole table + and find all characters which have the same + first level weight. + + XXX Note that this is not entirely correct. + we would have to match multibyte sequences + but this is not possible with the current + implementation. */ + for (ch = 1; ch < 256; ++ch) + /* XXX This test would have to be changed if we + would allow matching multibyte sequences. */ + if (table[ch] > 0) + { + int32_t idx2 = table[ch]; + size_t len = weights[idx2]; + + /* Test whether the lenghts match. */ + if (weights[idx] == len) + { + /* They do. New compare the bytes of + the weight. */ + size_t cnt = 0; + + while (cnt < len + && (weights[idx + 1 + cnt] + == weights[idx2 + 1 + cnt])) + ++len; + + if (cnt == len) + /* They match. Mark the character as + acceptable. */ + SET_LIST_BIT (ch); + } + } + } +#endif + had_char_class = false; + } else { c1++; @@ -2507,7 +2672,6 @@ regex_compile (pattern, size, syntax, bufp) had_char_class = false; } } -#endif else { had_char_class = false; |