From 450bf66ef223ad83e7032920652445817865770b Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 25 Dec 1999 23:41:39 +0000 Subject: Update. 1999-12-25 Ulrich Drepper * locale/C-collate.c (_nl_C_LC_COLLATE): Add one more entry for the indirect table. * locale/langinfo.h: Likewise. * locale/categories.def: Likewise. Remove reference to postload functions. * locale/lc-collate.c (_nl_postload_collate): Removed. Also remove __collate_tablemb, __collate_weightmb, and __collate_extramb. * locale/localeinfo.h: Remove declaration for removed variables above. Remove prototype for _nl_get_era_entry. * locale/weight.h: Complete rewrite for new collate implementation. * locale/programs/ld-collate.c: Many changes to make output file usable in strxfrm/strcoll. * string/strxfrm.c: Complete rewrite for new collate implementation. * wcsmbs/wcsxfrm.c: Don't use strxfrm.c, implement dummy implementation locally. 1999-12-25 Shinya Hanataka * locale/programs/ld-ctype.c (allocate_arrays): Correctly assign transformation values for chars >255. * wctype/wctrans.c: Return pointer unmodified. --- ChangeLog | 24 ++ locale/C-collate.c | 3 +- locale/categories.def | 3 +- locale/langinfo.h | 1 + locale/lc-collate.c | 18 -- locale/localeinfo.h | 9 - locale/programs/ld-collate.c | 121 ++++++---- locale/programs/ld-ctype.c | 6 +- locale/weight.h | 251 +++++++-------------- string/strxfrm.c | 525 ++++++++++++++++++++++++++----------------- wcsmbs/wcsxfrm.c | 23 +- wctype/wctrans.c | 2 +- 12 files changed, 526 insertions(+), 460 deletions(-) diff --git a/ChangeLog b/ChangeLog index 257dee79f4..f0d5a021c6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +1999-12-25 Ulrich Drepper + + * locale/C-collate.c (_nl_C_LC_COLLATE): Add one more entry for the + indirect table. + * locale/langinfo.h: Likewise. + * locale/categories.def: Likewise. Remove reference to postload + functions. + * locale/lc-collate.c (_nl_postload_collate): Removed. Also remove + __collate_tablemb, __collate_weightmb, and __collate_extramb. + * locale/localeinfo.h: Remove declaration for removed variables above. + Remove prototype for _nl_get_era_entry. + * locale/weight.h: Complete rewrite for new collate implementation. + * locale/programs/ld-collate.c: Many changes to make output file + usable in strxfrm/strcoll. + * string/strxfrm.c: Complete rewrite for new collate implementation. + * wcsmbs/wcsxfrm.c: Don't use strxfrm.c, implement dummy implementation + locally. + +1999-12-25 Shinya Hanataka + + * locale/programs/ld-ctype.c (allocate_arrays): Correctly assign + transformation values for chars >255. + * wctype/wctrans.c: Return pointer unmodified. + 1999-12-24 Ulrich Drepper * sysdeps/posix/system.c (__libc_system): Check whether command diff --git a/locale/C-collate.c b/locale/C-collate.c index 94f6e0f60f..7875f5de22 100644 --- a/locale/C-collate.c +++ b/locale/C-collate.c @@ -150,12 +150,13 @@ const struct locale_data _nl_C_LC_COLLATE = _nl_C_name, NULL, 0, 0, /* no file mapped */ UNDELETABLE, - 5, + 6, { { word: 0 }, { string: NULL }, { string: NULL }, { string: NULL }, + { string: NULL }, { string: NULL } } }; diff --git a/locale/categories.def b/locale/categories.def index 06d79ed202..40fc74213c 100644 --- a/locale/categories.def +++ b/locale/categories.def @@ -47,7 +47,8 @@ DEFINE_CATEGORY DEFINE_ELEMENT (_NL_COLLATE_TABLEMB, "collate-tablemb", std, string) DEFINE_ELEMENT (_NL_COLLATE_WEIGHTMB, "collate-weightmb", std, string) DEFINE_ELEMENT (_NL_COLLATE_EXTRAMB, "collate-extramb", std, string) - ), _nl_postload_collate) + DEFINE_ELEMENT (_NL_COLLATE_INDIRECTMB, "collate-indirectmb", std, string) + ), NO_POSTLOAD) /* The actual definition of ctype is meaningless here. It is hard coded in diff --git a/locale/langinfo.h b/locale/langinfo.h index ff48fab35f..3f39298c17 100644 --- a/locale/langinfo.h +++ b/locale/langinfo.h @@ -235,6 +235,7 @@ enum _NL_COLLATE_TABLEMB, _NL_COLLATE_WEIGHTMB, _NL_COLLATE_EXTRAMB, + _NL_COLLATE_INDIRECTMB, _NL_NUM_LC_COLLATE, /* LC_CTYPE category: character classification. diff --git a/locale/lc-collate.c b/locale/lc-collate.c index 02262b5ce2..623be06e26 100644 --- a/locale/lc-collate.c +++ b/locale/lc-collate.c @@ -22,21 +22,3 @@ _NL_CURRENT_DEFINE (LC_COLLATE); - -const int32_t *__collate_tablemb; -const unsigned char *__collate_weightmb; -const unsigned char *__collate_extramb; - -/* We are called after loading LC_CTYPE data to load it into - the variables used by the collation functions and regex. */ -void -_nl_postload_collate (void) -{ -#define paste(a,b) paste1(a,b) -#define paste1(a,b) a##b -#define current(x) _NL_CURRENT (LC_COLLATE, paste(_NL_COLLATE_,x)) - - __collate_tablemb = (const int32_t *) current (TABLEMB); - __collate_weightmb = (const unsigned char *) current (WEIGHTMB); - __collate_extramb = (const unsigned char *) current (EXTRAMB); -} diff --git a/locale/localeinfo.h b/locale/localeinfo.h index 078e205f4f..ced96ac4a9 100644 --- a/locale/localeinfo.h +++ b/locale/localeinfo.h @@ -165,9 +165,6 @@ extern void _nl_unload_locale (struct locale_data *locale); extern void _nl_remove_locale (int locale, struct locale_data *data); -/* initialize `era' entries */ -extern void _nl_init_era_entries (void); - /* Return `era' entry which corresponds to TP. Used in strftime. */ extern struct era_entry *_nl_get_era_entry (const struct tm *tp); @@ -180,10 +177,4 @@ extern const char *_nl_get_alt_digit (unsigned int number); /* Similar, but now for wide characters. */ extern const wchar_t *_nl_get_walt_digit (unsigned int number); - -/* Global variables for LC_COLLATE category data. */ -extern const int32_t *__collate_tablemb; -extern const unsigned char *__collate_extrweightmb; -extern const unsigned char *__collate_extramb; - #endif /* localeinfo.h */ diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c index 65229275ff..c629bd477a 100644 --- a/locale/programs/ld-collate.c +++ b/locale/programs/ld-collate.c @@ -137,9 +137,6 @@ struct locale_collate_t /* To make handling of errors easier we have another section. */ struct section_list error_section; - /* Number of sorting rules given in order_start line. */ - uint32_t nrules; - /* Start of the order list. */ struct element_t *start; @@ -176,7 +173,7 @@ struct locale_collate_t /* We have a few global variables which are used for reading all LC_COLLATE category descriptions in all files. */ -static int nrules; +static uint32_t nrules; /* These are definitions used by some of the functions for handling @@ -426,7 +423,7 @@ read_directions (struct linereader *ldfile, struct token *arg, if (! warned) { lr_error (ldfile, _("\ -%s: `%s' mentioned twice in definition of weight %d in category `%s'"), +%s: `%s' mentioned twice in definition of weight %d"), "LC_COLLATE", "position", cnt + 1); } } @@ -450,7 +447,13 @@ read_directions (struct linereader *ldfile, struct token *arg, /* See whether we have to increment the counter. */ if (arg->tok != tok_comma && rules[cnt] != 0) - ++cnt; + { + /* Add the default `forward' if we have seen only `position'. */ + if (rules[cnt] == sort_position) + rules[cnt] = sort_position | sort_forward; + + ++cnt; + } if (arg->tok == tok_eof || arg->tok == tok_eol) /* End of line or file, so we exit the loop. */ @@ -876,7 +879,7 @@ insert_value (struct linereader *ldfile, struct token *arg, elem->nmbs = seq->nbytes; } - if (elem->wcs == NULL && seq != ILLEGAL_CHAR_VALUE) + if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE) { uint32_t wcs[2] = { wc, 0 }; @@ -1552,7 +1555,7 @@ collate_finish (struct localedef_t *locale, struct charmap_t *charmap) } -static inline int32_t +static int32_t output_weight (struct obstack *pool, struct locale_collate_t *collate, struct element_t *elem) { @@ -1575,25 +1578,18 @@ output_weight (struct obstack *pool, struct locale_collate_t *collate, int len = 0; int i; - /* Add the direction. */ - obstack_1grow (pool, elem->section->rules[cnt]); - for (i = 0; i < elem->weights[cnt].cnt; ++i) - /* Encode the weight value. */ - if (elem->weights[cnt].w[i] == NULL) - { - /* This entry was IGNORE. */ - buf[len++] = IGNORE_CHAR; - } - else + /* Encode the weight value. We do nothing for IGNORE entries. */ + if (elem->weights[cnt].w[i] != NULL) len += utf8_encode (&buf[len], elem->weights[cnt].w[i]->mborder[cnt]); /* And add the buffer content. */ + obstack_1grow (pool, len); obstack_grow (pool, buf, len); } - return retval; + return retval | ((elem->section->ruleidx & 0x7f) << 24); } @@ -1611,11 +1607,13 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, int32_t tablemb[256]; struct obstack weightpool; struct obstack extrapool; + struct obstack indirectpool; struct section_list *sect; int i; obstack_init (&weightpool); obstack_init (&extrapool); + obstack_init (&indirectpool); data.magic = LIMAGIC (LC_COLLATE); data.n = nelems; @@ -1629,7 +1627,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, cnt = 0; assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES)); - iov[2 + cnt].iov_base = &collate->nrules; + iov[2 + cnt].iov_base = &nrules; iov[2 + cnt].iov_len = sizeof (uint32_t); idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len; ++cnt; @@ -1638,7 +1636,12 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next) if (sect->ruleidx == i) { - obstack_grow (&weightpool, sect->rules, nrules); + int j; + + obstack_make_room (&weightpool, nrules); + + for (j = 0; j < nrules; ++j) + obstack_1grow_fast (&weightpool, sect->rules[j]); ++i; } /* And align the output. */ @@ -1674,7 +1677,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, && collate->mbheads[ch]->nmbs == 1) { tablemb[ch] = output_weight (&weightpool, collate, - collate->mbheads[ch]); + collate->mbheads[ch]); } else { @@ -1719,38 +1722,60 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, { int i; - /* More than one consecutive entry. We mark this by having - a negative index into the weight table. */ - weightidx = -weightidx; - /* Now add first the initial byte sequence. */ added = ((sizeof (int32_t) + 1 + 1 + 2 * (runp->nmbs - 1) + __alignof__ (int32_t) - 1) & ~(__alignof__ (int32_t) - 1)); obstack_make_room (&extrapool, added); + /* More than one consecutive entry. We mark this by having + a negative index into the indirect table. */ if (sizeof (int32_t) == sizeof (int)) - obstack_int_grow_fast (&extrapool, weightidx); + obstack_int_grow_fast (&extrapool, + obstack_object_size (&indirectpool) + / sizeof (int32_t)); else - obstack_grow (&extrapool, &weightidx, sizeof (int32_t)); - obstack_1grow_fast (&extrapool, runp->section->ruleidx); + { + int32_t i = (obstack_object_size (&indirectpool) + / sizeof (int32_t)); + obstack_grow (&extrapool, &i, sizeof (int32_t)); + } obstack_1grow_fast (&extrapool, runp->nmbs - 1); for (i = 1; i < runp->nmbs; ++i) obstack_1grow_fast (&extrapool, runp->mbs[i]); - /* Now find the end of the consecutive sequence. */ - do - runp = runp->next; - while (runp->mbnext != NULL - && runp->nmbs == runp->mbnext->nmbs - && memcmp (runp->mbs, runp->mbnext->mbs, - runp->nmbs - 1) == 0 - && (runp->mbs[runp->nmbs - 1] + 1 - == runp->mbnext->mbs[runp->nmbs - 1])); - - /* And add the end by sequence. Without length this time. */ + /* Now find the end of the consecutive sequence and + add all the indeces in the indirect pool. */ + while (1) + { + if (sizeof (int32_t) == sizeof (int)) + obstack_int_grow_fast (&extrapool, weightidx); + else + obstack_grow (&extrapool, &weightidx, sizeof (int32_t)); + + runp = runp->next; + if (runp->mbnext == NULL + || runp->nmbs != runp->mbnext->nmbs + || memcmp (runp->mbs, runp->mbnext->mbs, + runp->nmbs - 1) != 0 + || (runp->mbs[runp->nmbs - 1] + 1 + != runp->mbnext->mbs[runp->nmbs - 1])) + break; + + /* Insert the weight. */ + weightidx = output_weight (&weightpool, collate, runp); + } + + /* And add the end byte sequence. Without length this + time. */ for (i = 1; i < runp->nmbs; ++i) obstack_1grow_fast (&extrapool, runp->mbs[i]); + + weightidx = output_weight (&weightpool, collate, runp); + if (sizeof (int32_t) == sizeof (int)) + obstack_int_grow_fast (&extrapool, weightidx); + else + obstack_grow (&extrapool, &weightidx, sizeof (int32_t)); } else { @@ -1768,7 +1793,6 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, obstack_int_grow_fast (&extrapool, weightidx); else obstack_grow (&extrapool, &weightidx, sizeof (int32_t)); - obstack_1grow_fast (&extrapool, runp->section->ruleidx); obstack_1grow_fast (&extrapool, runp->nmbs - 1); for (i = 1; i < runp->nmbs; ++i) obstack_1grow_fast (&extrapool, runp->mbs[i]); @@ -1835,6 +1859,12 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len; ++cnt; + assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB)); + iov[2 + cnt].iov_len = obstack_object_size (&indirectpool); + iov[2 + cnt].iov_base = obstack_finish (&indirectpool); + idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len; + ++cnt; + assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE)); @@ -1842,6 +1872,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap, obstack_free (&weightpool, NULL); obstack_free (&extrapool, NULL); + obstack_free (&indirectpool, NULL); } @@ -2291,16 +2322,16 @@ error while adding equivalent collating symbol")); uint32_t cnt; /* This means we have exactly one rule: `forward'. */ - if (collate->nrules > 1) + if (nrules > 1) lr_error (ldfile, _("\ %s: invalid number of sorting rules"), "LC_COLLATE"); else - collate->nrules = 1; + nrules = 1; sp->rules = obstack_alloc (&collate->mempool, (sizeof (enum coll_sort_rule) - * collate->nrules)); - for (cnt = 0; cnt < collate->nrules; ++cnt) + * nrules)); + for (cnt = 0; cnt < nrules; ++cnt) sp->rules[cnt] = sort_forward; /* Next line. */ diff --git a/locale/programs/ld-ctype.c b/locale/programs/ld-ctype.c index 86d086021d..d98b7bdfd2 100644 --- a/locale/programs/ld-ctype.c +++ b/locale/programs/ld-ctype.c @@ -3073,10 +3073,8 @@ Computing table size for character classes might take a while..."), while (idx2 < ctype->map_collection_act[idx]) { if (ctype->map_collection[idx][idx2] != 0) - *find_idx (ctype, &ctype->map32[idx], - &ctype->map_collection_max[idx], - &ctype->map_collection_act[idx], - ctype->names[idx2]) = ctype->map_collection[idx][idx2]; + ctype->map32[idx][ctype->charnames[idx2]] = + ctype->map_collection[idx][idx2]; ++idx2; } } diff --git a/locale/weight.h b/locale/weight.h index 6e31e2d495..356ee57855 100644 --- a/locale/weight.h +++ b/locale/weight.h @@ -17,191 +17,106 @@ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#include -#include -#include -#include "localeinfo.h" - -#ifndef STRING_TYPE -# error STRING_TYPE not defined -#endif +/* Find index of weight. */ +static inline int32_t +findidx (const unsigned char **cpp) +{ + int_fast32_t i = table[*(*cpp)++]; + const unsigned char *cp; -#ifndef USTRING_TYPE -# error USTRING_TYPE not defined -#endif + if (i >= 0) + /* This is an index into the weight table. Cool. */ + return i; -typedef struct weight_t -{ - struct weight_t *prev; - struct weight_t *next; - struct data_pair + /* Oh well, more than one sequence starting with this byte. + Search for the correct one. */ + cp = &extra[-i]; + while (1) { - int number; - const uint32_t *value; - } data[0]; -} weight_t; - - -/* The following five macros grant access to the values in the - collate locale file that do not depend on byte order. */ -#ifndef USE_IN_EXTENDED_LOCALE_MODEL -# define collate_nrules \ - (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES)) -# define collate_hash_size \ - (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_SIZE)) -# define collate_hash_layers \ - (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_LAYERS)) -# define collate_undefined \ - (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_UNDEFINED_WC)) -# define collate_rules \ - ((uint32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULES)) - -static __inline void get_weight (const STRING_TYPE **str, weight_t *result); -static __inline void -get_weight (const STRING_TYPE **str, weight_t *result) -#else -# define collate_nrules \ - current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word -# define collate_hash_size \ - current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].word -# define collate_hash_layers \ - current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].word -# define collate_undefined \ - current->values[_NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED_WC)].word -# define collate_rules \ - ((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULES)].string) - -static __inline void get_weight (const STRING_TYPE **str, weight_t *result, - struct locale_data *current, - const uint32_t *__collate_tablewc, - const uint32_t *__collate_extrawc); -static __inline void -get_weight (const STRING_TYPE **str, weight_t *result, - struct locale_data *current, const uint32_t *__collate_tablewc, - const uint32_t *__collate_extrawc) -#endif -{ - unsigned int ch = *((USTRING_TYPE *) (*str))++; - size_t slot; + size_t nhere; + const unsigned char *usrc = *cpp; - if (sizeof (STRING_TYPE) == 1) - slot = ch * (collate_nrules + 1); - else - { - const size_t level_size = collate_hash_size * (collate_nrules + 1); - size_t level; + /* The first thing is the index. */ + i = *((int32_t *) cp); + cp += sizeof (int32_t); - slot = (ch % collate_hash_size) * (collate_nrules + 1); + /* Next is the length of the byte sequence. These are always + short byte sequences so there is no reason to call any + function (even if they are inlined). */ + nhere = *cp++; - level = 0; - while (__collate_tablewc[slot] != (uint32_t) ch) + if (i >= 0) { - if (__collate_tablewc[slot + 1] == 0 - || ++level >= collate_hash_layers) - { - size_t idx = collate_undefined; - size_t cnt; + /* It is a single character. If it matches we found our + index. Note that at the end of each list there is an + entry of length zero which represents the single byte + sequence. The first (and here only) byte was tested + already. */ + size_t cnt; - for (cnt = 0; cnt < collate_nrules; ++cnt) - { - result->data[cnt].number = __collate_extrawc[idx++]; - result->data[cnt].value = &__collate_extrawc[idx]; - idx += result->data[cnt].number; - } - /* The Unix standard requires that a character outside - the domain is signalled by setting `errno'. */ - __set_errno (EINVAL); - return; - } - slot += level_size; - } - } + for (cnt = 0; cnt < nhere; ++cnt) + if (cp[cnt] != usrc[cnt]) + break; - if (__collate_tablewc[slot + 1] != (uint32_t) FORWARD_CHAR) - { - /* We have a simple form. One value for each weight. */ - size_t cnt; + if (cnt == nhere) + { + /* Found it. */ + *cpp += nhere; + return i; + } - for (cnt = 0; cnt < collate_nrules; ++cnt) - { - result->data[cnt].number = 1; - result->data[cnt].value = &__collate_tablewc[slot + 1 + cnt]; + /* Up to the next entry. */ + cp += nhere; } - return; - } + else + { + /* This is a range of characters. First decide whether the + current byte sequence lies in the range. */ + size_t cnt; + size_t offset = 0; - /* We now look for any collation element which starts with CH. - There might none, but the last list member is a catch-all case - because it is simple the character CH. The value of this entry - might be the same as UNDEFINED. */ - slot = __collate_tablewc[slot + 2]; + for (cnt = 0; cnt < nhere; ++cnt) + if (cp[cnt] != usrc[cnt]) + break; - while (1) - { - size_t idx; + if (cnt != nhere) + { + if (cp[cnt] > usrc[cnt]) + { + /* Cannot be in this range. */ + cp += 2 * nhere; + continue; + } - /* This is a comparison between a uint32_t array (aka wchar_t) and - an 8-bit string. */ - for (idx = 0; __collate_extrawc[slot + 2 + idx] != 0; ++idx) - if (__collate_extrawc[slot + 2 + idx] != (uint32_t) (*str)[idx]) - break; + /* Test against the end of the range. */ + for (cnt = 0; cnt < nhere; ++cnt) + if (cp[nhere + cnt] != usrc[cnt]) + break; - /* When the loop finished with all character of the collation - element used, we found the longest prefix. */ - if (__collate_extrawc[slot + 2 + idx] == 0) - { - size_t cnt; + if (cnt != nhere && cp[nhere + cnt] < usrc[cnt]) + { + /* Cannot be in this range. */ + cp += 2 * nhere; + continue; + } - *str += idx; - idx += slot + 3; - for (cnt = 0; cnt < collate_nrules; ++cnt) - { - result->data[cnt].number = __collate_extrawc[idx++]; - result->data[cnt].value = &__collate_extrawc[idx]; - idx += result->data[cnt].number; + /* This range matches the next characters. Now find + the offset in the indirect table. */ + for (cnt = 0; cp[cnt] == usrc[cnt]; ++cnt); + + do + { + offset <<= 8; + offset += usrc[cnt] - cp[cnt]; + } + while (++cnt < nhere); } - return; - } - /* To next entry in list. */ - slot += __collate_extrawc[slot]; + *cpp += nhere; + return offset; + } } -} - -/* To process a string efficiently we retrieve all information about - the string at once. The following macro constructs a double linked - list of this information. It is a macro because we use `alloca' - and we use a double linked list because of the backward collation - order. - - We have this strange extra macro since the functions which use the - given locale (not the global one) cannot use the global tables. */ -#ifndef USE_IN_EXTENDED_LOCALE_MODEL -# define call_get_weight(strp, newp) get_weight ((strp), (newp)) -#else -# define call_get_weight(strp, newp) \ - get_weight ((strp), (newp), current, collate_table, collate_extra) -#endif - -#define get_string(str, forw, backw) \ - do \ - { \ - weight_t *newp; \ - while (*str != '\0') \ - { \ - newp = (weight_t *) alloca (sizeof (weight_t) \ - + (collate_nrules \ - * sizeof (struct data_pair))); \ - \ - newp->prev = backw; \ - if (backw == NULL) \ - forw = newp; \ - else \ - backw->next = newp; \ - newp->next = NULL; \ - backw = newp; \ - call_get_weight (&str, newp); \ - } \ - } \ - while (0) + /* NOTREACHED */ + return 0x43219876; +} diff --git a/string/strxfrm.c b/string/strxfrm.c index 2a3a8a9032..344e65b957 100644 --- a/string/strxfrm.c +++ b/string/strxfrm.c @@ -17,282 +17,397 @@ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include #include +#include #include #include -#ifndef WIDE_VERSION -# define STRING_TYPE char -# define USTRING_TYPE unsigned char -# define L_(Ch) Ch -# ifdef USE_IN_EXTENDED_LOCALE_MODEL -# define STRXFRM __strxfrm_l -# else -# define STRXFRM strxfrm -# endif -# define STRLEN strlen -# define STPNCPY __stpncpy -#endif +#include "../locale/localeinfo.h" -#ifndef USE_IN_EXTENDED_LOCALE_MODEL -size_t -STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n) +#ifdef USE_IN_EXTENDED_LOCALE_MODEL +# define STRXFRM __strxfrm_l #else -size_t -STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l) +# define STRXFRM strxfrm #endif -{ - if (n != 0) - STPNCPY (dest, src, n); - return STRLEN (src); -} - -#if 0 -/* Include the shared helper functions. `strxfrm'/`wcsxfrm' also use - these functions. */ -#include "../locale/weight.h" - -#ifndef WIDE_VERSION -/* Write 32 bit value UTF-8 encoded but only if enough space is left. */ -static __inline size_t -print_val (u_int32_t value, char *dest, size_t max, size_t act) +/* These are definitions used by some of the functions for handling + UTF-8 encoding below. */ +static const uint32_t encoding_mask[] = { - char tmp[6]; - int idx = 0; + ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff +}; - if (value < 0x80) - tmp[idx++] = (char) value; - else - { - tmp[idx++] = '\x80' + (char) (value & 0x3f); - value >>= 6; - - if (value < 0x20) - tmp[idx++] = '\xc0' + (char) value; - else - { - tmp[idx++] = '\x80' + (char) (value & 0x3f); - value >>= 6; - - if (value < 0x10) - tmp[idx++] = '\xe0' + (char) value; - else - { - tmp[idx++] = '\x80' + (char) (value & 0x3f); - value >>= 6; - - if (value < 0x08) - tmp[idx++] = '\xf0' + (char) value; - else - { - tmp[idx++] = '\x80' + (char) (value & 0x3f); - value >>= 6; - - if (value < 0x04) - tmp[idx++] = '\xf8' + (char) value; - else - { - tmp[idx++] = '\x80' + (char) (value & 0x3f); - tmp[idx++] = '\xfc' + (char) (value >> 6); - } - } - } - } - } +static const unsigned char encoding_byte[] = +{ + 0xc0, 0xe0, 0xf0, 0xf8, 0xfc +}; - while (idx-- > 0) - { - if (act < max) - dest[act] = tmp[idx]; - ++act; - } - return act; -} -#else -static __inline size_t -print_val (u_int32_t value, wchar_t *dest, size_t max, size_t act) +/* We need UTF-8 encoding of numbers. */ +static inline int +utf8_encode (char *buf, int val) { - /* We cannot really assume wchar_t is 32 bits wide. But it is for - GCC and so we don't do much optimization for the other case. */ - if (sizeof (wchar_t) == 4) + char *startp = buf; + int retval; + + if (val < 0x80) { - if (act < max) - dest[act] = (wchar_t) value; - ++act; + *buf++ = (char) val; + retval = 1; } else { - wchar_t tmp[3]; - size_t idx = 0; + int step; - if (value < 0x8000) - tmp[idx++] = (wchar_t) act; - else - { - tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff)); - value >>= 14; - if (value < 0x2000) - tmp[idx++] = (wchar_t) (0xc000 + value); - else - { - tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff)); - value >>= 14; - tmp[idx++] = (wchar_t) (0xe000 + value); - } - } - while (idx-- > 0) + for (step = 2; step < 6; ++step) + if ((val & encoding_mask[step - 2]) == 0) + break; + retval = step; + + *buf = encoding_byte[step - 2]; + --step; + do { - if (act < max) - dest[act] = tmp[idx]; - ++act; + buf[step] = 0x80 | (val & 0x3f); + val >>= 6; } + while (--step > 0); + *buf |= val; } - return act; + + return buf - startp; } -#endif -/* Transform SRC into a form such that the result of strcmp - on two strings that have been transformed by strxfrm is - the same as the result of strcoll on the two strings before - their transformation. The transformed string is put in at - most N characters of DEST and its length is returned. */ #ifndef USE_IN_EXTENDED_LOCALE_MODEL size_t -STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n) +STRXFRM (char *dest, const char *src, size_t n) #else size_t -STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l) +STRXFRM (char *dest, const char *src, size_t n, __locale_t l) #endif { #ifdef USE_IN_EXTENDED_LOCALE_MODEL struct locale_data *current = l->__locales[LC_COLLATE]; -# if BYTE_ORDER == BIG_ENDIAN - const u_int32_t *collate_table = (const u_int32_t *) - current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].string; - const u_int32_t *collate_extra = (const u_int32_t *) - current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].string; -# elif BYTE_ORDER == LITTLE_ENDIAN - const u_int32_t *collate_table = (const u_int32_t *) - current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].string; - const u_int32_t *collate_extra = (const u_int32_t *) - current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].string; -# else -# error bizarre byte order -# endif + uint_fast32_t nrules = *((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string); +#else + uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); #endif - weight_t *forw = NULL; - weight_t *backw = NULL; - size_t pass; - size_t written; - - /* If the current locale does not specify locale data we use normal - 8-bit string comparison. */ - if (collate_nrules == 0) + /* We don't assign the following values right away since it might be + unnecessary in case there are no rules. */ + const unsigned char *rulesets; + const int32_t *table; + const unsigned char *weights; + const unsigned char *extra; + const int32_t *indirect; + uint_fast32_t pass; + size_t needed; + const unsigned char *usrc; + size_t srclen = strlen (src); + int32_t *idxarr; + unsigned char *rulearr; + size_t idxmax; + size_t idxcnt; + int use_malloc = 0; + +#include "../locale/weight.h" + + if (nrules == 0) { if (n != 0) - STPNCPY (dest, src, n); + __stpncpy (dest, src, n); - return STRLEN (src); + return srclen; } +#ifdef USE_IN_EXTENDED_LOCALE_MODEL + rulesets = (const unsigned char *) + current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string; + table = (const int32_t *) + current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLEMB)].string; + weights = (const unsigned char *) + current->values[_NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB)].string; + extra = (const unsigned char *) + current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB)].string; + indirect = (const int32_t *) + current->values[_NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB)].string; +#else + rulesets = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS); + table = (const int32_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); + weights = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); + extra = (const unsigned char *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); + indirect = (const int32_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); +#endif + /* Handle an empty string as a special case. */ - if (*src == '\0') + if (srclen == 0) { if (n != 0) - *dest = '\0'; + *dest = '\0'; return 1; } - /* Get full information about the string. This means we get - information for all passes in a special data structure. */ - get_string (src, forw, backw); + /* We need the elements of the string as unsigned values since they + are used as indeces. */ + usrc = (const unsigned char *) src; + + /* Perform the first pass over the string and while doing this find + and store the weights for each character. Since we want this to + be as fast as possible we are using `alloca' to store the temporary + values. But since there is no limit on the length of the string + we have to use `malloc' if the string is too long. We should be + very conservative here. */ + if (srclen >= 16384) + { + idxarr = (int32_t *) malloc (srclen * (sizeof (int32_t) + 1)); + rulearr = (unsigned char *) &idxarr[srclen]; + + if (idxarr == NULL) + /* No memory. Well, go with the stack then. + + XXX Once this implementation is stable we will handle this + differently. Instead of precomputing the indeces we will + do this in time. This means, though, that this happens for + every pass again. */ + goto try_stack; + use_malloc = 1; + } + else + { + try_stack: + idxarr = (int32_t *) alloca (srclen * sizeof (int32_t)); + rulearr = (unsigned char *) alloca (srclen); + } - /* Now we have all the information. In at most the given number of - passes we can finally decide about the order. */ - written = 0; - for (pass = 0; pass < collate_nrules; ++pass) + idxmax = 0; + do { - int forward = (collate_rules[pass] & sort_forward) != 0; - const weight_t *run = forward ? forw : backw; - int idx = forward ? 0 : run->data[pass].number - 1; + int32_t tmp = findidx (&usrc); + rulearr[idxmax] = tmp >> 24; + idxarr[idxmax] = tmp & 0x80ffffff; - while (1) + ++idxmax; + } + while (*usrc != '\0'); + + /* Now the passes over the weights. We now use the indeces we found + before. */ + needed = 0; + for (pass = 0; pass < nrules; ++pass) + { + size_t backw_stop = ~0ul; + int rule = rulesets[rulearr[0] * nrules + pass]; + /* We assume that if a rule has defined `position' in one section + this is true for all of them. */ + int position = rule & sort_position; + + if (position == 0) { - int ignore = 0; - u_int32_t w = 0; - - /* Here we have to check for IGNORE entries. If these are - found we count them and go on with he next value. */ - while (run != NULL - && ((w = run->data[pass].value[idx]) - == (u_int32_t) IGNORE_CHAR)) + for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) { - ++ignore; - if (forward - ? ++idx >= run->data[pass].number - : --idx < 0) + if ((rule & sort_forward) != 0) { - weight_t *nextp = forward ? run->next : run->prev; - if (nextp == NULL) + size_t len; + + if (backw_stop != ~0ul) { - w = 0; - /* No more non-INGOREd elements means lowest - possible value. */ - ignore = -1; + /* Handle the pushed elements now. */ + size_t backw; + + for (backw = idxcnt - 1; backw >= backw_stop; --backw) + { + len = weights[idxarr[backw]++]; + + if (needed + len < n) + while (len-- > 0) + dest[needed++] = weights[idxarr[backw]++]; + else + { + /* No more characters fit into the buffer. */ + needed += len; + idxarr[backw] += len; + } + } + + backw_stop = ~0ul; } + + /* Now handle the forward element. */ + len = weights[idxarr[idxcnt]++]; + if (needed + len < n) + while (len-- > 0) + dest[needed++] = weights[idxarr[idxcnt]++]; else - idx = forward ? 0 : nextp->data[pass].number - 1; - run = nextp; + { + /* No more characters fit into the buffer. */ + needed += len; + idxarr[idxcnt] += len; + } + } + else + { + /* Remember where the backwards series started. */ + if (backw_stop == ~0ul) + backw_stop = idxcnt; } + + rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; } - /* Stop if all characters are processed. */ - if (run == NULL) - break; - /* Now we have information of the number of ignored weights - and the value of the next weight. We have to add 2 - because 0 means EOS and 1 is the intermediate string end. */ - if ((collate_rules[pass] & sort_position) != 0) - written = print_val (ignore + 2, dest, n, written); + if (backw_stop != ~0ul) + { + /* Handle the pushed elements now. */ + size_t backw; - if (w != 0) - written = print_val (w, dest, n, written); + for (backw = idxcnt - 1; backw >= backw_stop; --backw) + { + size_t len = weights[idxarr[backw]++]; - /* We have to increment the index counters. */ - if (forward) + if (needed + len < n) + while (len-- > 0) + dest[needed++] = weights[idxarr[backw]++]; + else + { + /* No more characters fit into the buffer. */ + needed += len; + idxarr[backw] += len; + } + } + } + } + else + { + int val = 1; + char buf[7]; + size_t buflen; + size_t i; + + for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) { - if (++idx >= run->data[pass].number) + if ((rule & sort_forward) != 0) + { + size_t len; + + if (backw_stop != ~0ul) + { + /* Handle the pushed elements now. */ + size_t backw; + + for (backw = idxcnt - 1; backw >= backw_stop; --backw) + { + len = weights[idxarr[backw]++]; + if (len != 0) + { + buflen = utf8_encode (buf, val); + if (needed + buflen + len < n) + { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = + weights[idxarr[backw] + i]; + } + idxarr[backw] += len; + needed += buflen + len; + val = 1; + } + else + ++val; + } + + backw_stop = ~0ul; + } + + /* Now handle the forward element. */ + len = weights[idxarr[idxcnt]++]; + if (len != 0) + { + buflen = utf8_encode (buf, val); + if (needed + buflen + len < n) + { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = + weights[idxarr[idxcnt] + i]; + } + idxarr[idxcnt] += len; + needed += buflen + len; + val = 1; + } + else + /* Note that we don't have to increment `idxarr[idxcnt]' + since the length is zero. */ + ++val; + } + else { - run = run->next; - idx = 0; + /* Remember where the backwards series started. */ + if (backw_stop == ~0ul) + backw_stop = idxcnt; } + + rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; } - else + + if (backw_stop != ~0) { - if (--idx < 0) + /* Handle the pushed elements now. */ + size_t backw; + + for (backw = idxmax - 1; backw >= backw_stop; --backw) { - run = run->prev; - if (run != NULL) - idx = run->data[pass].number - 1; + size_t len = weights[idxarr[backw]++]; + if (len != 0) + { + buflen = utf8_encode (buf, val); + if (needed + buflen + len < n) + { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = + weights[idxarr[backw] + i]; + } + idxarr[backw] += len; + needed += buflen + len; + val = 1; + } + else + ++val; } } } - /* Write marker for end of word. */ - if (pass + 1 < collate_nrules) - written = print_val (1, dest, n, written); + /* Finally store the byte to separate the passes or terminate + the string. */ + if (needed < n) + dest[needed] = pass + 1 < nrules ? '\1' : '\0'; + ++needed; + } + + /* This is a little optimization: many collation specifications have + a `position' rule at the end and if no non-ignored character + is found the last \1 byte is immediately followed by a \0 byte + signalling this. We can avoid the \1 byte(s). */ + if (needed > 2 && dest[needed - 2] == '\1') + { + /* Remove the \1 byte. */ + --needed; + dest[needed - 1] = '\0'; } - /* Terminate string. */ - if (written < n) - dest[written] = L_('\0'); + /* Free the memory if needed. */ + if (use_malloc) + free (idxarr); - /* Return length without counting the terminating '\0'. */ - return written; + return needed; } -#endif diff --git a/wcsmbs/wcsxfrm.c b/wcsmbs/wcsxfrm.c index e41251f559..99a359399e 100644 --- a/wcsmbs/wcsxfrm.c +++ b/wcsmbs/wcsxfrm.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1996, 1997, 1998 Free Software Foundation, Inc. +/* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1996. @@ -19,16 +19,23 @@ #include -#define WIDE_VERSION 1 -#define STRING_TYPE wchar_t -#define USTRING_TYPE wint_t -#define L_(Ch) L##Ch #ifdef USE_IN_EXTENDED_LOCALE_MODEL # define STRXFRM __wcsxfrm_l #else # define STRXFRM wcsxfrm #endif -#define STRLEN __wcslen -#define STPNCPY __wcpncpy -#include + +#ifndef USE_IN_EXTENDED_LOCALE_MODEL +size_t +STRXFRM (wchar_t *dest, const wchar_t *src, size_t n) +#else +size_t +STRXFRM (wchar_t *dest, const wchar_t *src, size_t n, __locale_t l) +#endif +{ + if (n != 0) + __wcpncpy (dest, src, n); + + return __wcslen (src); +} diff --git a/wctype/wctrans.c b/wctype/wctrans.c index a5b4a32aac..5d7b5bda38 100644 --- a/wctype/wctrans.c +++ b/wctype/wctrans.c @@ -52,5 +52,5 @@ wctrans (const char *property) /* We have to search the table. */ result = (int32_t *) _NL_CURRENT (LC_CTYPE, _NL_NUM_LC_CTYPE + cnt - 2); - return (wctrans_t) (result + 128); + return (wctrans_t) result; } -- cgit 1.4.1