diff options
Diffstat (limited to 'REORG.TODO/string/strxfrm_l.c')
-rw-r--r-- | REORG.TODO/string/strxfrm_l.c | 747 |
1 files changed, 747 insertions, 0 deletions
diff --git a/REORG.TODO/string/strxfrm_l.c b/REORG.TODO/string/strxfrm_l.c new file mode 100644 index 0000000000..dd98a4caaf --- /dev/null +++ b/REORG.TODO/string/strxfrm_l.c @@ -0,0 +1,747 @@ +/* Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Ulrich Drepper <drepper@gnu.org>, 1995. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <assert.h> +#include <langinfo.h> +#include <locale.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <sys/param.h> + +#ifndef STRING_TYPE +# define STRING_TYPE char +# define USTRING_TYPE unsigned char +# define STRXFRM __strxfrm_l +# define STRLEN strlen +# define STPNCPY __stpncpy +# define WEIGHT_H "../locale/weight.h" +# define SUFFIX MB +# define L(arg) arg +#endif + +#define CONCAT(a,b) CONCAT1(a,b) +#define CONCAT1(a,b) a##b + +/* Maximum string size that is calculated with cached indices. Right now this + is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be + lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */ +#define SMALL_STR_SIZE 4095 + +#include "../locale/localeinfo.h" +#include WEIGHT_H + +/* Group locale data for shorter parameter lists. */ +typedef struct +{ + uint_fast32_t nrules; + unsigned char *rulesets; + USTRING_TYPE *weights; + int32_t *table; + USTRING_TYPE *extra; + int32_t *indirect; +} locale_data_t; + +#ifndef WIDE_CHAR_VERSION + +/* We need UTF-8 encoding of numbers. */ +static int +utf8_encode (char *buf, int val) +{ + int retval; + + if (val < 0x80) + { + *buf++ = (char) val; + retval = 1; + } + else + { + int step; + + for (step = 2; step < 6; ++step) + if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0) + break; + retval = step; + + *buf = (unsigned char) (~0xff >> step); + --step; + do + { + buf[step] = 0x80 | (val & 0x3f); + val >>= 6; + } + while (--step > 0); + *buf |= val; + } + + return retval; +} +#endif + +/* Find next weight and rule index. Inlined since called for every char. */ +static __always_inline size_t +find_idx (const USTRING_TYPE **us, int32_t *weight_idx, + unsigned char *rule_idx, const locale_data_t *l_data, const int pass) +{ + int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us, + -1); + *rule_idx = tmp >> 24; + int32_t idx = tmp & 0xffffff; + size_t len = l_data->weights[idx++]; + + /* Skip over indices of previous levels. */ + for (int i = 0; i < pass; i++) + { + idx += len; + len = l_data->weights[idx++]; + } + + *weight_idx = idx; + return len; +} + +static int +find_position (const USTRING_TYPE *us, const locale_data_t *l_data, + const int pass) +{ + int32_t weight_idx; + unsigned char rule_idx; + const USTRING_TYPE *usrc = us; + + find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass); + return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position; +} + +/* Do the transformation. */ +static size_t +do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n, + const locale_data_t *l_data) +{ + int32_t weight_idx; + unsigned char rule_idx; + uint_fast32_t pass; + size_t needed = 0; + size_t last_needed; + + /* Now the passes over the weights. */ + for (pass = 0; pass < l_data->nrules; ++pass) + { + size_t backw_len = 0; + last_needed = needed; + const USTRING_TYPE *cur = usrc; + const USTRING_TYPE *backw_start = NULL; + + /* We assume that if a rule has defined `position' in one section + this is true for all of them. */ + int position = find_position (cur, l_data, pass); + + if (position == 0) + { + while (*cur != L('\0')) + { + const USTRING_TYPE *pos = cur; + size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data, + pass); + int rule = l_data->rulesets[rule_idx * l_data->nrules + pass]; + + if ((rule & sort_forward) != 0) + { + /* Handle the pushed backward sequence. */ + if (backw_start != NULL) + { + for (size_t i = backw_len; i > 0; ) + { + int32_t weight_idx; + unsigned char rule_idx; + size_t len = find_idx (&backw_start, &weight_idx, + &rule_idx, l_data, pass); + if (needed + i < n) + for (size_t j = len; j > 0; j--) + dest[needed + i - j] = + l_data->weights[weight_idx++]; + + i -= len; + } + + needed += backw_len; + backw_start = NULL; + backw_len = 0; + } + + /* Now handle the forward element. */ + if (needed + len < n) + while (len-- > 0) + dest[needed++] = l_data->weights[weight_idx++]; + else + /* No more characters fit into the buffer. */ + needed += len; + } + else + { + /* Remember start of the backward sequence & track length. */ + if (backw_start == NULL) + backw_start = pos; + backw_len += len; + } + } + + + /* Handle the pushed backward sequence. */ + if (backw_start != NULL) + { + for (size_t i = backw_len; i > 0; ) + { + size_t len = find_idx (&backw_start, &weight_idx, &rule_idx, + l_data, pass); + if (needed + i < n) + for (size_t j = len; j > 0; j--) + dest[needed + i - j] = + l_data->weights[weight_idx++]; + + i -= len; + } + + needed += backw_len; + } + } + else + { + int val = 1; +#ifndef WIDE_CHAR_VERSION + char buf[7]; + size_t buflen; +#endif + size_t i; + + while (*cur != L('\0')) + { + const USTRING_TYPE *pos = cur; + size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data, + pass); + int rule = l_data->rulesets[rule_idx * l_data->nrules + pass]; + + if ((rule & sort_forward) != 0) + { + /* Handle the pushed backward sequence. */ + if (backw_start != NULL) + { + for (size_t p = backw_len; p > 0; p--) + { + size_t len; + int32_t weight_idx; + unsigned char rule_idx; + const USTRING_TYPE *backw_cur = backw_start; + + /* To prevent a warning init the used vars. */ + len = find_idx (&backw_cur, &weight_idx, + &rule_idx, l_data, pass); + + for (i = 1; i < p; i++) + len = find_idx (&backw_cur, &weight_idx, + &rule_idx, l_data, pass); + + if (len != 0) + { +#ifdef WIDE_CHAR_VERSION + if (needed + 1 + len < n) + { + dest[needed] = val; + for (i = 0; i < len; ++i) + dest[needed + 1 + i] = + l_data->weights[weight_idx + i]; + } + needed += 1 + len; +#else + buflen = utf8_encode (buf, val); + if (needed + buflen + len < n) + { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = + l_data->weights[weight_idx + i]; + } + needed += buflen + len; +#endif + val = 1; + } + else + ++val; + } + + backw_start = NULL; + backw_len = 0; + } + + /* Now handle the forward element. */ + if (len != 0) + { +#ifdef WIDE_CHAR_VERSION + if (needed + 1 + len < n) + { + dest[needed] = val; + for (i = 0; i < len; ++i) + dest[needed + 1 + i] = + l_data->weights[weight_idx + i]; + } + needed += 1 + len; +#else + buflen = utf8_encode (buf, val); + if (needed + buflen + len < n) + { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = + l_data->weights[weight_idx + i]; + } + needed += buflen + len; +#endif + val = 1; + } + else + ++val; + } + else + { + /* Remember start of the backward sequence & track length. */ + if (backw_start == NULL) + backw_start = pos; + backw_len++; + } + } + + /* Handle the pushed backward sequence. */ + if (backw_start != NULL) + { + for (size_t p = backw_len; p > 0; p--) + { + size_t len; + int32_t weight_idx; + unsigned char rule_idx; + const USTRING_TYPE *backw_cur = backw_start; + + /* To prevent a warning init the used vars. */ + len = find_idx (&backw_cur, &weight_idx, + &rule_idx, l_data, pass); + + for (i = 1; i < p; i++) + len = find_idx (&backw_cur, &weight_idx, + &rule_idx, l_data, pass); + + if (len != 0) + { +#ifdef WIDE_CHAR_VERSION + if (needed + 1 + len < n) + { + dest[needed] = val; + for (i = 0; i < len; ++i) + dest[needed + 1 + i] = + l_data->weights[weight_idx + i]; + } + needed += 1 + len; +#else + buflen = utf8_encode (buf, val); + if (needed + buflen + len < n) + { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = + l_data->weights[weight_idx + i]; + } + needed += buflen + len; +#endif + val = 1; + } + else + ++val; + } + } + } + + /* Finally store the byte to separate the passes or terminate + the string. */ + if (needed < n) + dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0'); + ++needed; + } + + /* This is a little optimization: many collation specifications have + a `position' rule at the end and if no non-ignored character + is found the last \1 byte is immediately followed by a \0 byte + signalling this. We can avoid the \1 byte(s). */ + if (needed > 2 && needed == last_needed + 1) + { + /* Remove the \1 byte. */ + if (--needed <= n) + dest[needed - 1] = L('\0'); + } + + /* Return the number of bytes/words we need, but don't count the NUL + byte/word at the end. */ + return needed - 1; +} + +/* Do the transformation using weight-index and rule cache. */ +static size_t +do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data, + size_t idxmax, int32_t *idxarr, const unsigned char *rulearr) +{ + uint_fast32_t nrules = l_data->nrules; + unsigned char *rulesets = l_data->rulesets; + USTRING_TYPE *weights = l_data->weights; + uint_fast32_t pass; + size_t needed = 0; + size_t last_needed; + size_t idxcnt; + + /* Now the passes over the weights. */ + for (pass = 0; pass < nrules; ++pass) + { + size_t backw_stop = ~0ul; + int rule = rulesets[rulearr[0] * nrules + pass]; + /* We assume that if a rule has defined `position' in one section + this is true for all of them. */ + int position = rule & sort_position; + + last_needed = needed; + if (position == 0) + { + for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) + { + if ((rule & sort_forward) != 0) + { + size_t len; + + if (backw_stop != ~0ul) + { + /* Handle the pushed elements now. */ + size_t backw; + + for (backw = idxcnt; backw > backw_stop; ) + { + --backw; + len = weights[idxarr[backw]++]; + + if (needed + len < n) + while (len-- > 0) + dest[needed++] = weights[idxarr[backw]++]; + else + { + /* No more characters fit into the buffer. */ + needed += len; + idxarr[backw] += len; + } + } + + backw_stop = ~0ul; + } + + /* Now handle the forward element. */ + len = weights[idxarr[idxcnt]++]; + if (needed + len < n) + while (len-- > 0) + dest[needed++] = weights[idxarr[idxcnt]++]; + else + { + /* No more characters fit into the buffer. */ + needed += len; + idxarr[idxcnt] += len; + } + } + else + { + /* Remember where the backwards series started. */ + if (backw_stop == ~0ul) + backw_stop = idxcnt; + } + + rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; + } + + + if (backw_stop != ~0ul) + { + /* Handle the pushed elements now. */ + size_t backw; + + backw = idxcnt; + while (backw > backw_stop) + { + size_t len = weights[idxarr[--backw]++]; + + if (needed + len < n) + while (len-- > 0) + dest[needed++] = weights[idxarr[backw]++]; + else + { + /* No more characters fit into the buffer. */ + needed += len; + idxarr[backw] += len; + } + } + } + } + else + { + int val = 1; +#ifndef WIDE_CHAR_VERSION + char buf[7]; + size_t buflen; +#endif + size_t i; + + for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) + { + if ((rule & sort_forward) != 0) + { + size_t len; + + if (backw_stop != ~0ul) + { + /* Handle the pushed elements now. */ + size_t backw; + + for (backw = idxcnt; backw > backw_stop; ) + { + --backw; + len = weights[idxarr[backw]++]; + if (len != 0) + { +#ifdef WIDE_CHAR_VERSION + if (needed + 1 + len < n) + { + dest[needed] = val; + for (i = 0; i < len; ++i) + dest[needed + 1 + i] = + weights[idxarr[backw] + i]; + } + needed += 1 + len; +#else + buflen = utf8_encode (buf, val); + if (needed + buflen + len < n) + { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = + weights[idxarr[backw] + i]; + } + needed += buflen + len; +#endif + idxarr[backw] += len; + val = 1; + } + else + ++val; + } + + backw_stop = ~0ul; + } + + /* Now handle the forward element. */ + len = weights[idxarr[idxcnt]++]; + if (len != 0) + { +#ifdef WIDE_CHAR_VERSION + if (needed + 1+ len < n) + { + dest[needed] = val; + for (i = 0; i < len; ++i) + dest[needed + 1 + i] = + weights[idxarr[idxcnt] + i]; + } + needed += 1 + len; +#else + buflen = utf8_encode (buf, val); + if (needed + buflen + len < n) + { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = + weights[idxarr[idxcnt] + i]; + } + needed += buflen + len; +#endif + idxarr[idxcnt] += len; + val = 1; + } + else + /* Note that we don't have to increment `idxarr[idxcnt]' + since the length is zero. */ + ++val; + } + else + { + /* Remember where the backwards series started. */ + if (backw_stop == ~0ul) + backw_stop = idxcnt; + } + + rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; + } + + if (backw_stop != ~0ul) + { + /* Handle the pushed elements now. */ + size_t backw; + + backw = idxmax - 1; + while (backw > backw_stop) + { + size_t len = weights[idxarr[--backw]++]; + if (len != 0) + { +#ifdef WIDE_CHAR_VERSION + if (needed + 1 + len < n) + { + dest[needed] = val; + for (i = 0; i < len; ++i) + dest[needed + 1 + i] = + weights[idxarr[backw] + i]; + } + needed += 1 + len; +#else + buflen = utf8_encode (buf, val); + if (needed + buflen + len < n) + { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = + weights[idxarr[backw] + i]; + } + needed += buflen + len; +#endif + idxarr[backw] += len; + val = 1; + } + else + ++val; + } + } + } + + /* Finally store the byte to separate the passes or terminate + the string. */ + if (needed < n) + dest[needed] = pass + 1 < nrules ? L('\1') : L('\0'); + ++needed; + } + + /* This is a little optimization: many collation specifications have + a `position' rule at the end and if no non-ignored character + is found the last \1 byte is immediately followed by a \0 byte + signalling this. We can avoid the \1 byte(s). */ + if (needed > 2 && needed == last_needed + 1) + { + /* Remove the \1 byte. */ + if (--needed <= n) + dest[needed - 1] = L('\0'); + } + + /* Return the number of bytes/words we need, but don't count the NUL + byte/word at the end. */ + return needed - 1; +} + +size_t +STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l) +{ + locale_data_t l_data; + struct __locale_data *current = l->__locales[LC_COLLATE]; + l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word; + + /* Handle byte comparison case. */ + if (l_data.nrules == 0) + { + size_t srclen = STRLEN (src); + + if (n != 0) + STPNCPY (dest, src, MIN (srclen + 1, n)); + + return srclen; + } + + /* Handle an empty string, code hereafter relies on strlen (src) > 0. */ + if (*src == L('\0')) + { + if (n != 0) + *dest = L('\0'); + return 0; + } + + /* Get the locale data. */ + l_data.rulesets = (unsigned char *) + current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string; + l_data.table = (int32_t *) + current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string; + l_data.weights = (USTRING_TYPE *) + current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string; + l_data.extra = (USTRING_TYPE *) + current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string; + l_data.indirect = (int32_t *) + current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string; + + assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0); + assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0); + assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0); + assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0); + + /* We need the elements of the string as unsigned values since they + are used as indeces. */ + const USTRING_TYPE *usrc = (const USTRING_TYPE *) src; + + /* Allocate cache for small strings on the stack and fill it with weight and + rule indices. If the cache size is not sufficient, continue with the + uncached xfrm version. */ + size_t idxmax = 0; + const USTRING_TYPE *cur = usrc; + int32_t *idxarr = alloca (SMALL_STR_SIZE * sizeof (int32_t)); + unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1); + + do + { + int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur, + -1); + rulearr[idxmax] = tmp >> 24; + idxarr[idxmax] = tmp & 0xffffff; + + ++idxmax; + } + while (*cur != L('\0') && idxmax < SMALL_STR_SIZE); + + /* This element is only read, the value never used but to determine + another value which then is ignored. */ + rulearr[idxmax] = '\0'; + + /* Do the transformation. */ + if (*cur == L('\0')) + return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr); + else + return do_xfrm (usrc, dest, n, &l_data); +} +libc_hidden_def (STRXFRM) + +#ifndef WIDE_CHAR_VERSION +weak_alias (__strxfrm_l, strxfrm_l) +#endif |