diff options
author | Ulrich Drepper <drepper@redhat.com> | 2005-09-28 17:33:18 +0000 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2005-09-28 17:33:18 +0000 |
commit | 2c05d33f90861d074dc12808dafbde30f487b1a0 (patch) | |
tree | 654e4c8e7c500eca23285264cbb19a0945f05638 /posix/regcomp.c | |
parent | 1873e3cd1a7f5173d20d9060f3be825f31a53a39 (diff) | |
download | glibc-2c05d33f90861d074dc12808dafbde30f487b1a0.tar.gz glibc-2c05d33f90861d074dc12808dafbde30f487b1a0.tar.xz glibc-2c05d33f90861d074dc12808dafbde30f487b1a0.zip |
[BZ #1302]
2005-09-06 Paul Eggert <eggert@cs.ucla.edu> Ulrich Drepper <drepper@redhat.com> [BZ #1302] Change bitset word type from unsigned int to unsigned long int, as this has better performance on typical 64-bit hosts. Change bitset type name to bitset_t. * posix/regcomp.c (build_equiv_class, build_charclass): (build_range_exp, build_collating_symbol): Prefer bitset_t to re_bitset_ptr_t in prototypes, when the actual argument is a bitset. This is merely a style issue, but it makes it clearer that an entire array is expected. (re_compile_fastmap_iter, init_dfa, init_word_char, optimize_subexps, lower_subexp): Adjust for new bitset_t definition. (lower_subexp, parse_bracket_exp, built_charclass_op): Likewise. * posix/regex_internal.h (bitset_set, bitset_clear, bitset_contain, bitset_not, bitset_merge, bitset_set_all, bitset_mask): Likewise. * posix/regexec.c (check_dst_limits_calc_pos_1, check_subexp_matching_top, build_trtable, group_nodes_into_DFAstates): Likewise. * posix/regcomp.c (utf8_sb_map): Don't assume initializer == 0xffffffff. * posix/regex_internal.h (BITSET_WORD_BITS): Renamed from UINT_BITS. All uses changed. (BITSET_WORDS): Renamed from BITSET_UINTS. All uses changed. (bitset_word_t): New type, replacing 'unsigned int' for bitset uses. All uses changed. (BITSET_WORD_MAX): New macro. (bitset_set, bitset_clear, bitset_contain, bitset_empty, (bitset_set_all, bitset_copy): Adjust for bitset_t change. (bitset_empty, bitset_copy): Prefer sizeof (bitset_t) to multiplying it out ourselves. (bitset_not_merge): Remove; unused. (bitset_contain): Return bool, not unsigned int with one bit on. All callers changed. * posix/regexec.c (build_trtable): Don't assume bitset_t has no stricter alignment than re_node_set; do this by defining a new internal type struct dests_alloc and using it to allocate memory.
Diffstat (limited to 'posix/regcomp.c')
-rw-r--r-- | posix/regcomp.c | 93 |
1 files changed, 45 insertions, 48 deletions
diff --git a/posix/regcomp.c b/posix/regcomp.c index bf374a8b61..fde262b83c 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -113,21 +113,21 @@ static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, # endif /* not RE_ENABLE_I18N */ #endif /* not _LIBC */ #ifdef RE_ENABLE_I18N -static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, +static reg_errcode_t build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, int *equiv_class_alloc, const unsigned char *name); static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, - re_bitset_ptr_t sbcset, + bitset_t sbcset, re_charset_t *mbcset, int *char_class_alloc, const unsigned char *class_name, reg_syntax_t syntax); #else /* not RE_ENABLE_I18N */ -static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, +static reg_errcode_t build_equiv_class (bitset_t sbcset, const unsigned char *name); static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, - re_bitset_ptr_t sbcset, + bitset_t sbcset, const unsigned char *class_name, reg_syntax_t syntax); #endif /* not RE_ENABLE_I18N */ @@ -354,7 +354,7 @@ re_compile_fastmap_iter (bufp, init_state, fastmap) && dfa->nodes[node].type == CHARACTER && dfa->nodes[node].mb_partial) *p++ = dfa->nodes[node].opr.c; - memset (&state, 0, sizeof (state)); + memset (&state, '\0', sizeof (state)); if (mbrtowc (&wc, (const char *) buf, p - buf, &state) == p - buf && (__wcrtomb ((char *) buf, towlower (wc), &state) @@ -365,11 +365,15 @@ re_compile_fastmap_iter (bufp, init_state, fastmap) } else if (type == SIMPLE_BRACKET) { - int i, j, ch; - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) - if (dfa->nodes[node].opr.sbcset[i] & (1u << j)) - re_set_fastmap (fastmap, icase, ch); + int i, ch; + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + { + int j; + bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) + if (w & ((bitset_word_t) 1 << j)) + re_set_fastmap (fastmap, icase, ch); + } } #ifdef RE_ENABLE_I18N else if (type == COMPLEX_BRACKET) @@ -388,13 +392,11 @@ re_compile_fastmap_iter (bufp, init_state, fastmap) is a valid collation element, and don't catch 'b' since 'b' is the only collation element which starts from 'b'. */ - int j, ch; const int32_t *table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) - if (table[ch] < 0) - re_set_fastmap (fastmap, icase, ch); + for (i = 0; i < SBC_MAX; ++i) + if (table[i] < 0) + re_set_fastmap (fastmap, icase, i); } # else if (dfa->mb_cur_max > 1) @@ -581,14 +583,10 @@ weak_alias (__regerror, regerror) UTF-8 is used. Otherwise we would allocate memory just to initialize it the same all the time. UTF-8 is the preferred encoding so this is a worthwhile optimization. */ -static const bitset utf8_sb_map = +static const bitset_t utf8_sb_map = { /* Set the first 128 bits. */ -# if UINT_MAX == 0xffffffff - 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff -# else -# error "Add case for new unsigned int size" -# endif + [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX }; #endif @@ -908,20 +906,17 @@ init_dfa (dfa, pat_len) { int i, j, ch; - dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); + dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); if (BE (dfa->sb_char == NULL, 0)) return REG_ESPACE; - /* Clear all bits by, then set those corresponding to single - byte chars. */ - bitset_empty (dfa->sb_char); - - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) + /* Set the bits corresponding to single byte chars. */ + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) { wint_t wch = __btowc (ch); if (wch != WEOF) - dfa->sb_char[i] |= 1u << j; + dfa->sb_char[i] |= (bitset_word_t) 1 << j; # ifndef _LIBC if (isascii (ch) && wch != ch) dfa->map_notascii = 1; @@ -946,10 +941,10 @@ init_word_char (dfa) { int i, j, ch; dfa->word_ops_used = 1; - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) if (isalnum (ch) || ch == '_') - dfa->word_char[i] |= 1u << j; + dfa->word_char[i] |= (bitset_word_t) 1 << j; } /* Free the work area which are only used while compiling. */ @@ -1096,8 +1091,9 @@ optimize_utf8 (dfa) case COMPLEX_BRACKET: return; case SIMPLE_BRACKET: - /* Just double check. */ - for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i) + /* Just double check. The non-ASCII range starts at 0x80. */ + assert (0x80 % BITSET_WORD_BITS == 0); + for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) if (dfa->nodes[node].opr.sbcset[i]) return; break; @@ -1282,8 +1278,8 @@ optimize_subexps (extra, node) node->left->parent = node; dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; - if (other_idx < CHAR_BIT * sizeof dfa->used_bkref_map) - dfa->used_bkref_map &= ~(1u << other_idx); + if (other_idx < BITSET_WORD_BITS) + dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); } return REG_NOERROR; @@ -1331,8 +1327,9 @@ lower_subexp (err, preg, node) very common, so we do not lose much. An example that triggers this case is the sed "script" /\(\)/x. */ && node->left != NULL - && (node->token.opr.idx >= CHAR_BIT * sizeof dfa->used_bkref_map - || !(dfa->used_bkref_map & (1u << node->token.opr.idx)))) + && (node->token.opr.idx >= BITSET_WORD_BITS + || !(dfa->used_bkref_map + & ((bitset_word_t) 1 << node->token.opr.idx)))) return node->left; /* Convert the SUBEXP node to the concatenation of an @@ -2666,7 +2663,7 @@ build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) # else /* not RE_ENABLE_I18N */ build_range_exp (sbcset, start_elem, end_elem) # endif /* not RE_ENABLE_I18N */ - re_bitset_ptr_t sbcset; + bitset_t sbcset; bracket_elem_t *start_elem, *end_elem; { unsigned int start_ch, end_ch; @@ -2788,7 +2785,7 @@ build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) # else /* not RE_ENABLE_I18N */ build_collating_symbol (sbcset, name) # endif /* not RE_ENABLE_I18N */ - re_bitset_ptr_t sbcset; + bitset_t sbcset; const unsigned char *name; { size_t name_len = strlen ((const char *) name); @@ -2931,7 +2928,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) re_charset_t *mbcset; int *range_alloc; - re_bitset_ptr_t sbcset; + bitset_t sbcset; bracket_elem_t *start_elem, *end_elem; { unsigned int ch; @@ -3014,7 +3011,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) re_charset_t *mbcset; int *coll_sym_alloc; - re_bitset_ptr_t sbcset; + bitset_t sbcset; const unsigned char *name; { int32_t elem, idx; @@ -3099,7 +3096,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) _NL_COLLATE_SYMB_EXTRAMB); } #endif - sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); + sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); #ifdef RE_ENABLE_I18N mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); #endif /* RE_ENABLE_I18N */ @@ -3309,12 +3306,12 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); if (BE (mbc_tree == NULL, 0)) goto parse_bracket_exp_espace; - for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx) + for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) if (sbcset[sbc_idx]) break; /* If there are no bits set in sbcset, there is no point of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ - if (sbc_idx < BITSET_UINTS) + if (sbc_idx < BITSET_WORDS) { /* Build a tree for simple bracket. */ br_token.type = SIMPLE_BRACKET; @@ -3464,7 +3461,7 @@ build_equiv_class (sbcset, mbcset, equiv_class_alloc, name) #else /* not RE_ENABLE_I18N */ build_equiv_class (sbcset, name) #endif /* not RE_ENABLE_I18N */ - re_bitset_ptr_t sbcset; + bitset_t sbcset; const unsigned char *name; { #if defined _LIBC @@ -3560,7 +3557,7 @@ build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax) build_charclass (trans, sbcset, class_name, syntax) #endif /* not RE_ENABLE_I18N */ RE_TRANSLATE_TYPE trans; - re_bitset_ptr_t sbcset; + bitset_t sbcset; const unsigned char *class_name; reg_syntax_t syntax; { @@ -3649,7 +3646,7 @@ build_charclass_op (dfa, trans, class_name, extra, non_match, err) re_token_t br_token; bin_tree_t *tree; - sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); + sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); #ifdef RE_ENABLE_I18N mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); #endif /* RE_ENABLE_I18N */ |