From 02f3550c8bf47ecff6b548bc8ba3219d234a41a3 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 26 Jan 2005 22:42:49 +0000 Subject: [BZ #605, BZ #611] Update. 2004-12-13 Paolo Bonzini Separate parsing and creation of the NFA. Avoided recursion on the (very unbalanced) parse tree. [BZ #611] * posix/regcomp.c (struct subexp_optimize, analyze_tree, calc_epsdest, re_dfa_add_tree_node, mark_opt_subexp_iter): Removed. (optimize_subexps, duplicate_tree, calc_first, calc_next, mark_opt_subexp): Rewritten. (preorder, postorder, lower_subexps, lower_subexp, link_nfa_nodes, create_token_tree, free_tree, free_token): New. (analyze): Accept a regex_t *. Invoke the passes via the preorder and postorder generic visitors. Do not initialize the fields in the re_dfa_t that represent the transitions. (free_dfa_content): Use free_token. (re_compile_internal): Analyze before UTF-8 optimizations. Do not include optimization of subexpressions. (create_initial_state): Fetch the DFA node index from the first node's bin_tree_t *. (optimize_utf8): Abort on unexpected nodes, including OP_DUP_QUESTION. Return on COMPLEX_BRACKET. (duplicate_node_closure): Fix comment. (duplicate_node): Do not initialize the fields in the re_dfa_t that represent the transitions. (calc_eclosure, calc_inveclosure): Do not handle OP_DELETED_SUBEXP. (create_tree): Remove final argument. All callers adjusted. Rewritten to use create_token_tree. (parse_reg_exp, parse_branch, parse_expression, parse_bracket_exp, build_charclass_op): Use create_tree or create_token_tree instead of re_dfa_add_tree_node. (parse_dup_op): Likewise. Also free the tree using free_tree for "{0}", and lower OP_DUP_QUESTION to OP_ALT: "a?" is equivalent to "a|". Adjust invocation of mark_opt_subexp. (parse_sub_exp): Create a single SUBEXP node. * posix/regex_internal.c (re_dfa_add_node): Remove last parameter, always perform as if it was 1. Do not initialize OPT_SUBEXP and DUPLICATED, and initialize the DFA fields representing the transitions. * posix/regex_internal.h (re_dfa_add_node): Adjust prototype. (re_token_type_t): Move OP_DUP_PLUS and OP_DUP_QUESTION to the tokens section. Add a tree-only code SUBEXP. Remove OP_DELETED_SUBEXP. (bin_tree_t): Include a full re_token_t for TOKEN. Turn FIRST and NEXT into pointers to trees. Remove ECLOSURE. 2004-12-28 Paolo Bonzini [BZ #605] * posix/regcomp.c (parse_bracket_exp): Do not modify DFA nodes that were already created. * posix/regex_internal.c (re_dfa_add_node): Set accept_mb field in the token if needed. (create_ci_newstate, create_cd_newstate): Set accept_mb field from the tokens' field. * posix/regex_internal.h (re_token_t): Add accept_mb field. (ACCEPT_MB_NODE): Removed. * posix/regexec.c (proceed_next_node, transit_states_mb, build_sifted_states, check_arrival_add_next_nodes): Use accept_mb instead of ACCEPT_MB_NODE. --- posix/regex_internal.c | 77 ++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 40 deletions(-) (limited to 'posix/regex_internal.c') diff --git a/posix/regex_internal.c b/posix/regex_internal.c index f15cb575eb..dbd3f24ca3 100644 --- a/posix/regex_internal.c +++ b/posix/regex_internal.c @@ -1330,47 +1330,49 @@ re_node_set_remove_at (set, idx) Or return -1, if an error will be occured. */ static int -re_dfa_add_node (dfa, token, mode) +re_dfa_add_node (dfa, token) re_dfa_t *dfa; re_token_t token; - int mode; { + int type = token.type; if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) { int new_nodes_alloc = dfa->nodes_alloc * 2; + int *new_nexts, *new_indices; + re_node_set *new_edests, *new_eclosures, *new_inveclosures; + re_token_t *new_array = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); if (BE (new_array == NULL, 0)) return -1; dfa->nodes = new_array; - if (mode) - { - int *new_nexts, *new_indices; - re_node_set *new_edests, *new_eclosures, *new_inveclosures; - - new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc); - new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc); - new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); - new_eclosures = re_realloc (dfa->eclosures, re_node_set, - new_nodes_alloc); - new_inveclosures = re_realloc (dfa->inveclosures, re_node_set, - new_nodes_alloc); - if (BE (new_nexts == NULL || new_indices == NULL - || new_edests == NULL || new_eclosures == NULL - || new_inveclosures == NULL, 0)) - return -1; - dfa->nexts = new_nexts; - dfa->org_indices = new_indices; - dfa->edests = new_edests; - dfa->eclosures = new_eclosures; - dfa->inveclosures = new_inveclosures; - } + new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc); + new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc); + new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); + new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); + new_inveclosures = re_realloc (dfa->inveclosures, re_node_set, + new_nodes_alloc); + if (BE (new_nexts == NULL || new_indices == NULL + || new_edests == NULL || new_eclosures == NULL + || new_inveclosures == NULL, 0)) + return -1; + dfa->nexts = new_nexts; + dfa->org_indices = new_indices; + dfa->edests = new_edests; + dfa->eclosures = new_eclosures; + dfa->inveclosures = new_inveclosures; dfa->nodes_alloc = new_nodes_alloc; } dfa->nodes[dfa->nodes_len] = token; - dfa->nodes[dfa->nodes_len].opt_subexp = 0; - dfa->nodes[dfa->nodes_len].duplicated = 0; dfa->nodes[dfa->nodes_len].constraint = 0; +#ifdef RE_ENABLE_I18N + dfa->nodes[dfa->nodes_len].accept_mb = + (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; +#endif + dfa->nexts[dfa->nodes_len] = -1; + re_node_set_init_empty (dfa->edests + dfa->nodes_len); + re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); + re_node_set_init_empty (dfa->inveclosures + dfa->nodes_len); return dfa->nodes_len++; } @@ -1551,16 +1553,13 @@ create_ci_newstate (dfa, nodes, hash) re_token_type_t type = node->type; if (type == CHARACTER && !node->constraint) continue; +#ifdef RE_ENABLE_I18N + newstate->accept_mb |= node->accept_mb; +#endif /* RE_ENABLE_I18N */ /* If the state has the halt node, the state is a halt state. */ - else if (type == END_OF_RE) + if (type == END_OF_RE) newstate->halt = 1; -#ifdef RE_ENABLE_I18N - else if (type == COMPLEX_BRACKET - || type == OP_UTF8_PERIOD - || (type == OP_PERIOD && dfa->mb_cur_max > 1)) - newstate->accept_mb = 1; -#endif /* RE_ENABLE_I18N */ else if (type == OP_BACK_REF) newstate->has_backref = 1; else if (type == ANCHOR || node->constraint) @@ -1611,15 +1610,13 @@ create_cd_newstate (dfa, nodes, context, hash) if (type == CHARACTER && !constraint) continue; - /* If the state has the halt node, the state is a halt state. */ - else if (type == END_OF_RE) - newstate->halt = 1; #ifdef RE_ENABLE_I18N - else if (type == COMPLEX_BRACKET - || type == OP_UTF8_PERIOD - || (type == OP_PERIOD && dfa->mb_cur_max > 1)) - newstate->accept_mb = 1; + newstate->accept_mb |= node->accept_mb; #endif /* RE_ENABLE_I18N */ + + /* If the state has the halt node, the state is a halt state. */ + if (type == END_OF_RE) + newstate->halt = 1; else if (type == OP_BACK_REF) newstate->has_backref = 1; else if (type == ANCHOR) -- cgit 1.4.1