From 02f3550c8bf47ecff6b548bc8ba3219d234a41a3 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 26 Jan 2005 22:42:49 +0000 Subject: [BZ #605, BZ #611] Update. 2004-12-13 Paolo Bonzini Separate parsing and creation of the NFA. Avoided recursion on the (very unbalanced) parse tree. [BZ #611] * posix/regcomp.c (struct subexp_optimize, analyze_tree, calc_epsdest, re_dfa_add_tree_node, mark_opt_subexp_iter): Removed. (optimize_subexps, duplicate_tree, calc_first, calc_next, mark_opt_subexp): Rewritten. (preorder, postorder, lower_subexps, lower_subexp, link_nfa_nodes, create_token_tree, free_tree, free_token): New. (analyze): Accept a regex_t *. Invoke the passes via the preorder and postorder generic visitors. Do not initialize the fields in the re_dfa_t that represent the transitions. (free_dfa_content): Use free_token. (re_compile_internal): Analyze before UTF-8 optimizations. Do not include optimization of subexpressions. (create_initial_state): Fetch the DFA node index from the first node's bin_tree_t *. (optimize_utf8): Abort on unexpected nodes, including OP_DUP_QUESTION. Return on COMPLEX_BRACKET. (duplicate_node_closure): Fix comment. (duplicate_node): Do not initialize the fields in the re_dfa_t that represent the transitions. (calc_eclosure, calc_inveclosure): Do not handle OP_DELETED_SUBEXP. (create_tree): Remove final argument. All callers adjusted. Rewritten to use create_token_tree. (parse_reg_exp, parse_branch, parse_expression, parse_bracket_exp, build_charclass_op): Use create_tree or create_token_tree instead of re_dfa_add_tree_node. (parse_dup_op): Likewise. Also free the tree using free_tree for "{0}", and lower OP_DUP_QUESTION to OP_ALT: "a?" is equivalent to "a|". Adjust invocation of mark_opt_subexp. (parse_sub_exp): Create a single SUBEXP node. * posix/regex_internal.c (re_dfa_add_node): Remove last parameter, always perform as if it was 1. Do not initialize OPT_SUBEXP and DUPLICATED, and initialize the DFA fields representing the transitions. * posix/regex_internal.h (re_dfa_add_node): Adjust prototype. (re_token_type_t): Move OP_DUP_PLUS and OP_DUP_QUESTION to the tokens section. Add a tree-only code SUBEXP. Remove OP_DELETED_SUBEXP. (bin_tree_t): Include a full re_token_t for TOKEN. Turn FIRST and NEXT into pointers to trees. Remove ECLOSURE. 2004-12-28 Paolo Bonzini [BZ #605] * posix/regcomp.c (parse_bracket_exp): Do not modify DFA nodes that were already created. * posix/regex_internal.c (re_dfa_add_node): Set accept_mb field in the token if needed. (create_ci_newstate, create_cd_newstate): Set accept_mb field from the tokens' field. * posix/regex_internal.h (re_token_t): Add accept_mb field. (ACCEPT_MB_NODE): Removed. * posix/regexec.c (proceed_next_node, transit_states_mb, build_sifted_states, check_arrival_add_next_nodes): Use accept_mb instead of ACCEPT_MB_NODE. --- posix/regex_internal.h | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'posix/regex_internal.h') diff --git a/posix/regex_internal.h b/posix/regex_internal.h index 694a69965a..f065cf449d 100644 --- a/posix/regex_internal.h +++ b/posix/regex_internal.h @@ -189,16 +189,16 @@ typedef enum OP_CLOSE_SUBEXP = EPSILON_BIT | 1, OP_ALT = EPSILON_BIT | 2, OP_DUP_ASTERISK = EPSILON_BIT | 3, - OP_DUP_PLUS = EPSILON_BIT | 4, - OP_DUP_QUESTION = EPSILON_BIT | 5, - ANCHOR = EPSILON_BIT | 6, - OP_DELETED_SUBEXP = EPSILON_BIT | 7, + ANCHOR = EPSILON_BIT | 4, /* Tree type, these are used only by tree. */ CONCAT = 16, + SUBEXP = 17, /* Token type, these are used only by token. */ - OP_OPEN_BRACKET = 17, + OP_DUP_PLUS = 18, + OP_DUP_QUESTION, + OP_OPEN_BRACKET, OP_CLOSE_BRACKET, OP_CHARSET_RANGE, OP_OPEN_DUP_NUM, @@ -287,6 +287,7 @@ typedef struct unsigned int duplicated : 1; unsigned int opt_subexp : 1; #ifdef RE_ENABLE_I18N + unsigned int accept_mb : 1; /* These 2 bits can be moved into the union if needed (e.g. if running out of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */ unsigned int mb_partial : 1; @@ -295,8 +296,6 @@ typedef struct } re_token_t; #define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT) -#define ACCEPT_MB_NODE(type) \ - ((type) >= OP_PERIOD && (type) <= OP_UTF8_PERIOD) struct re_string_t { @@ -432,15 +431,14 @@ struct bin_tree_t struct bin_tree_t *parent; struct bin_tree_t *left; struct bin_tree_t *right; + struct bin_tree_t *first; + struct bin_tree_t *next; + + re_token_t token; /* `node_idx' is the index in dfa->nodes, if `type' == 0. Otherwise `type' indicate the type of this node. */ - re_token_type_t type; int node_idx; - - int first; - int next; - re_node_set eclosure; }; typedef struct bin_tree_t bin_tree_t; @@ -680,7 +678,7 @@ static void re_node_set_remove_at (re_node_set *set, int idx) internal_function; (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1)) #define re_node_set_empty(p) ((p)->nelem = 0) #define re_node_set_free(set) re_free ((set)->elems) -static int re_dfa_add_node (re_dfa_t *dfa, re_token_t token, int mode) internal_function; +static int re_dfa_add_node (re_dfa_t *dfa, re_token_t token) internal_function; static re_dfastate_t *re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes) internal_function; static re_dfastate_t *re_acquire_state_context (reg_errcode_t *err, -- cgit 1.4.1