about summary refs log tree commit diff
path: root/posix/regcomp.c
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2003-11-12 19:09:20 +0000
committerUlrich Drepper <drepper@redhat.com>2003-11-12 19:09:20 +0000
commit14744156b935eb7fb1a2013fdc3ce6613defa94d (patch)
treeb6dfa5ab8046feddf82d275f25b9baee162dddfc /posix/regcomp.c
parent3c0fb5745f66c8920ed4cfa8d3ead55216b15ec1 (diff)
downloadglibc-14744156b935eb7fb1a2013fdc3ce6613defa94d.tar.gz
glibc-14744156b935eb7fb1a2013fdc3ce6613defa94d.tar.xz
glibc-14744156b935eb7fb1a2013fdc3ce6613defa94d.zip
Update.
	* posix/regcomp.c (optimize_utf8): New function.
	(re_compile_fastmap_iter): Use dfa->mb_cur_max > 1 instead
	of !icase.
	(re_compile_internal): Call optimize_utf8 if not case insensitive
	and in UTF-8 locale.
	* posix/regex_internal.h: Ifdef out some prototypes if
	RE_NO_INTERNAL_PROTOTYPES is defined to shut up warnings.
	* posix/Makefile (tests): Add bug-regex20.
	(bug-regex20-ENV): Add LOCPATH.
	* posix/bug-regex20.c: New test.

2003-11-12  Jakub Jelinek  <jakub@redhat.com>
Diffstat (limited to 'posix/regcomp.c')
-rw-r--r--posix/regcomp.c68
1 files changed, 66 insertions, 2 deletions
diff --git a/posix/regcomp.c b/posix/regcomp.c
index 82d4bb1c57..ce91ef6807 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -30,6 +30,9 @@ static void free_charset (re_charset_t *cset);
 #endif /* RE_ENABLE_I18N */
 static void free_workarea_compile (regex_t *preg);
 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
+#ifdef RE_ENABLE_I18N
+static void optimize_utf8 (re_dfa_t *dfa);
+#endif
 static reg_errcode_t analyze (re_dfa_t *dfa);
 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
@@ -322,7 +325,7 @@ re_compile_fastmap_iter (bufp, init_state, fastmap)
 	{
 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 #ifdef RE_ENABLE_I18N
-	  if ((bufp->syntax & RE_ICASE) && !icase)
+	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 	    {
 	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 	      wchar_t wc;
@@ -389,7 +392,7 @@ re_compile_fastmap_iter (bufp, init_state, fastmap)
 	      memset (&state, '\0', sizeof (state));
 	      __wcrtomb (buf, cset->mbchars[i], &state);
 	      re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
-	      if ((bufp->syntax & RE_ICASE) && !icase)
+	      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 		{
 		  __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
 		  re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
@@ -760,6 +763,12 @@ re_compile_internal (preg, pattern, length, syntax)
   if (BE (dfa->str_tree == NULL, 0))
     goto re_compile_internal_free_return;
 
+#ifdef RE_ENABLE_I18N
+  /* If possible, do searching in single byte encoding to speed things up.  */
+  if (dfa->is_utf8 && !(syntax & RE_ICASE))
+    optimize_utf8 (dfa);
+#endif
+
   /* Analyze the tree and collect information which is necessary to
      create the dfa.  */
   err = analyze (dfa);
@@ -945,6 +954,61 @@ create_initial_state (dfa)
   return REG_NOERROR;
 }
 
+#ifdef RE_ENABLE_I18N
+/* If it is possible to do searching in single byte encoding instead of UTF-8
+   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
+   DFA nodes where needed.  */
+
+static void
+optimize_utf8 (dfa)
+     re_dfa_t *dfa;
+{
+  int node;
+
+  for (node = 0; node < dfa->nodes_len; ++node)
+    switch (dfa->nodes[node].type)
+      {
+      case CHARACTER:
+        /* Chars >= 0x80 are optimizable in some cases (e.g. when not
+	   followed by DUP operator, not in bracket etc.).
+	   For now punt on them all.  */
+	if (dfa->nodes[node].opr.c >= 0x80)
+	  return;
+	break;
+      case ANCHOR:
+	switch (dfa->nodes[node].opr.idx)
+	  {
+	  case LINE_FIRST:
+	  case LINE_LAST:
+	  case BUF_FIRST:
+	  case BUF_LAST:
+	    break;
+	  default:
+	    /* Word anchors etc. cannot be handled.  */
+	    return;
+	  }
+	break;
+      case OP_BACK_REF:
+      case OP_ALT:
+      case END_OF_RE:
+      case BACK_SLASH:
+      case OP_DUP_ASTERISK:
+      case OP_DUP_QUESTION:
+      case OP_DUP_PLUS:
+      case OP_OPEN_SUBEXP:
+      case OP_CLOSE_SUBEXP:
+	break;
+      default:
+	return;
+      }
+
+  /* The search can be in single byte locale.  */
+  dfa->mb_cur_max = 1;
+  dfa->is_utf8 = 0;
+  dfa->has_mb_node = dfa->nbackref > 0;
+}
+#endif
+
 /* Analyze the structure tree, and calculate "first", "next", "edest",
    "eclosure", and "inveclosure".  */