about summary refs log tree commit diff
path: root/posix/regex_internal.c
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2005-09-07 01:15:33 +0000
committerUlrich Drepper <drepper@redhat.com>2005-09-07 01:15:33 +0000
commit01ed6ceb7c440f0695726463ee9ee307921ea97e (patch)
treeeda9aef2d63fd0c0f39e51208e7bb9d463d65661 /posix/regex_internal.c
parent2d87db5b5341bd6b714f175c1c268b7136444a71 (diff)
downloadglibc-01ed6ceb7c440f0695726463ee9ee307921ea97e.tar.gz
glibc-01ed6ceb7c440f0695726463ee9ee307921ea97e.tar.xz
glibc-01ed6ceb7c440f0695726463ee9ee307921ea97e.zip
* posix/regex_internal.c (re_string_reconstruct): Avoid calling
	mbrtowc for very simple UTF-8 case.

2005-09-01  Paul Eggert  <eggert@cs.ucla.edu>

	* posix/regex_internal.c (build_wcs_upper_buffer): Fix portability
	bugs in int versus size_t comparisons.

2005-09-06  Ulrich Drepper  <drepper@redhat.com>

	* posix/regex_internal.c (re_acquire_state): Make DFA pointer arg
	a pointer-to-const.
	(re_acquire_state_context): Likewise.
	* posix/regex_internal.h: Adjust prototypes.

2005-08-31  Jim Meyering  <jim@meyering.net>

	* posix/regcomp.c (search_duplicated_node): Make first pointer arg
	a pointer-to-const.
	* posix/regex_internal.c (create_ci_newstate, create_cd_newstate,
	register_state): Likewise.
	* posix/regexec.c (search_cur_bkref_entry, check_dst_limits):
	(check_dst_limits_calc_pos_1, check_dst_limits_calc_pos):
	(group_nodes_into_DFAstates): Likewise.

	* posix/regexec.c (re_search_internal): Simplify update of
	rm_so and rm_eo by replacing "if (A == B) A += C - B;"
	with the equivalent of "if (A == B) A = C;".

2005-09-06  Ulrich Drepper  <drepper@redhat.com>

	* posix/regcomp.c (re_compile_internal): Change third parameter type
	to size_t.
	(init_dfa): Likewise.  Make sure that arithmetic on pat_len doesn't
	overflow.
	* posix/regex_internal.h (struct re_dfa_t): Change type of nodes_alloc
	and nodes_len to size_t.
	* posix/regex_internal.c (re_dfa_add_node): Use size_t as type for
	new_nodes_alloc.  Check for overflow.

2005-08-31  Paul Eggert  <eggert@cs.ucla.edu>

	* posix/regcomp.c (re_compile_fastmap_iter, init_dfa, init_word_char):
	(optimize_subexps, lower_subexp):
	Don't assume 1<<31 has defined behavior on hosts with 32-bit int,
	since the signed shift might overflow.  Use 1u<<31 instead.
	* posix/regex_internal.h (bitset_set, bitset_clear, bitset_contain):
	Likewise.
	* posix/regexec.c (check_dst_limits_calc_pos_1): Likewise.
	(check_subexp_matching_top): Likewise.
	* posix/regcomp.c (optimize_subexps, lower_subexp):
	Use CHAR_BIT rather than 8, for clarity.
	* posix/regexec.c (check_dst_limits_calc_pos_1):
	(check_subexp_matching_top): Likewise.
	* posix/regcomp.c (init_dfa): Make table_size unsigned, so that we
	don't have to worry about portability issues when shifting it left.
	Remove no-longer-needed test for table_size > 0.
	* posix/regcomp.c (parse_sub_exp): Do not shift more bits than there
	are in a word, as the resulting behavior is undefined.
	* posix/regexec.c (check_dst_limits_calc_pos_1): Likewise;
	in one case, a <= should have been an <, and in another case the
	whole test was missing.
	* posix/regex_internal.h (BYTE_BITS): Remove.  All uses changed to
	the standard name CHAR_BIT.
Diffstat (limited to 'posix/regex_internal.c')
-rw-r--r--posix/regex_internal.c120
1 files changed, 69 insertions, 51 deletions
diff --git a/posix/regex_internal.c b/posix/regex_internal.c
index 821ed7f45f..240e8872b3 100644
--- a/posix/regex_internal.c
+++ b/posix/regex_internal.c
@@ -26,12 +26,13 @@ static void re_string_construct_common (const char *str, int len,
 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
 				 wint_t *last_wc) internal_function;
 #endif /* RE_ENABLE_I18N */
-static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
+static reg_errcode_t register_state (const re_dfa_t *dfa,
+				     re_dfastate_t *newstate,
 				     unsigned int hash) internal_function;
-static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
+static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
 					  const re_node_set *nodes,
 					  unsigned int hash) internal_function;
-static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
+static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
 					  const re_node_set *nodes,
 					  unsigned int context,
 					  unsigned int hash) internal_function;
@@ -654,37 +655,50 @@ re_string_reconstruct (pstr, idx, eflags)
 		     byte other than 0x80 - 0xbf.  */
 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 		  end = raw + (offset - pstr->mb_cur_max);
-		  for (p = raw + offset - 1; p >= end; --p)
-		    if ((*p & 0xc0) != 0x80)
-		      {
-			mbstate_t cur_state;
-			wchar_t wc2;
-			int mlen = raw + pstr->len - p;
-			unsigned char buf[6];
-
-			q = p;
-			if (BE (pstr->trans != NULL, 0))
-			  {
-			    int i = mlen < 6 ? mlen : 6;
-			    while (--i >= 0)
-			      buf[i] = pstr->trans[p[i]];
-			    q = buf;
-			  }
-			/* XXX Don't use mbrtowc, we know which conversion
-			   to use (UTF-8 -> UCS4).  */
-			memset (&cur_state, 0, sizeof (cur_state));
-			mlen = (mbrtowc (&wc2, (const char *) p, mlen,
-					 &cur_state)
-				- (raw + offset - p));
-			if (mlen >= 0)
-			  {
-			    memset (&pstr->cur_state, '\0',
-				    sizeof (mbstate_t));
-			    pstr->valid_len = mlen;
-			    wc = wc2;
-			  }
-			break;
-		      }
+		  p = raw + offset - 1;
+#ifdef _LIBC
+		  /* We know the wchar_t encoding is UCS4, so for the simple
+		     case, ASCII characters, skip the conversion step.  */
+		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
+		    {
+		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
+		      pstr->valid_len = 0;
+		      wc = (wchar_t) *p;
+		    }
+		  else
+#endif
+		    for (; p >= end; --p)
+		      if ((*p & 0xc0) != 0x80)
+			{
+			  mbstate_t cur_state;
+			  wchar_t wc2;
+			  int mlen = raw + pstr->len - p;
+			  unsigned char buf[6];
+			  size_t mbclen;
+
+			  q = p;
+			  if (BE (pstr->trans != NULL, 0))
+			    {
+			      int i = mlen < 6 ? mlen : 6;
+			      while (--i >= 0)
+				buf[i] = pstr->trans[p[i]];
+			      q = buf;
+			    }
+			  /* XXX Don't use mbrtowc, we know which conversion
+			     to use (UTF-8 -> UCS4).  */
+			  memset (&cur_state, 0, sizeof (cur_state));
+			  mbclen = mbrtowc (&wc2, (const char *) p, mlen,
+					    &cur_state);
+			  if (raw + offset - p <= mbclen
+			      && mbclen < (size_t) -2)
+			    {
+			      memset (&pstr->cur_state, '\0',
+				      sizeof (mbstate_t));
+			      pstr->valid_len = mbclen - (raw + offset - p);
+			      wc = wc2;
+			    }
+			  break;
+			}
 		}
 
 	      if (wc == WEOF)
@@ -738,15 +752,15 @@ re_string_reconstruct (pstr, idx, eflags)
     }
   else
 #endif /* RE_ENABLE_I18N */
-  if (BE (pstr->mbs_allocated, 0))
-    {
-      if (pstr->icase)
-	build_upper_buffer (pstr);
-      else if (pstr->trans != NULL)
-	re_string_translate_buffer (pstr);
-    }
-  else
-    pstr->valid_len = pstr->len;
+    if (BE (pstr->mbs_allocated, 0))
+      {
+	if (pstr->icase)
+	  build_upper_buffer (pstr);
+	else if (pstr->trans != NULL)
+	  re_string_translate_buffer (pstr);
+      }
+    else
+      pstr->valid_len = pstr->len;
 
   pstr->cur_idx = 0;
   return REG_NOERROR;
@@ -1345,12 +1359,16 @@ re_dfa_add_node (dfa, token)
   int type = token.type;
   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
     {
-      int new_nodes_alloc = dfa->nodes_alloc * 2;
+      size_t new_nodes_alloc = dfa->nodes_alloc * 2;
       int *new_nexts, *new_indices;
       re_node_set *new_edests, *new_eclosures;
+      re_token_t *new_nodes;
+
+      /* Avoid overflows.  */
+      if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
+	return -1;
 
-      re_token_t *new_nodes = re_realloc (dfa->nodes, re_token_t,
-					  new_nodes_alloc);
+      new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
       if (BE (new_nodes == NULL, 0))
 	return -1;
       dfa->nodes = new_nodes;
@@ -1403,7 +1421,7 @@ calc_state_hash (nodes, context)
 static re_dfastate_t*
 re_acquire_state (err, dfa, nodes)
      reg_errcode_t *err;
-     re_dfa_t *dfa;
+     const re_dfa_t *dfa;
      const re_node_set *nodes;
 {
   unsigned int hash;
@@ -1448,7 +1466,7 @@ re_acquire_state (err, dfa, nodes)
 static re_dfastate_t*
 re_acquire_state_context (err, dfa, nodes, context)
      reg_errcode_t *err;
-     re_dfa_t *dfa;
+     const re_dfa_t *dfa;
      const re_node_set *nodes;
      unsigned int context;
 {
@@ -1486,7 +1504,7 @@ re_acquire_state_context (err, dfa, nodes, context)
 
 static reg_errcode_t
 register_state (dfa, newstate, hash)
-     re_dfa_t *dfa;
+     const re_dfa_t *dfa;
      re_dfastate_t *newstate;
      unsigned int hash;
 {
@@ -1525,7 +1543,7 @@ register_state (dfa, newstate, hash)
 
 static re_dfastate_t *
 create_ci_newstate (dfa, nodes, hash)
-     re_dfa_t *dfa;
+     const re_dfa_t *dfa;
      const re_node_set *nodes;
      unsigned int hash;
 {
@@ -1576,7 +1594,7 @@ create_ci_newstate (dfa, nodes, hash)
 
 static re_dfastate_t *
 create_cd_newstate (dfa, nodes, context, hash)
-     re_dfa_t *dfa;
+     const re_dfa_t *dfa;
      const re_node_set *nodes;
      unsigned int context, hash;
 {