about summary refs log tree commit diff
path: root/posix/regexec.c
diff options
context:
space:
mode:
Diffstat (limited to 'posix/regexec.c')
-rw-r--r--posix/regexec.c352
1 files changed, 193 insertions, 159 deletions
diff --git a/posix/regexec.c b/posix/regexec.c
index 72ae70b916..a03df2636a 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -22,8 +22,6 @@ static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
 				     int n) internal_function;
 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
 static void match_ctx_free (re_match_context_t *cache) internal_function;
-static void match_ctx_free_subtops (re_match_context_t *mctx)
-     internal_function;
 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
 					  int str_idx, int from, int to)
      internal_function;
@@ -606,15 +604,16 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
   reg_errcode_t err;
   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
   int left_lim, right_lim, incr;
-  int fl_longest_match, match_first, match_last = -1;
-  int fast_translate, sb;
+  int fl_longest_match, match_first, match_kind, match_last = -1;
+  int sb, ch;
 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
   re_match_context_t mctx = { .dfa = dfa };
 #else
   re_match_context_t mctx;
 #endif
-  char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
-		    && range && !preg->can_be_null) ? preg->fastmap : NULL);
+  char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
+		   && range && !preg->can_be_null) ? preg->fastmap : NULL;
+  unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 
 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
   memset (&mctx, '\0', sizeof (re_match_context_t));
@@ -685,88 +684,100 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
   left_lim = (range < 0) ? start + range : start;
   right_lim = (range < 0) ? start : start + range;
   sb = dfa->mb_cur_max == 1;
-  fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
-
-  for (;;)
+  match_kind =
+    (fastmap
+     ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
+	| (range >= 0 ? 2 : 0)
+	| (t != NULL ? 1 : 0))
+     : 8);
+
+  for (;; match_first += incr)
     {
-      /* At first get the current byte from input string.  */
-      if (fastmap)
+      err = REG_NOMATCH;
+      if (match_first < left_lim || right_lim < match_first)
+	goto free_return;
+
+      /* Advance as rapidly as possible through the string, until we
+	 find a plausible place to start matching.  This may be done
+	 with varying efficiency, so there are various possibilities:
+	 only the most common of them are specialized, in order to
+	 save on code size.  We use a switch statement for speed.  */
+      switch (match_kind)
 	{
-	  if (BE (fast_translate, 1))
+	case 8:
+	  /* No fastmap.  */
+	  break;
+
+	case 7:
+	  /* Fastmap with single-byte translation, match forward.  */
+	  while (BE (match_first < right_lim, 1)
+		 && !fastmap[t[(unsigned char) string[match_first]]])
+	    ++match_first;
+	  goto forward_match_found_start_or_reached_end;
+
+	case 6:
+	  /* Fastmap without translation, match forward.  */
+	  while (BE (match_first < right_lim, 1)
+		 && !fastmap[(unsigned char) string[match_first]])
+	    ++match_first;
+
+	forward_match_found_start_or_reached_end:
+	  if (BE (match_first == right_lim, 0))
 	    {
-	      unsigned RE_TRANSLATE_TYPE t
-		= (unsigned RE_TRANSLATE_TYPE) preg->translate;
-	      if (BE (range >= 0, 1))
-		{
-		  if (BE (t != NULL, 0))
-		    {
-		      while (BE (match_first < right_lim, 1)
-			     && !fastmap[t[(unsigned char) string[match_first]]])
-			++match_first;
-		    }
-		  else
-		    {
-		      while (BE (match_first < right_lim, 1)
-			     && !fastmap[(unsigned char) string[match_first]])
-			++match_first;
-		    }
-		  if (BE (match_first == right_lim, 0))
-		    {
-		      int ch = match_first >= length
-			       ? 0 : (unsigned char) string[match_first];
-		      if (!fastmap[t ? t[ch] : ch])
-			break;
-		    }
-		}
-	      else
-		{
-		  while (match_first >= left_lim)
-		    {
-		      int ch = match_first >= length
-			       ? 0 : (unsigned char) string[match_first];
-		      if (fastmap[t ? t[ch] : ch])
-			break;
-		      --match_first;
-		    }
-		  if (match_first < left_lim)
-		    break;
-		}
+	      ch = match_first >= length
+		       ? 0 : (unsigned char) string[match_first];
+	      if (!fastmap[t ? t[ch] : ch])
+		goto free_return;
 	    }
-	  else
+	  break;
+
+	case 4:
+	case 5:
+	  /* Fastmap without multi-byte translation, match backwards.  */
+	  while (match_first >= left_lim)
 	    {
-	      int ch;
+	      ch = match_first >= length
+		       ? 0 : (unsigned char) string[match_first];
+	      if (fastmap[t ? t[ch] : ch])
+		break;
+	      --match_first;
+	    }
+	  if (match_first < left_lim)
+	    goto free_return;
+	  break;
 
-	      do
+	default:
+	  /* In this case, we can't determine easily the current byte,
+	     since it might be a component byte of a multibyte
+	     character.  Then we use the constructed buffer instead.  */
+	  for (;;)
+	    {
+	      /* If MATCH_FIRST is out of the valid range, reconstruct the
+		 buffers.  */
+	      unsigned int offset = match_first - mctx.input.raw_mbs_idx;
+	      if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 		{
-		  /* In this case, we can't determine easily the current byte,
-		     since it might be a component byte of a multibyte
-		     character.  Then we use the constructed buffer
-		     instead.  */
-		  /* If MATCH_FIRST is out of the valid range, reconstruct the
-		     buffers.  */
-		  if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len
-		      <= match_first
-		      || match_first < mctx.input.raw_mbs_idx)
-		    {
-		      err = re_string_reconstruct (&mctx.input, match_first,
-						   eflags);
-		      if (BE (err != REG_NOERROR, 0))
-			goto free_return;
-		    }
-		  /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
-		     Note that MATCH_FIRST must not be smaller than 0.  */
-		  ch = ((match_first >= length) ? 0
-		       : re_string_byte_at (&mctx.input,
-					    match_first
-					    - mctx.input.raw_mbs_idx));
-		  if (fastmap[ch])
-		    break;
-		  match_first += incr;
+		  err = re_string_reconstruct (&mctx.input, match_first,
+					       eflags);
+		  if (BE (err != REG_NOERROR, 0))
+		    goto free_return;
+
+		  offset = match_first - mctx.input.raw_mbs_idx;
 		}
-	      while (match_first >= left_lim && match_first <= right_lim);
-	      if (! fastmap[ch])
+	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
+		 Note that MATCH_FIRST must not be smaller than 0.  */
+	      ch = (match_first >= length
+		    ? 0 : re_string_byte_at (&mctx.input, offset));
+	      if (fastmap[ch])
 		break;
+	      match_first += incr;
+	      if (match_first < left_lim || match_first > right_lim)
+	        {
+	          err = REG_NOMATCH;
+	          goto free_return;
+	        }
 	    }
+	  break;
 	}
 
       /* Reconstruct the buffers so that the matcher can assume that
@@ -774,57 +785,60 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
       err = re_string_reconstruct (&mctx.input, match_first, eflags);
       if (BE (err != REG_NOERROR, 0))
 	goto free_return;
+
 #ifdef RE_ENABLE_I18N
-     /* Eliminate it when it is a component of a multibyte character
-	 and isn't the head of a multibyte character.  */
-      if (sb || re_string_first_byte (&mctx.input, 0))
+     /* Don't consider this char as a possible match start if it part,
+	yet isn't the head, of a multibyte character.  */
+      if (!sb && !re_string_first_byte (&mctx.input, 0))
+	continue;
 #endif
+
+      /* It seems to be appropriate one, then use the matcher.  */
+      /* We assume that the matching starts from 0.  */
+      mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
+      match_last = check_matching (&mctx, fl_longest_match,
+				   range >= 0 ? &match_first : NULL);
+      if (match_last != -1)
 	{
-	  /* It seems to be appropriate one, then use the matcher.  */
-	  /* We assume that the matching starts from 0.  */
-	  mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
-	  match_last = check_matching (&mctx, fl_longest_match,
-				       range >= 0 ? &match_first : NULL);
-	  if (match_last != -1)
+	  if (BE (match_last == -2, 0))
 	    {
-	      if (BE (match_last == -2, 0))
+	      err = REG_ESPACE;
+	      goto free_return;
+	    }
+	  else
+	    {
+	      mctx.match_last = match_last;
+	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 		{
-		  err = REG_ESPACE;
-		  goto free_return;
+		  re_dfastate_t *pstate = mctx.state_log[match_last];
+		  mctx.last_node = check_halt_state_context (&mctx, pstate,
+							     match_last);
 		}
-	      else
+	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
+		  || dfa->nbackref)
 		{
-		  mctx.match_last = match_last;
-		  if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
-		    {
-		      re_dfastate_t *pstate = mctx.state_log[match_last];
-		      mctx.last_node = check_halt_state_context (&mctx, pstate,
-								 match_last);
-		    }
-		  if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
-		      || dfa->nbackref)
-		    {
-		      err = prune_impossible_nodes (&mctx);
-		      if (err == REG_NOERROR)
-			break;
-		      if (BE (err != REG_NOMATCH, 0))
-			goto free_return;
-		      match_last = -1;
-		    }
-		  else
-		    break; /* We found a match.  */
+		  err = prune_impossible_nodes (&mctx);
+		  if (err == REG_NOERROR)
+		    break;
+		  if (BE (err != REG_NOMATCH, 0))
+		    goto free_return;
+		  match_last = -1;
 		}
+	      else
+		break; /* We found a match.  */
 	    }
-	  match_ctx_clean (&mctx);
 	}
-      /* Update counter.  */
-      match_first += incr;
-      if (match_first < left_lim || right_lim < match_first)
-	break;
+
+      match_ctx_clean (&mctx);
     }
 
+#ifdef DEBUG
+  assert (match_last != -1);
+  assert (err == REG_NOERROR);
+#endif
+
   /* Set pmatch[] if we need.  */
-  if (match_last != -1 && nmatch > 0)
+  if (nmatch > 0)
     {
       int reg_idx;
 
@@ -869,7 +883,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 	    pmatch[reg_idx].rm_eo += match_first;
 	  }
     }
-  err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
+
  free_return:
   re_free (mctx.state_log);
   if (dfa->nbackref)
@@ -1072,6 +1086,20 @@ check_matching (mctx, fl_longest_match, p_match_first)
   while (!re_string_eoi (&mctx->input))
     {
       re_dfastate_t *old_state = cur_state;
+      int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
+
+      if (BE (next_char_idx >= mctx->input.bufs_len, 0)
+          || (BE (next_char_idx >= mctx->input.valid_len, 0)
+              && mctx->input.valid_len < mctx->input.len))
+        {
+          err = extend_buffers (mctx);
+          if (BE (err != REG_NOERROR, 0))
+	    {
+	      assert (err == REG_ESPACE);
+	      return -2;
+	    }
+        }
+
       cur_state = transit_state (&err, mctx, cur_state);
       if (mctx->state_log != NULL)
 	cur_state = merge_state_with_log (&err, mctx, cur_state);
@@ -1090,10 +1118,10 @@ check_matching (mctx, fl_longest_match, p_match_first)
 	    break;
 	}
 
-      if (at_init_state)
+      if (BE (at_init_state, 0))
 	{
 	  if (old_state == cur_state)
-	    next_start_idx = re_string_cur_idx (&mctx->input);
+	    next_start_idx = next_char_idx;
 	  else
 	    at_init_state = 0;
 	}
@@ -1109,13 +1137,16 @@ check_matching (mctx, fl_longest_match, p_match_first)
 	      /* We found an appropriate halt state.  */
 	      match_last = re_string_cur_idx (&mctx->input);
 	      match = 1;
+
+	      /* We found a match, do not modify match_first below.  */
+	      p_match_first = NULL;
 	      if (!fl_longest_match)
 		break;
 	    }
 	}
-   }
+    }
 
-  if (match_last == -1 && p_match_first)
+  if (p_match_first)
     *p_match_first += next_start_idx;
 
   return match_last;
@@ -1854,7 +1885,12 @@ check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
 	      {
 		int dst, cpos;
 
-		if (ent->node != node || ent->subexp_from != ent->subexp_to)
+		if (ent->node != node)
+		  continue;
+
+		if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
+		    && (ent->eps_reachable_subexps_map
+			& (1 << (subexp_idx - 1))) == 0)
 		  continue;
 
 		/* Recurse trying to reach the OP_OPEN_SUBEXP and
@@ -1875,11 +1911,13 @@ check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
 		cpos = check_dst_limits_calc_pos_1 (mctx, boundaries,
 						    subexp_idx, dst, bkref_idx);
 
-		if (cpos == -1 && (boundaries & 1))
+		if (cpos == -1 /* && (boundaries & 1) */)
 		  return -1;
 
-		if (cpos == 0 /* && (boundaries & 2) */)
+		if (cpos == 0 && (boundaries & 2))
 		  return 0;
+
+		ent->eps_reachable_subexps_map &= ~(1 << (subexp_idx - 1));
 	      }
 	    while (ent++->more);
 	    break;
@@ -2167,23 +2205,14 @@ transit_state (err, mctx, state)
   re_dfastate_t **trtable;
   unsigned char ch;
 
-  if (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.bufs_len
-      || (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.valid_len
-	  && mctx->input.valid_len < mctx->input.len))
+#ifdef RE_ENABLE_I18N
+  /* If the current state can accept multibyte.  */
+  if (BE (state->accept_mb, 0))
     {
-      *err = extend_buffers (mctx);
+      *err = transit_state_mb (mctx, state);
       if (BE (*err != REG_NOERROR, 0))
 	return NULL;
     }
-
-#ifdef RE_ENABLE_I18N
-      /* If the current state can accept multibyte.  */
-      if (state->accept_mb)
-	{
-	  *err = transit_state_mb (mctx, state);
-	  if (BE (*err != REG_NOERROR, 0))
-	    return NULL;
-	}
 #endif /* RE_ENABLE_I18N */
 
   /* Then decide the next state with the single byte.  */
@@ -4078,28 +4107,6 @@ static void
 match_ctx_clean (mctx)
     re_match_context_t *mctx;
 {
-  match_ctx_free_subtops (mctx);
-  mctx->nsub_tops = 0;
-  mctx->nbkref_ents = 0;
-}
-
-/* Free all the memory associated with MCTX.  */
-
-static void
-match_ctx_free (mctx)
-    re_match_context_t *mctx;
-{
-  match_ctx_free_subtops (mctx);
-  re_free (mctx->sub_tops);
-  re_free (mctx->bkref_ents);
-}
-
-/* Free all the memory associated with MCTX->SUB_TOPS.  */
-
-static void
-match_ctx_free_subtops (mctx)
-     re_match_context_t *mctx;
-{
   int st_idx;
   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
     {
@@ -4119,6 +4126,21 @@ match_ctx_free_subtops (mctx)
 	}
       free (top);
     }
+
+  mctx->nsub_tops = 0;
+  mctx->nbkref_ents = 0;
+}
+
+/* Free all the memory associated with MCTX.  */
+
+static void
+match_ctx_free (mctx)
+    re_match_context_t *mctx;
+{
+  /* First, free all the memory associated with MCTX->SUB_TOPS.  */
+  match_ctx_clean (mctx);
+  re_free (mctx->sub_tops);
+  re_free (mctx->bkref_ents);
 }
 
 /* Add a new backreference entry to MCTX.
@@ -4154,6 +4176,18 @@ match_ctx_add_entry (mctx, node, str_idx, from, to)
   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
+
+  /* This is a cache that saves negative results of check_dst_limits_calc_pos.
+     If bit N is clear, means that this entry won't epsilon-transition to
+     an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
+     it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
+     such node.
+
+     A backreference does not epsilon-transition unless it is empty, so set
+     to all zeros if FROM != TO.  */
+  mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
+    = (from == to ? ~0 : 0);
+
   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
   if (mctx->max_mb_elem_len < to - from)
     mctx->max_mb_elem_len = to - from;