about summary refs log tree commit diff
path: root/posix/regex_internal.c
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2003-11-12 17:47:46 +0000
committerUlrich Drepper <drepper@redhat.com>2003-11-12 17:47:46 +0000
commit3c0fb5745f66c8920ed4cfa8d3ead55216b15ec1 (patch)
treed993d46e33582e130e8324c5bd7fb7817d2a7f54 /posix/regex_internal.c
parente432c0378ecfa545948d4be68190ab470446554b (diff)
downloadglibc-3c0fb5745f66c8920ed4cfa8d3ead55216b15ec1.tar.gz
glibc-3c0fb5745f66c8920ed4cfa8d3ead55216b15ec1.tar.xz
glibc-3c0fb5745f66c8920ed4cfa8d3ead55216b15ec1.zip
Update.
2003-11-12  Jakub Jelinek  <jakub@redhat.com>

	* io/ftw.c (NFTW_NEW_NAME, NFTW_OLD_NAME): Add prototypes.

2003-11-12  Jakub Jelinek  <jakub@redhat.com>

	* posix/tst-regex.c (umemlen): New variable.
	(test_expr): Add expectedicase argument.  Test case insensitive
	searches as well as backwards searches (case sensitive and
	insensitive) too.
	(run_test): Add icase argument.  Use it to compute regcomp flags.
	(run_test_backwards): New function.
	(main): Cast read to size_t to avoid warning.  Set umemlen.
	Add expectedicase arguments to test_expr.
	* posix/regex_internal.c (re_string_reconstruct): If is_utf8,
	find previous character by walking back instead of converting
	all chars from beginning.

2003-11-12  Jakub Jelinek  <jakub@redhat.com>

	* posix/regex_internal.h (struct re_string_t): Add is_utf8
	and mb_cur_max fields.
	(struct re_dfa_t): Likewise.  Reorder fields to make structure
	smaller on 64-bit arches.
	(re_string_allocate, re_string_construct): Add mb_cur_max and
	is_utf8 arguments.
	(re_string_char_size_at, re_string_wchar_at): Use pstr->mb_cur_max
	instead of MB_CUR_MAX.
	* posix/regcomp.c (re_compile_fastmap_iter): Use dfa->mb_cur_max
	instead of MB_CUR_MAX.
	(re_compile_internal): Pass new arguments to re_string_construct.
	(init_dfa): Initialize mb_cur_max and is_utf8 fields.
	(peek_token, peek_token_bracket): Use input->mb_cur_max instead
	of MB_CUR_MAX.
	(parse_expression, parse_bracket_exp, parse_charclass_op): Use
	dfa->mb_cur_max instead of MB_CUR_MAX.
	* posix/regex_internal.c (re_string_construct_common): Add
	mb_cur_max and is_utf8 arguments.  Initialize fields with them.
	(re_string_allocate, re_string_construct): Add mb_cur_max and
	is_utf8 arguments, pass them to re_string_construct_common.
	Use mb_cur_max instead of MB_CUR_MAX.
	(re_string_realloc_buffers): Use pstr->mb_cur_max instead of
	MB_CUR_MAX.
	(re_string_reconstruct): Likewise.
	(re_string_context_at): Use input->mb_cur_max instead of
	MB_CUR_MAX.
	(create_ci_newstate, create_cd_newstate): Use dfa->mb_cur_max
	instead of MB_CUR_MAX.
	* posix/regexec.c (re_search_internal): Likewise.
	Pass new arguments to re_string_allocate.
	(check_matching, transit_state_sb): Use dfa->mb_cur_max instead of
	MB_CUR_MAX.
	(extend_buffers): Use pstr->mb_cur_max instead of MB_CUR_MAX.

2003-11-12  Jakub Jelinek  <jakub@redhat.com>

	* posix/Makefile (tests): Add bug-regex19.
	(bug-regex19-ENV): Add LOCPATH.
	* posix/bug-regex19.c: New test.
Diffstat (limited to 'posix/regex_internal.c')
-rw-r--r--posix/regex_internal.c86
1 files changed, 61 insertions, 25 deletions
diff --git a/posix/regex_internal.c b/posix/regex_internal.c
index e328243a57..329fc81de9 100644
--- a/posix/regex_internal.c
+++ b/posix/regex_internal.c
@@ -20,7 +20,8 @@
 
 static void re_string_construct_common (const char *str, int len,
 					re_string_t *pstr,
-					RE_TRANSLATE_TYPE trans, int icase);
+					RE_TRANSLATE_TYPE trans, int icase,
+					int mb_cur_max, int is_utf8);
 #ifdef RE_ENABLE_I18N
 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
 				 wint_t *last_wc);
@@ -46,15 +47,17 @@ static unsigned int inline calc_state_hash (const re_node_set *nodes,
    re_string_reconstruct before using the object.  */
 
 static reg_errcode_t
-re_string_allocate (pstr, str, len, init_len, trans, icase)
+re_string_allocate (pstr, str, len, init_len, trans, icase,
+		    mb_cur_max, is_utf8)
      re_string_t *pstr;
      const char *str;
-     int len, init_len, icase;
+     int len, init_len, icase, mb_cur_max, is_utf8;
      RE_TRANSLATE_TYPE trans;
 {
   reg_errcode_t ret;
   int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
-  re_string_construct_common (str, len, pstr, trans, icase);
+  re_string_construct_common (str, len, pstr, trans, icase,
+			      mb_cur_max, is_utf8);
   pstr->stop = pstr->len;
 
   ret = re_string_realloc_buffers (pstr, init_buf_len);
@@ -65,21 +68,22 @@ re_string_allocate (pstr, str, len, init_len, trans, icase)
 		    : (unsigned char *) str);
   pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
   pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr)
-		     || MB_CUR_MAX > 1) ? pstr->valid_len : len;
+		     || mb_cur_max > 1) ? pstr->valid_len : len;
   return REG_NOERROR;
 }
 
 /* This function allocate the buffers, and initialize them.  */
 
 static reg_errcode_t
-re_string_construct (pstr, str, len, trans, icase)
+re_string_construct (pstr, str, len, trans, icase, mb_cur_max, is_utf8)
      re_string_t *pstr;
      const char *str;
-     int len, icase;
+     int len, icase, mb_cur_max, is_utf8;
      RE_TRANSLATE_TYPE trans;
 {
   reg_errcode_t ret;
-  re_string_construct_common (str, len, pstr, trans, icase);
+  re_string_construct_common (str, len, pstr, trans, icase,
+			      mb_cur_max, is_utf8);
   pstr->stop = pstr->len;
   /* Set 0 so that this function can initialize whole buffers.  */
   pstr->valid_len = 0;
@@ -97,7 +101,7 @@ re_string_construct (pstr, str, len, trans, icase)
   if (icase)
     {
 #ifdef RE_ENABLE_I18N
-      if (MB_CUR_MAX > 1)
+      if (mb_cur_max > 1)
 	build_wcs_upper_buffer (pstr);
       else
 #endif /* RE_ENABLE_I18N  */
@@ -106,7 +110,7 @@ re_string_construct (pstr, str, len, trans, icase)
   else
     {
 #ifdef RE_ENABLE_I18N
-      if (MB_CUR_MAX > 1)
+      if (mb_cur_max > 1)
 	build_wcs_buffer (pstr);
       else
 #endif /* RE_ENABLE_I18N  */
@@ -131,7 +135,7 @@ re_string_realloc_buffers (pstr, new_buf_len)
      int new_buf_len;
 {
 #ifdef RE_ENABLE_I18N
-  if (MB_CUR_MAX > 1)
+  if (pstr->mb_cur_max > 1)
     {
       wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
       if (BE (new_array == NULL, 0))
@@ -163,18 +167,20 @@ re_string_realloc_buffers (pstr, new_buf_len)
 
 
 static void
-re_string_construct_common (str, len, pstr, trans, icase)
+re_string_construct_common (str, len, pstr, trans, icase, mb_cur_max, is_utf8)
      const char *str;
      int len;
      re_string_t *pstr;
      RE_TRANSLATE_TYPE trans;
-     int icase;
+     int icase, mb_cur_max, is_utf8;
 {
   memset (pstr, '\0', sizeof (re_string_t));
   pstr->raw_mbs = (const unsigned char *) str;
   pstr->len = len;
   pstr->trans = trans;
   pstr->icase = icase ? 1 : 0;
+  pstr->mb_cur_max = mb_cur_max;
+  pstr->is_utf8 = is_utf8;
 }
 
 #ifdef RE_ENABLE_I18N
@@ -372,7 +378,7 @@ re_string_translate_buffer (pstr)
 }
 
 /* This function re-construct the buffers.
-   Concretely, convert to wide character in case of MB_CUR_MAX > 1,
+   Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
    convert to upper case in case of REG_ICASE, apply translation.  */
 
 static reg_errcode_t
@@ -385,7 +391,7 @@ re_string_reconstruct (pstr, idx, eflags, newline)
     {
       /* Reset buffer.  */
 #ifdef RE_ENABLE_I18N
-      if (MB_CUR_MAX > 1)
+      if (pstr->mb_cur_max > 1)
 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 #endif /* RE_ENABLE_I18N */
       pstr->len += pstr->raw_mbs_idx;
@@ -409,7 +415,7 @@ re_string_reconstruct (pstr, idx, eflags, newline)
 	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
 						    newline);
 #ifdef RE_ENABLE_I18N
-	  if (MB_CUR_MAX > 1)
+	  if (pstr->mb_cur_max > 1)
 	    memmove (pstr->wcs, pstr->wcs + offset,
 		     (pstr->valid_len - offset) * sizeof (wint_t));
 #endif /* RE_ENABLE_I18N */
@@ -429,13 +435,43 @@ re_string_reconstruct (pstr, idx, eflags, newline)
 	  /* No, skip all characters until IDX.  */
 	  pstr->valid_len = 0;
 #ifdef RE_ENABLE_I18N
-	  if (MB_CUR_MAX > 1)
+	  if (pstr->mb_cur_max > 1)
 	    {
 	      int wcs_idx;
-	      wint_t wc;
-	      pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
-	      for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
-		pstr->wcs[wcs_idx] = WEOF;
+	      wint_t wc = WEOF;
+
+	      if (pstr->is_utf8)
+		{
+		  const unsigned char *raw, *p, *end;
+
+		  /* Special case UTF-8.  Multi-byte chars start with any
+		     byte other than 0x80 - 0xbf.  */
+		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
+		  end = raw + (pstr->valid_len > offset - pstr->mb_cur_max
+			       ? pstr->valid_len : offset - pstr->mb_cur_max);
+		  for (p = raw + offset - 1; p >= end; --p)
+		    if ((*p & 0xc0) != 0x80)
+		      {
+			mbstate_t cur_state;
+			wchar_t wc2;
+
+			memset (&cur_state, 0, sizeof (cur_state));
+			if (mbrtowc (&wc2, p, raw + offset - p, &cur_state)
+			    == raw + offset - p)
+			  {
+			    memset (&pstr->cur_state, '\0',
+				    sizeof (mbstate_t));
+			    wc = wc2;
+			  }
+			break;
+		      }
+		}
+	      if (wc == WEOF)
+		{
+		  pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
+		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
+		    pstr->wcs[wcs_idx] = WEOF;
+		}
 	      if (pstr->trans && wc <= 0xff)
 		wc = pstr->trans[wc];
 	      pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD
@@ -467,7 +503,7 @@ re_string_reconstruct (pstr, idx, eflags, newline)
 
   /* Then build the buffers.  */
 #ifdef RE_ENABLE_I18N
-  if (MB_CUR_MAX > 1)
+  if (pstr->mb_cur_max > 1)
     {
       if (pstr->icase)
 	build_wcs_upper_buffer (pstr);
@@ -519,7 +555,7 @@ re_string_context_at (input, idx, eflags, newline_anchor)
 		: CONTEXT_NEWLINE | CONTEXT_ENDBUF);
     }
 #ifdef RE_ENABLE_I18N
-  if (MB_CUR_MAX > 1)
+  if (input->mb_cur_max > 1)
     {
       wint_t wc;
       int wc_idx = idx;
@@ -1157,7 +1193,7 @@ create_ci_newstate (dfa, nodes, hash)
 	newstate->halt = 1;
 #ifdef RE_ENABLE_I18N
       else if (type == COMPLEX_BRACKET
-	       || (type == OP_PERIOD && MB_CUR_MAX > 1))
+	       || (type == OP_PERIOD && dfa->mb_cur_max > 1))
 	newstate->accept_mb = 1;
 #endif /* RE_ENABLE_I18N */
       else if (type == OP_BACK_REF)
@@ -1208,7 +1244,7 @@ create_cd_newstate (dfa, nodes, context, hash)
 	newstate->halt = 1;
 #ifdef RE_ENABLE_I18N
       else if (type == COMPLEX_BRACKET
-	       || (type == OP_PERIOD && MB_CUR_MAX > 1))
+	       || (type == OP_PERIOD && dfa->mb_cur_max > 1))
 	newstate->accept_mb = 1;
 #endif /* RE_ENABLE_I18N */
       else if (type == OP_BACK_REF)