about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog7
-rw-r--r--locale/programs/ld-collate.c45
-rw-r--r--posix/PTESTS12
-rw-r--r--posix/ptestcases.h12
-rw-r--r--posix/regex.c91
5 files changed, 86 insertions, 81 deletions
diff --git a/ChangeLog b/ChangeLog
index 8c10f3a301..0192430ee2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
 1999-12-31  Ulrich Drepper  <drepper@cygnus.com>
 
+	* locale/programs/ld-collate.c (collate_output): Emit correct
+	information for collation elements.
+	Don't write over end of array idx.
+	* posix/regex.c: Handle also collation elements at end of range.
+
+	* posix/PTESTS: Fix a few typos.
+
 	* posix/bits/posix2_lim.h: Remove _POSIX2_EQUIV_CLASS_MAX.  I have
 	no idea where this came from.
 	* sysdeps/posix/sysconf.c: Remove _POSIX2_EQUIV_CLASS_MAX
diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c
index 8eb47d7f8e..2cbea388b2 100644
--- a/locale/programs/ld-collate.c
+++ b/locale/programs/ld-collate.c
@@ -91,8 +91,6 @@ struct element_t
   unsigned int used_in_level;
 
   struct element_list_t *weights;
-  /* Index in the `weight' table in the output file for the character.  */
-  int32_t weights_idx;
 
   /* Nonzero if this is a real character definition.  */
   int is_character;
@@ -301,7 +299,6 @@ new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
 
   /* Will be allocated later.  */
   newp->weights = NULL;
-  newp->weights_idx = 0;
 
   newp->file = NULL;
   newp->line = 0;
@@ -1809,9 +1806,6 @@ output_weight (struct obstack *pool, struct locale_collate_t *collate,
       obstack_grow (pool, buf, len);
     }
 
-  /* Remember the index.  */
-  elem->weights_idx = retval;
-
   return retval | ((elem->section->ruleidx & 0x7f) << 24);
 }
 
@@ -1899,11 +1893,26 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
   /* If we have no LC_COLLATE data emit only the number of rules as zero.  */
   if (collate == NULL)
     {
+      int32_t dummy = 0;
+
       while (cnt < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
 	{
-	  iov[2 + cnt].iov_base = (char *) "";
-	  iov[2 + cnt].iov_len = 0;
-	  idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
+	  /* The words have to be handled specially.  */
+	  if (cnt == _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)
+	      || cnt == _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)
+	      || cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
+	    {
+	      iov[2 + cnt].iov_base = &dummy;
+	      iov[2 + cnt].iov_len = sizeof (int32_t);
+	    }
+	  else
+	    {
+	      iov[2 + cnt].iov_base = (char *) "";
+	      iov[2 + cnt].iov_len = 0;
+	    }
+
+	  if (cnt + 1 < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
+	    idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
 	  ++cnt;
 	}
 
@@ -2453,23 +2462,20 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
 	      elem_table[idx * 2] = hash;
 	      elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
 
-	      /* Now add the index into the weights table.  We know the
-		 address is always 32bit aligned.  */
-	      if (sizeof (int) == sizeof (int32_t))
-		obstack_int_grow (&extrapool, runp->weights_idx);
-	      else
-		obstack_grow (&extrapool, &runp->weights_idx,
-			      sizeof (int32_t));
-
 	      /* The the string itself including length.  */
 	      obstack_1grow (&extrapool, namelen);
 	      obstack_grow (&extrapool, runp->name, namelen);
 
+	      /* And the multibyte representation.  */
+	      obstack_1grow (&extrapool, runp->nmbs);
+	      obstack_grow (&extrapool, runp->mbs, runp->nmbs);
+
 	      /* And align again to 32 bits.  */
-	      if ((1 + namelen) % sizeof (int32_t) != 0)
+	      if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
 		obstack_grow (&extrapool, "\0\0",
 			      (sizeof (int32_t)
-			       - (1 + namelen) % sizeof (int32_t)));
+			       - ((1 + namelen + 1 + runp->nmbs)
+				  % sizeof (int32_t))));
 	    }
 	}
 
@@ -2492,7 +2498,6 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_EXTRAMB));
   iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
   iov[2 + cnt].iov_base = obstack_finish (&extrapool);
-  idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
   ++cnt;
 
 
diff --git a/posix/PTESTS b/posix/PTESTS
index 3580c81a03..b017f5b3f2 100644
--- a/posix/PTESTS
+++ b/posix/PTESTS
@@ -115,7 +115,7 @@
 3¦3¦[][.-.]-0]¦ab0-]¦
 3¦3¦[A-[.].]c]¦ab]!¦
 # GA122
--2¦-2¦[[.ch]]¦abc¦
+-2¦-2¦[[.ch.]]¦abc¦
 -2¦-2¦[[.ab.][.CD.][.EF.]]¦yZabCDEFQ9¦
 # GA125
 2¦2¦[[=a=]b]¦Abc¦
@@ -163,12 +163,12 @@
 2¦6¦bc[d-w]xy¦abchxyz¦
 # GA129
 1¦1¦[a-cd-f]¦dbccde¦
--1¦-1¦[a-ce-f¦dBCCdE¦
+-1¦-1¦[a-ce-f]¦dBCCdE¦
 2¦4¦b[n-zA-M]Y¦absY9Z¦
 2¦4¦b[n-zA-M]Y¦abGY9Z¦
 # GA130
 3¦3¦[-xy]¦ac-¦
-2¦4¦[c[-xy]D¦ac-D+¦
+2¦4¦c[-xy]D¦ac-D+¦
 2¦2¦[--/]¦a.b¦
 2¦4¦c[--/]D¦ac.D+b¦
 2¦2¦[^-ac]¦abcde-¦
@@ -189,7 +189,7 @@
 3¦4¦[a-c][e-f]¦acbedf¦
 4¦8¦abc*XYZ¦890abXYZ#*¦
 4¦9¦abc*XYZ¦890abcXYZ#*¦
-4¦15¦abc*XYZ¦890abccccccccXYZ#*¦
+4¦15¦abc*XYZ¦890abcccccccXYZ#*¦
 -1¦-1¦abc*XYZ¦890abc*XYZ#*¦
 # GA132
 2¦4¦\(*bc\)¦a*bc¦
@@ -267,7 +267,7 @@
 1¦1¦^a¦abc¦
 -1¦-1¦^b¦abc¦
 -1¦-1¦^[a-zA-Z]¦99Nine¦
-1¦4¦^[a-zA-Z]¦Nine99¦
+1¦4¦^[a-zA-Z]*¦Nine99¦
 # GA145(1)
 1¦2¦\(^a\)\1¦aabc¦
 -1¦-1¦\(^a\)\1¦^a^abc¦
@@ -284,7 +284,7 @@
 3¦3¦a$¦cba¦
 -1¦-1¦a$¦abc¦
 5¦7¦[a-z]*$¦99ZZxyz¦
--1¦-1¦[a-z]*$¦99ZZxyz99¦
+9¦9¦[a-z]*$¦99ZZxyz99¦
 3¦3¦$$¦ab$¦
 -1¦-1¦$$¦$ab¦
 3¦3¦\$$¦ab$¦
diff --git a/posix/ptestcases.h b/posix/ptestcases.h
index d6e099c82b..87f584d2e3 100644
--- a/posix/ptestcases.h
+++ b/posix/ptestcases.h
@@ -110,7 +110,7 @@
   { 3, 3, "[][.-.]-0]", "ab0-]",  },
   { 3, 3, "[A-[.].]c]", "ab]!",  },
   { 0, 0, "GA122", NULL, },
-  { -2, -2, "[[.ch]]", "abc",  },
+  { -2, -2, "[[.ch.]]", "abc",  },
   { -2, -2, "[[.ab.][.CD.][.EF.]]", "yZabCDEFQ9",  },
   { 0, 0, "GA125", NULL, },
   { 2, 2, "[[=a=]b]", "Abc",  },
@@ -158,12 +158,12 @@
   { 2, 6, "bc[d-w]xy", "abchxyz",  },
   { 0, 0, "GA129", NULL, },
   { 1, 1, "[a-cd-f]", "dbccde",  },
-  { -1, -1, "[a-ce-f", "dBCCdE",  },
+  { -1, -1, "[a-ce-f]", "dBCCdE",  },
   { 2, 4, "b[n-zA-M]Y", "absY9Z",  },
   { 2, 4, "b[n-zA-M]Y", "abGY9Z",  },
   { 0, 0, "GA130", NULL, },
   { 3, 3, "[-xy]", "ac-",  },
-  { 2, 4, "[c[-xy]D", "ac-D+",  },
+  { 2, 4, "c[-xy]D", "ac-D+",  },
   { 2, 2, "[--/]", "a.b",  },
   { 2, 4, "c[--/]D", "ac.D+b",  },
   { 2, 2, "[^-ac]", "abcde-",  },
@@ -184,7 +184,7 @@
   { 3, 4, "[a-c][e-f]", "acbedf",  },
   { 4, 8, "abc*XYZ", "890abXYZ#*",  },
   { 4, 9, "abc*XYZ", "890abcXYZ#*",  },
-  { 4, 15, "abc*XYZ", "890abccccccccXYZ#*",  },
+  { 4, 15, "abc*XYZ", "890abcccccccXYZ#*",  },
   { -1, -1, "abc*XYZ", "890abc*XYZ#*",  },
   { 0, 0, "GA132", NULL, },
   { 2, 4, "\\(*bc\\)", "a*bc",  },
@@ -262,7 +262,7 @@
   { 1, 1, "^a", "abc",  },
   { -1, -1, "^b", "abc",  },
   { -1, -1, "^[a-zA-Z]", "99Nine",  },
-  { 1, 4, "^[a-zA-Z]", "Nine99",  },
+  { 1, 4, "^[a-zA-Z]*", "Nine99",  },
   { 0, 0, "GA145(1)", NULL, },
   { 1, 2, "\\(^a\\)\\1", "aabc",  },
   { -1, -1, "\\(^a\\)\\1", "^a^abc",  },
@@ -274,7 +274,7 @@
   { 3, 3, "a$", "cba",  },
   { -1, -1, "a$", "abc",  },
   { 5, 7, "[a-z]*$", "99ZZxyz",  },
-  { -1, -1, "[a-z]*$", "99ZZxyz99",  },
+  { 9, 9, "[a-z]*$", "99ZZxyz99",  },
   { 3, 3, "$$", "ab$",  },
   { -1, -1, "$$", "$ab",  },
   { 3, 3, "\\$$", "ab$",  },
diff --git a/posix/regex.c b/posix/regex.c
index a59f5d4a71..d036a7dd3a 100644
--- a/posix/regex.c
+++ b/posix/regex.c
@@ -1570,7 +1570,8 @@ static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p,
 					   reg_syntax_t syntax));
 static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend,
 					   reg_syntax_t syntax));
-static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
+static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
+					      const char **p_ptr,
 					      const char *pend,
 					      char *translate,
 					      reg_syntax_t syntax,
@@ -2174,6 +2175,7 @@ regex_compile (pattern, size, syntax, bufp)
         case '[':
           {
             boolean had_char_class = false;
+	    unsigned int range_start = 0xffffffff;
 
             if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
 
@@ -2217,6 +2219,7 @@ regex_compile (pattern, size, syntax, bufp)
 
                     PATFETCH (c1);
                     SET_LIST_BIT (c1);
+		    range_start = c1;
                     continue;
                   }
 
@@ -2241,8 +2244,10 @@ regex_compile (pattern, size, syntax, bufp)
                     && *p != ']')
                   {
                     reg_errcode_t ret
-                      = compile_range (&p, pend, translate, syntax, b);
+                      = compile_range (range_start, &p, pend, translate,
+				       syntax, b);
                     if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+		    range_start = 0xffffffff;
                   }
 
                 else if (p[0] == '-' && p[1] != ']')
@@ -2252,8 +2257,9 @@ regex_compile (pattern, size, syntax, bufp)
 		    /* Move past the `-'.  */
                     PATFETCH (c1);
 
-                    ret = compile_range (&p, pend, translate, syntax, b);
+                    ret = compile_range (c, &p, pend, translate, syntax, b);
                     if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+		    range_start = 0xffffffff;
                   }
 
                 /* See if we're at the beginning of a possible character
@@ -2376,6 +2382,7 @@ regex_compile (pattern, size, syntax, bufp)
                           PATUNFETCH;
                         SET_LIST_BIT ('[');
                         SET_LIST_BIT (':');
+			range_start = ':';
                         had_char_class = false;
                       }
                   }
@@ -2503,6 +2510,16 @@ regex_compile (pattern, size, syntax, bufp)
 #endif
 			had_char_class = true;
 		      }
+                    else
+                      {
+                        c1++;
+                        while (c1--)
+                          PATUNFETCH;
+                        SET_LIST_BIT ('[');
+                        SET_LIST_BIT ('=');
+			range_start = '=';
+                        had_char_class = false;
+                      }
 		  }
                 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
 		  {
@@ -2553,6 +2570,7 @@ regex_compile (pattern, size, syntax, bufp)
 
 			    /* Set the bit for the character.  */
 			    SET_LIST_BIT (str[0]);
+			    range_start = ((const unsigned char *) str)[0];
 			  }
 #ifdef _LIBC
 			else
@@ -2561,9 +2579,7 @@ regex_compile (pattern, size, syntax, bufp)
 			       those known to the collate implementation.
 			       First find out whether the bytes in `str' are
 			       actually from exactly one character.  */
-			    const unsigned char *weights;
 			    int32_t table_size;
-			    const int32_t *table;
 			    const int32_t *symb_table;
 			    const unsigned char *extra;
 			    int32_t idx;
@@ -2574,10 +2590,6 @@ regex_compile (pattern, size, syntax, bufp)
 			    int32_t hash;
 			    int ch;
 
-			    table = (const int32_t *)
-			      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
-			    weights = (const unsigned char *)
-			      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
 			    table_size =
 			      _NL_CURRENT_WORD (LC_COLLATE,
 						_NL_COLLATE_SYMB_HASH_SIZEMB);
@@ -2598,17 +2610,15 @@ regex_compile (pattern, size, syntax, bufp)
 			      {
 				/* First compare the hashing value.  */
 				if (symb_table[2 * elem] == hash
-				    && (c1 == extra[symb_table[2 * elem + 1]
-						   + sizeof (int32_t)])
+				    && c1 == extra[symb_table[2 * elem + 1]]
 				    && memcmp (str,
 					       &extra[symb_table[2 * elem + 1]
-						     + sizeof (int32_t) + 1],
+						     + 1],
 					       c1) == 0)
 				  {
 				    /* Yep, this is the entry.  */
-				    idx = *((int32_t *)
-					    (extra
-					     + symb_table[2 * elem + 1]));
+				    idx = symb_table[2 * elem + 1];
+				    idx += 1 + extra[idx];
 				    break;
 				  }
 
@@ -2624,40 +2634,21 @@ regex_compile (pattern, size, syntax, bufp)
 			       class.  */
 			    PATFETCH (c);
 
-			    /* Now we have to go throught the whole table
-			       and find all characters which have the same
-			       first level weight.
+			    /* Now add the multibyte character(s) we found
+			       to the acceptabed list.
 
 			       XXX Note that this is not entirely correct.
 			       we would have to match multibyte sequences
 			       but this is not possible with the current
-			       implementation.  */
-			    for (ch = 1; ch < 256; ++ch)
-			      /* XXX This test would have to be changed if we
-				 would allow matching multibyte sequences.  */
-			      if (table[ch] > 0)
-				{
-				  int32_t idx2 = table[ch];
-				  size_t len = weights[idx2];
-
-				  /* Test whether the lenghts match.  */
-				  if (weights[idx] == len)
-				    {
-				      /* They do.  New compare the bytes of
-					 the weight.  */
-				      size_t cnt = 0;
-
-				      while (cnt < len
-					     && (weights[idx + 1 + cnt]
-						 == weights[idx2 + 1 + cnt]))
-					++len;
-
-				      if (cnt == len)
-					/* They match.  Mark the character as
-					   acceptable.  */
-					SET_LIST_BIT (ch);
-				    }
-				}
+			       implementation.  Also, we have to match
+			       collating symbols, which expand to more than
+			       one file, as a whole and not allow the
+			       individual bytes.  */
+			    c1 = extra[idx++];
+			    if (c1 == 1)
+			      range_start = extra[idx];
+			    while (c1-- > 0)
+			      SET_LIST_BIT (extra[idx++]);
 			  }
 #endif
 			had_char_class = false;
@@ -2668,7 +2659,8 @@ regex_compile (pattern, size, syntax, bufp)
                         while (c1--)
                           PATUNFETCH;
                         SET_LIST_BIT ('[');
-                        SET_LIST_BIT ('=');
+                        SET_LIST_BIT ('.');
+			range_start = '.';
                         had_char_class = false;
                       }
 		  }
@@ -2676,6 +2668,7 @@ regex_compile (pattern, size, syntax, bufp)
                   {
                     had_char_class = false;
                     SET_LIST_BIT (c);
+		    range_start = c;
                   }
               }
 
@@ -3425,7 +3418,8 @@ group_in_compile_stack (compile_stack, regnum)
    `regex_compile' itself.  */
 
 static reg_errcode_t
-compile_range (p_ptr, pend, translate, syntax, b)
+compile_range (range_start, p_ptr, pend, translate, syntax, b)
+     unsigned int range_start;
     const char **p_ptr, *pend;
     RE_TRANSLATE_TYPE translate;
     reg_syntax_t syntax;
@@ -3434,7 +3428,7 @@ compile_range (p_ptr, pend, translate, syntax, b)
   unsigned this_char;
 
   const char *p = *p_ptr;
-  unsigned int range_start, range_end;
+  unsigned int range_end;
 
   if (p == pend)
     return REG_ERANGE;
@@ -3447,7 +3441,6 @@ compile_range (p_ptr, pend, translate, syntax, b)
      We also want to fetch the endpoints without translating them; the
      appropriate translation is done in the bit-setting loop below.  */
   /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *.  */
-  range_start = ((const unsigned char *) p)[-2];
   range_end   = ((const unsigned char *) p)[0];
 
   /* Have to increment the pointer into the pattern string, so the