about summary refs log tree commit diff
path: root/locale/programs/ld-collate.c
diff options
context:
space:
mode:
Diffstat (limited to 'locale/programs/ld-collate.c')
-rw-r--r--locale/programs/ld-collate.c317
1 files changed, 288 insertions, 29 deletions
diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c
index 42fd601064..87005e86ab 100644
--- a/locale/programs/ld-collate.c
+++ b/locale/programs/ld-collate.c
@@ -73,7 +73,8 @@ struct element_t
 
   const char *mbs;
   const uint32_t *wcs;
-  int order;
+  int mborder;
+  int wcorder;
 
   struct element_list_t *weights;
 
@@ -87,6 +88,9 @@ struct element_t
   /* Predecessor and successor in the order list.  */
   struct element_t *last;
   struct element_t *next;
+
+  /* Next element in multibyte output list.  */
+  struct element_t *mbnext;
 };
 
 /* Special element value.  */
@@ -151,6 +155,10 @@ struct locale_collate_t
      that the definitions from more than one input file contains information.
      Therefore we keep all relevant input in a list.  */
   struct locale_collate_t *next;
+
+  /* Arrays with heads of the list for each of the leading bytes in
+     the multibyte sequences.  */
+  struct element_t *mbheads[256];
 };
 
 
@@ -176,7 +184,7 @@ make_seclist_elem (struct locale_collate_t *collate, const char *string,
 
 
 static struct element_t *
-new_element (struct locale_collate_t *collate, const char *mbs,
+new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
 	     const uint32_t *wcs, const char *name, size_t namelen)
 {
   struct element_t *newp;
@@ -185,7 +193,10 @@ new_element (struct locale_collate_t *collate, const char *mbs,
 					     sizeof (*newp));
   newp->name = name == NULL ? NULL : obstack_copy (&collate->mempool,
 						   name, namelen);
-  newp->mbs = mbs;
+  if (mbs != NULL)
+    newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
+  else
+    newp->mbs = NULL;
   if (wcs != NULL)
     {
       size_t nwcs = wcslen ((wchar_t *) wcs) + 1;
@@ -196,7 +207,8 @@ new_element (struct locale_collate_t *collate, const char *mbs,
     }
   else
     newp->wcs = NULL;
-  newp->order = 0;
+  newp->mborder = 0;
+  newp->wcorder = 0;
 
   /* Will be allocated later.  */
   newp->weights = NULL;
@@ -209,6 +221,8 @@ new_element (struct locale_collate_t *collate, const char *mbs,
   newp->last = NULL;
   newp->next = NULL;
 
+  newp->mbnext = NULL;
+
   return newp;
 }
 
@@ -457,14 +471,15 @@ find_element (struct linereader *ldfile, struct locale_collate_t *collate,
 	  result = sym->order;
 
 	  if (result == NULL)
-	    result = sym->order = new_element (collate, NULL, NULL, NULL, 0);
+	    result = sym->order = new_element (collate, NULL, 0, NULL,
+					       NULL, 0);
 	}
       else if (find_entry (&collate->elem_table, str, len,
 			   (void **) &result) != 0)
 	{
 	  /* It's also no collation element.  So it is an character
 	     element defined later.  */
-	  result = new_element (collate, NULL, NULL, str, len);
+	  result = new_element (collate, NULL, 0, NULL, str, len);
 	  if (result != NULL)
 	    /* Insert it into the sequence table.  */
 	    insert_entry (&collate->seq_table, str, len, result);
@@ -499,6 +514,8 @@ insert_weights (struct linereader *ldfile, struct element_t *elem,
   elem->line = ldfile->lineno;
   elem->last = collate->cursor;
   elem->next = collate->cursor ? collate->cursor->next : NULL;
+  if (collate->cursor != NULL)
+    collate->cursor->next = elem;
   elem->weights = (struct element_list_t *)
     obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
   memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
@@ -683,7 +700,7 @@ insert_weights (struct linereader *ldfile, struct element_t *elem,
 }
 
 
-static void
+static int
 insert_value (struct linereader *ldfile, struct token *arg,
 	      struct charmap_t *charmap, struct repertoire_t *repertoire,
 	      struct locale_collate_t *collate)
@@ -721,14 +738,14 @@ insert_value (struct linereader *ldfile, struct token *arg,
 	  elem = sym->order;
 
 	  if (elem == NULL)
-	    elem = sym->order = new_element (collate, NULL, NULL, NULL, 0);
+	    elem = sym->order = new_element (collate, NULL, 0, NULL, NULL, 0);
 	}
       else if (find_entry (&collate->elem_table, arg->val.str.startmb,
 			   arg->val.str.lenmb, (void **) &elem) != 0)
 	{
 	  /* It's also no collation element.  Therefore ignore it.  */
 	  lr_ignore_rest (ldfile, 0);
-	  return;
+	  return 1;
 	}
     }
   else
@@ -741,6 +758,7 @@ insert_value (struct linereader *ldfile, struct token *arg,
 
 	  /* We have to allocate an entry.  */
 	  elem = new_element (collate, seq != NULL ? seq->bytes : NULL,
+			      seq != NULL ? seq->nbytes : 0,
 			      wcs, arg->val.str.startmb, arg->val.str.lenmb);
 
 	  /* And add it to the table.  */
@@ -755,14 +773,16 @@ insert_value (struct linereader *ldfile, struct token *arg,
   if (elem->next != NULL || (collate->cursor != NULL
 			     && elem->next == collate->cursor))
     {
-      lr_error (ldfile, _("order for `%.*s' already defined at %s:%Z"),
+      lr_error (ldfile, _("order for `%.*s' already defined at %s:%zu"),
 		arg->val.str.lenmb, arg->val.str.startmb,
 		elem->file, elem->line);
       lr_ignore_rest (ldfile, 0);
-      return;
+      return 1;
     }
 
   insert_weights (ldfile, elem, charmap, repertoire, collate, tok_none);
+
+  return 0;
 }
 
 
@@ -780,8 +800,11 @@ handle_ellipsis (struct linereader *ldfile, struct token *arg,
   startp = collate->cursor;
 
   /* Process and add the end-entry.  */
-  if (arg != NULL)
-    insert_value (ldfile, arg, charmap, repertoire, collate);
+  if (arg != NULL
+      && insert_value (ldfile, arg, charmap, repertoire, collate))
+    /* Something went wrong with inserting the to-value.  This means
+       we cannot process the ellipsis.  */
+    return;
 
   /* Reset the cursor.  */
   collate->cursor = startp;
@@ -805,7 +828,168 @@ handle_ellipsis (struct linereader *ldfile, struct token *arg,
 
   if (ellipsis == tok_ellipsis3)
     {
-      /* XXX */
+      /* One requirement we make here: the length of the byte
+	 sequences for the first and end character must be the same.
+	 This is mainly to prevent unwanted effects and this is often
+	 not what is wanted.  */
+      size_t len = (startp->mbs != NULL ? strlen (startp->mbs)
+		    : (endp->mbs != NULL ? strlen (endp->mbs) : 0));
+      char mbcnt[len + 1];
+      char mbend[len + 1];
+
+      /* Well, this should be caught somewhere else already.  Just to
+	 make sure.  */
+      assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
+      assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
+
+      if (startp != NULL && endp != NULL
+	  && startp->mbs != NULL && endp->mbs != NULL
+	  && strlen (startp->mbs) != strlen (endp->mbs))
+	{
+	  lr_error (ldfile, _("\
+%s: byte sequences of first and last character must have the same length"),
+		    "LC_COLLATE");
+	  return;
+	}
+
+      /* Determine whether we have to generate multibyte sequences.  */
+      if ((startp == NULL || startp->mbs != NULL)
+	  && (endp == NULL || endp->mbs != NULL))
+	{
+	  int cnt;
+	  int ret;
+
+	  /* Prepare the beginning byte sequence.  This is either from the
+	     beginning byte sequence or it is all nulls if it was an
+	     initial ellipsis.  */
+	  if (startp == NULL || startp->mbs == NULL)
+	    memset (mbcnt, '\0', len);
+	  else
+	    {
+	      memcpy (mbcnt, startp->mbs, len);
+
+	      /* And increment it so that the value is the first one we will
+		 try to insert.  */
+	      for (cnt = len - 1; cnt >= 0; --cnt)
+		if (++mbcnt[cnt] != '\0')
+		  break;
+	    }
+	  mbcnt[len] = '\0';
+
+	  /* And the end sequence.  */
+	  if (endp == NULL || endp->mbs == NULL)
+	    memset (mbend, '\0', len);
+	  else
+	    memcpy (mbend, endp->mbs, len);
+	  mbend[len] = '\0';
+
+	  /* Test whether we have a correct range.  */
+	  ret = memcmp (mbcnt, mbend, len);
+	  if (ret >= 0)
+	    {
+	      if (ret > 0)
+		lr_error (ldfile, _("%s: byte sequence of first character of \
+sequence is not lower than that of the last character"), "LC_COLLATE");
+	      return;
+	    }
+
+	  /* Generate the byte sequences data.  */
+	  while (1)
+	    {
+	      struct charseq *seq;
+
+	      /* Quite a bit of work ahead.  We have to find the character
+		 definition for the byte sequence and then determine the
+		 wide character belonging to it.  */
+	      seq = charmap_find_symbol (charmap, mbcnt, len);
+	      if (seq != NULL)
+		{
+		  struct element_t *elem;
+		  size_t namelen;
+
+		  if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
+		    seq->ucs4 = repertoire_find_value (repertoire, seq->name,
+						       strlen (seq->name));
+
+		  /* I don't this this can ever happen.  */
+		  assert (seq->name != NULL);
+		  namelen = strlen (seq->name);
+
+		  /* Now we are ready to insert the new value in the
+		     sequence.  Find out whether the element is
+		     already known.  */
+		  if (find_entry (&collate->seq_table, seq->name, namelen,
+				  (void **) &elem) != 0)
+		    {
+		      uint32_t wcs[2] = { seq->ucs4, 0 };
+
+		      /* We have to allocate an entry.  */
+		      elem = new_element (collate, mbcnt, len, wcs, seq->name,
+					  namelen);
+
+		      /* And add it to the table.  */
+		      if (insert_entry (&collate->seq_table, seq->name,
+					namelen, elem) != 0)
+			/* This cannot happen.  */
+			assert (! "Internal error");
+		    }
+
+		  /* Test whether this element is not already in the list.  */
+		  if (elem->next != NULL || (collate->cursor != NULL
+					     && elem->next == collate->cursor))
+		    {
+		      lr_error (ldfile, _("\
+order for `%.*s' already defined at %s:%zu"),
+				namelen, seq->name, elem->file, elem->line);
+		      goto increment;
+		    }
+
+		  /* Enqueue the new element.  */
+		  elem->last = collate->cursor;
+		  elem->next = collate->cursor->next;
+		  elem->last->next = elem;
+		  if (elem->next != NULL)
+		    elem->next->last = elem;
+		  collate->cursor = elem;
+
+		 /* Add the weight value.  We take them from the
+		    `ellipsis_weights' member of `collate'.  */
+		  elem->weights = (struct element_list_t *)
+		    obstack_alloc (&collate->mempool,
+				   nrules * sizeof (struct element_list_t));
+		  for (cnt = 0; cnt < nrules; ++cnt)
+		    if (collate->ellipsis_weight.weights[cnt].cnt == 1
+			&& (collate->ellipsis_weight.weights[cnt].w[0]
+			    == ELEMENT_ELLIPSIS2))
+		      {
+			elem->weights[cnt].w = (struct element_t **)
+			  obstack_alloc (&collate->mempool,
+					 sizeof (struct element_t *));
+			elem->weights[cnt].w[0] = elem;
+			elem->weights[cnt].cnt = 1;
+		      }
+		    else
+		      {
+			/* Simly use the weight from `ellipsis_weight'.  */
+			elem->weights[cnt].w =
+			  collate->ellipsis_weight.weights[cnt].w;
+			elem->weights[cnt].cnt =
+			  collate->ellipsis_weight.weights[cnt].cnt;
+		      }
+		}
+
+	      /* Increment for the next round.  */
+	    increment:
+	      for (cnt = len - 1; cnt >= 0; --cnt)
+		if (++mbcnt[cnt] != '\0')
+		  break;
+
+	      /* Find out whether this was all.  */
+	      if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
+		/* Yep, that's all.  */
+		break;
+	    }
+	}
     }
   else
     {
@@ -883,7 +1067,7 @@ handle_ellipsis (struct linereader *ldfile, struct token *arg,
 					     && elem->next == collate->cursor))
 		    {
 		      lr_error (ldfile, _("\
-%s: order for `%.*s' already defined at %s:%Z"),
+%s: order for `%.*s' already defined at %s:%zu"),
 				"LC_COLLATE", lenfrom, buf,
 				elem->file, elem->line);
 		      continue;
@@ -923,6 +1107,7 @@ handle_ellipsis (struct linereader *ldfile, struct token *arg,
 		      /* We have to allocate an entry.  */
 		      elem = new_element (collate,
 					  seq != NULL ? seq->bytes : NULL,
+					  seq != NULL ? seq->nbytes : 0,
 					  wc == ILLEGAL_CHAR_VALUE
 					  ? NULL : wcs,
 					  buf, lenfrom);
@@ -1023,6 +1208,67 @@ collate_startup (struct linereader *ldfile, struct localedef_t *locale,
 void
 collate_finish (struct localedef_t *locale, struct charmap_t *charmap)
 {
+  /* Now is the time when we can assign the individual collation
+     values for all the symbols.  We have possibly different values
+     for the wide- and the multibyte-character symbols.  This is done
+     since it might make a difference in the encoding if there is in
+     some cases no multibyte-character but there are wide-characters.
+     (The other way around it is not important since theencoded
+     collation value in the wide-character case is 32 bits wide and
+     therefore requires no encoding).
+
+     The lowest collation value assigned is 2.  Zero is reserved for
+     the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
+     functions and 1 is used to separate the individual passes for the
+     different rules.
+
+     We also have to construct is list with all the bytes/words which
+     can come first in a sequence, followed by all the elements which
+     also start with this byte/word.  The order is reverse which has
+     among others the important effect that longer strings are located
+     first in the list.  This is required for the output data since
+     the algorithm used in `strcoll' etc depends on this.
+
+     The multibyte case is easy.  We simply sort into an array with
+     256 elements.  */
+  struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
+  int mbact = 2;
+  int wcact = 2;
+  struct element_t *runp = collate->start;
+
+  while (runp != NULL)
+    {
+      if (runp->mbs != NULL)
+	{
+	  struct element_t **eptr;
+
+	  /* Determine the order.  */
+	  runp->mborder = mbact++;
+
+	  /* Find the point where to insert in the list.  */
+	  eptr = &collate->mbheads[(unsigned int) runp->mbs[0]];
+	  while (*eptr != NULL)
+	    {
+	      /* Check which string is larger, the one we want to insert
+		 or the current element of the list we are looking at.  */
+	      assert (runp->mbs[0] == (*eptr)->mbs[0]);
+	      if (strcmp (runp->mbs, (*eptr)->mbs) > 0)
+		break;
+
+	      eptr = &(*eptr)->mbnext;
+	    }
+
+	  /* Set the pointers.  */
+	  runp->mbnext = *eptr;
+	  *eptr = runp;
+	}
+
+      if (runp->wcs != NULL)
+	runp->wcorder = wcact++;
+
+      /* Up to the next entry.  */
+      runp = runp->next;
+    }
 }
 
 
@@ -1257,7 +1503,8 @@ collate_read (struct linereader *ldfile, struct localedef_t *result,
 		      if (insert_entry (&collate->elem_table,
 					symbol, symbol_len,
 					new_element (collate,
-						     NULL, NULL, NULL, 0)) < 0)
+						     NULL, 0, NULL, symbol,
+						     symbol_len)) < 0)
 			lr_error (ldfile, _("\
 error while adding collating element"));
 		    }
@@ -1519,9 +1766,12 @@ error while adding equivalent collating symbol"));
 	    goto err_label;
 
 	  /* Handle ellipsis at end of list.  */
-	  if (was_ellipsis)
-	    /* XXX */
-	    abort ();
+	  if (was_ellipsis != tok_none)
+	    {
+	      handle_ellipsis (ldfile, NULL, was_ellipsis, charmap, repertoire,
+			       collate);
+	      was_ellipsis = tok_none;
+	    }
 
 	  state = 2;
 	  lr_ignore_rest (ldfile, 1);
@@ -1543,9 +1793,12 @@ error while adding equivalent collating symbol"));
 	      state = 2;
 
 	      /* Handle ellipsis at end of list.  */
-	      if (was_ellipsis)
-		/* XXX */
-		abort ();
+	      if (was_ellipsis != tok_none)
+		{
+		  handle_ellipsis (ldfile, arg, was_ellipsis, charmap,
+				   repertoire, collate);
+		  was_ellipsis = tok_none;
+		}
 	    }
 	  else if (state != 2 && state != 3)
 	    goto err_label;
@@ -1610,9 +1863,12 @@ error while adding equivalent collating symbol"));
 	      state = 2;
 
 	      /* Handle ellipsis at end of list.  */
-	      if (was_ellipsis)
-		/* XXX */
-		abort ();
+	      if (was_ellipsis != tok_none)
+		{
+		  handle_ellipsis (ldfile, NULL, was_ellipsis, charmap,
+				   repertoire, collate);
+		  was_ellipsis = tok_none;
+		}
 	    }
 	  else if (state == 3)
 	    {
@@ -1848,7 +2104,7 @@ error while adding equivalent collating symbol"));
 		  && collate->undefined.next == collate->cursor))
 	    {
 	      lr_error (ldfile,
-			_("%s: order for `%.*s' already defined at %s:%Z"),
+			_("%s: order for `%.*s' already defined at %s:%zu"),
 			"LC_COLLATE", 9, "UNDEFINED", collate->undefined.file,
 			collate->undefined.line);
 	      lr_ignore_rest (ldfile, 0);
@@ -1892,9 +2148,12 @@ error while adding equivalent collating symbol"));
 			    "LC_COLLATE");
 
 		  /* Handle ellipsis at end of list.  */
-		  if (was_ellipsis)
-		    /* XXX */
-		    abort ();
+		  if (was_ellipsis != tok_none)
+		    {
+		      handle_ellipsis (ldfile, NULL, was_ellipsis, charmap,
+				       repertoire, collate);
+		      was_ellipsis = tok_none;
+		    }
 		}
 	      else if (state == 3)
 		error (0, 0, _("%s: missing `reorder-end' keyword"),