about summary refs log tree commit diff
path: root/locale/programs/ld-collate.c
diff options
context:
space:
mode:
Diffstat (limited to 'locale/programs/ld-collate.c')
-rw-r--r--locale/programs/ld-collate.c402
1 files changed, 392 insertions, 10 deletions
diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c
index 4bdf0b2256..77e946535d 100644
--- a/locale/programs/ld-collate.c
+++ b/locale/programs/ld-collate.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 1995, 1996 Free Software Foundation, Inc.
 This file is part of the GNU C Library.
-Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
 
 The GNU C Library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Library General Public License as
@@ -34,6 +34,7 @@ Boston, MA 02111-1307, USA.  */
 #include "locales.h"
 #include "simple-hash.h"
 #include "stringtrans.h"
+#include "strlen-hash.h"
 
 /* Uncomment the following line in the production version.  */
 /* #define NDEBUG 1 */
@@ -83,7 +84,7 @@ typedef struct element_t
 } element_t;
 
 
-/* The real definition of the struct for the LC_CTYPE locale.  */
+/* The real definition of the struct for the LC_COLLATE locale.  */
 struct locale_collate_t
 {
   /* Collate symbol table.  Simple mapping to number.  */
@@ -275,15 +276,13 @@ collate_finish (struct localedef_t *locale, struct charset_t *charset)
   collate->undefined_len = 2;	/* For the name: 1 x wchar_t + L'\0'.  */
   for (cnt = 0; cnt < collate->nrules; ++cnt)
     collate->undefined_len += 1 + collate->undefined.ordering[cnt];
-
-  /* Collating symbols are not used anymore.  */
-  (void) delete_hash (&collate->symbols);
 }
 
 
 
 void
-collate_output (struct localedef_t *locale, const char *output_path)
+collate_output (struct localedef_t *locale, struct charset_t *charset,
+		const char *output_path)
 {
   struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
   u_int32_t table_size, table_best, level_best, sum_best;
@@ -296,10 +295,29 @@ collate_output (struct localedef_t *locale, const char *output_path)
   struct locale_file data;
   u_int32_t idx[nelems];
   struct obstack non_simple;
+  struct obstack string_pool;
   size_t cnt, entry_size;
   u_int32_t undefined_offset = UINT_MAX;
   u_int32_t *table, *extra, *table2, *extra2;
   size_t extra_len;
+  u_int32_t element_hash_tab_size;
+  u_int32_t *element_hash_tab;
+  u_int32_t *element_hash_tab_ob;
+  u_int32_t element_string_pool_size;
+  char *element_string_pool;
+  u_int32_t element_value_size;
+  wchar_t *element_value;
+  wchar_t *element_value_ob;
+  u_int32_t symbols_hash_tab_size;
+  u_int32_t *symbols_hash_tab;
+  u_int32_t *symbols_hash_tab_ob;
+  u_int32_t symbols_string_pool_size;
+  char *symbols_string_pool;
+  u_int32_t symbols_class_size;
+  u_int32_t *symbols_class;
+  u_int32_t *symbols_class_ob;
+  hash_table *hash_tab;
+  unsigned int dummy_weights[collate->nrules + 1];
 
   sum_best = UINT_MAX;
   table_best = 0xffff;
@@ -342,6 +360,7 @@ Computing table size for collation information might take a while..."),
   fputs (_(" done\n"), stderr);
 
   obstack_init (&non_simple);
+  obstack_init (&string_pool);
 
   data.magic = LIMAGIC (LC_COLLATE);
   data.n = nelems;
@@ -608,6 +627,258 @@ Computing table size for collation information might take a while..."),
   for (cnt = 0; cnt < extra_len / sizeof (u_int32_t); ++cnt)
     extra2[cnt] = SWAPU32 (extra2[cnt]);
 
+  /* We need a simple hashing table to get a collation-element->chars
+     mapping.  We again use internal hasing using a secondary hashing
+     function.
+
+     Each string has an associate hashing value V, computed by a
+     fixed function.  To locate the string we use open addressing with
+     double hashing.  The first index will be V % M, where M is the
+     size of the hashing table.  If no entry is found, iterating with
+     a second, independent hashing function takes place.  This second
+     value will be 1 + V % (M - 2).  The approximate number of probes
+     will be
+
+	  for unsuccessful search: (1 - N / M) ^ -1
+	  for successful search:   - (N / M) ^ -1 * ln (1 - N / M)
+
+     where N is the number of keys.
+
+     If we now choose M to be the next prime bigger than 4 / 3 * N,
+     we get the values 4 and 1.85 resp.  Because unsuccesful searches
+     are unlikely this is a good value.  Formulas: [Knuth, The Art of
+     Computer Programming, Volume 3, Sorting and Searching, 1973,
+     Addison Wesley]  */
+  if (collate->elements.filled == 0)
+    {
+      /* We don't need any element table since there are no collating
+	 elements.  */
+      element_hash_tab_size = 0;
+      element_hash_tab = NULL;
+      element_hash_tab_ob = NULL;
+      element_string_pool_size = 0;
+      element_string_pool = NULL;
+      element_value_size = 0;
+      element_value = NULL;
+      element_value_ob = NULL;
+    }
+  else
+    {
+      void *ptr;		/* Running pointer.  */
+      const char *key;		/* Key for current bucket.  */
+      size_t keylen;		/* Length of key data.  */
+      const element_t *data;	/* Data, i.e., the character sequence.  */
+
+      element_hash_tab_size = next_prime ((collate->elements.filled * 4) / 3);
+      if (element_hash_tab_size < 7)
+	/* We need a minimum to make the following code work.  */
+	element_hash_tab_size = 7;
+
+      element_hash_tab = obstack_alloc (&non_simple, (2 * element_hash_tab_size
+						      * sizeof (u_int32_t)));
+      memset (element_hash_tab, '\377', (2 * element_hash_tab_size
+					 * sizeof (u_int32_t)));
+
+      ptr = NULL;
+      while (iterate_table (&collate->elements, &ptr, (const void **) &key,
+			    &keylen, (void **) &data) == 0)
+	{
+	  size_t hash_val = hash_string (key, keylen);
+	  size_t idx = hash_val % element_hash_tab_size;
+
+	  if (element_hash_tab[2 * idx] != (~((u_int32_t) 0)))
+	    {
+	      /* We need the second hashing function.  */
+	      size_t c = 1 + (hash_val % (element_hash_tab_size - 2));
+
+	      do
+		if (idx >= element_hash_tab_size - c)
+		  idx -= element_hash_tab_size - c;
+		else
+		  idx += c;
+	      while (element_hash_tab[2 * idx] != (~((u_int32_t) 0)));
+	    }
+
+	  element_hash_tab[2 * idx] = obstack_object_size (&non_simple);
+	  element_hash_tab[2 * idx + 1] = (obstack_object_size (&string_pool)
+					   / sizeof (wchar_t));
+
+	  obstack_grow0 (&non_simple, key, keylen);
+	  obstack_grow (&string_pool, data->name,
+			(wcslen (data->name) + 1) * sizeof (wchar_t));
+	}
+
+      if (obstack_object_size (&non_simple) % 4 != 0)
+	obstack_blank (&non_simple,
+		       4 - (obstack_object_size (&non_simple) % 4));
+      element_string_pool_size = obstack_object_size (&non_simple);
+      element_string_pool = obstack_finish (&non_simple);
+
+      element_value_size = obstack_object_size (&string_pool);
+      element_value = obstack_finish (&string_pool);
+
+      /* Create the tables for the other byte order.  */
+      element_hash_tab_ob = obstack_alloc (&non_simple,
+					   (2 * element_hash_tab_size
+					    * sizeof (u_int32_t)));
+      for (cnt = 0; cnt < 2 * element_hash_tab_size; ++cnt)
+	element_hash_tab_ob[cnt] = SWAPU32 (element_hash_tab[cnt]);
+
+      element_value_ob = obstack_alloc (&string_pool, element_value_size);
+      if (sizeof (wchar_t) != 4)
+	{
+	  fputs ("sizeof (wchar_t) != 4 currently not handled", stderr);
+	  abort ();
+	}
+      for (cnt = 0; cnt < element_value_size / 4; ++cnt)
+	element_value_ob[cnt] = SWAPU32 (element_value[cnt]);
+    }
+
+  /* Store collation elements as map to collation class.  There are
+     three kinds of symbols:
+       - simple characters
+       - collation elements
+       - collation symbols
+     We need to make a table which lets the user to access the primary
+     weight based on the symbol string.  */
+  symbols_hash_tab_size = next_prime ((4 * (charset->char_table.filled
+					    + collate->elements.filled
+					    + collate->symbols.filled)) / 3);
+  symbols_hash_tab = obstack_alloc (&non_simple, (2 * symbols_hash_tab_size
+						  * sizeof (u_int32_t)));
+  memset (symbols_hash_tab, '\377', (2 * symbols_hash_tab_size
+				     * sizeof (u_int32_t)));
+
+  /* Now fill the array.  First the symbols from the character set,
+     then the collation elements and last the collation symbols.  */
+  hash_tab = &charset->char_table;
+  while (1)
+    {
+      void *ptr;	/* Running pointer.  */
+      const char *key;	/* Key for current bucket.  */
+      size_t keylen;	/* Length of key data.  */
+      void *data;	/* Data.  */
+
+      ptr = NULL;
+      while (iterate_table (hash_tab, &ptr, (const void **) &key,
+			    &keylen, (void **) &data) == 0)
+	{
+	  size_t hash_val;
+	  size_t idx;
+	  u_int32_t word;
+	  unsigned int *weights;
+
+	  if (hash_tab == &charset->char_table
+	      || hash_tab == &collate->elements)
+	    {
+	      element_t *lastp, *firstp;
+	      wchar_t dummy_name[2];
+	      const wchar_t *name;
+	      size_t name_len;
+
+	      if (hash_tab == &charset->char_table)
+		{
+		  dummy_name[0] = (wchar_t) ((unsigned long int) data);
+		  dummy_name[1] = L'\0';
+		  name = dummy_name;
+		  name_len = sizeof (wchar_t);
+		}
+	      else
+		{
+		  element_t *elemp = (element_t *) data;
+		  name = elemp->name;
+		  name_len = wcslen (name) * sizeof (wchar_t);
+		}
+
+	      /* First check whether this character is used at all.  */
+	      if (find_entry (&collate->result, name, name_len,
+			      (void *) &firstp) < 0)
+		/* The symbol is not directly mentioned in the collation.
+		   I.e., we use the value for UNDEFINED.  */
+		lastp = &collate->undefined;
+	      else
+		{
+		  /* The entry for the simple character is always found at
+		     the end.  */
+		  lastp = firstp;
+		  while (lastp->next != NULL && wcscmp (name, lastp->name))
+		    lastp = lastp->next;
+		}
+
+	      weights = lastp->ordering;
+	    }
+	  else
+	    {
+	      dummy_weights[0] = 1;
+	      dummy_weights[collate->nrules]
+		= (unsigned int) ((unsigned long int) data);
+
+	      weights = dummy_weights;
+	    }
+
+	  /* In LASTP->ordering we now have the collation class.
+	     Determine the place in the hashing table next.  */
+	  hash_val = hash_string (key, keylen);
+	  idx = hash_val % symbols_hash_tab_size;
+
+	  if (symbols_hash_tab[2 * idx] != (~((u_int32_t) 0)))
+	    {
+	      /* We need the second hashing function.  */
+	      size_t c = 1 + (hash_val % (symbols_hash_tab_size - 2));
+
+	      do
+		if (idx >= symbols_hash_tab_size - c)
+		  idx -= symbols_hash_tab_size - c;
+		else
+		  idx += c;
+	      while (symbols_hash_tab[2 * idx] != (~((u_int32_t) 0)));
+	    }
+
+	  symbols_hash_tab[2 * idx] = obstack_object_size (&string_pool);
+	  symbols_hash_tab[2 * idx + 1] = (obstack_object_size (&non_simple)
+					   / sizeof (u_int32_t));
+
+	  obstack_grow0 (&string_pool, key, keylen);
+	  /* Adding the first weight looks complicated.  We have to deal
+	     with the kind it is stored and with the fact that original
+	     form uses `unsigned int's while we need `u_int32_t' here.  */
+	  word = weights[0];
+	  obstack_grow (&non_simple, &word, sizeof (u_int32_t));
+	  for (cnt = 0; cnt < weights[0]; ++cnt)
+	    {
+	      word = weights[collate->nrules + cnt];
+	      obstack_grow (&non_simple, &word, sizeof (u_int32_t));
+	    }
+	}
+
+      if (hash_tab == &charset->char_table)
+	hash_tab = &collate->elements;
+      else if (hash_tab == &collate->elements)
+	hash_tab = &collate->symbols;
+      else
+	break;
+    }
+
+  /* Now we have the complete tables.  */
+  if (obstack_object_size (&string_pool) % 4 != 0)
+    obstack_blank (&non_simple, 4 - (obstack_object_size (&string_pool) % 4));
+  symbols_string_pool_size = obstack_object_size (&string_pool);
+  symbols_string_pool = obstack_finish (&string_pool);
+
+  symbols_class_size = obstack_object_size (&non_simple);
+  symbols_class = obstack_finish (&non_simple);
+
+  /* Generate tables with other byte order.  */
+  symbols_hash_tab_ob = obstack_alloc (&non_simple, (2 * symbols_hash_tab_size
+						     * sizeof (u_int32_t)));
+  for (cnt = 0; cnt < 2 * symbols_hash_tab_size; ++cnt)
+    symbols_hash_tab_ob[cnt] = SWAPU32 (symbols_hash_tab[cnt]);
+
+  symbols_class_ob = obstack_alloc (&non_simple, symbols_class_size);
+  for (cnt = 0; cnt < symbols_class_size / 4; ++cnt)
+    symbols_class_ob[cnt] = SWAPU32 (symbols_class[cnt]);
+
+
   /* Store table adresses and lengths.   */
 #if __BYTE_ORDER == __BIG_ENDIAN
   iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table;
@@ -642,12 +913,124 @@ Computing table size for collation information might take a while..."),
   iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_base = &undefined_offset;
   iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_len = sizeof (u_int32_t);
 
+
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_SIZE)].iov_base
+    = &element_hash_tab_size;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_SIZE)].iov_len
+    = sizeof (u_int32_t);
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_base
+    = element_hash_tab;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_len
+    = 2 * element_hash_tab_size * sizeof (u_int32_t);
+
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_base
+    = element_hash_tab_ob;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_len
+    = 2 * element_hash_tab_size * sizeof (u_int32_t);
+#else
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_base
+    = element_hash_tab;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_len
+    = 2 * element_hash_tab_size * sizeof (u_int32_t);
+
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_base
+    = element_hash_tab_ob;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_len
+    = 2 * element_hash_tab_size * sizeof (u_int32_t);
+#endif
+
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_STR_POOL)].iov_base
+    = element_string_pool;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_STR_POOL)].iov_len
+    = element_string_pool_size;
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_base
+    = element_value;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_len
+    = element_value_size;
+
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_base
+    = element_value_ob;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_len
+    = element_value_size;
+#else
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_base
+    = element_value;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_len
+    = element_value_size;
+
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_base
+    = element_value_ob;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_len
+    = element_value_size;
+#endif
+
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZE)].iov_base
+    = &symbols_hash_tab_size;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZE)].iov_len
+    = sizeof (u_int32_t);
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_base
+    = symbols_hash_tab;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_len
+    = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
+
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_base
+    = symbols_hash_tab_ob;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_len
+    = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
+#else
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_base
+    = symbols_hash_tab;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_len
+    = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
+
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_base
+    = symbols_hash_tab_ob;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_len
+    = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
+#endif
+
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_STR_POOL)].iov_base
+    = symbols_string_pool;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_STR_POOL)].iov_len
+    = symbols_string_pool_size;
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_base
+    = symbols_class;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_CLASS_EB)].iov_len
+    = symbols_class_size;
+
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_base
+    = symbols_class_ob;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_len
+    = symbols_class_size;
+#else
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_base
+    = symbols_class;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_len
+    = symbols_class_size;
+
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_base
+    = symbols_class_ob;
+  iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_len
+    = symbols_class_size;
+#endif
+
   /* Update idx array.  */
   idx[0] = iov[0].iov_len + iov[1].iov_len;
   for (cnt = 1; cnt < nelems; ++cnt)
     idx[cnt] = idx[cnt - 1] + iov[1 + cnt].iov_len;
 
   write_locale_data (output_path, "LC_COLLATE", 2 + nelems, iov);
+
+  obstack_free (&non_simple, NULL);
+  obstack_free (&string_pool, NULL);
 }
 
 
@@ -729,7 +1112,7 @@ collate_element_from (struct linereader *lr, struct localedef_t *locale,
 
   if (elemp->name[0] == L'\0' || elemp->name[1] == L'\0')
     {
-      lr_error (lr, _("illegal colltion element"));
+      lr_error (lr, _("illegal collation element"));
       return;
     }
 
@@ -762,8 +1145,7 @@ collate_element_from (struct linereader *lr, struct localedef_t *locale,
 	    {
 	      if (set_entry (&collate->result, elemp->name, sizeof (wchar_t),
 			     elemp) < 0)
-		error (EXIT_FAILURE, 0,
-		       _("\
+		error (EXIT_FAILURE, 0, _("\
 error while inserting collation element into hash table"));
 	    }
 	  else
@@ -1019,7 +1401,7 @@ line before ellipsis does not contain definition for character constant"));
     }
 
   /* Now it's time to handle the ellipsis in the previous line.  We do
-     this only when the last line contained an definition for an
+     this only when the last line contained an definition for a
      character, the current line also defines an character, the
      character code for the later is bigger than the former.  */
   if (collate->was_ellipsis)