about summary refs log tree commit diff
path: root/locale/programs
diff options
context:
space:
mode:
Diffstat (limited to 'locale/programs')
-rw-r--r--locale/programs/ld-collate.c119
1 files changed, 117 insertions, 2 deletions
diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c
index 3eff699e7b..8eb47d7f8e 100644
--- a/locale/programs/ld-collate.c
+++ b/locale/programs/ld-collate.c
@@ -25,12 +25,14 @@
 #include <error.h>
 #include <stdlib.h>
 #include <wchar.h>
+#include <sys/param.h>
 
 #include "charmap.h"
 #include "localeinfo.h"
 #include "linereader.h"
 #include "locfile.h"
 #include "localedef.h"
+#include "elem-hash.h"
 
 /* Uncomment the following line in the production version.  */
 /* #define NDEBUG 1 */
@@ -88,11 +90,13 @@ struct element_t
      we changed if necessary but I doubt this is necessary.  */
   unsigned int used_in_level;
 
+  struct element_list_t *weights;
+  /* Index in the `weight' table in the output file for the character.  */
+  int32_t weights_idx;
+
   /* Nonzero if this is a real character definition.  */
   int is_character;
 
-  struct element_list_t *weights;
-
   /* Where does the definition come from.  */
   const char *file;
   size_t line;
@@ -297,6 +301,7 @@ new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
 
   /* Will be allocated later.  */
   newp->weights = NULL;
+  newp->weights_idx = 0;
 
   newp->file = NULL;
   newp->line = 0;
@@ -1804,6 +1809,9 @@ output_weight (struct obstack *pool, struct locale_collate_t *collate,
       obstack_grow (pool, buf, len);
     }
 
+  /* Remember the index.  */
+  elem->weights_idx = retval;
+
   return retval | ((elem->section->ruleidx & 0x7f) << 24);
 }
 
@@ -1866,7 +1874,10 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
   uint32_t *names;
   uint32_t *tablewc;
   size_t table_size;
+  uint32_t elem_size;
+  uint32_t *elem_table;
   int i;
+  struct element_t *runp;
 
   data.magic = LIMAGIC (LC_COLLATE);
   data.n = nelems;
@@ -2381,6 +2392,110 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
   ++cnt;
 
 
+  /* Finally write the table with collation element names out.  It is
+     a hash table with a simple function which gets the name of the
+     character as the input.  One character might have many names.  The
+     value associated with the name is an index into the weight table
+     where we are then interested in the first-level weight value.
+
+     To determine how large the table should be we are counting the
+     elements have to put in.  Since we are using internal chaining
+     using a secondary hash function we have to make the table a bit
+     larger to avoid extremely long search times.  We can achieve
+     good results with a 40% larger table than there are entries.  */
+  elem_size = 0;
+  runp = collate->start;
+  while (runp != NULL)
+    {
+      if (runp->mbs != NULL && runp->weights != NULL)
+	/* Yep, the element really counts.  */
+	++elem_size;
+
+      runp = runp->next;
+    }
+  /* Add 40% and find the next prime number.  */
+  elem_size = MIN (next_prime (elem_size * 1.4), 257);
+
+  /* Allocate the table.  Each entry consists of two words: the hash
+     value and an index in a secondary table which provides the index
+     into the weight table and the string itself (so that a match can
+     be determined).  */
+  elem_table = (uint32_t *) obstack_alloc (&extrapool,
+					   elem_size * 2 * sizeof (uint32_t));
+  memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
+
+  /* Now add the elements.  */
+  runp = collate->start;
+  while (runp != NULL)
+    {
+      if (runp->mbs != NULL && runp->weights != NULL)
+	{
+	  /* Compute the hash value of the name.  */
+	  uint32_t namelen = strlen (runp->name);
+	  uint32_t hash = elem_hash (runp->name, namelen);
+	  size_t idx = hash % elem_size;
+
+	  if (elem_table[idx * 2] != 0)
+	    {
+	      /* The spot is already take.  Try iterating using the value
+		 from the secondary hashing function.  */
+	      size_t iter = hash % (elem_size - 2);
+
+	      do
+		{
+		  idx += iter;
+		  if (idx >= elem_size)
+		    idx -= elem_size;
+		}
+	      while (elem_table[idx * 2] != 0);
+
+	      /* This is the spot where we will insert the value.  */
+	      elem_table[idx * 2] = hash;
+	      elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
+
+	      /* Now add the index into the weights table.  We know the
+		 address is always 32bit aligned.  */
+	      if (sizeof (int) == sizeof (int32_t))
+		obstack_int_grow (&extrapool, runp->weights_idx);
+	      else
+		obstack_grow (&extrapool, &runp->weights_idx,
+			      sizeof (int32_t));
+
+	      /* The the string itself including length.  */
+	      obstack_1grow (&extrapool, namelen);
+	      obstack_grow (&extrapool, runp->name, namelen);
+
+	      /* And align again to 32 bits.  */
+	      if ((1 + namelen) % sizeof (int32_t) != 0)
+		obstack_grow (&extrapool, "\0\0",
+			      (sizeof (int32_t)
+			       - (1 + namelen) % sizeof (int32_t)));
+	    }
+	}
+
+      runp = runp->next;
+    }
+
+  /* Prepare to write out this data.  */
+  assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB));
+  iov[2 + cnt].iov_base = &elem_size;
+  iov[2 + cnt].iov_len = sizeof (int32_t);
+  idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
+  ++cnt;
+
+  assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_TABLEMB));
+  iov[2 + cnt].iov_base = elem_table;
+  iov[2 + cnt].iov_len = elem_size * 2 * sizeof (int32_t);
+  idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
+  ++cnt;
+
+  assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_EXTRAMB));
+  iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
+  iov[2 + cnt].iov_base = obstack_finish (&extrapool);
+  idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
+  ++cnt;
+
+
   assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
 
   write_locale_data (output_path, "LC_COLLATE", 2 + cnt, iov);