about summary refs log tree commit diff
path: root/string/strxfrm_l.c
diff options
context:
space:
mode:
Diffstat (limited to 'string/strxfrm_l.c')
-rw-r--r--string/strxfrm_l.c429
1 files changed, 426 insertions, 3 deletions
diff --git a/string/strxfrm_l.c b/string/strxfrm_l.c
index 264ab9bfe3..44b605168a 100644
--- a/string/strxfrm_l.c
+++ b/string/strxfrm_l.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1995,96,97,2002 Free Software Foundation, Inc.
+/* Copyright (C) 1995,96,97,2002, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
 
@@ -17,7 +17,430 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
-#define USE_IN_EXTENDED_LOCALE_MODEL	1
-#include <strxfrm.c>
+#include <assert.h>
+#include <langinfo.h>
+#include <locale.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/param.h>
 
+#ifndef STRING_TYPE
+# define STRING_TYPE char
+# define USTRING_TYPE unsigned char
+# define STRXFRM __strxfrm_l
+# define STRCMP strcmp
+# define STRLEN strlen
+# define STPNCPY __stpncpy
+# define WEIGHT_H "../locale/weight.h"
+# define SUFFIX	MB
+# define L(arg) arg
+#endif
+
+#define CONCAT(a,b) CONCAT1(a,b)
+#define CONCAT1(a,b) a##b
+
+#include "../locale/localeinfo.h"
+
+
+#ifndef WIDE_CHAR_VERSION
+
+/* We need UTF-8 encoding of numbers.  */
+static int
+utf8_encode (char *buf, int val)
+{
+  int retval;
+
+  if (val < 0x80)
+    {
+      *buf++ = (char) val;
+      retval = 1;
+    }
+  else
+    {
+      int step;
+
+      for (step = 2; step < 6; ++step)
+	if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
+	  break;
+      retval = step;
+
+      *buf = (unsigned char) (~0xff >> step);
+      --step;
+      do
+	{
+	  buf[step] = 0x80 | (val & 0x3f);
+	  val >>= 6;
+	}
+      while (--step > 0);
+      *buf |= val;
+    }
+
+  return retval;
+}
+#endif
+
+
+size_t
+STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
+{
+  struct locale_data *current = l->__locales[LC_COLLATE];
+  uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
+  /* We don't assign the following values right away since it might be
+     unnecessary in case there are no rules.  */
+  const unsigned char *rulesets;
+  const int32_t *table;
+  const USTRING_TYPE *weights;
+  const USTRING_TYPE *extra;
+  const int32_t *indirect;
+  uint_fast32_t pass;
+  size_t needed;
+  const USTRING_TYPE *usrc;
+  size_t srclen = STRLEN (src);
+  int32_t *idxarr;
+  unsigned char *rulearr;
+  size_t idxmax;
+  size_t idxcnt;
+  int use_malloc;
+
+#include WEIGHT_H
+
+  if (nrules == 0)
+    {
+      if (n != 0)
+	STPNCPY (dest, src, MIN (srclen + 1, n));
+
+      return srclen;
+    }
+
+  rulesets = (const unsigned char *)
+    current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
+  table = (const int32_t *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
+  weights = (const USTRING_TYPE *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
+  extra = (const USTRING_TYPE *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
+  indirect = (const int32_t *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
+  use_malloc = 0;
+
+  assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
+  assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
+  assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
+  assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
+
+  /* Handle an empty string as a special case.  */
+  if (srclen == 0)
+    {
+      if (n != 0)
+        *dest = L('\0');
+      return 0;
+    }
+
+  /* We need the elements of the string as unsigned values since they
+     are used as indeces.  */
+  usrc = (const USTRING_TYPE *) src;
+
+  /* Perform the first pass over the string and while doing this find
+     and store the weights for each character.  Since we want this to
+     be as fast as possible we are using `alloca' to store the temporary
+     values.  But since there is no limit on the length of the string
+     we have to use `malloc' if the string is too long.  We should be
+     very conservative here.  */
+  if (! __libc_use_alloca (srclen))
+    {
+      idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
+      rulearr = (unsigned char *) &idxarr[srclen];
+
+      if (idxarr == NULL)
+	/* No memory.  Well, go with the stack then.
+
+	   XXX Once this implementation is stable we will handle this
+	   differently.  Instead of precomputing the indeces we will
+	   do this in time.  This means, though, that this happens for
+	   every pass again.  */
+	goto try_stack;
+      use_malloc = 1;
+    }
+  else
+    {
+    try_stack:
+      idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
+      rulearr = (unsigned char *) alloca (srclen + 1);
+    }
+
+  idxmax = 0;
+  do
+    {
+      int32_t tmp = findidx (&usrc);
+      rulearr[idxmax] = tmp >> 24;
+      idxarr[idxmax] = tmp & 0xffffff;
+
+      ++idxmax;
+    }
+  while (*usrc != L('\0'));
+
+  /* This element is only read, the value never used but to determine
+     another value which then is ignored.  */
+  rulearr[idxmax] = '\0';
+
+  /* Now the passes over the weights.  We now use the indeces we found
+     before.  */
+  needed = 0;
+  for (pass = 0; pass < nrules; ++pass)
+    {
+      size_t backw_stop = ~0ul;
+      int rule = rulesets[rulearr[0] * nrules + pass];
+      /* We assume that if a rule has defined `position' in one section
+	 this is true for all of them.  */
+      int position = rule & sort_position;
+
+      if (position == 0)
+	{
+	  for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
+	    {
+	      if ((rule & sort_forward) != 0)
+		{
+		  size_t len;
+
+		  if (backw_stop != ~0ul)
+		    {
+		      /* Handle the pushed elements now.  */
+		      size_t backw;
+
+		      for (backw = idxcnt - 1; backw >= backw_stop; --backw)
+			{
+			  len = weights[idxarr[backw]++];
+
+			  if (needed + len < n)
+			    while (len-- > 0)
+			      dest[needed++] = weights[idxarr[backw]++];
+			  else
+			    {
+				/* No more characters fit into the buffer.  */
+			      needed += len;
+			      idxarr[backw] += len;
+			    }
+			}
+
+		      backw_stop = ~0ul;
+		    }
+
+		  /* Now handle the forward element.  */
+		  len = weights[idxarr[idxcnt]++];
+		  if (needed + len < n)
+		    while (len-- > 0)
+		      dest[needed++] = weights[idxarr[idxcnt]++];
+		  else
+		    {
+		      /* No more characters fit into the buffer.  */
+		      needed += len;
+		      idxarr[idxcnt] += len;
+		    }
+		}
+	      else
+		{
+		  /* Remember where the backwards series started.  */
+		  if (backw_stop == ~0ul)
+		    backw_stop = idxcnt;
+		}
+
+	      rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
+	    }
+
+
+	  if (backw_stop != ~0ul)
+	    {
+	      /* Handle the pushed elements now.  */
+	      size_t backw;
+
+	      backw = idxcnt;
+	      while (backw > backw_stop)
+		{
+		  size_t len = weights[idxarr[--backw]++];
+
+		  if (needed + len < n)
+		    while (len-- > 0)
+		      dest[needed++] = weights[idxarr[backw]++];
+		  else
+		    {
+		      /* No more characters fit into the buffer.  */
+		      needed += len;
+		      idxarr[backw] += len;
+		    }
+		}
+	    }
+	}
+      else
+	{
+	  int val = 1;
+#ifndef WIDE_CHAR_VERSION
+	  char buf[7];
+	  size_t buflen;
+#endif
+	  size_t i;
+
+	  for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
+	    {
+	      if ((rule & sort_forward) != 0)
+		{
+		  size_t len;
+
+		  if (backw_stop != ~0ul)
+		    {
+		     /* Handle the pushed elements now.  */
+		      size_t backw;
+
+		      for (backw = idxcnt - 1; backw >= backw_stop; --backw)
+			{
+			  len = weights[idxarr[backw]++];
+			  if (len != 0)
+			    {
+#ifdef WIDE_CHAR_VERSION
+			      if (needed + 1 + len < n)
+				{
+				  dest[needed] = val;
+				  for (i = 0; i < len; ++i)
+				    dest[needed + 1 + i] =
+				      weights[idxarr[backw] + i];
+				}
+			      needed += 1 + len;
+#else
+			      buflen = utf8_encode (buf, val);
+			      if (needed + buflen + len < n)
+				{
+				  for (i = 0; i < buflen; ++i)
+				    dest[needed + i] = buf[i];
+				  for (i = 0; i < len; ++i)
+				    dest[needed + buflen + i] =
+				      weights[idxarr[backw] + i];
+				}
+			      needed += buflen + len;
+#endif
+			      idxarr[backw] += len;
+			      val = 1;
+			    }
+			  else
+			    ++val;
+			}
+
+		      backw_stop = ~0ul;
+		    }
+
+		  /* Now handle the forward element.  */
+		  len = weights[idxarr[idxcnt]++];
+		  if (len != 0)
+		    {
+#ifdef WIDE_CHAR_VERSION
+		      if (needed + 1+ len < n)
+			{
+			  dest[needed] = val;
+			  for (i = 0; i < len; ++i)
+			    dest[needed + 1 + i] =
+			      weights[idxarr[idxcnt] + i];
+			}
+		      needed += 1 + len;
+#else
+		      buflen = utf8_encode (buf, val);
+		      if (needed + buflen + len < n)
+			{
+			  for (i = 0; i < buflen; ++i)
+			    dest[needed + i] = buf[i];
+			  for (i = 0; i < len; ++i)
+			    dest[needed + buflen + i] =
+			      weights[idxarr[idxcnt] + i];
+			}
+		      needed += buflen + len;
+#endif
+		      idxarr[idxcnt] += len;
+		      val = 1;
+		    }
+		  else
+		    /* Note that we don't have to increment `idxarr[idxcnt]'
+		       since the length is zero.  */
+		    ++val;
+		}
+	      else
+		{
+		  /* Remember where the backwards series started.  */
+		  if (backw_stop == ~0ul)
+		    backw_stop = idxcnt;
+		}
+
+	      rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
+	    }
+
+	  if (backw_stop != ~0ul)
+	    {
+	      /* Handle the pushed elements now.  */
+	      size_t backw;
+
+	      backw = idxmax - 1;
+	      while (backw > backw_stop)
+		{
+		  size_t len = weights[idxarr[--backw]++];
+		  if (len != 0)
+		    {
+#ifdef WIDE_CHAR_VERSION
+		      if (needed + 1 + len < n)
+			{
+			  dest[needed] = val;
+			  for (i = 0; i < len; ++i)
+			    dest[needed + 1 + i] =
+			      weights[idxarr[backw] + i];
+			}
+		      needed += 1 + len;
+#else
+		      buflen = utf8_encode (buf, val);
+		      if (needed + buflen + len < n)
+			{
+			  for (i = 0; i < buflen; ++i)
+			    dest[needed + i] = buf[i];
+			  for (i = 0; i < len; ++i)
+			    dest[needed + buflen + i] =
+			      weights[idxarr[backw] + i];
+			}
+		      needed += buflen + len;
+#endif
+		      idxarr[backw] += len;
+		      val = 1;
+		    }
+		  else
+		    ++val;
+		}
+	    }
+	}
+
+      /* Finally store the byte to separate the passes or terminate
+	 the string.  */
+      if (needed < n)
+	dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
+      ++needed;
+    }
+
+  /* This is a little optimization: many collation specifications have
+     a `position' rule at the end and if no non-ignored character
+     is found the last \1 byte is immediately followed by a \0 byte
+     signalling this.  We can avoid the \1 byte(s).  */
+  if (needed <= n && needed > 2 && dest[needed - 2] == L('\1'))
+    {
+      /* Remove the \1 byte.  */
+      --needed;
+      dest[needed - 1] = L('\0');
+    }
+
+  /* Free the memory if needed.  */
+  if (use_malloc)
+    free (idxarr);
+
+  /* Return the number of bytes/words we need, but don't count the NUL
+     byte/word at the end.  */
+  return needed - 1;
+}
+libc_hidden_def (STRXFRM)
+
+#ifndef WIDE_CHAR_VERSION
 weak_alias (__strxfrm_l, strxfrm_l)
+#endif