about summary refs log tree commit diff
path: root/string/strcoll_l.c
diff options
context:
space:
mode:
authorLeonhard Holz <leonhard.holz@web.de>2015-05-12 11:37:52 +0200
committerOndřej Bílka <neleai@seznam.cz>2015-05-12 11:37:52 +0200
commitf13c2a8dff2329c6692a80176262ceaaf8a6f74e (patch)
treedd2443fba95dd55830d0d0e745a055ef3981c506 /string/strcoll_l.c
parent34cb304e5a6df706e186d504b69af974bfc15a2f (diff)
downloadglibc-f13c2a8dff2329c6692a80176262ceaaf8a6f74e.tar.gz
glibc-f13c2a8dff2329c6692a80176262ceaaf8a6f74e.tar.xz
glibc-f13c2a8dff2329c6692a80176262ceaaf8a6f74e.zip
Improve strcoll with strdiff.
This patch improves strcoll hot case by finding first byte that
mismatches. That is in likely case enough to determine comparison
result.
Diffstat (limited to 'string/strcoll_l.c')
-rw-r--r--string/strcoll_l.c38
1 files changed, 37 insertions, 1 deletions
diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index 658d5b9b90..0fa005f0f9 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -29,6 +29,7 @@
 # define STRING_TYPE char
 # define USTRING_TYPE unsigned char
 # define STRCOLL __strcoll_l
+# define STRDIFF __strdiff
 # define STRCMP strcmp
 # define WEIGHT_H "../locale/weight.h"
 # define SUFFIX	MB
@@ -41,6 +42,20 @@
 #include "../locale/localeinfo.h"
 #include WEIGHT_H
 
+#define MASK_UTF8_7BIT  (1 << 7)
+#define MASK_UTF8_START (3 << 6)
+
+size_t
+STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
+{
+  size_t n;
+
+  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
+    continue;
+
+  return n;
+}
+
 /* Track status while looking for sequences in a string.  */
 typedef struct
 {
@@ -255,9 +270,29 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
   const USTRING_TYPE *extra;
   const int32_t *indirect;
 
+  /* In case there is no locale specific sort order (C / POSIX).  */
   if (nrules == 0)
     return STRCMP (s1, s2);
 
+  /* Fast forward to the position of the first difference.  Needs to be
+     encoding aware as the byte-by-byte comparison can stop in the middle
+     of a char sequence for multibyte encodings like UTF-8.  */
+  uint_fast32_t encoding =
+    current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
+  if (encoding != __cet_other)
+    {
+      size_t diff = STRDIFF (s1, s2);
+      if (diff > 0)
+	{
+	  if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
+	    do
+	      diff--;
+	    while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
+	  s1 += diff;
+	  s2 += diff;
+	}
+    }
+
   /* Catch empty strings.  */
   if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
     return (*s1 != '\0') - (*s2 != '\0');
@@ -321,7 +356,8 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
 		     byte-level comparison to ensure that we don't waste time
 		     going through multiple passes for totally equal strings
 		     before proceeding to subsequent passes.  */
-		  if (pass == 0 && STRCMP (s1, s2) == 0)
+		  if (pass == 0 && encoding == __cet_other &&
+		      STRCMP (s1, s2) == 0)
 		    return result;
 		  else
 		    break;