summary refs log tree commit diff
path: root/string/strcoll_l.c
diff options
context:
space:
mode:
Diffstat (limited to 'string/strcoll_l.c')
-rw-r--r--string/strcoll_l.c38
1 files changed, 37 insertions, 1 deletions
diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index 658d5b9b90..0fa005f0f9 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -29,6 +29,7 @@
 # define STRING_TYPE char
 # define USTRING_TYPE unsigned char
 # define STRCOLL __strcoll_l
+# define STRDIFF __strdiff
 # define STRCMP strcmp
 # define WEIGHT_H "../locale/weight.h"
 # define SUFFIX	MB
@@ -41,6 +42,20 @@
 #include "../locale/localeinfo.h"
 #include WEIGHT_H
 
+#define MASK_UTF8_7BIT  (1 << 7)
+#define MASK_UTF8_START (3 << 6)
+
+size_t
+STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
+{
+  size_t n;
+
+  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
+    continue;
+
+  return n;
+}
+
 /* Track status while looking for sequences in a string.  */
 typedef struct
 {
@@ -255,9 +270,29 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
   const USTRING_TYPE *extra;
   const int32_t *indirect;
 
+  /* In case there is no locale specific sort order (C / POSIX).  */
   if (nrules == 0)
     return STRCMP (s1, s2);
 
+  /* Fast forward to the position of the first difference.  Needs to be
+     encoding aware as the byte-by-byte comparison can stop in the middle
+     of a char sequence for multibyte encodings like UTF-8.  */
+  uint_fast32_t encoding =
+    current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
+  if (encoding != __cet_other)
+    {
+      size_t diff = STRDIFF (s1, s2);
+      if (diff > 0)
+	{
+	  if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
+	    do
+	      diff--;
+	    while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
+	  s1 += diff;
+	  s2 += diff;
+	}
+    }
+
   /* Catch empty strings.  */
   if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
     return (*s1 != '\0') - (*s2 != '\0');
@@ -321,7 +356,8 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
 		     byte-level comparison to ensure that we don't waste time
 		     going through multiple passes for totally equal strings
 		     before proceeding to subsequent passes.  */
-		  if (pass == 0 && STRCMP (s1, s2) == 0)
+		  if (pass == 0 && encoding == __cet_other &&
+		      STRCMP (s1, s2) == 0)
 		    return result;
 		  else
 		    break;