about summary refs log tree commit diff
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2010-07-30 00:14:04 -0700
committerUlrich Drepper <drepper@redhat.com>2010-07-30 00:14:04 -0700
commit42e08a5438ddbd9d550d914733c0bc5ba96d79ec (patch)
tree5a9f393d2b0b213db465584b0d6b4f01d277b02a
parentfe36dd025ea34c5c082b688592618ec72369b96b (diff)
downloadglibc-42e08a5438ddbd9d550d914733c0bc5ba96d79ec.tar.gz
glibc-42e08a5438ddbd9d550d914733c0bc5ba96d79ec.tar.xz
glibc-42e08a5438ddbd9d550d914733c0bc5ba96d79ec.zip
Implement optimized strcaecmp for x86-64.
-rw-r--r--ChangeLog11
-rw-r--r--NEWS2
-rw-r--r--string/Makefile2
-rw-r--r--string/test-strcasecmp.c276
-rw-r--r--sysdeps/x86_64/Makefile3
-rw-r--r--sysdeps/x86_64/locale-defines.sym11
-rw-r--r--sysdeps/x86_64/strcasecmp.S1
-rw-r--r--sysdeps/x86_64/strcasecmp_l-nonascii.c5
-rw-r--r--sysdeps/x86_64/strcasecmp_l.S6
-rw-r--r--sysdeps/x86_64/strcmp.S136
10 files changed, 449 insertions, 4 deletions
diff --git a/ChangeLog b/ChangeLog
index f19b63b908..7b8c416f0c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,16 @@
 2010-07-30  Ulrich Drepper  <drepper@redhat.com>
 
+	* string/Makefile (strop-tests): Add strcasecmp.
+	* sysdeps/x86_64/Makefile [subdir=string] (sysdep_routines): Add
+	strcasecmp_l-nonascii.
+	(gen-as-const-headers): Add locale-defines.sym.
+	* sysdeps/x86_64/strcmp.S: Add support for strcasecmp implementation.
+	* sysdeps/x86_64/strcasecmp.S: New file.
+	* sysdeps/x86_64/strcasecmp_l.S: New file.
+	* sysdeps/x86_64/strcasecmp_l-nonascii.c: New file.
+	* sysdeps/x86_64/locale-defines.sym: New file.
+	* string/test-strcasecmp.c: New file.
+
 	* string/test-strcasestr.c: Test both ends of the range of characters.
 	* sysdeps/x86_64/multiarch/strstr.c: Fix UCHIGH definition.
 
diff --git a/NEWS b/NEWS
index 8358f62efe..8d9bb43ec3 100644
--- a/NEWS
+++ b/NEWS
@@ -13,7 +13,7 @@ Version 2.13
 
 * POWER7 optimizations: memset, memcmp, strncmp
 
-* New optimized string functions for x86-64: strnlen
+* New optimized string functions for x86-64: strnlen, strcasecmp
   Implemented by Ulrich Drepper.
 
 Version 2.12
diff --git a/string/Makefile b/string/Makefile
index e8eb514155..4c160e9d2d 100644
--- a/string/Makefile
+++ b/string/Makefile
@@ -49,7 +49,7 @@ o-objects.ob	:= memcpy.o memset.o memchr.o
 strop-tests	:= memchr memcmp memcpy memmove mempcpy memset memccpy	\
 		   stpcpy stpncpy strcat strchr strcmp strcpy strcspn	\
 		   strlen strncmp strncpy strpbrk strrchr strspn memmem	\
-		   strstr strcasestr strnlen
+		   strstr strcasestr strnlen strcasecmp
 tests		:= tester inl-tester noinl-tester testcopy test-ffs	\
 		   tst-strlen stratcliff tst-svc tst-inlcall		\
 		   bug-strncat1 bug-strspn1 bug-strpbrk1 tst-bswap	\
diff --git a/string/test-strcasecmp.c b/string/test-strcasecmp.c
new file mode 100644
index 0000000000..7d1d110148
--- /dev/null
+++ b/string/test-strcasecmp.c
@@ -0,0 +1,276 @@
+/* Test and measure strcasecmp functions.
+   Copyright (C) 1999, 2002, 2003, 2005, 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Written by Jakub Jelinek <jakub@redhat.com>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <ctype.h>
+#define TEST_MAIN
+#include "test-string.h"
+
+typedef int (*proto_t) (const char *, const char *);
+static int simple_strcasecmp (const char *, const char *);
+static int stupid_strcasecmp (const char *, const char *);
+
+IMPL (stupid_strcasecmp, 0)
+IMPL (simple_strcasecmp, 0)
+IMPL (strcasecmp, 1)
+
+static int
+simple_strcasecmp (const char *s1, const char *s2)
+{
+  int ret;
+
+  while ((ret = ((unsigned char) tolower (*s1)
+		 - (unsigned char) tolower (*s2))) == 0
+	 && *s1++)
+    ++s2;
+  return ret;
+}
+
+static int
+stupid_strcasecmp (const char *s1, const char *s2)
+{
+  size_t ns1 = strlen (s1) + 1, ns2 = strlen (s2) + 1;
+  size_t n = ns1 < ns2 ? ns1 : ns2;
+  int ret = 0;
+
+  while (n--)
+    {
+      if ((ret = ((unsigned char) tolower (*s1)
+		  - (unsigned char) tolower (*s2))) != 0)
+	break;
+      ++s1;
+      ++s2;
+    }
+  return ret;
+}
+
+static void
+do_one_test (impl_t *impl, const char *s1, const char *s2, int exp_result)
+{
+  int result = CALL (impl, s1, s2);
+  if ((exp_result == 0 && result != 0)
+      || (exp_result < 0 && result >= 0)
+      || (exp_result > 0 && result <= 0))
+    {
+      error (0, 0, "Wrong result in function %s %d %d", impl->name,
+	     result, exp_result);
+      ret = 1;
+      return;
+    }
+
+  if (HP_TIMING_AVAIL)
+    {
+      hp_timing_t start __attribute ((unused));
+      hp_timing_t stop __attribute ((unused));
+      hp_timing_t best_time = ~ (hp_timing_t) 0;
+      size_t i;
+
+      for (i = 0; i < 32; ++i)
+	{
+	  HP_TIMING_NOW (start);
+	  CALL (impl, s1, s2);
+	  HP_TIMING_NOW (stop);
+	  HP_TIMING_BEST (best_time, start, stop);
+	}
+
+      printf ("\t%zd", (size_t) best_time);
+    }
+}
+
+static void
+do_test (size_t align1, size_t align2, size_t len, int max_char,
+	 int exp_result)
+{
+  size_t i;
+  char *s1, *s2;
+
+  if (len == 0)
+    return;
+
+  align1 &= 7;
+  if (align1 + len + 1 >= page_size)
+    return;
+
+  align2 &= 7;
+  if (align2 + len + 1 >= page_size)
+    return;
+
+  s1 = (char *) (buf1 + align1);
+  s2 = (char *) (buf2 + align2);
+
+  for (i = 0; i < len; i++)
+    {
+      s1[i] = toupper (1 + 23 * i % max_char);
+      s2[i] = tolower (s1[i]);
+    }
+
+  s1[len] = s2[len] = 0;
+  s1[len + 1] = 23;
+  s2[len + 1] = 24 + exp_result;
+  if ((s2[len - 1] == 'z' && exp_result == -1)
+      || (s2[len - 1] == 'a' && exp_result == 1))
+    s1[len - 1] += exp_result;
+  else
+    s2[len - 1] -= exp_result;
+
+  if (HP_TIMING_AVAIL)
+    printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
+
+  FOR_EACH_IMPL (impl, 0)
+    do_one_test (impl, s1, s2, exp_result);
+
+  if (HP_TIMING_AVAIL)
+    putchar ('\n');
+}
+
+static void
+do_random_tests (void)
+{
+  size_t i, j, n, align1, align2, pos, len1, len2;
+  int result;
+  long r;
+  unsigned char *p1 = buf1 + page_size - 512;
+  unsigned char *p2 = buf2 + page_size - 512;
+
+  for (n = 0; n < ITERATIONS; n++)
+    {
+      align1 = random () & 31;
+      if (random () & 1)
+	align2 = random () & 31;
+      else
+	align2 = align1 + (random () & 24);
+      pos = random () & 511;
+      j = align1 > align2 ? align1 : align2;
+      if (pos + j >= 511)
+	pos = 510 - j - (random () & 7);
+      len1 = random () & 511;
+      if (pos >= len1 && (random () & 1))
+	len1 = pos + (random () & 7);
+      if (len1 + j >= 512)
+	len1 = 511 - j - (random () & 7);
+      if (pos >= len1)
+	len2 = len1;
+      else
+	len2 = len1 + (len1 != 511 - j ? random () % (511 - j - len1) : 0);
+      j = (pos > len2 ? pos : len2) + align1 + 64;
+      if (j > 512)
+	j = 512;
+      for (i = 0; i < j; ++i)
+	{
+	  p1[i] = tolower (random () & 255);
+	  if (i < len1 + align1 && !p1[i])
+	    {
+	      p1[i] = tolower (random () & 255);
+	      if (!p1[i])
+		p1[i] = tolower (1 + (random () & 127));
+	    }
+	}
+      for (i = 0; i < j; ++i)
+	{
+	  p2[i] = toupper (random () & 255);
+	  if (i < len2 + align2 && !p2[i])
+	    {
+	      p2[i] = toupper (random () & 255);
+	      if (!p2[i])
+		toupper (p2[i] = 1 + (random () & 127));
+	    }
+	}
+
+      result = 0;
+      memcpy (p2 + align2, p1 + align1, pos);
+      if (pos < len1)
+	{
+	  if (tolower (p2[align2 + pos]) == p1[align1 + pos])
+	    {
+	      p2[align2 + pos] = toupper (random () & 255);
+	      if (tolower (p2[align2 + pos]) == p1[align1 + pos])
+		p2[align2 + pos] = toupper (p1[align1 + pos]
+					    + 3 + (random () & 127));
+	    }
+
+	  if (p1[align1 + pos] < tolower (p2[align2 + pos]))
+	    result = -1;
+	  else
+	    result = 1;
+	}
+      p1[len1 + align1] = 0;
+      p2[len2 + align2] = 0;
+
+      FOR_EACH_IMPL (impl, 1)
+	{
+	  r = CALL (impl, (char *) (p1 + align1), (char *) (p2 + align2));
+	  /* Test whether on 64-bit architectures where ABI requires
+	     callee to promote has the promotion been done.  */
+	  asm ("" : "=g" (r) : "0" (r));
+	  if ((r == 0 && result)
+	      || (r < 0 && result >= 0)
+	      || (r > 0 && result <= 0))
+	    {
+	      error (0, 0, "Iteration %zd - wrong result in function %s (%zd, %zd, %zd, %zd, %zd) %ld != %d, p1 %p p2 %p",
+		     n, impl->name, align1, align2, len1, len2, pos, r, result, p1, p2);
+	      ret = 1;
+	    }
+	}
+    }
+}
+
+int
+test_main (void)
+{
+  size_t i;
+
+  test_init ();
+
+  printf ("%23s", "");
+  FOR_EACH_IMPL (impl, 0)
+    printf ("\t%s", impl->name);
+  putchar ('\n');
+
+  for (i = 1; i < 16; ++i)
+    {
+      do_test (i, i, i, 127, 0);
+      do_test (i, i, i, 127, 1);
+      do_test (i, i, i, 127, -1);
+    }
+
+  for (i = 1; i < 10; ++i)
+    {
+      do_test (0, 0, 2 << i, 127, 0);
+      do_test (0, 0, 2 << i, 254, 0);
+      do_test (0, 0, 2 << i, 127, 1);
+      do_test (0, 0, 2 << i, 254, 1);
+      do_test (0, 0, 2 << i, 127, -1);
+      do_test (0, 0, 2 << i, 254, -1);
+    }
+
+  for (i = 1; i < 8; ++i)
+    {
+      do_test (i, 2 * i, 8 << i, 127, 0);
+      do_test (2 * i, i, 8 << i, 254, 0);
+      do_test (i, 2 * i, 8 << i, 127, 1);
+      do_test (2 * i, i, 8 << i, 254, 1);
+      do_test (i, 2 * i, 8 << i, 127, -1);
+      do_test (2 * i, i, 8 << i, 254, -1);
+    }
+
+  do_random_tests ();
+  return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index e8d0285e26..f7eeb155ed 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -12,7 +12,8 @@ sysdep_routines += _mcount
 endif
 
 ifeq ($(subdir),string)
-sysdep_routines += cacheinfo
+sysdep_routines += cacheinfo strcasecmp_l-nonascii
+gen-as-const-headers += locale-defines.sym
 endif
 
 ifeq ($(subdir),elf)
diff --git a/sysdeps/x86_64/locale-defines.sym b/sysdeps/x86_64/locale-defines.sym
new file mode 100644
index 0000000000..aebff9a4f9
--- /dev/null
+++ b/sysdeps/x86_64/locale-defines.sym
@@ -0,0 +1,11 @@
+#include <locale/localeinfo.h>
+#include <langinfo.h>
+#include <stddef.h>
+
+--
+
+LOCALE_T___LOCALES		offsetof (struct __locale_struct, __locales)
+LC_CTYPE
+_NL_CTYPE_NONASCII_CASE
+LOCALE_DATA_VALUES		offsetof (struct __locale_data, values)
+SIZEOF_VALUES			sizeof (((struct __locale_data *) 0)->values[0])
diff --git a/sysdeps/x86_64/strcasecmp.S b/sysdeps/x86_64/strcasecmp.S
new file mode 100644
index 0000000000..fe49e820f2
--- /dev/null
+++ b/sysdeps/x86_64/strcasecmp.S
@@ -0,0 +1 @@
+/* In strcasecmp_l.S.  */
diff --git a/sysdeps/x86_64/strcasecmp_l-nonascii.c b/sysdeps/x86_64/strcasecmp_l-nonascii.c
new file mode 100644
index 0000000000..7a0a04f345
--- /dev/null
+++ b/sysdeps/x86_64/strcasecmp_l-nonascii.c
@@ -0,0 +1,5 @@
+#include <string.h>
+
+#define __strcasecmp_l __strcasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL    1
+#include <string/strcasecmp.c>
diff --git a/sysdeps/x86_64/strcasecmp_l.S b/sysdeps/x86_64/strcasecmp_l.S
new file mode 100644
index 0000000000..5456b3a49e
--- /dev/null
+++ b/sysdeps/x86_64/strcasecmp_l.S
@@ -0,0 +1,6 @@
+#define STRCMP __strcasecmp_l
+#define USE_AS_STRCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strcasecmp_l, strcasecmp_l)
+libc_hidden_def (strcasecmp_l)
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index ac3fe14679..7b2b246866 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -51,6 +51,15 @@
 	je	LABEL(strcmp_exitz);			\
 	mov	%r9, %r11
 
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+
+/* No support for strcasecmp outside libc so far since it is not needed.  */
+# ifdef NOT_IN_lib
+#  error "strcasecmp_l not implemented so far"
+# endif
+
+# define UPDATE_STRNCMP_COUNTER
 #else
 # define UPDATE_STRNCMP_COUNTER
 # ifndef STRCMP
@@ -64,6 +73,19 @@
 	.section .text.ssse3,"ax",@progbits
 #endif
 
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp)
+	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+	movq	%fs:(%rax),%rdx
+
+	/* 5-byte NOP.  */
+	.byte	0x0f,0x1f,0x44,0x00,0x00
+END (__strcasecmp)
+weak_alias (__strcasecmp, strcasecmp)
+libc_hidden_def (__strcasecmp)
+	/* FALLTHROUGH to strcasecmp_l.  */
+#endif
+
 ENTRY (BP_SYM (STRCMP))
 #ifdef NOT_IN_libc
 /* Simple version since we can't use SSE registers in ld.so.  */
@@ -84,6 +106,18 @@ L(neq):	movl	$1, %eax
 	ret
 END (BP_SYM (STRCMP))
 #else	/* NOT_IN_libc */
+# ifdef USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales
+	   with encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movq	LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax
+#  else
+	movq	(%rdx), %rax
+#  endif
+	testl	$0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+	jne	__strcasecmp_l_nonascii
+# endif
+
 /*
  * This implementation uses SSE to compare up to 16 bytes at a time.
  */
@@ -99,6 +133,26 @@ END (BP_SYM (STRCMP))
 /* Use 64bit AND here to avoid long NOP padding.  */
 	and	$0x3f, %rcx		/* rsi alignment in cache line */
 	and	$0x3f, %rax		/* rdi alignment in cache line */
+# ifdef USE_AS_STRCASECMP_L
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+.Lbelowupper:
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+.Ltopupper:
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+	movdqa	.Lbelowupper(%rip), %xmm5
+# define UCLOW_reg %xmm5
+	movdqa	.Ltopupper(%rip), %xmm6
+# define UCHIGH_reg %xmm6
+	movdqa	.Ltouppermask(%rip), %xmm7
+# define LCQWORD_reg %xmm7
+# endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
 	cmp	$0x30, %eax
@@ -107,6 +161,26 @@ END (BP_SYM (STRCMP))
 	movlpd	(%rsi), %xmm2
 	movhpd	8(%rdi), %xmm1
 	movhpd	8(%rsi), %xmm2
+# ifdef USE_AS_STRCASECMP_L
+#  define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm8;					\
+	movdqa	UCHIGH_reg, %xmm9;				\
+	movdqa	reg2, %xmm10;					\
+	movdqa	UCHIGH_reg, %xmm11;				\
+	pcmpgtb	UCLOW_reg, %xmm8;				\
+	pcmpgtb	reg1, %xmm9;					\
+	pcmpgtb	UCLOW_reg, %xmm10;				\
+	pcmpgtb	reg2, %xmm11;					\
+	pand	%xmm9, %xmm8;					\
+	pand	%xmm11, %xmm10;					\
+	pand	LCQWORD_reg, %xmm8;				\
+	pand	LCQWORD_reg, %xmm10;				\
+	por	%xmm8, reg1;					\
+	por	%xmm10, reg2
+	TOLOWER (%xmm1, %xmm2)
+# else
+#  define TOLOWER(reg1, reg2)
+# endif
 	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
@@ -159,7 +233,13 @@ LABEL(ashr_0):
 	movdqa	(%rsi), %xmm1
 	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
 	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
+# ifndef USE_AS_STRCASECMP_L
 	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
+# else
+	movdqa	(%rdi), %xmm2
+	TOLOWER (%xmm1, %xmm2)
+	pcmpeqb	%xmm2, %xmm1			/* compare 16 bytes for equality */
+# endif
 	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
 	pmovmskb %xmm1, %r9d
 	shr	%cl, %edx			/* adjust 0xffff for offset */
@@ -183,6 +263,7 @@ LABEL(ashr_0):
 LABEL(loop_ashr_0):
 	movdqa	(%rsi, %rcx), %xmm1
 	movdqa	(%rdi, %rcx), %xmm2
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -198,6 +279,7 @@ LABEL(loop_ashr_0):
 	add	$16, %rcx
 	movdqa	(%rsi, %rcx), %xmm1
 	movdqa	(%rdi, %rcx), %xmm2
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -214,7 +296,7 @@ LABEL(loop_ashr_0):
 
 /*
  * The following cases will be handled by ashr_1
- * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
+ * rcx(offset of rsi)  rax(offset of rdi)   relative offset	corresponding case
  *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
  */
 	.p2align 4
@@ -224,6 +306,7 @@ LABEL(ashr_1):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 	pslldq	$15, %xmm2		/* shift first string to align with second */
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
 	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
 	pmovmskb %xmm2, %r9d
@@ -263,6 +346,7 @@ LABEL(gobble_ashr_1):
 # else
 	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -292,6 +376,7 @@ LABEL(gobble_ashr_1):
 # else
 	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -351,6 +436,7 @@ LABEL(ashr_2):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$14, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -390,6 +476,7 @@ LABEL(gobble_ashr_2):
 # else
 	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -420,6 +507,7 @@ LABEL(gobble_ashr_2):
 # else
 	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -472,6 +560,7 @@ LABEL(ashr_3):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$13, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -512,6 +601,7 @@ LABEL(gobble_ashr_3):
 # else
 	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -542,6 +632,7 @@ LABEL(gobble_ashr_3):
 # else
 	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -594,6 +685,7 @@ LABEL(ashr_4):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$12, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -634,6 +726,7 @@ LABEL(gobble_ashr_4):
 # else
 	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -664,6 +757,7 @@ LABEL(gobble_ashr_4):
 # else
 	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -716,6 +810,7 @@ LABEL(ashr_5):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$11, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -756,6 +851,7 @@ LABEL(gobble_ashr_5):
 # else
 	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -786,6 +882,7 @@ LABEL(gobble_ashr_5):
 # else
 	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -838,6 +935,7 @@ LABEL(ashr_6):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$10, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -878,6 +976,7 @@ LABEL(gobble_ashr_6):
 # else
 	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -908,6 +1007,7 @@ LABEL(gobble_ashr_6):
 # else
 	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -960,6 +1060,7 @@ LABEL(ashr_7):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$9, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -1000,6 +1101,7 @@ LABEL(gobble_ashr_7):
 # else
 	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1030,6 +1132,7 @@ LABEL(gobble_ashr_7):
 # else
 	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1082,6 +1185,7 @@ LABEL(ashr_8):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$8, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -1122,6 +1226,7 @@ LABEL(gobble_ashr_8):
 # else
 	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1152,6 +1257,7 @@ LABEL(gobble_ashr_8):
 # else
 	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1204,6 +1310,7 @@ LABEL(ashr_9):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$7, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -1244,6 +1351,7 @@ LABEL(gobble_ashr_9):
 # else
 	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1274,6 +1382,7 @@ LABEL(gobble_ashr_9):
 # else
 	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1326,6 +1435,7 @@ LABEL(ashr_10):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$6, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -1366,6 +1476,7 @@ LABEL(gobble_ashr_10):
 # else
 	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1396,6 +1507,7 @@ LABEL(gobble_ashr_10):
 # else
 	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1448,6 +1560,7 @@ LABEL(ashr_11):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$5, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -1488,6 +1601,7 @@ LABEL(gobble_ashr_11):
 # else
 	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1518,6 +1632,7 @@ LABEL(gobble_ashr_11):
 # else
 	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1570,6 +1685,7 @@ LABEL(ashr_12):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$4, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -1610,6 +1726,7 @@ LABEL(gobble_ashr_12):
 # else
 	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1640,6 +1757,7 @@ LABEL(gobble_ashr_12):
 # else
 	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1692,6 +1810,7 @@ LABEL(ashr_13):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$3, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -1732,6 +1851,7 @@ LABEL(gobble_ashr_13):
 # else
 	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1762,6 +1882,7 @@ LABEL(gobble_ashr_13):
 # else
 	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1814,6 +1935,7 @@ LABEL(ashr_14):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq  $2, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -1854,6 +1976,7 @@ LABEL(gobble_ashr_14):
 # else
 	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1884,6 +2007,7 @@ LABEL(gobble_ashr_14):
 # else
 	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -1936,6 +2060,7 @@ LABEL(ashr_15):
 	movdqa	(%rsi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pslldq	$1, %xmm2
+	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm1, %xmm2
 	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
@@ -1978,6 +2103,7 @@ LABEL(gobble_ashr_15):
 # else
 	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -2008,6 +2134,7 @@ LABEL(gobble_ashr_15):
 # else
 	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
 # endif
+	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
 	pcmpeqb	%xmm2, %xmm1
@@ -2049,6 +2176,7 @@ LABEL(ashr_15_exittail):
 
 	.p2align 4
 LABEL(aftertail):
+	TOLOWER (%xmm1, %xmm3)
 	pcmpeqb	%xmm3, %xmm1
 	psubb	%xmm0, %xmm1
 	pmovmskb %xmm1, %edx
@@ -2076,6 +2204,12 @@ LABEL(less16bytes):
 	movzbl	(%rsi, %rdx), %ecx
 	movzbl	(%rdi, %rdx), %eax
 
+# ifdef USE_AS_STRCASECMP_L
+	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+	movl	(%rdx,%rcx,4), %ecx
+	movl	(%rdx,%rax,4), %eax
+# endif
+
 	sub	%ecx, %eax
 	ret