about summary refs log tree commit diff
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@gmail.com>2011-11-14 18:24:35 -0500
committerUlrich Drepper <drepper@gmail.com>2011-11-14 18:24:35 -0500
commit6abf346582ba678f4850a88b4a5950593841df1d (patch)
tree97c2ed051c43f3b1b8a08b71459fee856c25efc6
parent76e3966e9efc3808a9e7ad09121c5dfc1211c20b (diff)
downloadglibc-6abf346582ba678f4850a88b4a5950593841df1d.tar.gz
glibc-6abf346582ba678f4850a88b4a5950593841df1d.tar.xz
glibc-6abf346582ba678f4850a88b4a5950593841df1d.zip
Add SSE4.2 support for strcasecmp and strncasecmp on x86-32
-rw-r--r--ChangeLog14
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile3
-rw-r--r--sysdeps/i386/i686/multiarch/strcasecmp.S3
-rw-r--r--sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S2
-rw-r--r--sysdeps/i386/i686/multiarch/strcmp-sse4.S534
-rw-r--r--sysdeps/i386/i686/multiarch/strcmp-ssse3.S6
-rw-r--r--sysdeps/i386/i686/multiarch/strcmp.S6
-rw-r--r--sysdeps/i386/i686/multiarch/strncase.S3
-rw-r--r--sysdeps/i386/i686/multiarch/strncase_l-sse4.S2
9 files changed, 484 insertions, 89 deletions
diff --git a/ChangeLog b/ChangeLog
index 739105f553..d5889fc5b1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2011-11-14  Ulrich Drepper  <drepper@gmail.com>
+
+	* sysdeps/i386/i686/multiarch/Makefile [subdir=string]
+	(sysdep_routines): Add strcasecmp_l-sse4 and strncase_l-sse4.
+	* sysdeps/i386/i686/multiarch/strcasecmp.S: Re-enable SSE4.2 code.
+	* sysdeps/i386/i686/multiarch/strcmp.S: Likewise.
+	* sysdeps/i386/i686/multiarch/strncase.S: Likewise.
+	* sysdeps/i386/i686/multiarch/strcmp-sse4.S: Change to allow reuse
+	to compile strcasecmp and strncasecmp.
+	* sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S: New file.
+	* sysdeps/i386/i686/multiarch/strncase_l-sse4.S: New file.
+
+	* sysdeps/i386/i686/multiarch/strcmp-ssse3.S: Use L macro consistently.
+
 2011-11-13  Ulrich Drepper  <drepper@gmail.com>
 
 	* sysdeps/i386/i686/multiarch/Makefile [subdir=string]: Add
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 05bd65f463..426b718e47 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -21,7 +21,8 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
 		   rawmemchr-sse2 rawmemchr-sse2-bsf \
 		   strnlen-sse2 strnlen-c \
 		   strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \
-		   strncase_l-c strncase-c strncase_l-ssse3
+		   strncase_l-c strncase-c strncase_l-ssse3 \
+		   strcasecmp_l-sse4 strncase_l-sse4
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/strcasecmp.S b/sysdeps/i386/i686/multiarch/strcasecmp.S
index 55f10ba6af..97603d884b 100644
--- a/sysdeps/i386/i686/multiarch/strcasecmp.S
+++ b/sysdeps/i386/i686/multiarch/strcasecmp.S
@@ -36,12 +36,9 @@ ENTRY(__strcasecmp)
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__strcasecmp_ssse3@GOTOFF(%ebx), %eax
-#if 0
-	// XXX Temporarily
 	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__strcasecmp_sse4_2@GOTOFF(%ebx), %eax
-#endif
 2:	popl	%ebx
 	cfi_adjust_cfa_offset (-4)
 	cfi_restore (ebx)
diff --git a/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S b/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
new file mode 100644
index 0000000000..411d4153f2
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
index 0de0a113c0..1df63e3156 100644
--- a/sysdeps/i386/i686/multiarch/strcmp-sse4.S
+++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -1,5 +1,5 @@
 /* strcmp with SSE4.2
-   Copyright (C) 2010 Free Software Foundation, Inc.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -34,33 +34,156 @@
 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
 #define POP(REG)	popl REG; CFI_POP (REG)
 
-#ifndef USE_AS_STRNCMP
+#ifdef USE_AS_STRNCMP
 # ifndef STRCMP
-#  define STRCMP	__strcmp_sse4_2
+#  define STRCMP	__strncmp_sse4_2
 # endif
-# define STR1		4
+# define STR1		8
 # define STR2		STR1+4
-# define RETURN		ret; .p2align 4
-#else
+# define CNT		STR2+4
+# define RETURN		POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define REM		%ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
 # ifndef STRCMP
-#  define STRCMP	__strncmp_sse4_2
+#  define STRCMP	__strcasecmp_l_sse4_2
 # endif
-# define STR1		8
+# define STR1		12
+# define STR2		STR1+4
+# define LOCALE		12	/* Loaded before the adjustement.  */
+# ifdef PIC
+#  define RETURN	POP (%edi); POP (%ebx); ret; \
+			.p2align 4; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# else
+#  define RETURN	POP (%edi); ret; .p2align 4; CFI_PUSH (%edi)
+# endif
+# define NONASCII	__strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+#  define STRCMP	__strncasecmp_l_sse4_2
+# endif
+# define STR1		16
 # define STR2		STR1+4
 # define CNT		STR2+4
-# define RETURN		POP (%ebp); ret; .p2align 4; CFI_PUSH (%ebp)
+# define LOCALE		16	/* Loaded before the adjustement.  */
+# ifdef PIC
+#  define RETURN	POP (%edi); POP (REM); POP (%ebx); ret; \
+			.p2align 4; \
+			CFI_PUSH (%ebx); CFI_PUSH (REM); CFI_PUSH (%edi)
+# else
+#  define RETURN	POP (%edi); POP (REM); ret; \
+			.p2align 4; CFI_PUSH (REM); CFI_PUSH (%edi)
+# endif
+# define REM		%ebp
+# define NONASCII	__strncasecmp_nonascii
+#else
+# ifndef STRCMP
+#  define STRCMP	__strcmp_sse4_2
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
 #endif
 
 	.section .text.sse4.2,"ax",@progbits
-ENTRY (STRCMP)
-#ifdef USE_AS_STRNCMP
-	PUSH	(%ebp)
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_sse4_2)
+# ifdef PIC
+	PUSH	(%ebx)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+# else
+	movl	__libc_tsd_LOCALE@NTPOFF, %eax
+# endif
+	movl	%gs:(%eax), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+	jne	__strcasecmp_nonascii
+	jmp	L(ascii)
+END (__strcasecmp_sse4_2)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_sse4_2)
+# ifdef PIC
+	PUSH	(%ebx)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	movl	__libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+# else
+	movl	__libc_tsd_LOCALE@NTPOFF, %eax
+# endif
+	movl	%gs:(%eax), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+	jne	__strncasecmp_nonascii
+	jmp	L(ascii)
+END (__strncasecmp_sse4_2)
+#endif
+
+	ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movl	LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	movl	LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+	movl	(%eax), %eax
+# endif
+	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+	jne	NONASCII
+
+# ifdef PIC
+	PUSH	(%ebx)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+# endif
+L(ascii):
+	.section .rodata.cst16,"aM",@progbits,16
+	.align 16
+.Lbelowupper:
+	.quad	0x4040404040404040
+	.quad	0x4040404040404040
+.Ltopupper:
+	.quad	0x5b5b5b5b5b5b5b5b
+	.quad	0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+# ifdef PIC
+#  define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+#  define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+#  define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+#  define UCLOW_reg .Lbelowupper
+#  define UCHIGH_reg .Ltopupper
+#  define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	PUSH	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	PUSH	(%edi)
 #endif
 	mov	STR1(%esp), %edx
 	mov	STR2(%esp), %eax
-#ifdef USE_AS_STRNCMP
-	movl	CNT(%esp), %ebp
-	test	%ebp, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	movl	CNT(%esp), REM
+	test	REM, REM
 	je	L(eq)
 #endif
 	mov	%dx, %cx
@@ -72,10 +195,40 @@ ENTRY (STRCMP)
 	and	$0xfff, %ecx
 	cmp	$0xff0, %ecx
 	ja	L(first4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+	movdqa	reg1, %xmm3;						      \
+	movdqa	UCHIGH_reg, %xmm4;					      \
+	movdqa	reg2, %xmm5;						      \
+	movdqa	UCHIGH_reg, %xmm6;					      \
+	pcmpgtb	UCLOW_reg, %xmm3;					      \
+	pcmpgtb	reg1, %xmm4;						      \
+	pcmpgtb	UCLOW_reg, %xmm5;					      \
+	pcmpgtb	reg2, %xmm6;						      \
+	pand	%xmm4, %xmm3;						      \
+	pand	%xmm6, %xmm5;						      \
+	pand	LCQWORD_reg, %xmm3;					      \
+	pand	LCQWORD_reg, %xmm5;					      \
+	por	%xmm3, reg1;						      \
+	por	%xmm5, reg2
+
+	movdqu	(%eax), %xmm1
+	TOLOWER (%xmm2, %xmm1)
+	movd	%xmm2, %ecx
+	movd	%xmm1, %edi
+	movdqa	%xmm2, %xmm3
+	movdqa	%xmm1, %xmm4
+	cmpl	%edi, %ecx
+#else
+# define TOLOWER(reg1, reg)
+
 	movd	%xmm2, %ecx
 	cmp	(%eax), %ecx
+#endif
 	jne	L(less4bytes)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	movdqu	(%eax), %xmm1
+#endif
 	pxor	%xmm2, %xmm1
 	pxor	%xmm0, %xmm0
 	ptest	%xmm1, %xmm0
@@ -84,113 +237,210 @@ ENTRY (STRCMP)
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
 
-#ifdef USE_AS_STRNCMP
-	sub	$16, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, REM
 	jbe	L(eq)
 #endif
 	add	$16, %edx
 	add	$16, %eax
 L(first4bytes):
 	movzbl	(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, (%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$1, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
 	je	L(eq)
 #endif
 
 	movzbl	1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 1(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$2, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
 	je	L(eq)
 #endif
 	movzbl	2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 2(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$3, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
 	je	L(eq)
 #endif
 	movzbl	3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 3(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$4, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
 	je	L(eq)
 #endif
 	movzbl	4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 4(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$5, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
 	je	L(eq)
 #endif
 	movzbl	5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 5(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$6, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
 	je	L(eq)
 #endif
 	movzbl	6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 6(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$7, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
 	je	L(eq)
 #endif
 	movzbl	7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 7(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	sub	$8, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$8, REM
 	je	L(eq)
 #endif
 	add	$8, %eax
 	add	$8, %edx
 
-	PUSH	(%ebx)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	PUSH	(%edi)
+#endif
 	PUSH	(%esi)
-#ifdef USE_AS_STRNCMP
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cfi_remember_state
 #endif
 	mov	%edx, %edi
 	mov	%eax, %esi
 	xorl	%eax, %eax
 L(check_offset):
-	movl	%edi, %ebx
+	movl	%edi, %edx
 	movl	%esi, %ecx
-	andl	$0xfff, %ebx
+	andl	$0xfff, %edx
 	andl	$0xfff, %ecx
-	cmpl	%ebx, %ecx
-	cmovl	%ebx, %ecx
+	cmpl	%edx, %ecx
+	cmovl	%edx, %ecx
 	lea	-0xff0(%ecx), %edx
 	sub	%edx, %edi
 	sub	%edx, %esi
@@ -199,11 +449,12 @@ L(check_offset):
 L(loop):
 	movdqu	(%esi,%edx), %xmm2
 	movdqu	(%edi,%edx), %xmm1
+	TOLOWER (%xmm2, %xmm1)
 	pcmpistri	$0x1a, %xmm2, %xmm1
 	jbe	L(end)
 
-#ifdef USE_AS_STRNCMP
-	sub	$16, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$16, REM
 	jbe	L(more16byteseq)
 #endif
 
@@ -211,13 +462,22 @@ L(loop):
 	jle	L(loop)
 L(crosspage):
 	movzbl	(%edi,%edx), %eax
-	movzbl	(%esi,%edx), %ebx
-	subl	%ebx, %eax
+	movzbl	(%esi,%edx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+	subl	%ecx, %eax
 	jne	L(ret)
-	testl	%ebx, %ebx
+	testl	%ecx, %ecx
 	je	L(ret)
-#ifdef USE_AS_STRNCMP
-	sub	$1, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$1, REM
 	jbe	L(more16byteseq)
 #endif
 	inc	%edx
@@ -230,30 +490,44 @@ L(crosspage):
 	.p2align 4
 L(end):
 	jnc	L(ret)
-#ifdef USE_AS_STRNCMP
-	sub	%ecx, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	%ecx, REM
 	jbe	L(more16byteseq)
 #endif
-	lea	(%ecx,%edx), %ebx
-	movzbl	(%edi,%ebx), %eax
-	movzbl	(%esi,%ebx), %ecx
+	lea	(%ecx,%edx), %ecx
+	movzbl	(%edi,%ecx), %eax
+	movzbl	(%esi,%ecx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
 	subl	%ecx, %eax
 L(ret):
 	POP	(%esi)
 	POP	(%edi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	POP	(REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
 	POP	(%ebx)
-#ifdef USE_AS_STRNCMP
-	POP	(%ebp)
+# endif
 #endif
 	ret
 
 	.p2align 4
-#ifdef USE_AS_STRNCMP
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cfi_restore_state
 L(more16byteseq):
 	POP	(%esi)
+# ifdef USE_AS_STRNCMP
 	POP	(%edi)
-	POP	(%ebx)
+# endif
 #endif
 L(eq):
 	xorl	%eax, %eax
@@ -269,27 +543,45 @@ L(neq_bigger):
 L(less16bytes):
 	add	$0xfefefeff, %ecx
 	jnc	L(less4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movd	%xmm3, %edi
+	xor	%edi, %ecx
+#else
 	xor	(%edx), %ecx
+#endif
 	or	$0xfefefeff, %ecx
 	add	$1, %ecx
 	jnz	L(less4bytes)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$4, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
 	jbe	L(eq)
 #endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	psrldq	$4, %xmm3
+	psrldq	$4, %xmm4
+	movd	%xmm3, %ecx
+	movd	%xmm4, %edi
+	cmp	%edi, %ecx
+	mov	%ecx, %edi
+#else
 	mov	4(%edx), %ecx
 	cmp	4(%eax), %ecx
+#endif
 	jne	L(more4bytes)
 	add	$0xfefefeff, %ecx
 	jnc	L(more4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	xor	%edi, %ecx
+#else
 	xor	4(%edx), %ecx
+#endif
 	or	$0xfefefeff, %ecx
 	add	$1, %ecx
 	jnz	L(more4bytes)
 
-#ifdef USE_AS_STRNCMP
-	sub	$8, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	sub	$8, REM
 	jbe	L(eq)
 #endif
 
@@ -298,80 +590,176 @@ L(less16bytes):
 L(less4bytes):
 
 	movzbl	(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, (%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$1, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$1, REM
 	je	L(eq)
 #endif
 	movzbl	1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	1(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 1(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$2, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$2, REM
 	je	L(eq)
 #endif
 
 	movzbl	2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	2(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 2(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$3, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$3, REM
 	je	L(eq)
 #endif
 	movzbl	3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	3(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 3(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
 L(more4bytes):
-#ifdef USE_AS_STRNCMP
-	cmp	$4, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$4, REM
 	je	L(eq)
 #endif
 	movzbl	4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	4(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 4(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
 
-#ifdef USE_AS_STRNCMP
-	cmp	$5, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$5, REM
 	je	L(eq)
 #endif
 	movzbl	5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	5(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 5(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$6, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$6, REM
 	je	L(eq)
 #endif
 	movzbl	6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	6(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 6(%edx)
+#endif
 	jne	L(neq)
 	cmpl	$0, %ecx
 	je	L(eq)
 
-#ifdef USE_AS_STRNCMP
-	cmp	$7, %ebp
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+	cmp	$7, REM
 	je	L(eq)
 #endif
 	movzbl	7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+	movzbl	7(%edx), %edi
+# ifdef PIC
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+	movl	_nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+	cmpl	%ecx, %edi
+#else
 	cmpb	%cl, 7(%edx)
+#endif
 	jne	L(neq)
 	jmp	L(eq)
 
diff --git a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
index f8a2c7de83..137596d6cf 100644
--- a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
@@ -123,7 +123,7 @@ ENTRY (__strcasecmp_ssse3)
 # endif
 	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
 	jne	__strcasecmp_nonascii
-	jmp	.Lascii
+	jmp	L(ascii)
 END (__strcasecmp_ssse3)
 #endif
 
@@ -145,7 +145,7 @@ ENTRY (__strncasecmp_ssse3)
 # endif
 	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
 	jne	__strncasecmp_nonascii
-	jmp	.Lascii
+	jmp	L(ascii)
 END (__strncasecmp_ssse3)
 #endif
 
@@ -165,7 +165,7 @@ ENTRY (STRCMP)
 	call	__i686.get_pc_thunk.bx
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 # endif
-.Lascii:
+L(ascii):
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
 .Lbelowupper:
diff --git a/sysdeps/i386/i686/multiarch/strcmp.S b/sysdeps/i386/i686/multiarch/strcmp.S
index 1838ea3ff5..28e2d6154c 100644
--- a/sysdeps/i386/i686/multiarch/strcmp.S
+++ b/sysdeps/i386/i686/multiarch/strcmp.S
@@ -76,12 +76,9 @@ ENTRY(STRCMP)
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__STRCMP_SSSE3@GOTOFF(%ebx), %eax
-#if 0
-	// XXX Temporarily
 	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__STRCMP_SSE4_2@GOTOFF(%ebx), %eax
-#endif
 2:	popl	%ebx
 	cfi_adjust_cfa_offset (-4)
 	cfi_restore (ebx)
@@ -98,12 +95,9 @@ ENTRY(STRCMP)
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
 	jz	2f
 	leal	__STRCMP_SSSE3, %eax
-#if 0
-	// XXX Temporarily
 	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
 	jz	2f
 	leal	__STRCMP_SSE4_2, %eax
-#endif
 2:	ret
 END(STRCMP)
 # endif
diff --git a/sysdeps/i386/i686/multiarch/strncase.S b/sysdeps/i386/i686/multiarch/strncase.S
index 9b697d1bbc..d20532f993 100644
--- a/sysdeps/i386/i686/multiarch/strncase.S
+++ b/sysdeps/i386/i686/multiarch/strncase.S
@@ -36,12 +36,9 @@ ENTRY(__strncasecmp)
 	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__strncasecmp_ssse3@GOTOFF(%ebx), %eax
-#if 0
-	// XXX Temporarily
 	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
 	jz	2f
 	leal	__strncasecmp_sse4_2@GOTOFF(%ebx), %eax
-#endif
 2:	popl	%ebx
 	cfi_adjust_cfa_offset (-4)
 	cfi_restore (ebx)
diff --git a/sysdeps/i386/i686/multiarch/strncase_l-sse4.S b/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
new file mode 100644
index 0000000000..557210832e
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-sse4.S"