summary refs log tree commit diff
path: root/sysdeps
diff options
context:
space:
mode:
authorAndreas Schwab <schwab@redhat.com>2010-02-22 16:34:12 +0100
committerAndreas Schwab <schwab@redhat.com>2010-02-22 16:41:22 +0100
commit2b6e16c5e2fa8ad12b02a218ba32d63a4dffda18 (patch)
treeb931e143f212d5b656497bb0c3f3e3a413fc78e1 /sysdeps
parent0c3f133ed106996172dd2c106b22e38ce695e63d (diff)
parent199428c19774c12b3c4b6e6486ea9d4a021288af (diff)
downloadglibc-2b6e16c5e2fa8ad12b02a218ba32d63a4dffda18.tar.gz
glibc-2b6e16c5e2fa8ad12b02a218ba32d63a4dffda18.tar.xz
glibc-2b6e16c5e2fa8ad12b02a218ba32d63a4dffda18.zip
Merge remote branch 'origin/master' into fedora/master
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/i386/fpu/fegetenv.c1
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile4
-rw-r--r--sysdeps/i386/i686/multiarch/memcmp-sse4.S1004
-rw-r--r--sysdeps/i386/i686/multiarch/memcmp-ssse3.S1966
-rw-r--r--sysdeps/i386/i686/multiarch/memcmp.S88
-rw-r--r--sysdeps/i386/i686/multiarch/strcmp-sse4.S378
-rw-r--r--sysdeps/i386/i686/multiarch/strcmp-ssse3.S2220
-rw-r--r--sysdeps/i386/i686/multiarch/strcmp.S115
-rw-r--r--sysdeps/i386/i686/multiarch/strncmp-c.c8
-rw-r--r--sysdeps/i386/i686/multiarch/strncmp-sse4.S5
-rw-r--r--sysdeps/i386/i686/multiarch/strncmp-ssse3.S5
-rw-r--r--sysdeps/i386/i686/multiarch/strncmp.S3
-rw-r--r--sysdeps/i386/lshift.S4
-rw-r--r--sysdeps/i386/rshift.S8
-rw-r--r--sysdeps/ia64/fpu/fegetenv.c1
-rw-r--r--sysdeps/powerpc/fpu/fegetenv.c1
-rw-r--r--sysdeps/powerpc/powerpc32/configure9
-rw-r--r--sysdeps/powerpc/powerpc32/configure.in4
-rw-r--r--sysdeps/powerpc/powerpc32/dl-machine.h7
-rw-r--r--sysdeps/powerpc/powerpc32/dl-start.S5
-rw-r--r--sysdeps/powerpc/powerpc32/elf/start.S10
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S5
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/s_ceil.S7
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/s_ceilf.S7
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/s_floor.S7
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/s_floorf.S7
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/s_lround.S7
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/s_rint.S7
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/s_rintf.S7
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/s_round.S6
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/s_roundf.S6
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/s_trunc.S7
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/s_truncf.S7
-rw-r--r--sysdeps/powerpc/powerpc32/fpu/setjmp-common.S5
-rw-r--r--sysdeps/powerpc/powerpc32/memset.S7
-rw-r--r--sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S6
-rw-r--r--sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S7
-rw-r--r--sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S7
-rw-r--r--sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S7
-rw-r--r--sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S7
-rw-r--r--sysdeps/powerpc/powerpc32/power7/Implies1
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/Implies1
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S89
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S1
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S88
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S1
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S92
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S68
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S71
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S69
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S1
-rw-r--r--sysdeps/s390/fpu/fegetenv.c5
-rw-r--r--sysdeps/s390/s390-64/utf16-utf32-z9.c11
-rw-r--r--sysdeps/s390/s390-64/utf8-utf16-z9.c9
-rw-r--r--sysdeps/sh/sh4/fpu/fegetenv.c1
-rw-r--r--sysdeps/sparc/fpu/fegetenv.c1
-rw-r--r--sysdeps/sparc/sparc32/dl-irel.h55
-rw-r--r--sysdeps/sparc/sparc32/dl-machine.h81
-rw-r--r--sysdeps/sparc/sparc32/dl-plt.h62
-rw-r--r--sysdeps/sparc/sparc64/dl-irel.h58
-rw-r--r--sysdeps/sparc/sparc64/dl-machine.h172
-rw-r--r--sysdeps/sparc/sparc64/dl-plt.h144
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S7
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S7
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S5
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies1
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S5
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S5
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies1
-rw-r--r--sysdeps/x86_64/fpu/fegetenv.c1
75 files changed, 6701 insertions, 389 deletions
diff --git a/sysdeps/i386/fpu/fegetenv.c b/sysdeps/i386/fpu/fegetenv.c
index fb955cf565..ddb67e5d83 100644
--- a/sysdeps/i386/fpu/fegetenv.c
+++ b/sysdeps/i386/fpu/fegetenv.c
@@ -40,4 +40,5 @@ strong_alias (__fegetenv, __old_fegetenv)
 compat_symbol (libm, BP_SYM (__old_fegetenv), BP_SYM (fegetenv), GLIBC_2_1);
 #endif
 
+libm_hidden_ver (__fegetenv, fegetenv)
 versioned_symbol (libm, BP_SYM (__fegetenv), BP_SYM (fegetenv), GLIBC_2_2);
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index fbad9ae734..e8847d6fc4 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -7,7 +7,9 @@ ifeq ($(subdir),string)
 sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
 		   memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
-		   memset-sse2-rep bzero-sse2-rep
+		   memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
+		   strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
+		   memcmp-ssse3 memcmp-sse4
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
new file mode 100644
index 0000000000..b1ed778f1f
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -0,0 +1,1004 @@
+/* memcmp with SSE4.2
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef MEMCMP
+# define MEMCMP		__memcmp_sse4_2
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS		4
+#define BLK1		PARMS
+#define BLK2		BLK1+4
+#define LEN		BLK2+4
+#define RETURN		POP (%ebx); ret; CFI_PUSH (%ebx)
+
+
+#ifdef SHARED
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */				\
+    call	__i686.get_pc_thunk.bx;				\
+    /* Get the address of the jump table.  */			\
+    addl	$(TABLE - .), %ebx;				\
+    /* Get the entry and convert the relative offset to the	\
+       absolute address.  */					\
+    addl	(%ebx,INDEX,SCALE), %ebx;			\
+    /* We loaded the jump table and adjuested EDX/ESI. Go.  */	\
+    jmp		*%ebx
+
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	ALIGN (4)
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+#else
+# define JMPTBL(I, B)	I
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    jmp		*TABLE(,INDEX,SCALE)
+#endif
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (MEMCMP)
+	movl	BLK1(%esp), %eax
+	movl	BLK2(%esp), %edx
+	movl	LEN(%esp), %ecx
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+	pxor	%xmm0, %xmm0
+	cmp	$64, %ecx
+	ja	L(64bytesormore)
+	cmp	$8, %ecx
+	PUSH (%ebx)
+	jb	L(less8bytes)
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+	ALIGN (4)
+L(less8bytes):
+	mov	(%eax), %bl
+	cmpb	(%edx), %bl
+	jne	L(nonzero)
+
+	mov	1(%eax), %bl
+	cmpb	1(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$2, %ecx
+	jz	L(0bytes)
+
+	mov	2(%eax), %bl
+	cmpb	2(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$3, %ecx
+	jz	L(0bytes)
+
+	mov	3(%eax), %bl
+	cmpb	3(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$4, %ecx
+	jz	L(0bytes)
+
+	mov	4(%eax), %bl
+	cmpb	4(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$5, %ecx
+	jz	L(0bytes)
+
+	mov	5(%eax), %bl
+	cmpb	5(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$6, %ecx
+	jz	L(0bytes)
+
+	mov	6(%eax), %bl
+	cmpb	6(%edx), %bl
+	je	L(0bytes)
+L(nonzero):
+	POP (%ebx)
+	mov	$1, %eax
+	ja	L(above)
+	neg	%eax
+L(above):
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(0bytes):
+	POP (%ebx)
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(less1bytes):
+	jb	L(0bytesend)
+	movzbl	(%eax), %eax
+	movzbl	(%edx), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN (4)
+L(0bytesend):
+	xor	%eax, %eax
+	ret
+
+	ALIGN (4)
+L(64bytesormore):
+	PUSH (%ebx)
+	mov	%ecx, %ebx
+	mov	$64, %ecx
+	sub	$64, %ebx
+L(64bytesormore_loop):
+	movdqu	(%eax), %xmm1
+	movdqu	(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_16diff)
+
+	movdqu	16(%eax), %xmm1
+	movdqu	16(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_32diff)
+
+	movdqu	32(%eax), %xmm1
+	movdqu	32(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_48diff)
+
+	movdqu	48(%eax), %xmm1
+	movdqu	48(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_64diff)
+	add	%ecx, %eax
+	add	%ecx, %edx
+	sub	%ecx, %ebx
+	jae	L(64bytesormore_loop)
+	add	%ebx, %ecx
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+	ALIGN (4)
+L(find_16diff):
+	sub	$16, %ecx
+L(find_32diff):
+	sub	$16, %ecx
+L(find_48diff):
+	sub	$16, %ecx
+L(find_64diff):
+	add	%ecx, %edx
+	add	%ecx, %eax
+	jmp	L(16bytes)
+
+	ALIGN (4)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+	ALIGN (4)
+L(49bytes):
+	movdqu	-49(%eax), %xmm1
+	movdqu	-49(%edx), %xmm2
+	mov	$-49, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%eax), %xmm1
+	movdqu	-33(%edx), %xmm2
+	mov	$-33, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(50bytes):
+	mov	$-50, %ebx
+	movdqu	-50(%eax), %xmm1
+	movdqu	-50(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	mov	$-34, %ebx
+	movdqu	-34(%eax), %xmm1
+	movdqu	-34(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(51bytes):
+	mov	$-51, %ebx
+	movdqu	-51(%eax), %xmm1
+	movdqu	-51(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	mov	$-35, %ebx
+	movdqu	-35(%eax), %xmm1
+	movdqu	-35(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+L(1bytes):
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(52bytes):
+	movdqu	-52(%eax), %xmm1
+	movdqu	-52(%edx), %xmm2
+	mov	$-52, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%eax), %xmm1
+	movdqu	-36(%edx), %xmm2
+	mov	$-36, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%eax), %xmm1
+	movdqu	-20(%edx), %xmm2
+	mov	$-20, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+	ALIGN (4)
+L(53bytes):
+	movdqu	-53(%eax), %xmm1
+	movdqu	-53(%edx), %xmm2
+	mov	$-53, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	mov	$-37, %ebx
+	movdqu	-37(%eax), %xmm1
+	movdqu	-37(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	mov	$-21, %ebx
+	movdqu	-21(%eax), %xmm1
+	movdqu	-21(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(54bytes):
+	movdqu	-54(%eax), %xmm1
+	movdqu	-54(%edx), %xmm2
+	mov	$-54, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	mov	$-38, %ebx
+	movdqu	-38(%eax), %xmm1
+	movdqu	-38(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	mov	$-22, %ebx
+	movdqu	-22(%eax), %xmm1
+	movdqu	-22(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(55bytes):
+	movdqu	-55(%eax), %xmm1
+	movdqu	-55(%edx), %xmm2
+	mov	$-55, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	mov	$-39, %ebx
+	movdqu	-39(%eax), %xmm1
+	movdqu	-39(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	mov	$-23, %ebx
+	movdqu	-23(%eax), %xmm1
+	movdqu	-23(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(56bytes):
+	movdqu	-56(%eax), %xmm1
+	movdqu	-56(%edx), %xmm2
+	mov	$-56, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	mov	$-40, %ebx
+	movdqu	-40(%eax), %xmm1
+	movdqu	-40(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	mov	$-24, %ebx
+	movdqu	-24(%eax), %xmm1
+	movdqu	-24(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+	ALIGN (4)
+L(57bytes):
+	movdqu	-57(%eax), %xmm1
+	movdqu	-57(%edx), %xmm2
+	mov	$-57, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	mov	$-41, %ebx
+	movdqu	-41(%eax), %xmm1
+	movdqu	-41(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	mov	$-25, %ebx
+	movdqu	-25(%eax), %xmm1
+	movdqu	-25(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(58bytes):
+	movdqu	-58(%eax), %xmm1
+	movdqu	-58(%edx), %xmm2
+	mov	$-58, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	mov	$-42, %ebx
+	movdqu	-42(%eax), %xmm1
+	movdqu	-42(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	mov	$-26, %ebx
+	movdqu	-26(%eax), %xmm1
+	movdqu	-26(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(59bytes):
+	movdqu	-59(%eax), %xmm1
+	movdqu	-59(%edx), %xmm2
+	mov	$-59, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	mov	$-43, %ebx
+	movdqu	-43(%eax), %xmm1
+	movdqu	-43(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	mov	$-27, %ebx
+	movdqu	-27(%eax), %xmm1
+	movdqu	-27(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(60bytes):
+	movdqu	-60(%eax), %xmm1
+	movdqu	-60(%edx), %xmm2
+	mov	$-60, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	mov	$-44, %ebx
+	movdqu	-44(%eax), %xmm1
+	movdqu	-44(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	mov	$-28, %ebx
+	movdqu	-28(%eax), %xmm1
+	movdqu	-28(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+	ALIGN (4)
+L(61bytes):
+	movdqu	-61(%eax), %xmm1
+	movdqu	-61(%edx), %xmm2
+	mov	$-61, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	mov	$-45, %ebx
+	movdqu	-45(%eax), %xmm1
+	movdqu	-45(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	mov	$-29, %ebx
+	movdqu	-29(%eax), %xmm1
+	movdqu	-29(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(62bytes):
+	movdqu	-62(%eax), %xmm1
+	movdqu	-62(%edx), %xmm2
+	mov	$-62, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	mov	$-46, %ebx
+	movdqu	-46(%eax), %xmm1
+	movdqu	-46(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	mov	$-30, %ebx
+	movdqu	-30(%eax), %xmm1
+	movdqu	-30(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(63bytes):
+	movdqu	-63(%eax), %xmm1
+	movdqu	-63(%edx), %xmm2
+	mov	$-63, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	mov	$-47, %ebx
+	movdqu	-47(%eax), %xmm1
+	movdqu	-47(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	mov	$-31, %ebx
+	movdqu	-31(%eax), %xmm1
+	movdqu	-31(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	ALIGN (4)
+L(64bytes):
+	movdqu	-64(%eax), %xmm1
+	movdqu	-64(%edx), %xmm2
+	mov	$-64, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%eax), %xmm1
+	movdqu	-48(%edx), %xmm2
+	mov	$-48, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%eax), %xmm1
+	movdqu	-32(%edx), %xmm2
+	mov	$-32, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+	ALIGN (4)
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	mov	(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	mov	4(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	mov	8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	mov	12(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+	ALIGN (4)
+L(find_diff):
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+L(end):
+	POP (%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+END (MEMCMP)
+
+	.section .rodata.sse4.2,"a",@progbits
+	ALIGN (2)
+	.type	L(table_64bytes), @object
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+	.size	L(table_64bytes), .-L(table_64bytes)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000000..d2f852f726
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -0,0 +1,1966 @@
+/* memcmp with SSSE3
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef MEMCMP
+# define MEMCMP		__memcmp_ssse3
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS		4
+#define BLK1		PARMS
+#define BLK2		BLK1+4
+#define LEN		BLK2+4
+#define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
+#define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (MEMCMP)
+	movl	LEN(%esp), %ecx
+	movl	BLK1(%esp), %eax
+	cmp	$48, %ecx
+	movl	BLK2(%esp), %edx
+	jae	L(48bytesormore)
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+	PUSH (%ebx)
+	add	%ecx, %edx
+	add	%ecx, %eax
+	jmp	L(less48bytes)
+
+	ALIGN (4)
+	CFI_POP (%ebx)
+L(less1bytes):
+	jb	L(zero)
+	movb	(%eax), %cl
+	cmp	(%edx), %cl
+	je	L(zero)
+	mov	$1, %eax
+	ja	L(1bytesend)
+	neg	%eax
+L(1bytesend):
+	ret
+
+	ALIGN (4)
+L(zero):
+	mov	$0, %eax
+	ret
+
+	ALIGN (4)
+L(48bytesormore):
+	PUSH (%ebx)
+	PUSH (%esi)
+	PUSH (%edi)
+	cfi_remember_state
+	movdqu    (%eax), %xmm3
+	movdqu    (%edx), %xmm0
+	movl	%eax, %edi
+	movl	%edx, %esi
+	pcmpeqb   %xmm0, %xmm3
+	pmovmskb  %xmm3, %edx
+	lea	16(%edi), %edi
+
+	sub      $0xffff, %edx
+	lea	16(%esi), %esi
+	jnz	  L(less16bytes)
+	mov	%edi, %edx
+	and	$0xf, %edx
+	xor	%edx, %edi
+	sub	%edx, %esi
+	add	%edx, %ecx
+	mov	%esi, %edx
+	and	$0xf, %edx
+	jz	L(shr_0)
+	xor	%edx, %esi
+
+	cmp	$8, %edx
+	jae	L(next_unaligned_table)
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$1, %edx
+	je	L(shr_1)
+	cmp	$2, %edx
+	je	L(shr_2)
+	cmp	$3, %edx
+	je	L(shr_3)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$5, %edx
+	je	L(shr_5)
+	cmp	$6, %edx
+	je	L(shr_6)
+	jmp	L(shr_7)
+
+	ALIGN (4)
+L(next_unaligned_table):
+	cmp	$8, %edx
+	je	L(shr_8)
+	cmp	$9, %edx
+	je	L(shr_9)
+	cmp	$10, %edx
+	je	L(shr_10)
+	cmp	$11, %edx
+	je	L(shr_11)
+	cmp	$12, %edx
+	je	L(shr_12)
+	cmp	$13, %edx
+	je	L(shr_13)
+	cmp	$14, %edx
+	je	L(shr_14)
+	jmp	L(shr_15)
+
+	ALIGN (4)
+L(shr_0):
+	cmp	$80, %ecx
+	jae	L(shr_0_gobble)
+	lea	-48(%ecx), %ecx
+	xor	%eax, %eax
+	movaps	(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+	movaps	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+	pand	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+	add	$32, %edi
+	add	$32, %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_0_gobble):
+	lea	-48(%ecx), %ecx
+	movdqa	(%esi), %xmm0
+	xor	%eax, %eax
+	pcmpeqb	(%edi), %xmm0
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm2
+	pcmpeqb	16(%edi), %xmm2
+L(shr_0_gobble_loop):
+	pand	%xmm0, %xmm2
+	sub	$32, %ecx
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	movdqa	32(%esi), %xmm0
+	movdqa	48(%esi), %xmm2
+	sbb	$0xffff, %edx
+	pcmpeqb	32(%edi), %xmm0
+	pcmpeqb	48(%edi), %xmm2
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	jz	L(shr_0_gobble_loop)
+
+	pand	%xmm0, %xmm2
+	cmp	$0, %ecx
+	jge	L(shr_0_gobble_loop_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_0_gobble_loop_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_1):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_1_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$1,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_1_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$1,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$1,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_1_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$1,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$1,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_1_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_1_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_1_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	1(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_2):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_2_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$2,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_2_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$2,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$2,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_2_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$2,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$2,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_2_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_2_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_2_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	2(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_3):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_3_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$3,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_3_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$3,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$3,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_3_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$3,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$3,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_3_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_3_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_3_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	3(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_4):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_4_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$4,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_4_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$4,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$4,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_4_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$4,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$4,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_4_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_4_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_4_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	4(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_5):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_5_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$5,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_5_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$5,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$5,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_5_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$5,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$5,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_5_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_5_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_5_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	5(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_6):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_6_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$6,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_6_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$6,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$6,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_6_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$6,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$6,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_6_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_6_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_6_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	6(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_7):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_7_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$7,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_7_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$7,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$7,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_7_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$7,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$7,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_7_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_7_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_7_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	7(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_8):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_8_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$8,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_8_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$8,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$8,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_8_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$8,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$8,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_8_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_8_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_8_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	8(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_9):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_9_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$9,(%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_9_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$9,(%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$9,16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_9_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$9,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$9,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_9_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_9_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_9_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	9(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_10):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_10_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$10, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10,%xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_10_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$10, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$10, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_10_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$10,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$10,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_10_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_10_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_10_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	10(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_11):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_11_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$11, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_11_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$11, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$11, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_11_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$11,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$11,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_11_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_11_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_11_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	11(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_12):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_12_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$12, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_12_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$12, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$12, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_12_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$12,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$12,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_12_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_12_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_12_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	12(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_13):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_13_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$13, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_13_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$13, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$13, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_13_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$13,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$13,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_13_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_13_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_13_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	13(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_14):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_14_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$14, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_14_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$14, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$14, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_14_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$14,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$14,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_14_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_14_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_14_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	14(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_15):
+	cmp	$80, %ecx
+	lea	-48(%ecx), %ecx
+	mov	%edx, %eax
+	jae	L(shr_15_gobble)
+
+	movdqa	16(%esi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$15, (%esi), %xmm1
+	pcmpeqb	(%edi), %xmm1
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(shr_15_gobble):
+	sub	$32, %ecx
+	movdqa	16(%esi), %xmm0
+	palignr	$15, (%esi), %xmm0
+	pcmpeqb	(%edi), %xmm0
+
+	movdqa	32(%esi), %xmm3
+	palignr	$15, 16(%esi), %xmm3
+	pcmpeqb	16(%edi), %xmm3
+
+L(shr_15_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %ecx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%esi), %xmm3
+	palignr	$15,48(%esi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%esi), %xmm0
+	palignr	$15,32(%esi), %xmm0
+	pcmpeqb	32(%edi), %xmm0
+	lea	32(%esi), %esi
+	pcmpeqb	48(%edi), %xmm3
+
+	lea	32(%edi), %edi
+	jz	L(shr_15_gobble_loop)
+
+	cmp	$0, %ecx
+	jge	L(shr_15_gobble_next)
+	inc	%edx
+	add	$32, %ecx
+L(shr_15_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%edi), %edi
+	lea	32(%esi), %esi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	(%ecx, %edi,1), %eax
+	lea	15(%ecx, %esi,1), %edx
+	POP (%edi)
+	POP (%esi)
+	jmp	L(less48bytes)
+
+	cfi_restore_state
+	cfi_remember_state
+	ALIGN (4)
+L(exit):
+	pmovmskb %xmm1, %ebx
+	sub	$0xffff, %ebx
+	jz	L(first16bytes)
+	lea	-16(%esi), %esi
+	lea	-16(%edi), %edi
+	mov	%ebx, %edx
+L(first16bytes):
+	add	%eax, %esi
+L(less16bytes):
+	test	%dl, %dl
+	jz	L(next_24_bytes)
+
+	test	$0x01, %dl
+	jnz	L(Byte16)
+
+	test	$0x02, %dl
+	jnz	L(Byte17)
+
+	test	$0x04, %dl
+	jnz	L(Byte18)
+
+	test	$0x08, %dl
+	jnz	L(Byte19)
+
+	test	$0x10, %dl
+	jnz	L(Byte20)
+
+	test	$0x20, %dl
+	jnz	L(Byte21)
+
+	test	$0x40, %dl
+	jnz	L(Byte22)
+L(Byte23):
+	movzbl	 -9(%edi), %eax
+	movzbl	 -9(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte16):
+	movzbl	 -16(%edi), %eax
+	movzbl	 -16(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte17):
+	movzbl	 -15(%edi), %eax
+	movzbl	 -15(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte18):
+	movzbl	 -14(%edi), %eax
+	movzbl	 -14(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte19):
+	movzbl	 -13(%edi), %eax
+	movzbl	 -13(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte20):
+	movzbl	 -12(%edi), %eax
+	movzbl	 -12(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte21):
+	movzbl	 -11(%edi), %eax
+	movzbl	 -11(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(Byte22):
+	movzbl	 -10(%edi), %eax
+	movzbl	 -10(%esi), %edx
+	sub	%edx, %eax
+	RETURN
+
+	ALIGN (4)
+L(next_24_bytes):
+	lea	8(%edi), %edi
+	lea	8(%esi), %esi
+	test	$0x01, %dh
+	jnz	L(Byte16)
+
+	test	$0x02, %dh
+	jnz	L(Byte17)
+
+	test	$0x04, %dh
+	jnz	L(Byte18)
+
+	test	$0x08, %dh
+	jnz	L(Byte19)
+
+	test	$0x10, %dh
+	jnz	L(Byte20)
+
+	test	$0x20, %dh
+	jnz	L(Byte21)
+
+	test	$0x40, %dh
+	jnz	L(Byte22)
+
+	ALIGN (4)
+L(Byte31):
+	movzbl	 -9(%edi), %eax
+	movzbl	 -9(%esi), %edx
+	sub	%edx, %eax
+	RETURN_END
+
+	CFI_PUSH (%ebx)
+	ALIGN (4)
+L(more8bytes):
+	cmp	$16, %ecx
+	jae	L(more16bytes)
+	cmp	$8, %ecx
+	je	L(8bytes)
+	cmp	$9, %ecx
+	je	L(9bytes)
+	cmp	$10, %ecx
+	je	L(10bytes)
+	cmp	$11, %ecx
+	je	L(11bytes)
+	cmp	$12, %ecx
+	je	L(12bytes)
+	cmp	$13, %ecx
+	je	L(13bytes)
+	cmp	$14, %ecx
+	je	L(14bytes)
+	jmp	L(15bytes)
+
+	ALIGN (4)
+L(more16bytes):
+	cmp	$24, %ecx
+	jae	L(more24bytes)
+	cmp	$16, %ecx
+	je	L(16bytes)
+	cmp	$17, %ecx
+	je	L(17bytes)
+	cmp	$18, %ecx
+	je	L(18bytes)
+	cmp	$19, %ecx
+	je	L(19bytes)
+	cmp	$20, %ecx
+	je	L(20bytes)
+	cmp	$21, %ecx
+	je	L(21bytes)
+	cmp	$22, %ecx
+	je	L(22bytes)
+	jmp	L(23bytes)
+
+	ALIGN (4)
+L(more24bytes):
+	cmp	$32, %ecx
+	jae	L(more32bytes)
+	cmp	$24, %ecx
+	je	L(24bytes)
+	cmp	$25, %ecx
+	je	L(25bytes)
+	cmp	$26, %ecx
+	je	L(26bytes)
+	cmp	$27, %ecx
+	je	L(27bytes)
+	cmp	$28, %ecx
+	je	L(28bytes)
+	cmp	$29, %ecx
+	je	L(29bytes)
+	cmp	$30, %ecx
+	je	L(30bytes)
+	jmp	L(31bytes)
+
+	ALIGN (4)
+L(more32bytes):
+	cmp	$40, %ecx
+	jae	L(more40bytes)
+	cmp	$32, %ecx
+	je	L(32bytes)
+	cmp	$33, %ecx
+	je	L(33bytes)
+	cmp	$34, %ecx
+	je	L(34bytes)
+	cmp	$35, %ecx
+	je	L(35bytes)
+	cmp	$36, %ecx
+	je	L(36bytes)
+	cmp	$37, %ecx
+	je	L(37bytes)
+	cmp	$38, %ecx
+	je	L(38bytes)
+	jmp	L(39bytes)
+
+	ALIGN (4)
+L(more40bytes):
+	cmp	$40, %ecx
+	je	L(40bytes)
+	cmp	$41, %ecx
+	je	L(41bytes)
+	cmp	$42, %ecx
+	je	L(42bytes)
+	cmp	$43, %ecx
+	je	L(43bytes)
+	cmp	$44, %ecx
+	je	L(44bytes)
+	cmp	$45, %ecx
+	je	L(45bytes)
+	cmp	$46, %ecx
+	je	L(46bytes)
+	jmp	L(47bytes)
+
+	ALIGN (4)
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+
+	ALIGN (4)
+L(44bytes):
+	mov	-44(%eax), %ecx
+	mov	-44(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	mov	-40(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	mov	-36(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	mov	-32(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	mov	-28(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	mov	-24(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	mov	-20(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	POP (%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(45bytes):
+	mov	-45(%eax), %ecx
+	mov	-45(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(41bytes):
+	mov	-41(%eax), %ecx
+	mov	-41(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(37bytes):
+	mov	-37(%eax), %ecx
+	mov	-37(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(33bytes):
+	mov	-33(%eax), %ecx
+	mov	-33(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(29bytes):
+	mov	-29(%eax), %ecx
+	mov	-29(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(25bytes):
+	mov	-25(%eax), %ecx
+	mov	-25(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(21bytes):
+	mov	-21(%eax), %ecx
+	mov	-21(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	POP (%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(46bytes):
+	mov	-46(%eax), %ecx
+	mov	-46(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(42bytes):
+	mov	-42(%eax), %ecx
+	mov	-42(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(38bytes):
+	mov	-38(%eax), %ecx
+	mov	-38(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(34bytes):
+	mov	-34(%eax), %ecx
+	mov	-34(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(30bytes):
+	mov	-30(%eax), %ecx
+	mov	-30(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(26bytes):
+	mov	-26(%eax), %ecx
+	mov	-26(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(22bytes):
+	mov	-22(%eax), %ecx
+	mov	-22(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	POP (%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(47bytes):
+	movl	-47(%eax), %ecx
+	movl	-47(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(43bytes):
+	movl	-43(%eax), %ecx
+	movl	-43(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(39bytes):
+	movl	-39(%eax), %ecx
+	movl	-39(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(35bytes):
+	movl	-35(%eax), %ecx
+	movl	-35(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(31bytes):
+	movl	-31(%eax), %ecx
+	movl	-31(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%eax), %ecx
+	movl	-27(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%eax), %ecx
+	movl	-23(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	POP (%ebx)
+	ret
+	CFI_PUSH (%ebx)
+
+	ALIGN (4)
+L(find_diff):
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+L(end):
+	POP (%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+
+END (MEMCMP)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memcmp.S b/sysdeps/i386/i686/multiarch/memcmp.S
new file mode 100644
index 0000000000..cf606a5959
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcmp.S
@@ -0,0 +1,88 @@
+/* Multiple versions of memcmp
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.text
+ENTRY(memcmp)
+	.type	memcmp, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memcmp_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memcmp_ssse3@GOTOFF(%ebx), %eax
+	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__memcmp_sse4_2@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(memcmp)
+# else
+	.text
+ENTRY(memcmp)
+	.type	memcmp, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__memcmp_ia32, %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+	jz	2f
+	leal	__memcmp_ssse3, %eax
+	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
+	jz	2f
+	leal	__memcmp_sse4_2, %eax
+2:	ret
+END(memcmp)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcmp_ia32, @function; \
+	.p2align 4; \
+	__memcmp_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memcmp; __GI_memcmp = __memcmp_ia32
+# endif
+#endif
+
+#include "../memcmp.S"
diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
new file mode 100644
index 0000000000..d5fd23e15c
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -0,0 +1,378 @@
+/* strcmp with SSE4.2
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifndef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strcmp_sse4_2
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
+#else
+# ifndef STRCMP
+#  define STRCMP	__strncmp_sse4_2
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (%ebp); ret; .p2align 4; CFI_PUSH (%ebp)
+#endif
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (STRCMP)
+#ifdef USE_AS_STRNCMP
+	PUSH	(%ebp)
+#endif
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %eax
+#ifdef USE_AS_STRNCMP
+	movl	CNT(%esp), %ebp
+	test	%ebp, %ebp
+	je	L(eq)
+#endif
+	mov	%dx, %cx
+	and	$0xfff, %cx
+	cmp	$0xff0, %cx
+	ja	L(first4bytes)
+	movdqu	(%edx), %xmm2
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(first4bytes)
+	movd	%xmm2, %ecx
+	cmp	(%eax), %ecx
+	jne	L(less4bytes)
+	movdqu	(%eax), %xmm1
+	pxor	%xmm2, %xmm1
+	pxor	%xmm0, %xmm0
+	ptest	%xmm1, %xmm0
+	jnc	L(less16bytes)
+	pcmpeqb	%xmm0, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %ebp
+	jbe	L(eq)
+#endif
+	add	$16, %edx
+	add	$16, %eax
+L(first4bytes):
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %ebp
+	je	L(eq)
+#endif
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %ebp
+	je	L(eq)
+#endif
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %ebp
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %ebp
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %ebp
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	sub	$8, %ebp
+	je	L(eq)
+#endif
+	add	$8, %eax
+	add	$8, %edx
+
+	PUSH	(%ebx)
+	PUSH	(%edi)
+	PUSH	(%esi)
+	cfi_remember_state
+	mov	%edx, %edi
+	mov	%eax, %esi
+	xorl	%eax, %eax
+L(check_offset):
+	movl	%edi, %ebx
+	movl	%esi, %ecx
+	andl	$0xfff, %ebx
+	andl	$0xfff, %ecx
+	cmpl	%ebx, %ecx
+	cmovl	%ebx, %ecx
+	lea	-0xff0(%ecx), %edx
+	sub	%edx, %edi
+	sub	%edx, %esi
+	testl	%edx, %edx
+	jg	L(crosspage)
+L(loop):
+	movdqu	(%esi,%edx), %xmm2
+	movdqu	(%edi,%edx), %xmm1
+	pcmpistri	$0x1a, %xmm2, %xmm1
+	jbe	L(end)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %ebp
+	jbe	L(more16byteseq)
+#endif
+
+	add	$16, %edx
+	jle	L(loop)
+L(crosspage):
+	movzbl	(%edi,%edx), %eax
+	movzbl	(%esi,%edx), %ebx
+	subl	%ebx, %eax
+	jne	L(ret)
+	testl	%ebx, %ebx
+	je	L(ret)
+#ifdef USE_AS_STRNCMP
+	sub	$1, %ebp
+	jbe	L(more16byteseq)
+#endif
+	inc	%edx
+	cmp	$15, %edx
+	jle	L(crosspage)
+	add	$16, %edi
+	add	$16, %esi
+	jmp	L(check_offset)
+
+	.p2align 4
+L(end):
+	jnc	L(ret)
+#ifdef USE_AS_STRNCMP
+	sub	%ecx, %ebp
+	jbe	L(more16byteseq)
+#endif
+	lea	(%ecx,%edx), %ebx
+	movzbl	(%edi,%ebx), %eax
+	movzbl	(%esi,%ebx), %ecx
+	subl	%ecx, %eax
+L(ret):
+	POP	(%esi)
+	POP	(%edi)
+	POP	(%ebx)
+#ifdef USE_AS_STRNCMP
+	POP	(%ebp)
+#endif
+	ret
+
+	.p2align 4
+	cfi_restore_state
+#ifdef USE_AS_STRNCMP
+L(more16byteseq):
+	POP	(%esi)
+	POP	(%edi)
+	POP	(%ebx)
+#endif
+L(eq):
+	xorl	%eax, %eax
+	RETURN
+
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+	RETURN
+
+L(less16bytes):
+	add	$0xfefefeff, %ecx
+	jnc	L(less4bytes)
+	xor	(%edx), %ecx
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(less4bytes)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	jbe	L(eq)
+#endif
+	mov	4(%edx), %ecx
+	cmp	4(%eax), %ecx
+	jne	L(more4bytes)
+	add	$0xfefefeff, %ecx
+	jnc	L(more4bytes)
+	xor	4(%edx), %ecx
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(more4bytes)
+
+#ifdef USE_AS_STRNCMP
+	sub	$8, %ebp
+	jbe	L(eq)
+#endif
+
+	add	$8, %edx
+	add	$8, %eax
+L(less4bytes):
+
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %ebp
+	je	L(eq)
+#endif
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %ebp
+	je	L(eq)
+#endif
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %ebp
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+L(more4bytes):
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %ebp
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %ebp
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	jmp	L(eq)
+
+END (STRCMP)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
new file mode 100644
index 0000000000..40994c05b1
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
@@ -0,0 +1,2220 @@
+/* strcmp with SSSE3
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifndef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strcmp_ssse3
+# endif
+# define STR1		4
+# define STR2		STR1+4
+# define RETURN		ret; .p2align 4
+# define UPDATE_STRNCMP_COUNTER
+#else
+# ifndef STRCMP
+#  define STRCMP	__strncmp_ssse3
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+# define RETURN		POP (%ebp); ret; .p2align 4; CFI_PUSH (%ebp)
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	mov	$16, %esi;				\
+	sub	%ecx, %esi;				\
+	cmp	%esi, %ebp;				\
+	jbe	L(more8byteseq);			\
+	sub	%esi, %ebp
+#endif
+
+	.section .text.ssse3,"ax",@progbits
+ENTRY (STRCMP)
+#ifdef USE_AS_STRNCMP
+	PUSH	(%ebp)
+#endif
+	movl	STR1(%esp), %edx
+	movl	STR2(%esp), %eax
+#ifdef USE_AS_STRNCMP
+	movl	CNT(%esp), %ebp
+	cmp	$16, %ebp
+	jb	L(less16bytes_sncmp)
+	jmp	L(more16bytes)
+#endif
+
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+	add	$8, %edx
+	add	$8, %eax
+#ifdef USE_AS_STRNCMP
+	cmp	$8, %ebp
+	lea	-8(%ebp), %ebp
+	je	L(eq)
+L(more16bytes):
+#endif
+	movl	%edx, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(crosspage)
+	pxor	%xmm0, %xmm0
+	movlpd	(%eax), %xmm1
+	movlpd	(%edx), %xmm2
+	movhpd	8(%eax), %xmm1
+	movhpd	8(%edx), %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %ecx
+	sub	$0xffff, %ecx
+	jnz	L(less16bytes)
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(eq)
+#endif
+	add	$16, %eax
+	add	$16, %edx
+
+L(crosspage):
+
+	PUSH	(%ebx)
+	PUSH	(%edi)
+	PUSH	(%esi)
+#ifdef USE_AS_STRNCMP
+	cfi_remember_state
+#endif
+
+	movl	%edx, %edi
+	movl	%eax, %ecx
+	and	$0xf, %ecx
+	and	$0xf, %edi
+	xor	%ecx, %eax
+	xor	%edi, %edx
+	xor	%ebx, %ebx
+	cmp	%edi, %ecx
+	je	L(ashr_0)
+	ja	L(bigger)
+	or	$0x20, %ebx
+	xchg	%edx, %eax
+	xchg	%ecx, %edi
+L(bigger):
+	lea	15(%edi), %edi
+	sub	%ecx, %edi
+	cmp	$8, %edi
+	jle	L(ashr_less_8)
+	cmp	$14, %edi
+	je	L(ashr_15)
+	cmp	$13, %edi
+	je	L(ashr_14)
+	cmp	$12, %edi
+	je	L(ashr_13)
+	cmp	$11, %edi
+	je	L(ashr_12)
+	cmp	$10, %edi
+	je	L(ashr_11)
+	cmp	$9, %edi
+	je	L(ashr_10)
+L(ashr_less_8):
+	je	L(ashr_9)
+	cmp	$7, %edi
+	je	L(ashr_8)
+	cmp	$6, %edi
+	je	L(ashr_7)
+	cmp	$5, %edi
+	je	L(ashr_6)
+	cmp	$4, %edi
+	je	L(ashr_5)
+	cmp	$3, %edi
+	je	L(ashr_4)
+	cmp	$2, %edi
+	je	L(ashr_3)
+	cmp	$1, %edi
+	je	L(ashr_2)
+	cmp	$0, %edi
+	je	L(ashr_1)
+
+/*
+ * The following cases will be handled by ashr_0
+ *  ecx(offset of esi)  eax(offset of edi)  relative offset  corresponding case
+ *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
+ */
+	.p2align 4
+L(ashr_0):
+	mov	$0xffff, %esi
+	movdqa	(%eax), %xmm1
+	pxor	%xmm0, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	(%edx), %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	mov	%ecx, %edi
+	jne	L(less32bytes)
+	UPDATE_STRNCMP_COUNTER
+	mov	$0x10, %ebx
+	mov	$0x10, %ecx
+	pxor	%xmm0, %xmm0
+	.p2align 4
+L(loop_ashr_0):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	jmp	L(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
+ */
+	.p2align 4
+L(ashr_1):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$15, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-15(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$1, %ebx
+	lea	1(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_1):
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+L(gobble_ashr_1):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_1)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$1, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_1)
+
+	.p2align 4
+L(nibble_ashr_1):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffe, %esi
+	jnz	L(ashr_1_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$15, %ebp
+	jbe	L(ashr_1_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_1)
+
+	.p2align 4
+L(ashr_1_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$1, %xmm0
+	psrldq	$1, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(14~15)            n -14            1(15 +(n-14) - n)         ashr_2
+ */
+	.p2align 4
+L(ashr_2):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$14, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-14(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$2, %ebx
+	lea	2(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_2):
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+L(gobble_ashr_2):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_2)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$2, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_2)
+
+	.p2align 4
+L(nibble_ashr_2):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfffc, %esi
+	jnz	L(ashr_2_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$14, %ebp
+	jbe	L(ashr_2_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_2)
+
+	.p2align 4
+L(ashr_2_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$2, %xmm0
+	psrldq	$2, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(13~15)            n -13            2(15 +(n-13) - n)         ashr_3
+ */
+	.p2align 4
+L(ashr_3):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$13, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-13(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$3, %ebx
+	lea	3(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_3):
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+L(gobble_ashr_3):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_3)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$3, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_3)
+
+	.p2align 4
+L(nibble_ashr_3):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff8, %esi
+	jnz	L(ashr_3_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$13, %ebp
+	jbe	L(ashr_3_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_3)
+
+	.p2align 4
+L(ashr_3_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$3, %xmm0
+	psrldq	$3, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(12~15)            n -12            3(15 +(n-12) - n)         ashr_4
+ */
+	.p2align 4
+L(ashr_4):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$12, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-12(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$4, %ebx
+	lea	4(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_4):
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+L(gobble_ashr_4):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_4)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$4, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_4)
+
+	.p2align 4
+L(nibble_ashr_4):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfff0, %esi
+	jnz	L(ashr_4_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$12, %ebp
+	jbe	L(ashr_4_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_4)
+
+	.p2align 4
+L(ashr_4_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$4, %xmm0
+	psrldq	$4, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(11~15)            n -11            4(15 +(n-11) - n)         ashr_5
+ */
+	.p2align 4
+L(ashr_5):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$11, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-11(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$5, %ebx
+	lea	5(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_5):
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+L(gobble_ashr_5):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_5)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$5, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_5)
+
+	.p2align 4
+L(nibble_ashr_5):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffe0, %esi
+	jnz	L(ashr_5_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$11, %ebp
+	jbe	L(ashr_5_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_5)
+
+	.p2align 4
+L(ashr_5_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$5, %xmm0
+	psrldq	$5, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(10~15)            n -10            5(15 +(n-10) - n)         ashr_6
+ */
+
+	.p2align 4
+L(ashr_6):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$10, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-10(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$6, %ebx
+	lea	6(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_6):
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+L(gobble_ashr_6):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_6)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$6, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_6)
+
+	.p2align 4
+L(nibble_ashr_6):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xffc0, %esi
+	jnz	L(ashr_6_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$10, %ebp
+	jbe	L(ashr_6_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_6)
+
+	.p2align 4
+L(ashr_6_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$6, %xmm0
+	psrldq	$6, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(9~15)            n - 9            6(15 +(n-9) - n)         ashr_7
+ */
+
+	.p2align 4
+L(ashr_7):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$9, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-9(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$7, %ebx
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_7):
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+L(gobble_ashr_7):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_7)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$7, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_7)
+
+	.p2align 4
+L(nibble_ashr_7):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff80, %esi
+	jnz	L(ashr_7_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$9, %ebp
+	jbe	L(ashr_7_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_7)
+
+	.p2align 4
+L(ashr_7_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$7, %xmm0
+	psrldq	$7, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(8~15)            n - 8            7(15 +(n-8) - n)         ashr_8
+ */
+	.p2align 4
+L(ashr_8):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$8, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-8(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$8, %ebx
+	lea	8(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_8):
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+L(gobble_ashr_8):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_8)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$8, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_8)
+
+	.p2align 4
+L(nibble_ashr_8):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xff00, %esi
+	jnz	L(ashr_8_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$8, %ebp
+	jbe	L(ashr_8_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_8)
+
+	.p2align 4
+L(ashr_8_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$8, %xmm0
+	psrldq	$8, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(7~15)            n - 7            8(15 +(n-7) - n)         ashr_9
+ */
+	.p2align 4
+L(ashr_9):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$7, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-7(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$9, %ebx
+	lea	9(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_9):
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+L(gobble_ashr_9):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_9)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$9, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_9)
+
+	.p2align 4
+L(nibble_ashr_9):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfe00, %esi
+	jnz	L(ashr_9_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	jbe	L(ashr_9_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_9)
+
+	.p2align 4
+L(ashr_9_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$9, %xmm0
+	psrldq	$9, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(6~15)            n - 6            9(15 +(n-6) - n)         ashr_10
+ */
+	.p2align 4
+L(ashr_10):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$6, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-6(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$10, %ebx
+	lea	10(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_10):
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+L(gobble_ashr_10):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_10)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$10, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_10)
+
+	.p2align 4
+L(nibble_ashr_10):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xfc00, %esi
+	jnz	L(ashr_10_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %ebp
+	jbe	L(ashr_10_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_10)
+
+	.p2align 4
+L(ashr_10_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$10, %xmm0
+	psrldq	$10, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(5~15)            n - 5            10(15 +(n-5) - n)         ashr_11
+ */
+	.p2align 4
+L(ashr_11):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$5, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-5(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$11, %ebx
+	lea	11(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_11):
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+L(gobble_ashr_11):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_11)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$11, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_11)
+
+	.p2align 4
+L(nibble_ashr_11):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf800, %esi
+	jnz	L(ashr_11_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %ebp
+	jbe	L(ashr_11_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_11)
+
+	.p2align 4
+L(ashr_11_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$11, %xmm0
+	psrldq	$11, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(4~15)            n - 4            11(15 +(n-4) - n)         ashr_12
+ */
+	.p2align 4
+L(ashr_12):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$4, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-4(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$12, %ebx
+	lea	12(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_12):
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+L(gobble_ashr_12):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_12)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$12, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_12)
+
+	.p2align 4
+L(nibble_ashr_12):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xf000, %esi
+	jnz	L(ashr_12_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	jbe	L(ashr_12_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_12)
+
+	.p2align 4
+L(ashr_12_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$12, %xmm0
+	psrldq	$12, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(3~15)            n - 3            12(15 +(n-3) - n)         ashr_13
+ */
+	.p2align 4
+L(ashr_13):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$3, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-3(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$13, %ebx
+	lea	13(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_13):
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+L(gobble_ashr_13):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_13)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$13, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_13)
+
+	.p2align 4
+L(nibble_ashr_13):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xe000, %esi
+	jnz	L(ashr_13_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %ebp
+	jbe	L(ashr_13_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_13)
+
+	.p2align 4
+L(ashr_13_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$13, %xmm0
+	psrldq	$13, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(2~15)            n - 2            13(15 +(n-2) - n)         ashr_14
+ */
+	.p2align 4
+L(ashr_14):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$2, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-2(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$14, %ebx
+	lea	14(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_14):
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+L(gobble_ashr_14):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_14)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$14, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_14)
+
+	.p2align 4
+L(nibble_ashr_14):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0xc000, %esi
+	jnz	L(ashr_14_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %ebp
+	jbe	L(ashr_14_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_14)
+
+	.p2align 4
+L(ashr_14_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$14, %xmm0
+	psrldq	$14, %xmm3
+	jmp	L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi)  eax(offset of edi)   relative offset	corresponding case
+ *        n(1~15)            n - 1            14(15 +(n-1) - n)         ashr_15
+ */
+
+	.p2align 4
+L(ashr_15):
+	mov	$0xffff, %esi
+	pxor	%xmm0, %xmm0
+	movdqa	(%edx), %xmm2
+	movdqa	(%eax), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$1, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %edi
+	shr	%cl, %esi
+	shr	%cl, %edi
+	sub	%edi, %esi
+	lea	-1(%ecx), %edi
+	jnz	L(less32bytes)
+
+	UPDATE_STRNCMP_COUNTER
+
+	movdqa	(%edx), %xmm3
+	pxor	%xmm0, %xmm0
+	mov	$16, %ecx
+	or	$15, %ebx
+	lea	15(%edx), %edi
+	and	$0xfff, %edi
+	sub	$0x1000, %edi
+
+	.p2align 4
+L(loop_ashr_15):
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+L(gobble_ashr_15):
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %edi
+	jg	L(nibble_ashr_15)
+
+	movdqa	(%eax, %ecx), %xmm1
+	movdqa	(%edx, %ecx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	palignr	$15, %xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	sub	$0xffff, %esi
+	jnz	L(exit)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$16, %ebp
+	lea	-16(%ebp), %ebp
+	jbe	L(more8byteseq)
+#endif
+	add	$16, %ecx
+	movdqa	%xmm4, %xmm3
+	jmp	L(loop_ashr_15)
+
+	.p2align 4
+L(nibble_ashr_15):
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %esi
+	test	$0x8000, %esi
+	jnz	L(ashr_15_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %ebp
+	jbe	L(ashr_15_exittail)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %edi
+	jmp	L(gobble_ashr_15)
+
+	.p2align 4
+L(ashr_15_exittail):
+	movdqa	(%eax, %ecx), %xmm1
+	psrldq	$15, %xmm0
+	psrldq	$15, %xmm3
+	jmp	L(aftertail)
+
+	.p2align 4
+L(aftertail):
+	pcmpeqb	%xmm3, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	not	%esi
+L(exit):
+	mov	%ebx, %edi
+	and	$0x1f, %edi
+	lea	-16(%edi, %ecx), %edi
+L(less32bytes):
+	add	%edi, %edx
+	add	%ecx, %eax
+	test	$0x20, %ebx
+	jz	L(ret2)
+	xchg	%eax, %edx
+
+	.p2align 4
+L(ret2):
+	mov	%esi, %ecx
+	POP	(%esi)
+	POP	(%edi)
+	POP	(%ebx)
+L(less16bytes):
+	test	%cl, %cl
+	jz	L(2next_8_bytes)
+
+	test	$0x01, %cl
+	jnz	L(Byte0)
+
+	test	$0x02, %cl
+	jnz	L(Byte1)
+
+	test	$0x04, %cl
+	jnz	L(Byte2)
+
+	test	$0x08, %cl
+	jnz	L(Byte3)
+
+	test	$0x10, %cl
+	jnz	L(Byte4)
+
+	test	$0x20, %cl
+	jnz	L(Byte5)
+
+	test	$0x40, %cl
+	jnz	L(Byte6)
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	jbe	L(eq)
+#endif
+
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte0):
+#ifdef USE_AS_STRNCMP
+	cmp	$0, %ebp
+	jbe	L(eq)
+#endif
+	movzx	(%eax), %ecx
+	movzx	(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte1):
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %ebp
+	jbe	L(eq)
+#endif
+	movzx	1(%eax), %ecx
+	movzx	1(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte2):
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %ebp
+	jbe	L(eq)
+#endif
+	movzx	2(%eax), %ecx
+	movzx	2(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte3):
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %ebp
+	jbe	L(eq)
+#endif
+	movzx	3(%eax), %ecx
+	movzx	3(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte4):
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	jbe	L(eq)
+#endif
+	movzx	4(%eax), %ecx
+	movzx	4(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte5):
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %ebp
+	jbe	L(eq)
+#endif
+	movzx	5(%eax), %ecx
+	movzx	5(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(Byte6):
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %ebp
+	jbe	L(eq)
+#endif
+	movzx	6(%eax), %ecx
+	movzx	6(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(2next_8_bytes):
+	add	$8, %eax
+	add	$8, %edx
+#ifdef USE_AS_STRNCMP
+	cmp	$8, %ebp
+	lea	-8(%ebp), %ebp
+	jbe	L(eq)
+#endif
+
+	test	$0x01, %ch
+	jnz	L(Byte0)
+
+	test	$0x02, %ch
+	jnz	L(Byte1)
+
+	test	$0x04, %ch
+	jnz	L(Byte2)
+
+	test	$0x08, %ch
+	jnz	L(Byte3)
+
+	test	$0x10, %ch
+	jnz	L(Byte4)
+
+	test	$0x20, %ch
+	jnz	L(Byte5)
+
+	test	$0x40, %ch
+	jnz	L(Byte6)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	jbe	L(eq)
+#endif
+	movzx	7(%eax), %ecx
+	movzx	7(%edx), %eax
+
+	sub	%ecx, %eax
+	RETURN
+
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+#ifdef USE_AS_STRNCMP
+	POP	(%ebp)
+#endif
+	ret
+
+#ifdef USE_AS_STRNCMP
+	.p2align 4
+	cfi_restore_state
+L(more8byteseq):
+	POP	(%esi)
+	POP	(%edi)
+	POP	(%ebx)
+#endif
+
+L(eq):
+
+#ifdef USE_AS_STRNCMP
+	POP	(%ebp)
+#endif
+	xorl	%eax, %eax
+	ret
+
+#ifdef USE_AS_STRNCMP
+	.p2align 4
+	CFI_PUSH (%ebp)
+L(less16bytes_sncmp):
+	test	%ebp, %ebp
+	jz	L(eq)
+
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$1, %ebp
+	je	L(eq)
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$2, %ebp
+	je	L(eq)
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$3, %ebp
+	je	L(eq)
+
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$4, %ebp
+	je	L(eq)
+
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$5, %ebp
+	je	L(eq)
+
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$6, %ebp
+	je	L(eq)
+
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$7, %ebp
+	je	L(eq)
+
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+
+	cmp	$8, %ebp
+	je	L(eq)
+
+	movzbl	8(%eax), %ecx
+	cmpb	%cl, 8(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$9, %ebp
+	je	L(eq)
+
+	movzbl	9(%eax), %ecx
+	cmpb	%cl, 9(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$10, %ebp
+	je	L(eq)
+
+	movzbl	10(%eax), %ecx
+	cmpb	%cl, 10(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$11, %ebp
+	je	L(eq)
+
+	movzbl	11(%eax), %ecx
+	cmpb	%cl, 11(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+
+	cmp	$12, %ebp
+	je	L(eq)
+
+	movzbl	12(%eax), %ecx
+	cmpb	%cl, 12(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$13, %ebp
+	je	L(eq)
+
+	movzbl	13(%eax), %ecx
+	cmpb	%cl, 13(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$14, %ebp
+	je	L(eq)
+
+	movzbl	14(%eax), %ecx
+	cmpb	%cl, 14(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	cmp	$15, %ebp
+	je	L(eq)
+
+	movzbl	15(%eax), %ecx
+	cmpb	%cl, 15(%edx)
+	jne	L(neq)
+	test	%cl, %cl
+	je	L(eq)
+
+	POP	(%ebp)
+	xor	%eax, %eax
+	ret
+#endif
+
+END (STRCMP)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strcmp.S b/sysdeps/i386/i686/multiarch/strcmp.S
new file mode 100644
index 0000000000..7136d47e85
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcmp.S
@@ -0,0 +1,115 @@
+/* Multiple versions of strcmp
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCMP
+# define STRCMP			strcmp
+# define __GI_STRCMP		__GI_strcmp
+# define __STRCMP_IA32		__strcmp_ia32
+# define __STRCMP_SSSE3		__strcmp_ssse3
+# define __STRCMP_SSE4_2	__strcmp_sse4_2
+#else
+# define STRCMP			strncmp
+# define __GI_STRCMP		__GI_strncmp
+# define __STRCMP_IA32		__strncmp_ia32
+# define __STRCMP_SSSE3		__strncmp_ssse3
+# define __STRCMP_SSE4_2	__strncmp_sse4_2
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncmp in static library since we
+   need strncmp before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc
+# ifdef SHARED
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(STRCMP)
+	.type	STRCMP, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__STRCMP_IA32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__STRCMP_SSSE3@GOTOFF(%ebx), %eax
+	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__STRCMP_SSE4_2@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(STRCMP)
+# else
+	.text
+ENTRY(STRCMP)
+	.type	STRCMP, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__STRCMP_IA32, %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+	jz	2f
+	leal	__STRCMP_SSSE3, %eax
+	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
+	jz	2f
+	leal	__STRCMP_SSE4_2, %eax
+2:	ret
+END(STRCMP)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __STRCMP_IA32, @function; \
+	.p2align 4; \
+	__STRCMP_IA32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32
+# endif
+#endif
+
+#ifndef USE_AS_STRNCMP
+# include "../strcmp.S"
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strncmp-c.c b/sysdeps/i386/i686/multiarch/strncmp-c.c
new file mode 100644
index 0000000000..cc059da494
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncmp-c.c
@@ -0,0 +1,8 @@
+#ifdef SHARED
+# define STRNCMP __strncmp_ia32
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)  \
+    __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32);
+#endif
+
+#include "string/strncmp.c"
diff --git a/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/sysdeps/i386/i686/multiarch/strncmp-sse4.S
new file mode 100644
index 0000000000..cf14dfaf6c
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncmp-sse4.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP	__strncmp_sse4_2
+# include "strcmp-sse4.S"
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
new file mode 100644
index 0000000000..536c8685f2
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP	__strncmp_ssse3
+# include "strcmp-ssse3.S"
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strncmp.S b/sysdeps/i386/i686/multiarch/strncmp.S
new file mode 100644
index 0000000000..b6814315fb
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncmp.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCMP
+#define STRCMP	strncmp
+#include "strcmp.S"
diff --git a/sysdeps/i386/lshift.S b/sysdeps/i386/lshift.S
index 536d9878eb..398cf038c7 100644
--- a/sysdeps/i386/lshift.S
+++ b/sysdeps/i386/lshift.S
@@ -1,5 +1,5 @@
 /* i80386 __mpn_lshift --
-   Copyright (C) 1992, 1994, 1997-2000, 2005 Free Software Foundation, Inc.
+   Copyright (C) 1992,1994,1997-2000,2005,2010 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
@@ -55,6 +55,7 @@ ENTRY (BP_SYM (__mpn_lshift))
 
 	movl	(%esi,%edx,4),%ebx	/* read most significant limb */
 	cfi_rel_offset (ebx, 0)
+	cfi_remember_state
 	xorl	%eax,%eax
 	shldl	%cl,%ebx,%eax		/* compute carry limb */
 	decl	%edx
@@ -95,6 +96,7 @@ L(1):	movl	(%esi,%edx,4),%eax
 	LEAVE
 	ret
 
+	cfi_restore_state
 L(end):	shll	%cl,%ebx		/* compute least significant limb */
 	movl	%ebx,(%edi)		/* store it */
 
diff --git a/sysdeps/i386/rshift.S b/sysdeps/i386/rshift.S
index 3fd0afe822..332c4d09e7 100644
--- a/sysdeps/i386/rshift.S
+++ b/sysdeps/i386/rshift.S
@@ -1,5 +1,5 @@
 /* i80386 __mpn_rshift --
-   Copyright (C) 1992,1994,1997-2000,2005 Free Software Foundation, Inc.
+   Copyright (C) 1992,1994,1997-2000,2005,2010 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
@@ -57,6 +57,7 @@ ENTRY (BP_SYM (__mpn_rshift))
 
 	movl	(%esi,%edx,4),%ebx	/* read least significant limb */
 	cfi_rel_offset (ebx, 0)
+	cfi_remember_state
 	xorl	%eax,%eax
 	shrdl	%cl,%ebx,%eax		/* compute carry limb */
 	incl	%edx
@@ -97,10 +98,7 @@ L(1):	movl	(%esi,%edx,4),%eax
 	LEAVE
 	ret
 
-	cfi_adjust_cfa_offset (12)
-	cfi_rel_offset (edi, 8)
-	cfi_rel_offset (esi, 4)
-	cfi_rel_offset (ebx, 0)
+	cfi_restore_state
 L(end):	shrl	%cl,%ebx		/* compute most significant limb */
 	movl	%ebx,(%edi)		/* store it */
 
diff --git a/sysdeps/ia64/fpu/fegetenv.c b/sysdeps/ia64/fpu/fegetenv.c
index 5446b16494..e240f75e43 100644
--- a/sysdeps/ia64/fpu/fegetenv.c
+++ b/sysdeps/ia64/fpu/fegetenv.c
@@ -27,3 +27,4 @@ fegetenv (fenv_t *envp)
 
   return 0;
 }
+libm_hidden_def (fegetenv)
diff --git a/sysdeps/powerpc/fpu/fegetenv.c b/sysdeps/powerpc/fpu/fegetenv.c
index 53953454cb..3d21abb529 100644
--- a/sysdeps/powerpc/fpu/fegetenv.c
+++ b/sysdeps/powerpc/fpu/fegetenv.c
@@ -35,4 +35,5 @@ strong_alias (__fegetenv, __old_fegetenv)
 compat_symbol (libm, BP_SYM (__old_fegetenv), BP_SYM (fegetenv), GLIBC_2_1);
 #endif
 
+libm_hidden_ver (__fegetenv, fegetenv)
 versioned_symbol (libm, BP_SYM (__fegetenv), BP_SYM (fegetenv), GLIBC_2_2);
diff --git a/sysdeps/powerpc/powerpc32/configure b/sysdeps/powerpc/powerpc32/configure
index 9b76c57886..da8ec0b87c 100644
--- a/sysdeps/powerpc/powerpc32/configure
+++ b/sysdeps/powerpc/powerpc32/configure
@@ -25,11 +25,10 @@ rm -f conftest*
 fi
 { $as_echo "$as_me:$LINENO: result: $libc_cv_ppc_rel16" >&5
 $as_echo "$libc_cv_ppc_rel16" >&6; }
-if test $libc_cv_ppc_rel16 = yes; then
-  cat >>confdefs.h <<\_ACEOF
-#define HAVE_ASM_PPC_REL16 1
-_ACEOF
-
+if test $libc_cv_ppc_rel16 = no; then
+  { { $as_echo "$as_me:$LINENO: error: R_PPC_REL16 is not supported. Binutils is too old." >&5
+$as_echo "$as_me: error: R_PPC_REL16 is not supported. Binutils is too old." >&2;}
+   { (exit 1); exit 1; }; }
 fi
 
 # See whether GCC uses -msecure-plt.
diff --git a/sysdeps/powerpc/powerpc32/configure.in b/sysdeps/powerpc/powerpc32/configure.in
index 7219ad993e..21d3f5ee5b 100644
--- a/sysdeps/powerpc/powerpc32/configure.in
+++ b/sysdeps/powerpc/powerpc32/configure.in
@@ -13,8 +13,8 @@ else
   libc_cv_ppc_rel16=no
 fi
 rm -f conftest*])
-if test $libc_cv_ppc_rel16 = yes; then
-  AC_DEFINE(HAVE_ASM_PPC_REL16)
+if test $libc_cv_ppc_rel16 = no; then
+  AC_MSG_ERROR(R_PPC_REL16 is not supported. Binutils is too old.)
 fi
 
 # See whether GCC uses -msecure-plt.
diff --git a/sysdeps/powerpc/powerpc32/dl-machine.h b/sysdeps/powerpc/powerpc32/dl-machine.h
index 6f8d0f506e..5351d9691d 100644
--- a/sysdeps/powerpc/powerpc32/dl-machine.h
+++ b/sysdeps/powerpc/powerpc32/dl-machine.h
@@ -41,16 +41,13 @@ static inline Elf32_Addr * __attribute__ ((const))
 ppc_got (void)
 {
   Elf32_Addr *got;
-#ifdef HAVE_ASM_PPC_REL16
+
   asm ("bcl 20,31,1f\n"
        "1:	mflr %0\n"
        "	addis %0,%0,_GLOBAL_OFFSET_TABLE_-1b@ha\n"
        "	addi %0,%0,_GLOBAL_OFFSET_TABLE_-1b@l\n"
        : "=b" (got) : : "lr");
-#else
-  asm (" bl _GLOBAL_OFFSET_TABLE_-4@local"
-       : "=l" (got));
-#endif
+
   return got;
 }
 
diff --git a/sysdeps/powerpc/powerpc32/dl-start.S b/sysdeps/powerpc/powerpc32/dl-start.S
index c77c4de198..ae41f47ede 100644
--- a/sysdeps/powerpc/powerpc32/dl-start.S
+++ b/sysdeps/powerpc/powerpc32/dl-start.S
@@ -47,15 +47,10 @@ _dl_start_user:
    passed by value!).  */
 
 /*  Put our GOT pointer in r31, */
-#ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r31
 	addis	r31,r31,_GLOBAL_OFFSET_TABLE_-1b@ha
 	addi	r31,r31,_GLOBAL_OFFSET_TABLE_-1b@l
-#else
-	bl	_GLOBAL_OFFSET_TABLE_-4@local
-	mflr	r31
-#endif
 /*  the address of _start in r30, */
 	mr	r30,r3
 /*  &_dl_argc in 29, &_dl_argv in 27, and _dl_loaded in 28.  */
diff --git a/sysdeps/powerpc/powerpc32/elf/start.S b/sysdeps/powerpc/powerpc32/elf/start.S
index a8abdca0c6..dc89a5e109 100644
--- a/sysdeps/powerpc/powerpc32/elf/start.S
+++ b/sysdeps/powerpc/powerpc32/elf/start.S
@@ -53,10 +53,6 @@ L(start_addresses):
 	ASM_SIZE_DIRECTIVE(L(start_addresses))
 
 	.section ".text"
-#if defined PIC && !defined HAVE_ASM_PPC_REL16
-L(start_addressesp):
-	.long	L(start_addresses)-L(branch)
-#endif
 ENTRY(_start)
  /* Save the stack pointer, in case we're statically linked under Linux.  */
 	mr	r9,r1
@@ -77,16 +73,10 @@ L(branch):
     start_addresses in r8.  Also load the GOT pointer so that new PLT
     calls work, like the one to __libc_start_main.  */
 #ifdef PIC
-# ifdef HAVE_ASM_PPC_REL16
 	addis	r30,r13,_GLOBAL_OFFSET_TABLE_-L(branch)@ha
 	addis	r8,r13,L(start_addresses)-L(branch)@ha
 	addi	r30,r30,_GLOBAL_OFFSET_TABLE_-L(branch)@l
 	lwzu	r13,L(start_addresses)-L(branch)@l(r8)
-# else
-	lwz	r8,L(start_addressesp)-L(branch)(r13)
-	add	r8,r13,r8
-	lwz	r13,0(r8)
-# endif
 #else
 	lis	r8,L(start_addresses)@ha
 	lwzu	r13,L(start_addresses)@l(r8)
diff --git a/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S b/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
index 04ed6da68b..e1ac064a59 100644
--- a/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
+++ b/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
@@ -34,15 +34,10 @@ ENTRY (BP_SYM (__longjmp))
 # ifdef PIC
 	mflr    r6
 	cfi_register (lr,r6)
-#  ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r5
 	addis	r5,r5,_GLOBAL_OFFSET_TABLE_-1b@ha
 	addi	r5,r5,_GLOBAL_OFFSET_TABLE_-1b@l
-#  else
-	bl      _GLOBAL_OFFSET_TABLE_@local-4
-	mflr    r5
-#  endif
 #  ifdef SHARED
 	lwz     r5,_rtld_global_ro@got(r5)
 	mtlr    r6
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_ceil.S b/sysdeps/powerpc/powerpc32/fpu/s_ceil.S
index bc74d302fb..80e72ca2bd 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_ceil.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_ceil.S
@@ -31,17 +31,10 @@ ENTRY (__ceil)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S b/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S
index 47a75ec0c3..ce6d71e4f8 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_ceilf.S
@@ -30,17 +30,10 @@ ENTRY (__ceilf)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_floor.S b/sysdeps/powerpc/powerpc32/fpu/s_floor.S
index a29e4791ea..0dd0dbe6c0 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_floor.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_floor.S
@@ -31,17 +31,10 @@ ENTRY (__floor)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_floorf.S b/sysdeps/powerpc/powerpc32/fpu/s_floorf.S
index 99fbdc5f86..98a47458bc 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_floorf.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_floorf.S
@@ -30,17 +30,10 @@ ENTRY (__floorf)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_lround.S b/sysdeps/powerpc/powerpc32/fpu/s_lround.S
index d73749e134..3bf1ffaea1 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_lround.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_lround.S
@@ -45,17 +45,10 @@ ENTRY (__lround)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp10,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp10,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_rint.S b/sysdeps/powerpc/powerpc32/fpu/s_rint.S
index c8dca313ae..93133718ad 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_rint.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_rint.S
@@ -33,17 +33,10 @@ ENTRY (__rint)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_rintf.S b/sysdeps/powerpc/powerpc32/fpu/s_rintf.S
index 7771cb2bc8..1e0fbb1f0d 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_rintf.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_rintf.S
@@ -29,17 +29,10 @@ ENTRY (__rintf)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_round.S b/sysdeps/powerpc/powerpc32/fpu/s_round.S
index 590c87ad8c..48b346e651 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_round.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_round.S
@@ -43,16 +43,10 @@ ENTRY (__round)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	addi	r9,r9,.LC0-1b@l
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 	lfs	fp13,0(r9)
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_roundf.S b/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
index 7e99bca315..88125aad06 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
@@ -42,16 +42,10 @@ ENTRY (__roundf )
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	addi	r9,r9,.LC0-1b@l
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 	lfs	fp13,0(r9)
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_trunc.S b/sysdeps/powerpc/powerpc32/fpu/s_trunc.S
index 5bc0856b9f..c3c021716a 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_trunc.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_trunc.S
@@ -38,17 +38,10 @@ ENTRY (__trunc)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_truncf.S b/sysdeps/powerpc/powerpc32/fpu/s_truncf.S
index e2e3bd6740..eddef070cd 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_truncf.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_truncf.S
@@ -37,17 +37,10 @@ ENTRY (__truncf)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	lfs	fp13,.LC0-1b@l(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-	lfs	fp13,0(r9)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 #else
diff --git a/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S b/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
index b7d1abc00d..131e7a332e 100644
--- a/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
+++ b/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
@@ -85,15 +85,10 @@ ENTRY (BP_SYM (__sigsetjmp))
 # ifdef PIC
 	mflr    r6
 	cfi_register(lr,r6)
-#  ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r5
 	addis	r5,r5,_GLOBAL_OFFSET_TABLE_-1b@ha
 	addi	r5,r5,_GLOBAL_OFFSET_TABLE_-1b@l
-#  else
-	bl      _GLOBAL_OFFSET_TABLE_@local-4
-	mflr    r5
-#  endif
 	mtlr	r6
 	cfi_same_value (lr)
 #  ifdef SHARED
diff --git a/sysdeps/powerpc/powerpc32/memset.S b/sysdeps/powerpc/powerpc32/memset.S
index 454abb2b65..b4ce218e24 100644
--- a/sysdeps/powerpc/powerpc32/memset.S
+++ b/sysdeps/powerpc/powerpc32/memset.S
@@ -256,17 +256,10 @@ L(checklinesize):
 	beq	L(medium)
 /* Establishes GOT addressability so we can load __cache_line_size
    from static. This value was set from the aux vector during startup.  */
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	rGOT
 	addis	rGOT,rGOT,__cache_line_size-1b@ha
 	lwz	rCLS,__cache_line_size-1b@l(rGOT)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	rGOT
-	lwz	rGOT,__cache_line_size@got(rGOT)
-	lwz	rCLS,0(rGOT)
-# endif
 	mtlr	rTMP
 #else
 /* Load __cache_line_size from static. This value was set from the
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S b/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
index e10a37977a..b03e041d8a 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
@@ -53,16 +53,10 @@ ENTRY (__llround)
 #ifdef SHARED
 	mflr	r11
 	cfi_register(lr,r11)
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r9
 	addis	r9,r9,.LC0-1b@ha
 	addi	r9,r9,.LC0-1b@l
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r10
-	lwz	r9,.LC0@got(10)
-# endif
 	mtlr	r11
 	cfi_same_value (lr)
 	lfd	fp9,0(r9)
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S
index 95a0b3915d..8be3cf1848 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S
@@ -63,7 +63,6 @@ EALIGN (__sqrt, 5, 0)
 	cfi_offset(lr,20-16)
 	cfi_offset(r30,8-16)
 #ifdef SHARED
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,.LCF1
 .LCF1:
 	mflr	r30
@@ -71,12 +70,6 @@ EALIGN (__sqrt, 5, 0)
 	addi	r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l
 	lwz	r9,_LIB_VERSION@got(30)
 	lwz	r0,0(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r30
-	lwz	r9,_LIB_VERSION@got(30)
-	lwz	r0,0(r9)
-# endif
 #else
 	lis	r9,_LIB_VERSION@ha
 	lwz	r0,_LIB_VERSION@l(r9)
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S
index c31555194b..9fa282c162 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S
@@ -63,7 +63,6 @@ EALIGN (__sqrtf, 5, 0)
 	cfi_offset(lr,20-16)
 	cfi_offset(r30,8-16)
 #ifdef SHARED
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,.LCF1
 .LCF1:
 	mflr	r30
@@ -71,12 +70,6 @@ EALIGN (__sqrtf, 5, 0)
 	addi	r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l
 	lwz	r9,_LIB_VERSION@got(30)
 	lwz	r0,0(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r30
-	lwz	r9,_LIB_VERSION@got(30)
-	lwz	r0,0(r9)
-# endif
 #else
 	lis	r9,_LIB_VERSION@ha
 	lwz	r0,_LIB_VERSION@l(r9)
diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S
index 105b5912a1..27a1a0dcbb 100644
--- a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S
+++ b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S
@@ -63,7 +63,6 @@ EALIGN (__sqrt, 5, 0)
 	cfi_offset(lr,20-16)
 	cfi_offset(r30,8-16)
 #ifdef SHARED
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,.LCF1
 .LCF1:
 	mflr	r30
@@ -71,12 +70,6 @@ EALIGN (__sqrt, 5, 0)
 	addi	r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l
 	lwz	r9,_LIB_VERSION@got(30)
 	lwz	r0,0(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r30
-	lwz	r9,_LIB_VERSION@got(30)
-	lwz	r0,0(r9)
-# endif
 #else
 	lis	r9,_LIB_VERSION@ha
 	lwz	r0,_LIB_VERSION@l(r9)
diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S
index 14bc0a2ceb..8914855542 100644
--- a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S
+++ b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S
@@ -63,7 +63,6 @@ EALIGN (__sqrtf, 5, 0)
 	cfi_offset(lr,20-16)
 	cfi_offset(r30,8-16)
 #ifdef SHARED
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,.LCF1
 .LCF1:
 	mflr	r30
@@ -71,12 +70,6 @@ EALIGN (__sqrtf, 5, 0)
 	addi	r30,r30,_GLOBAL_OFFSET_TABLE_-.LCF1@l
 	lwz	r9,_LIB_VERSION@got(30)
 	lwz	r0,0(r9)
-# else
-	bl	_GLOBAL_OFFSET_TABLE_@local-4
-	mflr	r30
-	lwz	r9,_LIB_VERSION@got(30)
-	lwz	r0,0(r9)
-# endif
 #else
 	lis	r9,_LIB_VERSION@ha
 	lwz	r0,_LIB_VERSION@l(r9)
diff --git a/sysdeps/powerpc/powerpc32/power7/Implies b/sysdeps/powerpc/powerpc32/power7/Implies
deleted file mode 100644
index 03899d8a3c..0000000000
--- a/sysdeps/powerpc/powerpc32/power7/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc32/power5
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/Implies b/sysdeps/powerpc/powerpc32/power7/fpu/Implies
deleted file mode 100644
index 819a7d7979..0000000000
--- a/sysdeps/powerpc/powerpc32/power7/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc32/power5/fpu
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
new file mode 100644
index 0000000000..5b0d950c74
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
@@ -0,0 +1,89 @@
+/* finite().  PowerPC32/POWER7 version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __finite(x)  */
+	.section    .rodata.cst8,"aM",@progbits,8
+	.align 3
+.LC0:   /* 1.0 */
+	.quad	    0x3ff0000000000000
+
+	.section    ".text"
+	.type	    __finite, @function
+	.machine    power7
+ENTRY (__finite)
+#ifdef SHARED
+	mflr	r11
+	cfi_register(lr,r11)
+
+	bcl	20,31,1f
+1:	mflr	r9
+	addis	r9,r9,.LC0-1b@ha
+	lfd	fp0,.LC0-1b@l(r9)
+
+	mtlr	r11
+	cfi_same_value (lr)
+#else
+	lis	r9,.LC0@ha
+	lfd	fp0,.LC0@l(r9)
+#endif
+	ftdiv	cr7,fp1,fp0
+	li	r3,1
+	bflr	30
+
+	/* We have -INF/+INF/NaN or a denormal.  */
+
+	stwu	r1,-16(r1)    /* Allocate stack space.  */
+	stfd    fp1,8(r1)     /* Transfer FP to GPR's.  */
+
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lhz     r0,8(r1)      /* Fetch the upper portion of the high word of
+			      the FP value (where the exponent and sign bits
+			      are).  */
+	clrlwi	r0,r0,17      /* r0 = abs(r0).  */
+	addi	r1,r1,16      /* Reset the stack pointer.  */
+	cmpwi	cr7,r0,0x7ff0 /* r4 == 0x7ff0?.  */
+	bltlr	cr7	      /* LT means we have a denormal.  */
+	li	r3,0
+	blr
+	END (__finite)
+
+hidden_def (__finite)
+weak_alias (__finite, finite)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__finite, __finitef)
+hidden_def (__finitef)
+weak_alias (__finitef, finitef)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__finite, __finitel)
+weak_alias (__finite, finitel)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
+compat_symbol (libc, finite, finitel, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S
new file mode 100644
index 0000000000..54bd94176d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_finite.S.  */
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
new file mode 100644
index 0000000000..2979534911
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
@@ -0,0 +1,88 @@
+/* isinf().  PowerPC32/POWER7 version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isinf(x)  */
+	.section    .rodata.cst8,"aM",@progbits,8
+	.align 3
+.LC0:   /* 1.0 */
+	.quad	    0x3ff0000000000000
+
+	.section    ".text"
+	.type	    __isinf, @function
+	.machine    power7
+ENTRY (__isinf)
+#ifdef SHARED
+	mflr	r11
+	cfi_register(lr,r11)
+
+	bcl	20,31,1f
+1:	mflr	r9
+	addis	r9,r9,.LC0-1b@ha
+	lfd	fp0,.LC0-1b@l(r9)
+
+	mtlr	r11
+	cfi_same_value (lr)
+#else
+	lis	r9,.LC0@ha
+	lfd	fp0,.LC0@l(r9)
+#endif
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr    29	      /* If not INF, return.  */
+
+	/* Either we have -INF/+INF or a denormal.  */
+
+	stwu    r1,-16(r1)    /* Allocate stack space.  */
+	stfd    fp1,8(r1)     /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lhz	r4,8(r1)      /* Fetch the upper portion of the high word of
+			      the FP value (where the exponent and sign bits
+			      are).  */
+	addi	r1,r1,16      /* Reset the stack pointer.  */
+	cmpwi	cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
+	li	r3,1
+	beqlr   cr7	      /* EQ means INF, otherwise -INF.  */
+	li      r3,-1
+	blr
+	END (__isinf)
+
+hidden_def (__isinf)
+weak_alias (__isinf, isinf)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isinf, __isinff)
+hidden_def (__isinff)
+weak_alias (__isinff, isinff)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isinf, __isinfl)
+weak_alias (__isinf, isinfl)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
+compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S
new file mode 100644
index 0000000000..be759e091e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isinf.S.  */
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
new file mode 100644
index 0000000000..852539f24b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
@@ -0,0 +1,92 @@
+/* isnan().  PowerPC32/POWER7 version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isnan(x)  */
+	.section    .rodata.cst8,"aM",@progbits,8
+	.align 3
+.LC0:   /* 1.0 */
+	.quad	    0x3ff0000000000000
+
+	.section    ".text"
+	.type	    __isnan, @function
+	.machine    power7
+ENTRY (__isnan)
+#ifdef SHARED
+	mflr	r11
+	cfi_register(lr,r11)
+
+	bcl	20,31,1f
+1:	mflr	r9
+	addis	r9,r9,.LC0-1b@ha
+	lfd	fp0,.LC0-1b@l(r9)
+
+	mtlr	r11
+	cfi_same_value (lr)
+#else
+	lis	r9,.LC0@ha
+	lfd	fp0,.LC0@l(r9)
+#endif
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr	30	      /* If not NaN or Inf, finish. */
+
+	/* We have -INF/+INF/NaN or a denormal.  */
+
+	stwu	r1,-16(r1)    /* Allocate stack space.  */
+	stfd	fp1,8(r1)     /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lwz     r4,8(r1)      /* Load the upper half of the FP value.  */
+	lwz     r5,12(r1)     /* Load the lower half of the FP value.  */
+	addi	r1,r1,16      /* Reset the stack pointer.  */
+	lis     r0,0x7ff0     /* Load the upper portion for an INF/NaN.  */
+	clrlwi  r4,r4,1	      /* r4 = abs(r4).  */
+	cmpw    cr7,r4,r0     /* if (abs(r4) <= inf).  */
+	cmpwi   cr6,r5,0      /* r5 == 0x00000000?  */
+	bltlr	cr7	      /* LT means we have a denormal.  */
+	bgt	cr7,L(NaN)    /* GT means we have a NaN.  */
+	beqlr	cr6	      /* EQ means we have +/-INF.  */
+L(NaN):
+	li      r3,1	      /* x == NaN?  */
+	blr
+	END (__isnan)
+
+hidden_def (__isnan)
+weak_alias (__isnan, isnan)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isnan, __isnanf)
+hidden_def (__isnanf)
+weak_alias (__isnanf, isnanf)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isnan, __isnanl)
+weak_alias (__isnan, isnanl)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
+compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S
new file mode 100644
index 0000000000..b48c85e0d3
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isnan.S.  */
diff --git a/sysdeps/powerpc/powerpc64/power7/Implies b/sysdeps/powerpc/powerpc64/power7/Implies
deleted file mode 100644
index 13b03309fb..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power5
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/powerpc/powerpc64/power7/fpu/Implies
deleted file mode 100644
index 13b03309fb..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power5
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
new file mode 100644
index 0000000000..6763d1adc8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
@@ -0,0 +1,68 @@
+/* finite().  PowerPC64/POWER7 version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __finite(x)  */
+	.section    ".toc","aw"
+.LC0:   /* 1.0 */
+	.tc	    FD_ONE[TC],0x3ff0000000000000
+	.section    ".text"
+	.type	    __finite, @function
+	.machine    power7
+EALIGN (__finite, 4, 0)
+	CALL_MCOUNT 0
+	lfd     fp0,.LC0@toc(r2)
+	ftdiv   cr7,fp1,fp0
+	li	r3,1
+	bflr    30
+
+	/* If we are here, we either have +/-INF,
+	NaN or denormal.  */
+
+	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+
+	lhz     r4,-16(r1)    /* Fetch the upper portion of the high word of
+			      the FP value (where the exponent and sign bits
+			      are).  */
+	clrlwi  r4,r4,17      /* r4 = abs(r4).  */
+	cmpwi   cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
+	bltlr   cr7	      /* LT means finite, other non-finite.  */
+	li      r3,0
+	blr
+	END (__finite)
+
+hidden_def (__finite)
+weak_alias (__finite, finite)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__finite, __finitef)
+hidden_def (__finitef)
+weak_alias (__finitef, finitef)
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
+compat_symbol (libc, finite, finitel, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
new file mode 100644
index 0000000000..54bd94176d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_finite.S.  */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
new file mode 100644
index 0000000000..f896d38026
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
@@ -0,0 +1,71 @@
+/* isinf().  PowerPC64/POWER7 version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isinf(x)  */
+	.section    ".toc","aw"
+.LC0:   /* 1.0 */
+	.tc	    FD_ONE[TC],0x3ff0000000000000
+	.section    ".text"
+	.type	    __isinf, @function
+	.machine    power7
+EALIGN (__isinf, 4, 0)
+	CALL_MCOUNT 0
+	lfd	fp0,.LC0@toc(r2)
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr    29	      /* If not INF, return.  */
+
+	/* Either we have -INF/+INF or a denormal.  */
+
+	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lhz	r4,-16(r1)    /* Fetch the upper portion of the high word of
+			      the FP value (where the exponent and sign bits
+			      are).  */
+	cmpwi	cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
+	li	r3,1
+	beqlr   cr7	      /* EQ means INF, otherwise -INF.  */
+	li      r3,-1
+	blr
+	END (__isinf)
+
+hidden_def (__isinf)
+weak_alias (__isinf, isinf)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isinf, __isinff)
+hidden_def (__isinff)
+weak_alias (__isinff, isinff)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isinf, __isinfl)
+weak_alias (__isinf, isinfl)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
+compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
new file mode 100644
index 0000000000..be759e091e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isinf.S.  */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
new file mode 100644
index 0000000000..8877012598
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
@@ -0,0 +1,69 @@
+/* isnan().  PowerPC64/POWER7 version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isnan(x)  */
+	.section    ".toc","aw"
+.LC0:   /* 1.0 */
+	.tc	    FD_ONE[TC],0x3ff0000000000000
+	.section    ".text"
+	.type	    __isnan, @function
+	.machine    power7
+EALIGN (__isnan, 4, 0)
+	CALL_MCOUNT 0
+	lfd	fp0,.LC0@toc(r2)
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr	30	      /* If not NaN, finish.  */
+
+	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	ld	r4,-16(r1)    /* Load FP into GPR.  */
+	lis     r0,0x7ff0
+	sldi	r0,r0,32      /* const long r0 0x7ff00000 00000000.  */
+	clrldi	r4,r4,1	      /* x = fabs(x)  */
+	cmpd	cr7,r4,r0     /* if (fabs(x) <= inf)  */
+	blelr	cr7	      /* LE means not NaN.  */
+	li	r3,1	      /* else return 1  */
+	blr
+	END (__isnan)
+
+hidden_def (__isnan)
+weak_alias (__isnan, isnan)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isnan, __isnanf)
+hidden_def (__isnanf)
+weak_alias (__isnanf, isnanf)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isnan, __isnanl)
+weak_alias (__isnan, isnanl)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
+compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
new file mode 100644
index 0000000000..b48c85e0d3
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isnan.S.  */
diff --git a/sysdeps/s390/fpu/fegetenv.c b/sysdeps/s390/fpu/fegetenv.c
index a244f2ca8b..04da54c94c 100644
--- a/sysdeps/s390/fpu/fegetenv.c
+++ b/sysdeps/s390/fpu/fegetenv.c
@@ -20,10 +20,6 @@
 
 #include <fenv_libc.h>
 #include <fpu_control.h>
-#include <stddef.h>
-#include <asm/ptrace.h>
-#include <sys/ptrace.h>
-#include <unistd.h>
 
 int
 fegetenv (fenv_t *envp)
@@ -33,3 +29,4 @@ fegetenv (fenv_t *envp)
   /* Success.  */
   return 0;
 }
+libm_hidden_def (fegetenv)
diff --git a/sysdeps/s390/s390-64/utf16-utf32-z9.c b/sysdeps/s390/s390-64/utf16-utf32-z9.c
index 868dea68ca..14daf2118f 100644
--- a/sysdeps/s390/s390-64/utf16-utf32-z9.c
+++ b/sysdeps/s390/s390-64/utf16-utf32-z9.c
@@ -203,7 +203,10 @@ gconv_end (struct __gconv_step *data)
    swapping).  */
 #define BODY								\
   {									\
-    if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)				\
+    /* The hardware instruction currently fails to report an error for	\
+       isolated low surrogates so we have to disable the instruction	\
+       until this gets resolved.  */					\
+    if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */			\
       {									\
 	HARDWARE_CONVERT ("cu24 %0, %1, 1");				\
 	if (inptr != inend)						\
@@ -229,6 +232,12 @@ gconv_end (struct __gconv_step *data)
       }									\
     else								\
       {									\
+        /* An isolated low-surrogate was found.  This has to be         \
+	   considered ill-formed.  */					\
+        if (__builtin_expect (u1 >= 0xdc00, 0))				\
+	  {								\
+	    STANDARD_FROM_LOOP_ERR_HANDLER (2);				\
+	  }								\
 	/* It's a surrogate character.  At least the first word says	\
 	   it is.  */							\
 	if (__builtin_expect (inptr + 4 > inend, 0))			\
diff --git a/sysdeps/s390/s390-64/utf8-utf16-z9.c b/sysdeps/s390/s390-64/utf8-utf16-z9.c
index 531d3ebd4b..5f73f3c59e 100644
--- a/sysdeps/s390/s390-64/utf8-utf16-z9.c
+++ b/sysdeps/s390/s390-64/utf8-utf16-z9.c
@@ -345,9 +345,12 @@ gconv_end (struct __gconv_step *data)
    Operation.  */
 #define BODY								\
   {									\
-    if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)				\
+    /* The hardware instruction currently fails to report an error for	\
+       isolated low surrogates so we have to disable the instruction	\
+       until this gets resolved.  */					\
+    if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */			\
       {									\
-	HARDWARE_CONVERT ("cu21 %0, %1");				\
+	HARDWARE_CONVERT ("cu21 %0, %1, 1");				\
 	if (inptr != inend)						\
 	  {								\
 	    /* Check if the third byte is				\
@@ -388,7 +391,7 @@ gconv_end (struct __gconv_step *data)
 									\
 	outptr += 2;							\
       }									\
-    else if (c >= 0x0800 && c <= 0xd7ff)				\
+    else if ((c >= 0x0800 && c <= 0xd7ff) || c > 0xdfff)		\
       {									\
 	/* Three byte UTF-8 char.  */					\
 									\
diff --git a/sysdeps/sh/sh4/fpu/fegetenv.c b/sysdeps/sh/sh4/fpu/fegetenv.c
index c07b32af30..683939b52d 100644
--- a/sysdeps/sh/sh4/fpu/fegetenv.c
+++ b/sysdeps/sh/sh4/fpu/fegetenv.c
@@ -29,3 +29,4 @@ fegetenv (fenv_t *envp)
 
   return 0;
 }
+libm_hidden_def (fegetenv)
diff --git a/sysdeps/sparc/fpu/fegetenv.c b/sysdeps/sparc/fpu/fegetenv.c
index 36486f5973..c606a9cac0 100644
--- a/sysdeps/sparc/fpu/fegetenv.c
+++ b/sysdeps/sparc/fpu/fegetenv.c
@@ -34,4 +34,5 @@ strong_alias (__fegetenv, __old_fegetenv)
 compat_symbol (libm, __old_fegetenv, fegetenv, GLIBC_2_1);
 #endif
 
+libm_hidden_ver (__fegetenv, fegetenv)
 versioned_symbol (libm, __fegetenv, fegetenv, GLIBC_2_2);
diff --git a/sysdeps/sparc/sparc32/dl-irel.h b/sysdeps/sparc/sparc32/dl-irel.h
new file mode 100644
index 0000000000..1891938d6d
--- /dev/null
+++ b/sysdeps/sparc/sparc32/dl-irel.h
@@ -0,0 +1,55 @@
+/* Machine-dependent ELF indirect relocation inline functions.
+   SPARC 32-bit version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef _DL_IREL_H
+#define _DL_IREL_H
+
+#include <stdio.h>
+#include <unistd.h>
+#include <dl-plt.h>
+
+#define ELF_MACHINE_IRELA	1
+
+static inline void
+__attribute ((always_inline))
+elf_irela (const Elf32_Rela *reloc)
+{
+  unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+
+  if (__builtin_expect (r_type == R_SPARC_IRELATIVE, 1))
+    {
+      Elf32_Addr *const reloc_addr = (void *) reloc->r_offset;
+      Elf32_Addr value = ((Elf32_Addr (*) (void)) reloc->r_addend) ();
+      *reloc_addr = value;
+    }
+  else if (__builtin_expect (r_type == R_SPARC_JMP_IREL, 1))
+    {
+      Elf32_Addr *const reloc_addr = (void *) reloc->r_offset;
+      Elf32_Addr value = ((Elf32_Addr (*) (void)) reloc->r_addend) ();
+
+      sparc_fixup_plt (reloc, reloc_addr, value, 0, 1);
+    }
+  else if (r_type == R_SPARC_NONE)
+    ;
+  else
+    __libc_fatal ("unexpected reloc type in static binary");
+}
+
+#endif /* dl-irel.h */
diff --git a/sysdeps/sparc/sparc32/dl-machine.h b/sysdeps/sparc/sparc32/dl-machine.h
index b3b7852d87..e1385f7aca 100644
--- a/sysdeps/sparc/sparc32/dl-machine.h
+++ b/sysdeps/sparc/sparc32/dl-machine.h
@@ -1,5 +1,5 @@
 /* Machine-dependent ELF dynamic relocation inline functions.  SPARC version.
-   Copyright (C) 1996-2003, 2004, 2005, 2006, 2007
+   Copyright (C) 1996-2003, 2004, 2005, 2006, 2007, 2010
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -27,20 +27,13 @@
 #include <sys/param.h>
 #include <ldsodefs.h>
 #include <tls.h>
+#include <dl-plt.h>
 
 #ifndef VALIDX
 # define VALIDX(tag) (DT_NUM + DT_THISPROCNUM + DT_VERSIONTAGNUM \
 		      + DT_EXTRANUM + DT_VALTAGIDX (tag))
 #endif
 
-/* Some SPARC opcodes we need to use for self-modifying code.  */
-#define OPCODE_NOP	0x01000000 /* nop */
-#define OPCODE_CALL	0x40000000 /* call ?; add PC-rel word address */
-#define OPCODE_SETHI_G1	0x03000000 /* sethi ?, %g1; add value>>10 */
-#define OPCODE_JMP_G1	0x81c06000 /* jmp %g1+?; add lo 10 bits of value */
-#define OPCODE_SAVE_SP	0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */
-#define OPCODE_BA	0x30800000 /* b,a ?; add PC-rel word address */
-
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int
 elf_machine_matches_host (const Elf32_Ehdr *ehdr)
@@ -312,41 +305,6 @@ _dl_start_user:\n\
 	.size   _dl_start_user, . - _dl_start_user\n\
 	.previous");
 
-static inline __attribute__ ((always_inline)) Elf32_Addr
-sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
-		 Elf32_Addr value, int t, int do_flush)
-{
-  Elf32_Sword disp = value - (Elf32_Addr) reloc_addr;
-
-  if (0 && disp >= -0x800000 && disp < 0x800000)
-    {
-      /* Don't need to worry about thread safety. We're writing just one
-	 instruction.  */
-
-      reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff);
-      if (do_flush)
-	__asm __volatile ("flush %0" : : "r"(reloc_addr));
-    }
-  else
-    {
-      /* For thread safety, write the instructions from the bottom and
-	 flush before we overwrite the critical "b,a".  This of course
-	 need not be done during bootstrapping, since there are no threads.
-	 But we also can't tell if we _can_ use flush, so don't. */
-
-      reloc_addr += t;
-      reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff);
-      if (do_flush)
-	__asm __volatile ("flush %0+4" : : "r"(reloc_addr));
-
-      reloc_addr[0] = OPCODE_SETHI_G1 | (value >> 10);
-      if (do_flush)
-	__asm __volatile ("flush %0" : : "r"(reloc_addr));
-    }
-
-  return value;
-}
-
 static inline Elf32_Addr
 elf_machine_fixup_plt (struct link_map *map, lookup_t t,
 		       const Elf32_Rela *reloc,
@@ -433,6 +391,13 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
 
   value += reloc->r_addend;	/* Assume copy relocs have zero addend.  */
 
+  if (sym != NULL
+      && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0)
+      && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1))
+    {
+      value = ((Elf32_Addr (*) (void)) value) ();
+    }
+
   switch (r_type)
     {
 #if !defined RTLD_BOOTSTRAP && !defined RESOLVE_CONFLICT_FIND_MAP
@@ -460,6 +425,13 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
     case R_SPARC_32:
       *reloc_addr = value;
       break;
+    case R_SPARC_IRELATIVE:
+      value = ((Elf32_Addr (*) (void)) value) ();
+      *reloc_addr = value;
+      break;
+    case R_SPARC_JMP_IREL:
+      value = ((Elf32_Addr (*) (void)) value) ();
+      /* Fall thru */
     case R_SPARC_JMP_SLOT:
       {
 #if !defined RTLD_BOOTSTRAP && !defined __sparc_v9__
@@ -578,16 +550,21 @@ __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map,
 		      Elf32_Addr l_addr, const Elf32_Rela *reloc)
 {
-  switch (ELF32_R_TYPE (reloc->r_info))
+  Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
+  const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+
+  if (__builtin_expect (r_type == R_SPARC_JMP_SLOT, 1))
+    ;
+  else if (r_type == R_SPARC_JMP_IREL)
     {
-    case R_SPARC_NONE:
-      break;
-    case R_SPARC_JMP_SLOT:
-      break;
-    default:
-      _dl_reloc_bad_type (map, ELFW(R_TYPE) (reloc->r_info), 1);
-      break;
+      Elf32_Addr value = map->l_addr + reloc->r_addend;
+      value = ((Elf32_Addr (*) (void)) value) ();
+      sparc_fixup_plt (reloc, reloc_addr, value, 0, 1);
     }
+  else if (r_type == R_SPARC_NONE)
+    ;
+  else
+    _dl_reloc_bad_type (map, r_type, 1);
 }
 
 #endif	/* RESOLVE_MAP */
diff --git a/sysdeps/sparc/sparc32/dl-plt.h b/sysdeps/sparc/sparc32/dl-plt.h
new file mode 100644
index 0000000000..edcc5c1374
--- /dev/null
+++ b/sysdeps/sparc/sparc32/dl-plt.h
@@ -0,0 +1,62 @@
+/* PLT fixups.  Sparc 32-bit version.
+   Copyright (C) 1996-2003, 2004, 2005, 2006, 2007, 2010
+   Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+/* Some SPARC opcodes we need to use for self-modifying code.  */
+#define OPCODE_NOP	0x01000000 /* nop */
+#define OPCODE_CALL	0x40000000 /* call ?; add PC-rel word address */
+#define OPCODE_SETHI_G1	0x03000000 /* sethi ?, %g1; add value>>10 */
+#define OPCODE_JMP_G1	0x81c06000 /* jmp %g1+?; add lo 10 bits of value */
+#define OPCODE_SAVE_SP	0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */
+#define OPCODE_BA	0x30800000 /* b,a ?; add PC-rel word address */
+
+static inline __attribute__ ((always_inline)) Elf32_Addr
+sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
+		 Elf32_Addr value, int t, int do_flush)
+{
+  Elf32_Sword disp = value - (Elf32_Addr) reloc_addr;
+
+  if (0 && disp >= -0x800000 && disp < 0x800000)
+    {
+      /* Don't need to worry about thread safety. We're writing just one
+	 instruction.  */
+
+      reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff);
+      if (do_flush)
+	__asm __volatile ("flush %0" : : "r"(reloc_addr));
+    }
+  else
+    {
+      /* For thread safety, write the instructions from the bottom and
+	 flush before we overwrite the critical "b,a".  This of course
+	 need not be done during bootstrapping, since there are no threads.
+	 But we also can't tell if we _can_ use flush, so don't. */
+
+      reloc_addr += t;
+      reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff);
+      if (do_flush)
+	__asm __volatile ("flush %0+4" : : "r"(reloc_addr));
+
+      reloc_addr[0] = OPCODE_SETHI_G1 | (value >> 10);
+      if (do_flush)
+	__asm __volatile ("flush %0" : : "r"(reloc_addr));
+    }
+
+  return value;
+}
diff --git a/sysdeps/sparc/sparc64/dl-irel.h b/sysdeps/sparc/sparc64/dl-irel.h
new file mode 100644
index 0000000000..1a2a0a3dd5
--- /dev/null
+++ b/sysdeps/sparc/sparc64/dl-irel.h
@@ -0,0 +1,58 @@
+/* Machine-dependent ELF indirect relocation inline functions.
+   SPARC 64-bit version.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef _DL_IREL_H
+#define _DL_IREL_H
+
+#include <stdio.h>
+#include <unistd.h>
+#include <dl-plt.h>
+
+#define ELF_MACHINE_IRELA	1
+
+static inline void
+__attribute ((always_inline))
+elf_irela (const Elf64_Rela *reloc)
+{
+  unsigned int r_type = (reloc->r_info & 0xff);
+
+  if (__builtin_expect (r_type == R_SPARC_IRELATIVE, 1))
+    {
+      Elf64_Addr *const reloc_addr = (void *) reloc->r_offset;
+      Elf64_Addr value = ((Elf64_Addr (*) (void)) reloc->r_addend) ();
+      *reloc_addr = value;
+    }
+  else if (__builtin_expect (r_type == R_SPARC_JMP_IREL, 1))
+    {
+      Elf64_Addr *const reloc_addr = (void *) reloc->r_offset;
+      Elf64_Addr value = ((Elf64_Addr (*) (void)) reloc->r_addend) ();
+      struct link_map map = { .l_addr = 0 };
+
+      /* 'high' is always zero, for large PLT entries the linker
+	 emits an R_SPARC_IRELATIVE.  */
+      sparc64_fixup_plt (&map, reloc, reloc_addr, value, 0, 0);
+    }
+  else if (r_type == R_SPARC_NONE)
+    ;
+  else
+    __libc_fatal ("unexpected reloc type in static binary");
+}
+
+#endif /* dl-irel.h */
diff --git a/sysdeps/sparc/sparc64/dl-machine.h b/sysdeps/sparc/sparc64/dl-machine.h
index 3eee672912..b4f43e9cf5 100644
--- a/sysdeps/sparc/sparc64/dl-machine.h
+++ b/sysdeps/sparc/sparc64/dl-machine.h
@@ -1,6 +1,6 @@
 /* Machine-dependent ELF dynamic relocation inline functions.  Sparc64 version.
-   Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
-	Free Software Foundation, Inc.
+   Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
+	2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -27,6 +27,7 @@
 #include <sys/param.h>
 #include <ldsodefs.h>
 #include <sysdep.h>
+#include <dl-plt.h>
 
 #ifndef VALIDX
 # define VALIDX(tag) (DT_NUM + DT_THISPROCNUM + DT_VERSIONTAGNUM \
@@ -89,132 +90,6 @@ elf_machine_load_address (void)
   return (Elf64_Addr) got - *got + (Elf32_Sword) ((pc[2] - pc[3]) * 4) - 4;
 }
 
-/* We have 4 cases to handle.  And we code different code sequences
-   for each one.  I love V9 code models...  */
-static inline void __attribute__ ((always_inline))
-sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
-		   Elf64_Addr *reloc_addr, Elf64_Addr value,
-		   Elf64_Addr high, int t)
-{
-  unsigned int *insns = (unsigned int *) reloc_addr;
-  Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr;
-  Elf64_Sxword disp = value - plt_vaddr;
-
-  /* Now move plt_vaddr up to the call instruction.  */
-  plt_vaddr += ((t + 1) * 4);
-
-  /* PLT entries .PLT32768 and above look always the same.  */
-  if (__builtin_expect (high, 0) != 0)
-    {
-      *reloc_addr = value - map->l_addr;
-    }
-  /* Near destination.  */
-  else if (disp >= -0x800000 && disp < 0x800000)
-    {
-      /* As this is just one instruction, it is thread safe and so
-	 we can avoid the unnecessary sethi FOO, %g1.
-	 b,a target  */
-      insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff);
-      __asm __volatile ("flush %0" : : "r" (insns));
-    }
-  /* 32-bit Sparc style, the target is in the lower 32-bits of
-     address space.  */
-  else if (insns += t, (value >> 32) == 0)
-    {
-      /* sethi	%hi(target), %g1
-	 jmpl	%g1 + %lo(target), %g0  */
-
-      insns[1] = 0x81c06000 | (value & 0x3ff);
-      __asm __volatile ("flush %0 + 4" : : "r" (insns));
-
-      insns[0] = 0x03000000 | ((unsigned int)(value >> 10));
-      __asm __volatile ("flush %0" : : "r" (insns));
-    }
-  /* We can also get somewhat simple sequences if the distance between
-     the target and the PLT entry is within +/- 2GB.  */
-  else if ((plt_vaddr > value
-	    && ((plt_vaddr - value) >> 31) == 0)
-	   || (value > plt_vaddr
-	       && ((value - plt_vaddr) >> 31) == 0))
-    {
-      unsigned int displacement;
-
-      if (plt_vaddr > value)
-	displacement = (0 - (plt_vaddr - value));
-      else
-	displacement = value - plt_vaddr;
-
-      /* mov	%o7, %g1
-	 call	displacement
-	  mov	%g1, %o7  */
-
-      insns[2] = 0x9e100001;
-      __asm __volatile ("flush %0 + 8" : : "r" (insns));
-
-      insns[1] = 0x40000000 | (displacement >> 2);
-      __asm __volatile ("flush %0 + 4" : : "r" (insns));
-
-      insns[0] = 0x8210000f;
-      __asm __volatile ("flush %0" : : "r" (insns));
-    }
-  /* Worst case, ho hum...  */
-  else
-    {
-      unsigned int high32 = (value >> 32);
-      unsigned int low32 = (unsigned int) value;
-
-      /* ??? Some tricks can be stolen from the sparc64 egcs backend
-	     constant formation code I wrote.  -DaveM  */
-
-      if (__builtin_expect (high32 & 0x3ff, 0))
-	{
-	  /* sethi	%hh(value), %g1
-	     sethi	%lm(value), %g5
-	     or		%g1, %hm(value), %g1
-	     or		%g5, %lo(value), %g5
-	     sllx	%g1, 32, %g1
-	     jmpl	%g1 + %g5, %g0
-	      nop  */
-
-	  insns[5] = 0x81c04005;
-	  __asm __volatile ("flush %0 + 20" : : "r" (insns));
-
-	  insns[4] = 0x83287020;
-	  __asm __volatile ("flush %0 + 16" : : "r" (insns));
-
-	  insns[3] = 0x8a116000 | (low32 & 0x3ff);
-	  __asm __volatile ("flush %0 + 12" : : "r" (insns));
-
-	  insns[2] = 0x82106000 | (high32 & 0x3ff);
-	}
-      else
-	{
-	  /* sethi	%hh(value), %g1
-	     sethi	%lm(value), %g5
-	     sllx	%g1, 32, %g1
-	     or		%g5, %lo(value), %g5
-	     jmpl	%g1 + %g5, %g0
-	      nop  */
-
-	  insns[4] = 0x81c04005;
-	  __asm __volatile ("flush %0 + 16" : : "r" (insns));
-
-	  insns[3] = 0x8a116000 | (low32 & 0x3ff);
-	  __asm __volatile ("flush %0 + 12" : : "r" (insns));
-
-	  insns[2] = 0x83287020;
-	}
-
-      __asm __volatile ("flush %0 + 8" : : "r" (insns));
-
-      insns[1] = 0x0b000000 | (low32 >> 10);
-      __asm __volatile ("flush %0 + 4" : : "r" (insns));
-
-      insns[0] = 0x03000000 | (high32 >> 10);
-      __asm __volatile ("flush %0" : : "r" (insns));
-    }
-}
-
 static inline Elf64_Addr __attribute__ ((always_inline))
 elf_machine_fixup_plt (struct link_map *map, lookup_t t,
 		       const Elf64_Rela *reloc,
@@ -549,6 +424,11 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc,
 
   value += reloc->r_addend;	/* Assume copy relocs have zero addend.  */
 
+  if (sym != NULL
+      && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0)
+      && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1))
+    value = ((Elf64_Addr (*) (void)) value) ();
+
   switch (r_type)
     {
 #if !defined RTLD_BOOTSTRAP && !defined RESOLVE_CONFLICT_FIND_MAP
@@ -576,6 +456,13 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc,
     case R_SPARC_GLOB_DAT:
       *reloc_addr = value;
       break;
+    case R_SPARC_IRELATIVE:
+      value = ((Elf64_Addr (*) (void)) value) ();
+      *reloc_addr = value;
+      break;
+    case R_SPARC_JMP_IREL:
+      value = ((Elf64_Addr (*) (void)) value) ();
+      /* Fall thru */
     case R_SPARC_JMP_SLOT:
 #ifdef RESOLVE_CONFLICT_FIND_MAP
       /* R_SPARC_JMP_SLOT conflicts against .plt[32768+]
@@ -757,16 +644,29 @@ __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map,
 		      Elf64_Addr l_addr, const Elf64_Rela *reloc)
 {
-  switch (ELF64_R_TYPE (reloc->r_info))
+  Elf64_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
+  const unsigned int r_type = ELF64_R_TYPE (reloc->r_info);
+
+  if (__builtin_expect (r_type == R_SPARC_JMP_SLOT, 1))
+    ;
+  else if (r_type == R_SPARC_JMP_IREL
+	   || r_type == R_SPARC_IRELATIVE)
     {
-    case R_SPARC_NONE:
-      break;
-    case R_SPARC_JMP_SLOT:
-      break;
-    default:
-      _dl_reloc_bad_type (map, ELFW(R_TYPE) (reloc->r_info), 1);
-      break;
+      Elf64_Addr value = map->l_addr + reloc->r_addend;
+      value = ((Elf64_Addr (*) (void)) value) ();
+      if (r_type == R_SPARC_JMP_IREL)
+	{
+	  /* 'high' is always zero, for large PLT entries the linker
+	     emits an R_SPARC_IRELATIVE.  */
+	  sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 0);
+	}
+      else
+	*reloc_addr = value;
     }
+  else if (r_type == R_SPARC_NONE)
+    ;
+  else
+    _dl_reloc_bad_type (map, r_type, 1);
 }
 
 #endif	/* RESOLVE_MAP */
diff --git a/sysdeps/sparc/sparc64/dl-plt.h b/sysdeps/sparc/sparc64/dl-plt.h
new file mode 100644
index 0000000000..e06be43a0a
--- /dev/null
+++ b/sysdeps/sparc/sparc64/dl-plt.h
@@ -0,0 +1,144 @@
+/* PLT fixups.  Sparc 64-bit version.
+   Copyright (C) 1997-2006, 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+/* We have 4 cases to handle.  And we code different code sequences
+   for each one.  I love V9 code models...  */
+static inline void __attribute__ ((always_inline))
+sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
+		   Elf64_Addr *reloc_addr, Elf64_Addr value,
+		   Elf64_Addr high, int t)
+{
+  unsigned int *insns = (unsigned int *) reloc_addr;
+  Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr;
+  Elf64_Sxword disp = value - plt_vaddr;
+
+  /* Now move plt_vaddr up to the call instruction.  */
+  plt_vaddr += ((t + 1) * 4);
+
+  /* PLT entries .PLT32768 and above look always the same.  */
+  if (__builtin_expect (high, 0) != 0)
+    {
+      *reloc_addr = value - map->l_addr;
+    }
+  /* Near destination.  */
+  else if (disp >= -0x800000 && disp < 0x800000)
+    {
+      /* As this is just one instruction, it is thread safe and so
+	 we can avoid the unnecessary sethi FOO, %g1.
+	 b,a target  */
+      insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff);
+      __asm __volatile ("flush %0" : : "r" (insns));
+    }
+  /* 32-bit Sparc style, the target is in the lower 32-bits of
+     address space.  */
+  else if (insns += t, (value >> 32) == 0)
+    {
+      /* sethi	%hi(target), %g1
+	 jmpl	%g1 + %lo(target), %g0  */
+
+      insns[1] = 0x81c06000 | (value & 0x3ff);
+      __asm __volatile ("flush %0 + 4" : : "r" (insns));
+
+      insns[0] = 0x03000000 | ((unsigned int)(value >> 10));
+      __asm __volatile ("flush %0" : : "r" (insns));
+    }
+  /* We can also get somewhat simple sequences if the distance between
+     the target and the PLT entry is within +/- 2GB.  */
+  else if ((plt_vaddr > value
+	    && ((plt_vaddr - value) >> 31) == 0)
+	   || (value > plt_vaddr
+	       && ((value - plt_vaddr) >> 31) == 0))
+    {
+      unsigned int displacement;
+
+      if (plt_vaddr > value)
+	displacement = (0 - (plt_vaddr - value));
+      else
+	displacement = value - plt_vaddr;
+
+      /* mov	%o7, %g1
+	 call	displacement
+	  mov	%g1, %o7  */
+
+      insns[2] = 0x9e100001;
+      __asm __volatile ("flush %0 + 8" : : "r" (insns));
+
+      insns[1] = 0x40000000 | (displacement >> 2);
+      __asm __volatile ("flush %0 + 4" : : "r" (insns));
+
+      insns[0] = 0x8210000f;
+      __asm __volatile ("flush %0" : : "r" (insns));
+    }
+  /* Worst case, ho hum...  */
+  else
+    {
+      unsigned int high32 = (value >> 32);
+      unsigned int low32 = (unsigned int) value;
+
+      /* ??? Some tricks can be stolen from the sparc64 egcs backend
+	     constant formation code I wrote.  -DaveM  */
+
+      if (__builtin_expect (high32 & 0x3ff, 0))
+	{
+	  /* sethi	%hh(value), %g1
+	     sethi	%lm(value), %g5
+	     or		%g1, %hm(value), %g1
+	     or		%g5, %lo(value), %g5
+	     sllx	%g1, 32, %g1
+	     jmpl	%g1 + %g5, %g0
+	      nop  */
+
+	  insns[5] = 0x81c04005;
+	  __asm __volatile ("flush %0 + 20" : : "r" (insns));
+
+	  insns[4] = 0x83287020;
+	  __asm __volatile ("flush %0 + 16" : : "r" (insns));
+
+	  insns[3] = 0x8a116000 | (low32 & 0x3ff);
+	  __asm __volatile ("flush %0 + 12" : : "r" (insns));
+
+	  insns[2] = 0x82106000 | (high32 & 0x3ff);
+	}
+      else
+	{
+	  /* sethi	%hh(value), %g1
+	     sethi	%lm(value), %g5
+	     sllx	%g1, 32, %g1
+	     or		%g5, %lo(value), %g5
+	     jmpl	%g1 + %g5, %g0
+	      nop  */
+
+	  insns[4] = 0x81c04005;
+	  __asm __volatile ("flush %0 + 16" : : "r" (insns));
+
+	  insns[3] = 0x8a116000 | (low32 & 0x3ff);
+	  __asm __volatile ("flush %0 + 12" : : "r" (insns));
+
+	  insns[2] = 0x83287020;
+	}
+
+      __asm __volatile ("flush %0 + 8" : : "r" (insns));
+
+      insns[1] = 0x0b000000 | (low32 >> 10);
+      __asm __volatile ("flush %0 + 4" : : "r" (insns));
+
+      insns[0] = 0x03000000 | (high32 >> 10);
+      __asm __volatile ("flush %0" : : "r" (insns));
+    }
+}
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S
index 4cb968505d..cfd9864f63 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/____longjmp_chk.S
@@ -28,19 +28,12 @@
 #define __longjmp ____longjmp_chk
 
 #ifdef PIC
-# ifdef HAVE_ASM_PPC_REL16
 #  define LOAD_ARG \
 	bcl	20,31,1f;				\
 1:	mflr	r3;					\
 	addis	r3,r3,_GLOBAL_OFFSET_TABLE_-1b@ha;	\
 	addi	r3,r3,_GLOBAL_OFFSET_TABLE_-1b@l;	\
 	lwz	r3,.LC0@got(r3)
-# else
-#  define LOAD_ARG \
-	bl	_GLOBAL_OFFSET_TABLE_-4@local;		\
-	mflr	r3;					\
-	lwz	r3,.LC0@got(r3)
-# endif
 #else
 # define LOAD_ARG \
 	lis	r3,.LC0@ha;				\
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S
index e945834945..4c8c6b433b 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/brk.S
@@ -36,17 +36,10 @@ ENTRY (BP_SYM (__brk))
 	DO_CALL(SYS_ify(brk))
 	lwz     r6,8(r1)
 #ifdef PIC
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r5
 	addis	r5,r5,__curbrk-1b@ha
 	stw	r3,__curbrk-1b@l(r5)
-# else
-	bl      _GLOBAL_OFFSET_TABLE_@local-4
-	mflr    r5
-	lwz     r5,__curbrk@got(r5)
-	stw     r3,0(r5)
-# endif
 #else
 	lis     r4,__curbrk@ha
 	stw     r3,__curbrk@l(r4)
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S
index 63e1773e22..27285ed4a5 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S
@@ -145,15 +145,10 @@ ENTRY(__CONTEXT_FUNC_NAME)
 # ifdef __CONTEXT_ENABLE_VRS
 #  ifdef PIC
 	mflr    r8
-#   ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r7
 	addis	r7,r7,_GLOBAL_OFFSET_TABLE_-1b@ha
 	addi	r7,r7,_GLOBAL_OFFSET_TABLE_-1b@l
-#   else
-	bl      _GLOBAL_OFFSET_TABLE_@local-4
-	mflr    r7
-#   endif
 #   ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
 	mtlr    r8
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies
index d379a2dd12..af946119aa 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies
@@ -1,3 +1,4 @@
 # Make sure this comes before the powerpc/powerpc32/fpu that's
 # listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies.
+powerpc/powerpc32/power7/fpu
 powerpc/powerpc32/power5/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S
index 127c9e4581..f304090868 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S
@@ -73,15 +73,10 @@ ENTRY(__CONTEXT_FUNC_NAME)
 
 #ifdef PIC
 	mflr    r8
-# ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r7
 	addis	r7,r7,_GLOBAL_OFFSET_TABLE_-1b@ha
 	addi	r7,r7,_GLOBAL_OFFSET_TABLE_-1b@l
-# else
-	bl      _GLOBAL_OFFSET_TABLE_@local-4
-	mflr    r7
-# endif
 # ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
 	mtlr    r8
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S
index 89b1a61954..62efee2dce 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S
@@ -146,15 +146,10 @@ ENTRY(__CONTEXT_FUNC_NAME)
 
 # ifdef PIC
 	mflr    r8
-#  ifdef HAVE_ASM_PPC_REL16
 	bcl	20,31,1f
 1:	mflr	r7
 	addis	r7,r7,_GLOBAL_OFFSET_TABLE_-1b@ha
 	addi	r7,r7,_GLOBAL_OFFSET_TABLE_-1b@l
-#  else
-	bl      _GLOBAL_OFFSET_TABLE_@local-4
-	mflr    r7
-#  endif
 #  ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
 	mtlr    r8
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies
index c46b3d42af..ca112208d1 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies
@@ -1,3 +1,4 @@
 # Make sure this comes before the powerpc/powerpc64/fpu that's
 # listed in unix/sysv/linux/powerpc/powerpc64/fpu/Implies.
+powerpc/powerpc64/power7/fpu
 powerpc/powerpc64/power5/fpu
diff --git a/sysdeps/x86_64/fpu/fegetenv.c b/sysdeps/x86_64/fpu/fegetenv.c
index fa5a8dadcb..2159a1fab1 100644
--- a/sysdeps/x86_64/fpu/fegetenv.c
+++ b/sysdeps/x86_64/fpu/fegetenv.c
@@ -28,3 +28,4 @@ fegetenv (fenv_t *envp)
   /* Success.  */
   return 0;
 }
+libm_hidden_def (fegetenv)