about summary refs log tree commit diff
path: root/sysdeps/i386/i686/multiarch/strcmp-sse4.S
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2010-02-15 11:17:50 -0800
committerUlrich Drepper <drepper@redhat.com>2010-02-15 11:17:50 -0800
commit904057bc17fb3e3127a35ebf35fcac8d5bc8269b (patch)
treede5ec58dcca85fcc063a43a92e0d1f957eecebdb /sysdeps/i386/i686/multiarch/strcmp-sse4.S
parent0ab85ce4298875d0dce8bfd4fe2cecd9cda840e3 (diff)
downloadglibc-904057bc17fb3e3127a35ebf35fcac8d5bc8269b.tar.gz
glibc-904057bc17fb3e3127a35ebf35fcac8d5bc8269b.tar.xz
glibc-904057bc17fb3e3127a35ebf35fcac8d5bc8269b.zip
32bit memcmp/strcmp/strncmp optimized for SSSE3/SSS4.2
Diffstat (limited to 'sysdeps/i386/i686/multiarch/strcmp-sse4.S')
-rw-r--r--sysdeps/i386/i686/multiarch/strcmp-sse4.S378
1 files changed, 378 insertions, 0 deletions
diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
new file mode 100644
index 0000000000..977647203f
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -0,0 +1,378 @@
+/* strcmp with SSE4.2
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#ifndef USE_AS_STRNCMP
+# ifndef STRCMP
+#  define STRCMP	__strcmp_sse4_2
+# endif
+# define STR1		4
+# define STR2		STR1+4
+#else
+# ifndef STRCMP
+#  define STRCMP	__strncmp_sse4_2
+# endif
+# define STR1		8
+# define STR2		STR1+4
+# define CNT		STR2+4
+#endif
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (STRCMP)
+#ifdef USE_AS_STRNCMP
+	PUSH	(%ebp)
+#endif
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %eax
+#ifdef USE_AS_STRNCMP
+	movl	CNT(%esp), %ebp
+	test	%ebp, %ebp
+	je	L(eq)
+#endif
+	mov	%dx, %cx
+	and	$0xfff, %cx
+	cmp	$0xff0, %cx
+	ja	L(first4bytes)
+	movdqu	(%edx), %xmm2
+	mov	%eax, %ecx
+	and	$0xfff, %ecx
+	cmp	$0xff0, %ecx
+	ja	L(first4bytes)
+	movd	%xmm2, %ecx
+	cmp	(%eax), %ecx
+	jne	L(less4bytes)
+	movdqu	(%eax), %xmm1
+	pxor	%xmm2, %xmm1
+	pxor	%xmm0, %xmm0
+	ptest	%xmm1, %xmm0
+	jnc	L(less16bytes)
+	pcmpeqb	%xmm0, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %ebp
+	jbe	L(eq)
+#endif
+	add	$16, %edx
+	add	$16, %eax
+L(first4bytes):
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %ebp
+	je	L(eq)
+#endif
+
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %ebp
+	je	L(eq)
+#endif
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %ebp
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %ebp
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %ebp
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	sub	$8, %ebp
+	je	L(eq)
+#endif
+	add	$8, %eax
+	add	$8, %edx
+
+	PUSH	(%ebx)
+	PUSH	(%edi)
+	PUSH	(%esi)
+	mov	%edx, %edi
+	mov	%eax, %esi
+	xorl	%eax, %eax
+L(check_offset):
+	movl	%edi, %ebx
+	movl	%esi, %ecx
+	andl	$0xfff, %ebx
+	andl	$0xfff, %ecx
+	cmpl	%ebx, %ecx
+	cmovl	%ebx, %ecx
+	lea	-0xff0(%ecx), %edx
+	sub	%edx, %edi
+	sub	%edx, %esi
+	testl	%edx, %edx
+	jg	L(crosspage)
+L(loop):
+	movdqu	(%esi,%edx), %xmm2
+	movdqu	(%edi,%edx), %xmm1
+	pcmpistri	$0x1a, %xmm2, %xmm1
+	jbe	L(end)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %ebp
+	jbe	L(more16byteseq)
+#endif
+
+	add	$16, %edx
+	jle	L(loop)
+L(crosspage):
+	movzbl	(%edi,%edx), %eax
+	movzbl	(%esi,%edx), %ebx
+	subl	%ebx, %eax
+	jne	L(ret)
+	testl	%ebx, %ebx
+	je	L(ret)
+#ifdef USE_AS_STRNCMP
+	sub	$1, %ebp
+	jbe	L(more16byteseq)
+#endif
+	inc	%edx
+	cmp	$15, %edx
+	jle	L(crosspage)
+	add	$16, %edi
+	add	$16, %esi
+	jmp	L(check_offset)
+
+L(end):
+	jnc	L(ret)
+#ifdef USE_AS_STRNCMP
+	sub	%ecx, %ebp
+	jbe	L(more16byteseq)
+#endif
+	lea	(%ecx,%edx), %ebx
+	movzbl	(%edi,%ebx), %eax
+	movzbl	(%esi,%ebx), %ecx
+	subl	%ecx, %eax
+L(ret):
+	POP	(%esi)
+	POP	(%edi)
+	POP	(%ebx)
+#ifdef USE_AS_STRNCMP
+	POP	(%ebp)
+#endif
+	ret
+
+#ifdef USE_AS_STRNCMP
+L(more16byteseq):
+	POP	(%esi)
+	POP	(%edi)
+	POP	(%ebx)
+#endif
+L(eq):
+	xorl	%eax, %eax
+#ifdef USE_AS_STRNCMP
+	POP	(%ebp)
+#endif
+	ret
+L(neq):
+	mov	$1, %eax
+	ja	L(neq_bigger)
+	neg	%eax
+L(neq_bigger):
+#ifdef USE_AS_STRNCMP
+	POP	(%ebp)
+#endif
+	ret
+	.p2align 4
+L(less16bytes):
+	add	$0xfefefeff, %ecx
+	jnc	L(less4bytes)
+	xor	(%edx), %ecx
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(less4bytes)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	jbe	L(eq)
+#endif
+	mov	4(%edx), %ecx
+	cmp	4(%eax), %ecx
+	jne	L(more4bytes)
+	add	$0xfefefeff, %ecx
+	jnc	L(more4bytes)
+	xor	4(%edx), %ecx
+	or	$0xfefefeff, %ecx
+	add	$1, %ecx
+	jnz	L(more4bytes)
+
+#ifdef USE_AS_STRNCMP
+	sub	$8, %ebp
+	jbe	L(eq)
+#endif
+
+	add	$8, %edx
+	add	$8, %eax
+L(less4bytes):
+
+	movzbl	(%eax), %ecx
+	cmpb	%cl, (%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %ebp
+	je	L(eq)
+#endif
+	movzbl	1(%eax), %ecx
+	cmpb	%cl, 1(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %ebp
+	je	L(eq)
+#endif
+
+	movzbl	2(%eax), %ecx
+	cmpb	%cl, 2(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %ebp
+	je	L(eq)
+#endif
+	movzbl	3(%eax), %ecx
+	cmpb	%cl, 3(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+L(more4bytes):
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %ebp
+	je	L(eq)
+#endif
+	movzbl	4(%eax), %ecx
+	cmpb	%cl, 4(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %ebp
+	je	L(eq)
+#endif
+	movzbl	5(%eax), %ecx
+	cmpb	%cl, 5(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %ebp
+	je	L(eq)
+#endif
+	movzbl	6(%eax), %ecx
+	cmpb	%cl, 6(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %ebp
+	je	L(eq)
+#endif
+	movzbl	7(%eax), %ecx
+	cmpb	%cl, 7(%edx)
+	jne	L(neq)
+	cmpl	$0, %ecx
+	je	L(eq)
+
+END (STRCMP)
+
+#endif