about summary refs log tree commit diff
path: root/sysdeps/x86_64
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2010-01-13 07:51:48 -0800
committerUlrich Drepper <drepper@redhat.com>2010-01-13 07:51:48 -0800
commit5a7af22fbb661e351473bfde639beeac16fc32f7 (patch)
tree70b559065a13a15cf87e1907c35cda904f10a64f /sysdeps/x86_64
parent52e96a8092e4c0bfef5bd9748b35861bba6f6b88 (diff)
downloadglibc-5a7af22fbb661e351473bfde639beeac16fc32f7.tar.gz
glibc-5a7af22fbb661e351473bfde639beeac16fc32f7.tar.xz
glibc-5a7af22fbb661e351473bfde639beeac16fc32f7.zip
Unroll the loop x86-64 SSE4.2 strlen.
Diffstat (limited to 'sysdeps/x86_64')
-rw-r--r--sysdeps/x86_64/multiarch/strlen.S60
1 files changed, 45 insertions, 15 deletions
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 509f9c9605..f9641131fa 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -46,28 +46,58 @@ END(strlen)
 __strlen_sse42:
 	cfi_startproc
 	CALL_MCOUNT
-	pxor	%xmm2, %xmm2
-	movq	%rdi, %rcx
+	pxor	%xmm1, %xmm1
+	movl	%edi, %ecx
 	movq	%rdi, %r8
 	andq	$~15, %rdi
-	movdqa	%xmm2, %xmm1
-	pcmpeqb	(%rdi), %xmm2
-	orl	$0xffffffff, %esi
-	subq	%rdi, %rcx
-	shll	%cl, %esi
-	pmovmskb %xmm2, %edx
-	andl	%esi, %edx
-	jnz	1f
-
-2:	pcmpistri $0x08, 16(%rdi), %xmm1
-	leaq	16(%rdi), %rdi
-	jnz	2b
+	xor	%edi, %ecx
+	pcmpeqb	(%rdi), %xmm1
+	pmovmskb %xmm1, %edx
+	shrl	%cl, %edx
+	shll	%cl, %edx
+	andl	%edx, %edx
+	jnz	L(less16bytes)
+	pxor	%xmm1, %xmm1
 
+	.p2align 4
+L(more64bytes_loop):
+	pcmpistri $0x08, 16(%rdi), %xmm1
+	jz	L(more32bytes)
+
+	pcmpistri $0x08, 32(%rdi), %xmm1
+	jz	L(more48bytes)
+
+	pcmpistri $0x08, 48(%rdi), %xmm1
+	jz	L(more64bytes)
+
+	add	$64, %rdi
+	pcmpistri $0x08, (%rdi), %xmm1
+	jnz	L(more64bytes_loop)
 	leaq	(%rdi,%rcx), %rax
 	subq	%r8, %rax
 	ret
 
-1:	subq	%r8, %rdi
+	.p2align 4
+L(more32bytes):
+	leaq	16(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more48bytes):
+	leaq	32(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more64bytes):
+	leaq	48(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(less16bytes):
+	subq	%r8, %rdi
 	bsfl	%edx, %eax
 	addq	%rdi, %rax
 	ret