summary refs log tree commit diff
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2010-01-13 07:51:48 -0800
committerUlrich Drepper <drepper@redhat.com>2010-01-13 07:51:48 -0800
commit5a7af22fbb661e351473bfde639beeac16fc32f7 (patch)
tree70b559065a13a15cf87e1907c35cda904f10a64f
parent52e96a8092e4c0bfef5bd9748b35861bba6f6b88 (diff)
downloadglibc-5a7af22fbb661e351473bfde639beeac16fc32f7.tar.gz
glibc-5a7af22fbb661e351473bfde639beeac16fc32f7.tar.xz
glibc-5a7af22fbb661e351473bfde639beeac16fc32f7.zip
Unroll the loop x86-64 SSE4.2 strlen.
-rw-r--r--ChangeLog4
-rw-r--r--sysdeps/x86_64/multiarch/strlen.S60
2 files changed, 49 insertions, 15 deletions
diff --git a/ChangeLog b/ChangeLog
index 736bd5d778..9080030d1e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2010-01-12  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/x86_64/multiarch/strlen.S: Unroll the loop.
+
 2010-01-13  Ulrich Drepper  <drepper@redhat.com>
 
 	* stdlib/stdlib.h: Be a bit more relaxed about obsoleted mktemp symbol.
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 509f9c9605..f9641131fa 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -46,28 +46,58 @@ END(strlen)
 __strlen_sse42:
 	cfi_startproc
 	CALL_MCOUNT
-	pxor	%xmm2, %xmm2
-	movq	%rdi, %rcx
+	pxor	%xmm1, %xmm1
+	movl	%edi, %ecx
 	movq	%rdi, %r8
 	andq	$~15, %rdi
-	movdqa	%xmm2, %xmm1
-	pcmpeqb	(%rdi), %xmm2
-	orl	$0xffffffff, %esi
-	subq	%rdi, %rcx
-	shll	%cl, %esi
-	pmovmskb %xmm2, %edx
-	andl	%esi, %edx
-	jnz	1f
-
-2:	pcmpistri $0x08, 16(%rdi), %xmm1
-	leaq	16(%rdi), %rdi
-	jnz	2b
+	xor	%edi, %ecx
+	pcmpeqb	(%rdi), %xmm1
+	pmovmskb %xmm1, %edx
+	shrl	%cl, %edx
+	shll	%cl, %edx
+	andl	%edx, %edx
+	jnz	L(less16bytes)
+	pxor	%xmm1, %xmm1
 
+	.p2align 4
+L(more64bytes_loop):
+	pcmpistri $0x08, 16(%rdi), %xmm1
+	jz	L(more32bytes)
+
+	pcmpistri $0x08, 32(%rdi), %xmm1
+	jz	L(more48bytes)
+
+	pcmpistri $0x08, 48(%rdi), %xmm1
+	jz	L(more64bytes)
+
+	add	$64, %rdi
+	pcmpistri $0x08, (%rdi), %xmm1
+	jnz	L(more64bytes_loop)
 	leaq	(%rdi,%rcx), %rax
 	subq	%r8, %rax
 	ret
 
-1:	subq	%r8, %rdi
+	.p2align 4
+L(more32bytes):
+	leaq	16(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more48bytes):
+	leaq	32(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more64bytes):
+	leaq	48(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(less16bytes):
+	subq	%r8, %rdi
 	bsfl	%edx, %eax
 	addq	%rdi, %rax
 	ret