summary refs log tree commit diff
path: root/sysdeps/x86_64/strlen.S
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2010-08-26 22:09:34 -0700
committerUlrich Drepper <drepper@redhat.com>2010-08-26 22:09:34 -0700
commit623aac7f84dfddee9bcf9d51f23612479cf672ec (patch)
tree355c57e1d98cff706ead0832461b060bc24ffc7c /sysdeps/x86_64/strlen.S
parentb416a900856ff871c06b08fa2c9c943fd86597da (diff)
downloadglibc-623aac7f84dfddee9bcf9d51f23612479cf672ec.tar.gz
glibc-623aac7f84dfddee9bcf9d51f23612479cf672ec.tar.xz
glibc-623aac7f84dfddee9bcf9d51f23612479cf672ec.zip
Unroll x86-64 strlen
Diffstat (limited to 'sysdeps/x86_64/strlen.S')
-rw-r--r--sysdeps/x86_64/strlen.S97
1 files changed, 76 insertions, 21 deletions
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index 93aee6bef1..7880c1d5e5 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,6 +1,7 @@
 /* strlen(str) -- determine the length of the string STR.
-   Copyright (C) 2009 Free Software Foundation, Inc.
+   Copyright (C) 2009, 2010 Free Software Foundation, Inc.
    Contributed by Ulrich Drepper <drepper@redhat.com>.
+   Modified by Intel Corporation.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -23,29 +24,83 @@
 
 	.text
 ENTRY(strlen)
-	pxor	%xmm2, %xmm2
-	movq	%rdi, %rcx
-	movq	%rdi, %r8
-	andq	$~15, %rdi
-	movdqa	%xmm2, %xmm1
-	pcmpeqb	(%rdi), %xmm2
-	orl	$0xffffffff, %esi
-	subq	%rdi, %rcx
-	shll	%cl, %esi
-	pmovmskb %xmm2, %edx
-	andl	%esi, %edx
-	jnz	1f
-
-2:	movdqa	16(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
+	xor	%rax, %rax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%rdi), %xmm1
 	pcmpeqb	%xmm1, %xmm0
 	pmovmskb %xmm0, %edx
-	testl	%edx, %edx
-	jz	2b
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%rdi, %rax
+	and	$-16, %rax
+	jmp	L(align16_start)
+L(next):
+	mov	%rdi, %rax
+	and	$-16, %rax
+	pcmpeqb	(%rax), %xmm0
+	mov	$-1, %esi
+	sub	%rax, %rcx
+	shl	%cl, %esi
+	pmovmskb %xmm0, %edx
+	and	%esi, %edx
+	jnz	L(exit)
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	.p2align 4
+L(align16_loop):
+	pcmpeqb	16(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
 
-1:	subq	%r8, %rdi
-	bsfl	%edx, %eax
-	addq	%rdi, %rax
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%rax), %rax
+	test	%edx, %edx
+	jz	L(align16_loop)
+L(exit):
+	sub	%rdi, %rax
+L(exit_less16):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	ret
+	.p2align 4
+L(exit16):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$16, %rax
+	ret
+	.p2align 4
+L(exit32):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$32, %rax
+	ret
+	.p2align 4
+L(exit48):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$48, %rax
 	ret
 END(strlen)
 libc_hidden_builtin_def (strlen)