about summary refs log tree commit diff
path: root/sysdeps/x86_64/multiarch/strlen-sse4.S
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2010-08-26 22:09:34 -0700
committerUlrich Drepper <drepper@redhat.com>2010-08-26 22:09:34 -0700
commit623aac7f84dfddee9bcf9d51f23612479cf672ec (patch)
tree355c57e1d98cff706ead0832461b060bc24ffc7c /sysdeps/x86_64/multiarch/strlen-sse4.S
parentb416a900856ff871c06b08fa2c9c943fd86597da (diff)
downloadglibc-623aac7f84dfddee9bcf9d51f23612479cf672ec.tar.gz
glibc-623aac7f84dfddee9bcf9d51f23612479cf672ec.tar.xz
glibc-623aac7f84dfddee9bcf9d51f23612479cf672ec.zip
Unroll x86-64 strlen
Diffstat (limited to 'sysdeps/x86_64/multiarch/strlen-sse4.S')
-rw-r--r--sysdeps/x86_64/multiarch/strlen-sse4.S85
1 files changed, 85 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S
new file mode 100644
index 0000000000..6b16ea7fa6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-sse4.S
@@ -0,0 +1,85 @@
+/* strlen with SSE4
+   Copyright (C) 2009, 2010 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@redhat.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#if defined SHARED && !defined NOT_IN_libc
+
+#include <sysdep.h>
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (__strlen_sse42)
+	pxor	%xmm1, %xmm1
+	movl	%edi, %ecx
+	movq	%rdi, %r8
+	andq	$~15, %rdi
+	xor	%edi, %ecx
+	pcmpeqb	(%rdi), %xmm1
+	pmovmskb %xmm1, %edx
+	shrl	%cl, %edx
+	shll	%cl, %edx
+	andl	%edx, %edx
+	jnz	L(less16bytes)
+	pxor	%xmm1, %xmm1
+
+	.p2align 4
+L(more64bytes_loop):
+	pcmpistri $0x08, 16(%rdi), %xmm1
+	jz	L(more32bytes)
+
+	pcmpistri $0x08, 32(%rdi), %xmm1
+	jz	L(more48bytes)
+
+	pcmpistri $0x08, 48(%rdi), %xmm1
+	jz	L(more64bytes)
+
+	add	$64, %rdi
+	pcmpistri $0x08, (%rdi), %xmm1
+	jnz	L(more64bytes_loop)
+	leaq	(%rdi,%rcx), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more32bytes):
+	leaq	16(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more48bytes):
+	leaq	32(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more64bytes):
+	leaq	48(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(less16bytes):
+	subq	%r8, %rdi
+	bsfl	%edx, %eax
+	addq	%rdi, %rax
+	ret
+
+END (__strlen_sse42)
+
+#endif