about summary refs log tree commit diff
path: root/sysdeps/x86_64/multiarch/strchr.S
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2009-10-22 22:47:12 -0700
committerUlrich Drepper <drepper@redhat.com>2009-10-22 22:47:12 -0700
commit001659f4d59d675c48a7748594e784fb35a5678d (patch)
tree5e4beef73cbcd1100c7ca505d04c5a6305fd7041 /sysdeps/x86_64/multiarch/strchr.S
parent31ba3e7a7d0f0614122f03f4350904415d94877b (diff)
downloadglibc-001659f4d59d675c48a7748594e784fb35a5678d.tar.gz
glibc-001659f4d59d675c48a7748594e784fb35a5678d.tar.xz
glibc-001659f4d59d675c48a7748594e784fb35a5678d.zip
Implement SSE4.2 optimized strchr and strrchr.
Diffstat (limited to 'sysdeps/x86_64/multiarch/strchr.S')
-rw-r--r--sysdeps/x86_64/multiarch/strchr.S177
1 files changed, 177 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
new file mode 100644
index 0000000000..a77cc1c844
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -0,0 +1,177 @@
+/* strchr with SSE4.2
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+
+/* Define multiple versions only for the definition in libc.  */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(strchr)
+	.type	strchr, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__strchr_sse2(%rip), %rax
+	testl	$(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jz	2f
+	leaq	__strchr_sse42(%rip), %rax
+2:	ret
+END(strchr)
+
+
+/*
+   This implementation uses SSE4 instructions to compare up to 16 bytes
+   at a time looking for the first occurrence of the character c in the
+   string s:
+
+   char *strchr (const char *s, int c);
+
+   We use 0xa:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_EACH
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   C C C C C C C C C C C C C C C C
+
+   to find out if the first 16byte data element has a byte C and the
+   offset of the first byte.  There are 3 cases:
+
+   1. The first 16byte data element has the byte C at the offset X.
+   2. The first 16byte data element has EOS and doesn't have the byte C.
+   3. The first 16byte data element is valid and doesn't have the byte C.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
+
+   case		ECX	CFlag	ZFlag	SFlag
+    1		 X	  1	 0/1	  0
+    2		16	  0	  1	  0
+    3		16	  0	  0	  0
+
+   We exit from the loop for cases 1 and 2 with jbe which branches
+   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
+   X for case 1.  */
+
+	.section .text.sse4.2,"ax",@progbits
+	.align 	16
+	.type	__strchr_sse42, @function
+__strchr_sse42:
+	cfi_startproc
+	CALL_MCOUNT
+	testb	%sil, %sil
+	je	__strend_sse4
+	pxor	%xmm2, %xmm2
+	movd	%esi, %xmm1
+	movl	%edi, %ecx
+	andl	$15, %ecx
+	movq	%rdi, %r8
+	je	L(aligned_start)
+
+/* Handle unaligned string.  */
+	andq	$-16, %r8
+	pshufb  %xmm2, %xmm1
+	movdqa	(%r8), %xmm0
+	pcmpeqb	 %xmm0, %xmm2
+	pcmpeqb	 %xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %esi
+	/* Remove the leading  bytes.  */
+	sarl	%cl, %edx
+	sarl	%cl, %esi
+	testl	%esi, %esi
+	je	L(unaligned_no_match)
+	/* Check which byte is a match.  */
+	bsfl	%esi, %eax
+	/* Is there a NULL? */
+	testl	%edx, %edx
+	je      L(unaligned_match)
+	bsfl	%edx, %esi
+	cmpl	%esi, %eax
+	/* Return NULL if NULL comes first.  */
+	ja	L(return_null)
+L(unaligned_match):
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(unaligned_no_match):
+	testl	%edx, %edx
+	jne	L(return_null)
+
+/* Loop start on aligned string.  */
+L(loop):
+	addq	$16, %r8
+L(aligned_start):
+	pcmpistri	$0x2, (%r8), %xmm1
+	jbe 	L(wrap)
+	addq	$16, %r8
+	pcmpistri	$0x2, (%r8), %xmm1
+	jbe 	L(wrap)
+	addq 	$16, %r8
+        pcmpistri       $0x2, (%r8), %xmm1
+  	jbe     L(wrap)
+	addq	$16, %r8
+	pcmpistri 	$0x2, (%r8), %xmm1
+	jbe 	L(wrap)
+	jmp	L(loop)
+L(wrap):
+	jc	L(loop_exit)
+
+/* Return NULL.  */
+L(return_null):
+	xorl	%eax, %eax
+	ret
+
+/* Loop exit.  */
+	.p2align 4
+L(loop_exit):
+	leaq	(%r8,%rcx), %rax
+	ret
+	cfi_endproc
+	.size	__strchr_sse42, .-__strchr_sse42
+
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strchr_sse2, @function; \
+	.align 16; \
+	__strchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strchr calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strchr; __GI_strchr = __strchr_sse2
+#endif
+
+#include "../strchr.S"