about summary refs log tree commit diff
path: root/sysdeps/x86_64/multiarch/strlen-avx2.S
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2017-06-09 05:18:03 -0700
committerH.J. Lu <hjl.tools@gmail.com>2017-06-09 05:18:18 -0700
commitdc485ceb2ac596d27294cc1942adf3181f15e8bf (patch)
tree54a2e09bd5996b759565efbc216f6b05e2b77d1a /sysdeps/x86_64/multiarch/strlen-avx2.S
parent2f5d20ac99b9434a634629282cbb46e2a8d56a1c (diff)
downloadglibc-dc485ceb2ac596d27294cc1942adf3181f15e8bf.tar.gz
glibc-dc485ceb2ac596d27294cc1942adf3181f15e8bf.tar.xz
glibc-dc485ceb2ac596d27294cc1942adf3181f15e8bf.zip
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2
Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with
a single vector compare instruction.  It is as fast as SSE2 versions for
size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell.
Select AVX2 version on AVX2 machines where vzeroupper is preferred and
AVX unaligned load is fast.

NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input.  TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.

	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
	strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2,
	wcslen-sse2, wcslen-avx2 and wcsnlen-avx2.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add tests for __strlen_avx2,
	__strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2,
	__wcslen_sse2 and __wcsnlen_avx2.
	* sysdeps/x86_64/multiarch/strlen-avx2.S: New file.
	* sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise.
	* sysdeps/x86_64/multiarch/strlen.c: Likewise.
	* sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise.
	* sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise.
	* sysdeps/x86_64/multiarch/strnlen.c: Likewise.
	* sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise.
	* sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise.
	* sysdeps/x86_64/multiarch/wcslen.c: Likewise.
	* sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise.
	* sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New.
	(IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where
	vzeroupper is preferred and AVX unaligned load is fast.
Diffstat (limited to 'sysdeps/x86_64/multiarch/strlen-avx2.S')
-rw-r--r--sysdeps/x86_64/multiarch/strlen-avx2.S394
1 files changed, 394 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
new file mode 100644
index 0000000000..1dc823af0a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -0,0 +1,394 @@
+/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+#  define STRLEN	__strlen_avx2
+# endif
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+	.section .text.avx,"ax",@progbits
+ENTRY (STRLEN)
+# ifdef USE_AS_STRNLEN
+	/* Check for zero length.  */
+	testq	%rsi, %rsi
+	jz	L(zero)
+#  ifdef USE_AS_WCSLEN
+	shl	$2, %rsi
+#  endif
+	movq	%rsi, %r8
+# endif
+	movl	%edi, %ecx
+	movq	%rdi, %rdx
+	vpxor	%xmm0, %xmm0, %xmm0
+
+	/* Check if we may cross page boundary with one vector load.  */
+	andl	$(2 * VEC_SIZE - 1), %ecx
+	cmpl	$VEC_SIZE, %ecx
+	ja	L(cros_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  */
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+# ifdef USE_AS_STRNLEN
+	jnz	L(first_vec_x0_check)
+	/* Adjust length and check the end of data.  */
+	subq	$VEC_SIZE, %rsi
+	jbe	L(max)
+# else
+	jnz	L(first_vec_x0)
+# endif
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+	/* Adjust length.  */
+	addq	%rcx, %rsi
+
+	subq	$(VEC_SIZE * 4), %rsi
+	jbe	L(last_4x_vec_or_less)
+# endif
+	jmp	L(more_4x_vec)
+
+	.p2align 4
+L(cros_page_boundary):
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_STRNLEN
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+# endif
+	addq	%rdi, %rax
+	addq	%rcx, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(aligned_more):
+# ifdef USE_AS_STRNLEN
+        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+	    to void possible addition overflow.  */
+	negq	%rcx
+	addq	$VEC_SIZE, %rcx
+
+	/* Check the end of data.  */
+	subq	%rcx, %rsi
+	jbe	L(max)
+# endif
+
+	addq	$VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+	subq	$(VEC_SIZE * 4), %rsi
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+L(more_4x_vec):
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifdef USE_AS_STRNLEN
+	subq	$(VEC_SIZE * 4), %rsi
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+	/* Align data to 4 * VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andl	$(4 * VEC_SIZE - 1), %ecx
+	andq	$-(4 * VEC_SIZE), %rdi
+
+# ifdef USE_AS_STRNLEN
+	/* Adjust length.  */
+	addq	%rcx, %rsi
+# endif
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqa (%rdi), %ymm1
+	vmovdqa	VEC_SIZE(%rdi), %ymm2
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
+	VPMINU	%ymm1, %ymm2, %ymm5
+	VPMINU	%ymm3, %ymm4, %ymm6
+	VPMINU	%ymm5, %ymm6, %ymm5
+
+	VPCMPEQ	%ymm5, %ymm0, %ymm5
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifndef USE_AS_STRNLEN
+	jmp	L(loop_4x_vec)
+# else
+	subq	$(VEC_SIZE * 4), %rsi
+	ja	L(loop_4x_vec)
+
+L(last_4x_vec_or_less):
+	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+	addl	$(VEC_SIZE * 2), %esi
+	jle	L(last_2x_vec)
+
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x2_check)
+	subl	$VEC_SIZE, %esi
+	jle	L(max)
+
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x3_check)
+	movq	%r8, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	addl	$(VEC_SIZE * 2), %esi
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x0_check)
+	subl	$VEC_SIZE, %esi
+	jle	L(max)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+	movq	%r8, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x0_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x3_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(max):
+	movq	%r8, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+	.p2align 4
+L(first_vec_x0):
+	tzcntl	%eax, %eax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+	VPCMPEQ %ymm2, %ymm0, %ymm2
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+	VPCMPEQ %ymm3, %ymm0, %ymm3
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+	VPCMPEQ %ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+END (STRLEN)
+#endif