about summary refs log tree commit diff
path: root/sysdeps/x86_64/multiarch/strchr-evex.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86_64/multiarch/strchr-evex.S')
-rw-r--r--sysdeps/x86_64/multiarch/strchr-evex.S335
1 files changed, 335 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
new file mode 100644
index 0000000000..ddc86a7058
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -0,0 +1,335 @@
+/* strchr/strchrnul optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCHR
+#  define STRCHR	__strchr_evex
+# endif
+
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# ifdef USE_AS_WCSCHR
+#  define VPBROADCAST	vpbroadcastd
+#  define VPCMP		vpcmpd
+#  define VPMINU	vpminud
+#  define CHAR_REG	esi
+#  define SHIFT_REG	r8d
+# else
+#  define VPBROADCAST	vpbroadcastb
+#  define VPCMP		vpcmpb
+#  define VPMINU	vpminub
+#  define CHAR_REG	sil
+#  define SHIFT_REG	ecx
+# endif
+
+# define XMMZERO	xmm16
+
+# define YMMZERO	ymm16
+# define YMM0		ymm17
+# define YMM1		ymm18
+# define YMM2		ymm19
+# define YMM3		ymm20
+# define YMM4		ymm21
+# define YMM5		ymm22
+# define YMM6		ymm23
+# define YMM7		ymm24
+# define YMM8		ymm25
+
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+
+	.section .text.evex,"ax",@progbits
+ENTRY (STRCHR)
+	movl	%edi, %ecx
+# ifndef USE_AS_STRCHRNUL
+	xorl	%edx, %edx
+# endif
+
+	/* Broadcast CHAR to YMM0.	*/
+	VPBROADCAST %esi, %YMM0
+
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+
+	/* Check if we cross page boundary with one vector load.  */
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja  L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
+	   null bytes.  */
+	VMOVU	(%rdi), %YMM1
+
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	ktestd	%k0, %k0
+	jz	L(more_vecs)
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	/* Found CHAR or the null byte.	 */
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(%rdi, %rax, 4), %rax
+# else
+	addq	%rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	.p2align 4
+L(more_vecs):
+	/* Align data for aligned loads in the loop.  */
+	andq	$-VEC_SIZE, %rdi
+L(aligned_more):
+
+	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.	*/
+	VMOVA	VEC_SIZE(%rdi), %YMM1
+	addq	$VEC_SIZE, %rdi
+
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VMOVA	VEC_SIZE(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	ktestd	%k0, %k0
+	jz	L(prep_loop_4x)
+
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	/* Found CHAR or the null byte.	 */
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+# else
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x0):
+	tzcntl	%eax, %eax
+	/* Found CHAR or the null byte.	 */
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(%rdi, %rax, 4), %rax
+# else
+	addq	%rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	/* Found CHAR or the null byte.	 */
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+	leaq	VEC_SIZE(%rdi, %rax), %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	/* Found CHAR or the null byte.	 */
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+# else
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+L(prep_loop_4x):
+	/* Align data to 4 * VEC_SIZE.	*/
+	andq	$-(VEC_SIZE * 4), %rdi
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
+	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
+	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
+
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM5
+	vpxorq	%YMM2, %YMM0, %YMM6
+	vpxorq	%YMM3, %YMM0, %YMM7
+	vpxorq	%YMM4, %YMM0, %YMM8
+
+	VPMINU	%YMM5, %YMM1, %YMM5
+	VPMINU	%YMM6, %YMM2, %YMM6
+	VPMINU	%YMM7, %YMM3, %YMM7
+	VPMINU	%YMM8, %YMM4, %YMM8
+
+	VPMINU	%YMM5, %YMM6, %YMM1
+	VPMINU	%YMM7, %YMM8, %YMM2
+
+	VPMINU	%YMM1, %YMM2, %YMM1
+
+	/* Each bit in K0 represents a CHAR or a null byte.  */
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+	ktestd	%k0, %k0
+	jz	L(loop_4x_vec)
+
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM5, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
+	VPCMP	$0, %YMMZERO, %YMM6, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
+	VPCMP	$0, %YMMZERO, %YMM7, %k2
+	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
+	VPCMP	$0, %YMMZERO, %YMM8, %k3
+
+# ifdef USE_AS_WCSCHR
+	/* NB: Each bit in K2/K3 represents 4-byte element.  */
+	kshiftlw $8, %k3, %k1
+# else
+	kshiftlq $32, %k3, %k1
+# endif
+
+	/* Each bit in K1 represents a NULL or a mismatch.  */
+	korq	%k1, %k2, %k1
+	kmovq	%k1, %rax
+
+	tzcntq  %rax, %rax
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+# else
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	/* Cold case for crossing page with first load.	 */
+	.p2align 4
+L(cross_page_boundary):
+	andq	$-VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+
+	VMOVA	(%rdi), %YMM1
+
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+
+# ifdef USE_AS_WCSCHR
+	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	   bytes.  */
+	movl	%ecx, %SHIFT_REG
+	sarl    $2, %SHIFT_REG
+# endif
+
+	/* Remove the leading bits.	 */
+	sarxl	%SHIFT_REG, %eax, %eax
+	testl	%eax, %eax
+
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	addq	%rcx, %rdi
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(%rdi, %rax, 4), %rax
+# else
+	addq	%rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+END (STRCHR)
+# endif