From 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 5 Mar 2021 06:24:52 -0800
Subject: x86-64: Add ifunc-avx2.h functions with 256-bit EVEX

Update ifunc-avx2.h, strchr.c, strcmp.c, strncmp.c and wcsnlen.c to
select the function optimized with 256-bit EVEX instructions using
YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL, AVX512BW
and BMI2 since VZEROUPPER isn't needed at function exit.

For strcmp/strncmp, prefer AVX2 strcmp/strncmp if Prefer_AVX2_STRCMP
is set.
---
 sysdeps/x86_64/multiarch/memchr-evex.S | 381 +++++++++++++++++++++++++++++++++
 1 file changed, 381 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/memchr-evex.S

(limited to 'sysdeps/x86_64/multiarch/memchr-evex.S')

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
new file mode 100644
index 0000000000..6dd5d67b90
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -0,0 +1,381 @@
+/* memchr/wmemchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCHR
+#  define MEMCHR	__memchr_evex
+# endif
+
+# ifdef USE_AS_WMEMCHR
+#  define VPBROADCAST	vpbroadcastd
+#  define VPCMP		vpcmpd
+#  define SHIFT_REG	r8d
+# else
+#  define VPBROADCAST	vpbroadcastb
+#  define VPCMP		vpcmpb
+#  define SHIFT_REG	ecx
+# endif
+
+# define XMMMATCH	xmm16
+# define YMMMATCH	ymm16
+# define YMM1		ymm17
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+
+# define VEC_SIZE 32
+
+	.section .text.evex,"ax",@progbits
+ENTRY (MEMCHR)
+# ifndef USE_AS_RAWMEMCHR
+	/* Check for zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	L(zero)
+# endif
+	movl	%edi, %ecx
+# ifdef USE_AS_WMEMCHR
+	shl	$2, %RDX_LP
+# else
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#  endif
+# endif
+	/* Broadcast CHAR to YMMMATCH.  */
+	VPBROADCAST %esi, %YMMMATCH
+	/* Check if we may cross page boundary with one vector load.  */
+	andl	$(2 * VEC_SIZE - 1), %ecx
+	cmpl	$VEC_SIZE, %ecx
+	ja	L(cros_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  */
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(first_vec_x0_check)
+	/* Adjust length and check the end of data.  */
+	subq	$VEC_SIZE, %rdx
+	jbe	L(zero)
+# else
+	jnz	L(first_vec_x0)
+# endif
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+	jmp	L(more_4x_vec)
+
+	.p2align 4
+L(cros_page_boundary):
+	andl	$(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	   bytes.  */
+	movl	%ecx, %SHIFT_REG
+	sarl	$2, %SHIFT_REG
+# endif
+	andq	$-VEC_SIZE, %rdi
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	/* Remove the leading bytes.  */
+	sarxl	%SHIFT_REG, %eax, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+# endif
+	addq	%rdi, %rax
+	addq	%rcx, %rax
+	ret
+
+	.p2align 4
+L(aligned_more):
+# ifndef USE_AS_RAWMEMCHR
+        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+	   overflow.  */
+	negq	%rcx
+	addq	$VEC_SIZE, %rcx
+
+	/* Check the end of data.  */
+	subq	%rcx, %rdx
+	jbe	L(zero)
+# endif
+
+	addq	$VEC_SIZE, %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+L(more_4x_vec):
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+	/* Align data to 4 * VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andl	$(4 * VEC_SIZE - 1), %ecx
+	andq	$-(4 * VEC_SIZE), %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+# endif
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
+	kord	%k1, %k2, %k5
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
+
+	kord	%k3, %k4, %k6
+	kortestd %k5, %k6
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifdef USE_AS_RAWMEMCHR
+	jmp	L(loop_4x_vec)
+# else
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec)
+
+L(last_4x_vec_or_less):
+	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jle	L(last_2x_vec)
+
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x2_check)
+	subl	$VEC_SIZE, %edx
+	jle	L(zero)
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x3_check)
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	addl	$(VEC_SIZE * 2), %edx
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x0_check)
+	subl	$VEC_SIZE, %edx
+	jle	L(zero)
+
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(first_vec_x0_check):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+# endif
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+# endif
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(first_vec_x2_check):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+# endif
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(first_vec_x3_check):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	sall	$2, %eax
+# endif
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+	.p2align 4
+L(first_vec_x0):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(%rdi, %rax, 4), %rax
+# else
+	addq	%rdi, %rax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+# else
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+# endif
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+	kmovd	%k2, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+	kmovd	%k3, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+	kmovd	%k4, %eax
+	testl	%eax, %eax
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+# else
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+# endif
+	ret
+
+END (MEMCHR)
+#endif
-- 
cgit 1.4.1