x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions

Optimizations are: 1. Use more overlapping stores to avoid branches. 2. Reduce how unrolled the aligning copies are (this is more of a code-size save, its a negative for some sizes in terms of perf). 3. Improve the loop a bit (similiar to what we do in strlen with 2x vpminu + kortest instead of 3x vpminu + kmov + test). 4. For st{r|p}n{cat|cpy} re-order the branches to minimize the number that are taken. Performance Changes: Times are from N = 10 runs of the benchmark suite and are reported as geometric mean of all ratios of New Implementation / Old Implementation. stpcpy-evex -> 0.922 strcat-evex -> 0.985 strcpy-evex -> 0.880 strncpy-evex -> 0.831 stpncpy-evex -> 0.780 strncat-evex -> 0.958 Code Size Changes: function -> Bytes New / Bytes Old -> Ratio strcat-evex -> 819 / 1874 -> 0.437 strcpy-evex -> 700 / 1074 -> 0.652 stpcpy-evex -> 735 / 1094 -> 0.672 strncpy-evex -> 1397 / 2611 -> 0.535 stpncpy-evex -> 1489 / 2691 -> 0.553 strncat-evex -> 1184 / 2832 -> 0.418 Notes: 1. Because of the significant difference between the implementations they are split into three files. strcpy-evex.S -> strcpy, stpcpy, strcat strncpy-evex.S -> strncpy strncat-evex.S > strncat I couldn't find a way to merge them without making the ifdefs incredibly difficult to follow. 2. All implementations can be made evex512 by including "x86-evex512-vecs.h" at the top. 3. All implementations have an optional define: `USE_EVEX_MASKED_STORE` Setting to one uses evex-masked stores for handling short strings. This saves code size and branches. It's disabled for all implementations are the moment as there are some serious drawbacks to masked stores in certain cases, but that may be fixed on future architectures. Full check passes on x86-64 and build succeeds for all ISA levels w/ and w/o multiarch.
author: Noah Goldstein <goldstein.w.n@gmail.com> 2022-11-08 17:38:38 -0800
committer: Noah Goldstein <goldstein.w.n@gmail.com> 2022-11-08 19:22:33 -0800
commit: f049f52dfeed8129c11ab1641a815705d09ff7e8 (patch)
tree: a6c13dc462411b308467b26a3a0f1062e0597bbd /sysdeps/x86_64/multiarch/strncat-evex.S
parent: d44e116428fefa0c2d01151af11f7a41fb525536 (diff)
download: glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.tar.gz
glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.tar.xz
glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.zip
1 files changed, 519 insertions, 6 deletions
diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
index 203a19bf21..bced4e8944 100644
--- a/sysdeps/x86_64/multiarch/strncat-evex.S
+++ b/sysdeps/x86_64/multiarch/strncat-evex.S
@@ -1,7 +1,520 @@
-#ifndef STRNCAT
-# define STRNCAT	__strncat_evex
-#endif
+/* {wcs|str}ncat  with 256/512-bit EVEX.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+# ifndef STRNCAT
+#  define STRNCAT	__strncat_evex
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+#  define MOVCHAR	movl
+#  define VMOVU_MASK	vmovdqu32
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
+
+#  define REP_MOVS	rep movsd
+
+#  define VMASK_REG	VR10
+#  define FIND_FIRST_ONE(src, dst)	movl $CHAR_PER_VEC, %dst; bsf %src, %dst
+
+#  define USE_WIDE_CHAR
+# else
+#  define MOVCHAR	movb
+#  define VMOVU_MASK	vmovdqu8
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
+
+#  define REP_MOVS	rep movsb
+
+#  define VMASK_REG	VRCX
+#  define FIND_FIRST_ONE(src, dst)	tzcnt %src, %dst
+
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# include "reg-macros.h"
+
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+	movq	%rdi, %rax
+
+	/* NB: It's safe to filter out zero-length strings WITHOUT
+	   setting null-term. Destination MUST be a null-terminated
+	   string so essentially the work is already done.  */
+# ifdef USE_AS_WCSCPY
+	leaq	-1(%rdx), %rcx
+	shrq	$56, %rcx
+	jnz	L(zero_len)
+# else
+	test	%rdx, %rdx
+	jle	L(zero_len)
+# endif
+
+# include "strcat-strlen-evex.h.S"
+
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPTESTN	%VMM(0), %VMM(0), %k0
+
+	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
+	   <= CHAR_PER_VEC with masked instructions (which have
+	   potential for dramatically bad perf if dst splits a page and
+	   is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	KMOV	%k0, %VRCX
+	FIND_FIRST_ONE (VRCX, VR8)
+	cmpq	%r8, %rdx
+	jbe	L(less_1x_vec)
+
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
+
+	blsmsk	%VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
+	ret
+
+L(less_1x_vec):
+	mov	$-1, %VRCX
+	bzhi	%VRDX, %VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
+	VMOVU_MASK %VMM(0), (%rdi){%k1}
+
+	ret
+# else
+	KMOV	%k0, %VMASK_REG
+	/* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
+	   %VMASK_REG, %VRCX` for wcsncat.  */
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpq	%rcx, %rdx
+	jbe	L(less_1x_vec)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	je	L(more_1x_vec)
+
+	movl	%ecx, %edx
+
+L(less_1x_vec):
+#  if VEC_SIZE == 64
+	cmpl	$(32 / CHAR_SIZE), %edx
+	jae	L(copy_32_63)
+#  endif
+
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(copy_16_31)
+
+
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jae	L(copy_8_15)
+
+#  ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  else
+
+	cmpl	$4, %edx
+	jae	L(copy_4_7)
+
+	movzbl	(%rsi), %ecx
+	cmpl	$1, %edx
+	jbe	L(set_null_term)
+
+	movzwl	1(%rsi), %esi
+	movw	%si, 1(%rdi)
+
+	.p2align 4,, 1
+L(set_null_term):
+	movb	%cl, (%rdi)
+	MOVCHAR	$0, (%rdi, %rdx)
+	ret
+#  endif
+
+#  if VEC_SIZE == 64
+	.p2align 4,, 6
+L(copy_32_63):
+	VMOVU	-(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
+	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  endif
+	.p2align 4,, 6
+L(copy_16_31):
+	/* Use xmm1 explicitly here as it won't require a `vzeroupper`
+	   and will save code size.  */
+	vmovdqu	-(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
+	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 2
+L(copy_8_15):
+	movq	-(8)(%rsi, %rdx, CHAR_SIZE), %rcx
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
+	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+
+#  ifndef USE_AS_WCSCPY
+	.p2align 4,, 12
+L(copy_4_7):
+	movl	-(4)(%rsi, %rdx, CHAR_SIZE), %ecx
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
+	MOVCHAR	$0, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  endif
+
+# endif
+	.p2align 4,, 4
+L(zero_len):
+# ifdef USE_AS_WCSCPY
+	test	%rdx, %rdx
+# endif
+	jne	OVERFLOW_STRCAT
+	ret
 
-#define USE_AS_STRNCAT
-#define STRCAT	STRNCAT
-#include "strcat-evex.S"
+	.p2align 4,, 8
+L(more_1x_vec):
+	VMOVU	%VMM(0), (%rdi)
+
+	/* We are going to align rsi here so will need to be able to re-
+	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
+
+	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+L(loop_last_4x_vec):
+	addq	%rsi, %rdi
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+
+	/* Will need this regardless.  */
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VMASK_REG
+
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	jne	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	addl	$-CHAR_PER_VEC, %edx
+	bzhi	%VRDX, %VRCX, %VR8
+	jz	L(ret_vec_x2_len)
+L(ret_vec_x2):
+	bsf	%VRCX, %VRDX
+L(ret_vec_x2_len):
+	VMOVU	(VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	MOVCHAR	$0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 4
+L(ret_vec_x1_len):
+	movl	%edx, %ecx
+L(ret_vec_x1):
+	VMOVU	(VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	MOVCHAR	$0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	addl	$-(CHAR_PER_VEC * 4), %edx
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VMASK_REG
+	subq	$-(VEC_SIZE * 4), %rsi
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+# ifdef USE_AS_WCSCPY
+	xorl	%ecx, %ecx
+# endif
+	bsf	%VMASK_REG, %VRCX
+	jnz	L(ret_vec_x1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VMASK_REG
+
+	cmpq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(more_4x_vec)
+
+	/* Adjust length before going to L(ret_vec_x3_len) or
+	   L(ret_vec_x3).  */
+	addl	$(CHAR_PER_VEC * -2), %edx
+
+	FIND_FIRST_ONE (VMASK_REG, VRCX)
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len)
+
+	/* If there were no zero-CHARs (rcx was zero before
+	   FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC.  */
+	cmpl	$CHAR_PER_VEC, %ecx
+	jne	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	addl	$-CHAR_PER_VEC, %edx
+	bzhi	%VRDX, %VRCX, %VR8
+	jz	L(ret_vec_x4_len)
+L(ret_vec_x4):
+	bsf	%VRCX, %VRDX
+L(ret_vec_x4_len):
+	VMOVU	(VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	MOVCHAR	$0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 4
+L(ret_vec_x3_len):
+	movl	%edx, %ecx
+L(ret_vec_x3):
+	VMOVU	(VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	MOVCHAR	$0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+	VMOVU	%VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+# ifdef USE_AS_WCSCPY
+	xorl	%ecx, %ecx
+# endif
+	bsf	%VMASK_REG, %VRCX
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+
+	/* Check if we are near the end before aligning.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rdx
+	jbe	L(last_4x_vec)
+
+
+	/* Add rsi to rdx (length) before aligning rsi. NB: Since we
+	   filtered out huge lengths this cannot overflow.  */
+# ifdef USE_AS_WCSCPY
+	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rsi, %rdx
+# endif
+
+	/* Subtract rsi from rdi before aligning (add back will have
+	   correct rdi for aligned rsi).  */
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+
+	/* Offset rsi by VEC_SIZE so that we can jump to
+	   L(loop_last_4x_vec).  */
+	addq	$-(VEC_SIZE), %rsi
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	/* Store loop end in r9.  */
+	leaq	-(VEC_SIZE * 5)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Restore rdi (dst).  */
+	addq	%rsi, %rdi
+
+	/* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
+	   test with bsf.  */
+	bsf	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
+
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	bsf	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+
+	KMOV	%k4, %VRCX
+	bsf	%VRCX, %VRCX
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
+	ret
+
+
+	.p2align 4,, 4
+L(page_cross):
+	movq	%rsi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %k0
+
+# ifdef USE_AS_WCSCPY
+	KMOV	%k0, %VR9
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+	shrx	%VRCX, %VR9, %VRCX
+# else
+	KMOV	%k0, %VRCX
+	shrx	%VRSI, %VRCX, %VRCX
+# endif
+
+	subl	%esi, %r8d
+	andl	$(VEC_SIZE - 1), %r8d
+# ifdef USE_AS_WCSCPY
+	shrl	$2, %r8d
+# endif
+	cmpq	%r8, %rdx
+	jbe	L(page_cross_small)
+	/* Optimizing more for space as this is very cold code. This
+	   saves 2x cache lines.  */
+
+	/* This adds once to the later result which will get correct
+	   copy bounds. NB: this can never zero-out a non-zero RCX as
+	   to be in the page cross case rsi cannot be aligned and we
+	   already right-shift rcx by the misalignment.  */
+	shl	%VRCX
+	jz	L(page_cross_continue)
+	bsf	%VRCX, %VRCX
+	REP_MOVS
+	ret
+
+L(page_cross_small):
+	tzcnt	%VRCX, %VRCX
+	jz	L(page_cross_setz)
+	cmpl	%edx, %ecx
+	cmova	%edx, %ecx
+
+# ifdef USE_AS_WCSCPY
+	rep	movsd
+# else
+	rep	movsb
+# endif
+L(page_cross_setz):
+	MOVCHAR	$0, (%rdi)
+	ret
+END(STRNCAT)
+#endif
author	Noah Goldstein <goldstein.w.n@gmail.com>	2022-11-08 17:38:38 -0800
committer	Noah Goldstein <goldstein.w.n@gmail.com>	2022-11-08 19:22:33 -0800
commit	f049f52dfeed8129c11ab1641a815705d09ff7e8 (patch)
tree	a6c13dc462411b308467b26a3a0f1062e0597bbd /sysdeps/x86_64/multiarch/strncat-evex.S
parent	d44e116428fefa0c2d01151af11f7a41fb525536 (diff)
download	glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.tar.gz glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.tar.xz glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.zip