1 files changed, 734 insertions, 6 deletions
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
index ce634e94fa..e9afd8fbed 100644
--- a/sysdeps/x86_64/multiarch/strncpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
@@ -1,7 +1,735 @@
-#ifndef STRNCPY
-# define STRNCPY	__strncpy_avx2
-#endif
+/* strncpy with AVX2
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+
+# ifndef VEC_SIZE
+#  include "x86-avx-vecs.h"
+# endif
+
+# ifndef STRNCPY
+#  define STRNCPY	__strncpy_avx2
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
+
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   just running rep stos{b|l} to zero set (which will almost
+	   certainly segfault), if that succeeds then just calling
+	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rax
+	/* 56 is end of max supported address space.  */
+	shr	$56, %rax
+	jnz	L(zero_len)
+	salq	$2, %rdx
+# else
+	decq	%rdx
+	/* `dec` can macrofuse with `jl`. If the flag needs to become
+	   `jb` replace `dec` with `sub`.  */
+	jl	L(zero_len)
+# endif
+
+	vpxor	%VZERO_128, %VZERO_128, %VZERO_128
+	movl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* If no STPCPY just save end ahead of time.  */
+# ifndef USE_AS_STPCPY
+	movq	%rdi, %rax
+# elif defined USE_AS_WCSCPY
+	/* Clear dependency as nearly all return code for wcpncpy uses
+	   `setc %al`.  */
+	xorl	%eax, %eax
+# endif
+
+	cmpq	$(VEC_SIZE - CHAR_SIZE), %rdx
+	/* `jb` because length rdx is now length - CHAR_SIZE.  */
+	jbe	L(less_1x_vec)
+
+	/* This may overset but thats fine because we still need to zero
+	   fill.  */
+	VMOVU	%VMM(0), (%rdi)
+
+	testl	%ecx, %ecx
+	jnz	L(zfill)
+
+	/* Align.  */
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	orq	$(VEC_SIZE - 1), %rsi
+	incq	%rsi
+L(last_4x_vec):
+	addq	%rsi, %rdi
+L(loop_last_4x_vec):
+	subq	%rsi, %rdx
+
+
+	VMOVA	0(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(more_2x_vec)
+
+	cmpl	$(VEC_SIZE), %edx
+	jb	L(ret_vec_x1_len)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+
+	VPCMPEQ	VEC_SIZE(%rsi), %VZERO, %VMM(6)
+	VMOVU	%VMM(1), (%rdi)
+	vpmovmskb %VMM(6), %ecx
+	shlq	$VEC_SIZE, %rcx
+L(ret_vec_x1_len):
+	tzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x1_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+	VMOVU	((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	.p2align 4,, 6
+L(ret_vec_x1):
+	bsfl	%ecx, %ecx
+	VMOVU	%VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+	subl	%ecx, %edx
+	/* Check if we need to reload/store.  */
+	cmpl	$VEC_SIZE, %edx
+	jb	L(ret_vec_x1_len_no_zfill_mov)
+	/* Otherwise safe to just store directly.  */
+	VMOVU	%VMM(1), (%rdi)
+	VMOVU	%VZERO, (%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 12
+L(more_2x_vec):
+	VMOVU	%VMM(1), (%rdi)
+	testl	%ecx, %ecx
+	/* Must fill at least 2x VEC.  */
+	jnz	L(zfill_vec1)
+
+	VMOVA	VEC_SIZE(%rsi), %VMM(2)
+	VMOVU	%VMM(2), VEC_SIZE(%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	/* Must fill at least 1x VEC.  */
+	jnz	L(zfill_vec2)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(3)
+	VPCMPEQ	%VMM(3), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+
+	/* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
+	   CHAR_SIZE.  */
+	cmpq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+	ja	L(more_4x_vec)
+
+	subl	$(VEC_SIZE * 3), %edx
+	jb	L(ret_vec_x3_len)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	vpmovmskb %VMM(6), %ecx
+	tzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x4_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+	movl	%ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 3 + 0)(%edx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+
+L(ret_vec_x3_len):
+	addl	$(VEC_SIZE * 1), %edx
+	tzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x3_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+	.p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+	VMOVU	%VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 2 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsfl	%ecx, %ecx
+	VMOVU	%VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
+	subl	%ecx, %edx
+	jl	L(ret_vec_x3_len_no_zfill_mov)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx), %rax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(more_4x_vec):
+
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi)
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec3)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(4)
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi)
+	VPCMPEQ	%VMM(4), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec4)
+
+	movq	%rdx, %rcx
+	addq	%rsi, %rdx
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	/* Recheck length before aligning.  */
+	cmpq	$(VEC_SIZE * 8 - CHAR_SIZE), %rcx
+	jbe	L(last_4x_vec)
+
+	andq	$(VEC_SIZE * -4), %rsi
+
+	/* Do first half of loop ahead of time so loop can just start by
+	   storing.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %r8d
+	addq	%rsi, %rdi
+	testl	%r8d, %r8d
+	jnz	L(loop_4x_done)
+
+	/* Use r9 as end register.  */
+	leaq	-(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
 
-#define USE_AS_STRNCPY
-#define STRCPY	STRNCPY
-#include "strcpy-avx2.S"
+	.p2align 4,, 11
+L(loop_4x_vec):
+
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	subq	$(VEC_SIZE * -4), %rsi
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPMIN	%VMM(4), %VMM(6), %VMM(6)
+	VPCMPEQ	%VMM(6), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %r8d
+
+	testl	%r8d, %r8d
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	subq	%rsi, %rdx
+	VMOVU	%VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+	VPCMPEQ	%VMM(0), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec1)
+
+	VMOVU	%VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+	VPCMPEQ	%VMM(1), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec2)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+	VPCMPEQ	%VMM(2), %VZERO, %VMM(6)
+	vpmovmskb %VMM(6), %ecx
+	testl	%ecx, %ecx
+	jnz	L(zfill_vec3)
+
+	VMOVU	%VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+	movl	%r8d, %ecx
+
+	// Zfill more....
+
+	.p2align 4,, 4
+L(zfill_vec4):
+	addq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2), %rdx
+L(zfill_vec2):
+	shlq	$VEC_SIZE, %rcx
+L(zfill):
+	bsfq	%rcx, %rcx
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_from_page_cross):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(zfill_less_vec_vzeroupper)
+
+L(zfill_more_1x_vec):
+	VMOVU	%VZERO, CHAR_SIZE(%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jae	L(zfill_more_2x_vec)
+L(zfill_done0):
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(zfill_vec3):
+	addq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2), %rdx
+	.p2align 4,, 2
+L(zfill_vec1):
+	bsfl	%ecx, %ecx
+	addq	%rcx, %rdi
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+	/* zfill from vec1/vec3 must have to set at least 2x VECS.  */
+
+	VMOVU	%VZERO, CHAR_SIZE(%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jb	L(zfill_done0)
+L(zfill_more_2x_vec):
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
+	subq	$(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+	jbe	L(zfill_done)
+
+	addq	%rdi, %rdx
+	VMOVU	%VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
+
+
+	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+	subq	$-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
+	cmpq	%rdi, %rdx
+	jbe	L(zfill_done)
+
+	andq	$-(VEC_SIZE), %rdi
+	.p2align 4,, 12
+L(zfill_loop_4x_vec):
+	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(zfill_loop_4x_vec)
+L(zfill_done):
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 8
+L(copy_1x):
+	VMOVU	%VMM(0), (%rdi)
+	testl	%ecx, %ecx
+	jz	L(ret_32_32)
+L(zfill_less_vec):
+	bsfl	%ecx, %ecx
+L(zfill_less_vec_no_bsf):
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_less_vec_vzeroupper):
+	COND_VZEROUPPER
+	/* We are taking advantage of the fact that to be here we must
+	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
+	   way for overwriting.  */
+	cmpl	$16, %edx
+	jb	L(zfill_less_16)
+	VMOVU	%VZERO_128, (%rdi)
+	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	ret
+# ifdef USE_AS_STPCPY
+L(ret_32_32):
+	leaq	CHAR_SIZE(%rdi, %rdx), %rax
+	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 4
+L(copy_16_31):
+	/* Overfill to avoid branches.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	ja	L(zfill_less_vec_no_bsf)
+# ifndef USE_AS_STPCPY
+L(ret_32_32):
+# else
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 4
+L(copy_8_15):
+	/* Overfill to avoid branches.  */
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
+	vmovq	%xmm0, (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_8_15)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+	.p2align 4,, 8
+L(zfill_less_16):
+	xorl	%ecx, %ecx
+	cmpl	$8, %edx
+	jb	L(zfill_less_8)
+	movq	%rcx, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
+# ifndef USE_AS_STPCPY
+L(ret_8_15):
+# endif
+	ret
+
+
+	.p2align 4,, 8
+L(less_1x_vec):
+	/* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
+	   buffer sizes are aligned conventially.  */
+	je	L(copy_1x)
+
+	tzcntl	%ecx, %ecx
+	cmpl	$16, %edx
+	jae	L(copy_16_31)
+
+	COND_VZEROUPPER
+	cmpl	$8, %edx
+	jae	L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+	testl	%ecx, %ecx
+	jz	L(zfill_less_8_set_ret)
+
+	movl	(%rsi, %rdx), %esi
+	vmovd	%xmm0, (%rdi)
+	movl	%esi, (%rdi, %rdx)
+
+#  ifdef USE_AS_STPCPY
+	cmpl	%ecx, %edx
+L(ret_8_15):
+	setc	%al
+	addq	%rdx, %rdi
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  endif
+	ret
+L(zfill_less_8_set_ret):
+	xorl	%ecx, %ecx
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+L(zfill_less_8):
+	movl	%ecx, (%rdi)
+	movl	%ecx, (%rdi, %rdx)
+	ret
+
+# else
+	cmpl	$3, %edx
+	jb	L(copy_0_3)
+	/* Overfill to avoid branches.  */
+	movl	-3(%rsi, %rdx), %esi
+	vmovd	%xmm0, (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_4_7)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+	xorl	%ecx, %ecx
+	.p2align 4,, 8
+L(zfill_less_8):
+	cmpl	$3, %edx
+	jb	L(zfill_less_3)
+	movl	%ecx, (%rdi)
+	movl	%ecx, -3(%rdi, %rdx)
+#  ifdef USE_AS_STPCPY
+	ret
+#  endif
+
+L(ret_4_7):
+#  ifdef USE_AS_STPCPY
+L(ret_8_15):
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+	ret
+
+	.p2align 4,, 4
+L(zfill_less_3):
+	testl	%edx, %edx
+	jz	L(zfill_1)
+	movw	%cx, (%rdi)
+L(zfill_1):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_3):
+	vmovd	%xmm0, %r8d
+	testl	%edx, %edx
+	jz	L(copy_1)
+	movw	%r8w, (%rdi)
+	cmpl	%ecx, %edx
+	ja	L(zfill_from_1)
+	movzbl	(%rsi, %rdx), %r8d
+#  ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+	movb	%r8b, (%rdi, %rdx)
+	ret
+#  endif
+
+L(copy_1):
+#  ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	cmpl	%ecx, %edx
+	adcq	%rdi, %rax
+#  endif
+#  ifdef USE_AS_WCSCPY
+	vmovd	%xmm0, (%rdi)
+#  else
+	movb	%r8b, (%rdi, %rdx)
+#  endif
+	ret
+# endif
+
+	.p2align 4,, 2
+L(zero_len):
+	movq	%rdi, %rax
+	ret
+# ifndef USE_AS_WCSCPY
+	.p2align 4,, 8
+L(zfill_from_1):
+#  ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+#  endif
+	movw	$0, -1(%rdi, %rdx)
+	ret
+# endif
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %rax
+	andq	$(VEC_SIZE * -1), %rax
+
+	VPCMPEQ	(%rax), %VZERO, %VMM(6)
+
+	vpmovmskb %VMM(6), %ecx
+	shrxl	%esi, %ecx, %ecx
+
+	subl	%esi, %eax
+	andl	$(VEC_SIZE - 1), %eax
+	cmpq	%rax, %rdx
+	jb	L(page_cross_small)
+	/* Optimizing more aggressively for space as this is very cold
+	   code. This saves 2x cache lines.  */
+
+	/* If rcx is non-zero then continue.  */
+	shl	$CHAR_SIZE, %ecx
+	jz	L(page_cross_continue)
+	bsf	%ecx, %ecx
+
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	leaq	-CHAR_SIZE(%rdi, %rcx), %rax
+# else
+	movq	%rdi, %rax
+# endif
+
+	rep	movsb
+# ifdef USE_AS_WCSCPY
+	movl	$0, (%rdi)
+# else
+	movb	$0, (%rdi)
+# endif
+	jmp	L(zfill_from_page_cross)
+
+L(page_cross_small):
+	tzcntl	%ecx, %ecx
+	xorl	%eax, %eax
+	cmpl	%ecx, %edx
+	jbe	L(page_cross_copy_only)
+
+	/* Do a zfill of the tail before copying.  */
+	movq	%rdi, %r9
+	movl	%ecx, %r8d
+
+	subl	%ecx, %edx
+	leaq	CHAR_SIZE(%rdi, %rcx), %rdi
+	movl	%edx, %ecx
+	rep	stosb
+	movq	%r9, %rdi
+	movl	%r8d, %edx
+L(page_cross_copy_only):
+	leal	CHAR_SIZE(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	setc	%al
+	addq	%rdi, %rdx
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# else
+	movq	%rdi, %rax
+# endif
+	rep	movsb
+	ret
+
+
+L(best_effort_strncpy):
+	movq	%rdx, %rcx
+	xorl	%eax, %eax
+	movq	%rdi, %r8
+	/* The length is >= 2^63. We very much so expect to segfault at
+	   rep stos. If that doesn't happen then just strcpy to finish.
+	 */
+# ifdef USE_AS_WCSCPY
+	rep	stosl
+# else
+	rep	stosb
+# endif
+	movq	%r8, %rdi
+	jmp	OVERFLOW_STRCPY
+END(STRNCPY)
+#endif