From 26b2478322db94edc9e0e8f577b2f71d291e5acb Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 14 Apr 2022 11:47:40 -0500
Subject: x86: Reduce code size of mem{move|pcpy|cpy}-ssse3

The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are
generally preferable. memcpy/memmove is one exception where avoiding
unaligned loads with `palignr` is important for some targets.

This commit replaces memmove-ssse3 with a better optimized are lower
code footprint verion. As well it aliases memcpy to memmove.

Aside from this function all other SSSE3 functions should be safe to
remove.

The performance is not changed drastically although shows overall
improvements without any major regressions or gains.

bench-memcpy geometric_mean(N=50) New / Original: 0.957

bench-memcpy-random geometric_mean(N=50) New / Original: 0.912

bench-memcpy-large geometric_mean(N=50) New / Original: 0.892

Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers
for all results.

More important this saves 7246 bytes of code size in memmove an
additional 10741 bytes by reusing memmove code for memcpy (total 17987
bytes saves). As well an additional 896 bytes of rodata for the jump
table entries.
---
 sysdeps/x86_64/multiarch/memmove-ssse3.S | 384 ++++++++++++++++++++++++++++++-
 1 file changed, 380 insertions(+), 4 deletions(-)

(limited to 'sysdeps/x86_64/multiarch/memmove-ssse3.S')

diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 295430b1ef..215583e7bd 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -1,4 +1,380 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3
-#define MEMCPY_CHK	__memmove_chk_ssse3
-#include "memcpy-ssse3.S"
+#include <sysdep.h>
+
+#ifndef MEMMOVE
+# define MEMMOVE	__memmove_ssse3
+# define MEMMOVE_CHK	__memmove_chk_ssse3
+# define MEMCPY	__memcpy_ssse3
+# define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMPCPY	__mempcpy_ssse3
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3
+#endif
+
+	.section .text.ssse3, "ax", @progbits
+ENTRY(MEMPCPY_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMPCPY_CHK)
+
+ENTRY(MEMPCPY)
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start)
+END(MEMPCPY)
+
+ENTRY(MEMMOVE_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMMOVE_CHK)
+
+ENTRY_P2ALIGN(MEMMOVE, 6)
+	movq	%rdi, %rax
+L(start):
+	cmpq	$16, %rdx
+	jb	L(copy_0_15)
+
+	/* These loads are always useful.  */
+	movups	0(%rsi), %xmm0
+	movups	-16(%rsi, %rdx), %xmm7
+	cmpq	$32, %rdx
+	ja	L(more_2x_vec)
+
+	movups	%xmm0, 0(%rdi)
+	movups	%xmm7, -16(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_0_15):
+	cmpl	$4, %edx
+	jb	L(copy_0_3)
+	cmpl	$8, %edx
+	jb	L(copy_4_7)
+	movq	0(%rsi), %rcx
+	movq	-8(%rsi, %rdx), %rsi
+	movq	%rcx, 0(%rdi)
+	movq	%rsi, -8(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_4_7):
+	movl	0(%rsi), %ecx
+	movl	-4(%rsi, %rdx), %esi
+	movl	%ecx, 0(%rdi)
+	movl	%esi, -4(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_0_3):
+	decl	%edx
+	jl	L(copy_0_0)
+	movb	(%rsi), %cl
+	je	L(copy_1_1)
+
+	movzwl	-1(%rsi, %rdx), %esi
+	movw	%si, -1(%rdi, %rdx)
+L(copy_1_1):
+	movb	%cl, (%rdi)
+L(copy_0_0):
+	ret
+
+	.p2align 4,, 4
+L(copy_4x_vec):
+	movups	16(%rsi), %xmm1
+	movups	-32(%rsi, %rdx), %xmm2
+
+	movups	%xmm0, 0(%rdi)
+	movups	%xmm1, 16(%rdi)
+	movups	%xmm2, -32(%rdi, %rdx)
+	movups	%xmm7, -16(%rdi, %rdx)
+L(nop):
+	ret
+
+	.p2align 4
+L(more_2x_vec):
+	cmpq	$64, %rdx
+	jbe	L(copy_4x_vec)
+
+	/* We use rcx later to get alignr value.  */
+	movq	%rdi, %rcx
+
+	/* Backward copy for overlap + dst > src for memmove safety.  */
+	subq	%rsi, %rcx
+	cmpq	%rdx, %rcx
+	jb	L(copy_backward)
+
+	/* Load tail.  */
+
+	/* -16(%rsi, %rdx) already loaded into xmm7.  */
+	movups	-32(%rsi, %rdx), %xmm8
+	movups	-48(%rsi, %rdx), %xmm9
+
+	/* Get misalignment.  */
+	andl	$0xf, %ecx
+
+	movq	%rsi, %r9
+	addq	%rcx, %rsi
+	andq	$-16, %rsi
+	/* Get first vec for `palignr`.  */
+	movaps	(%rsi), %xmm1
+
+	/* We have loaded (%rsi) so safe to do this store before the
+	   loop.  */
+	movups	%xmm0, (%rdi)
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %RDX_LP
+#else
+	cmp	__x86_shared_cache_size_half(%rip), %rdx
+#endif
+	ja	L(large_memcpy)
+
+	leaq	-64(%rdi, %rdx), %r8
+	andq	$-16, %rdi
+	movl	$48, %edx
+
+	leaq	L(loop_fwd_start)(%rip), %r9
+	sall	$6, %ecx
+	addq	%r9, %rcx
+	jmp	* %rcx
+
+	.p2align 4,, 8
+L(copy_backward):
+	testq	%rcx, %rcx
+	jz	L(nop)
+
+	/* Preload tail.  */
+
+	/* (%rsi) already loaded into xmm0.  */
+	movups	16(%rsi), %xmm4
+	movups	32(%rsi), %xmm5
+
+	movq	%rdi, %r8
+	subq	%rdi, %rsi
+	leaq	-49(%rdi, %rdx), %rdi
+	andq	$-16, %rdi
+	addq	%rdi, %rsi
+	andq	$-16, %rsi
+
+	movaps	48(%rsi), %xmm6
+
+
+	leaq	L(loop_bkwd_start)(%rip), %r9
+	andl	$0xf, %ecx
+	sall	$6, %ecx
+	addq	%r9, %rcx
+	jmp	* %rcx
+
+	.p2align 4,, 8
+L(large_memcpy):
+	movups	-64(%r9, %rdx), %xmm10
+	movups	-80(%r9, %rdx), %xmm11
+
+	sall	$5, %ecx
+	leal	(%rcx, %rcx, 2), %r8d
+	leaq	-96(%rdi, %rdx), %rcx
+	andq	$-16, %rdi
+	leaq	L(large_loop_fwd_start)(%rip), %rdx
+	addq	%r8, %rdx
+	jmp	* %rdx
+
+
+	/* Instead of a typical jump table all 16 loops are exactly
+	   64-bytes in size. So, we can just jump to first loop + r8 *
+	   64. Before modifying any loop ensure all their sizes match!
+	 */
+	.p2align 6
+L(loop_fwd_start):
+L(loop_fwd_0x0):
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	%xmm1, 16(%rdi)
+	movaps	%xmm2, 32(%rdi)
+	movaps	%xmm3, 48(%rdi)
+	addq	%rdx, %rdi
+	addq	%rdx, %rsi
+	cmpq	%rdi, %r8
+	ja	L(loop_fwd_0x0)
+L(end_loop_fwd):
+	movups	%xmm9, 16(%r8)
+	movups	%xmm8, 32(%r8)
+	movups	%xmm7, 48(%r8)
+	ret
+
+	/* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
+	   60 bytes otherwise.  */
+#define ALIGNED_LOOP_FWD(align_by);	\
+	.p2align 6;	\
+L(loop_fwd_ ## align_by):	\
+	movaps	16(%rsi), %xmm0;	\
+	movaps	32(%rsi), %xmm2;	\
+	movaps	48(%rsi), %xmm3;	\
+	movaps	%xmm3, %xmm4;	\
+	palignr	$align_by, %xmm2, %xmm3;	\
+	palignr	$align_by, %xmm0, %xmm2;	\
+	palignr	$align_by, %xmm1, %xmm0;	\
+	movaps	%xmm4, %xmm1;	\
+	movaps	%xmm0, 16(%rdi);	\
+	movaps	%xmm2, 32(%rdi);	\
+	movaps	%xmm3, 48(%rdi);	\
+	addq	%rdx, %rdi;	\
+	addq	%rdx, %rsi;	\
+	cmpq	%rdi, %r8;	\
+	ja	L(loop_fwd_ ## align_by);	\
+	jmp	L(end_loop_fwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LOOP_FWD (0xf)
+	ALIGNED_LOOP_FWD (0xe)
+	ALIGNED_LOOP_FWD (0xd)
+	ALIGNED_LOOP_FWD (0xc)
+	ALIGNED_LOOP_FWD (0xb)
+	ALIGNED_LOOP_FWD (0xa)
+	ALIGNED_LOOP_FWD (0x9)
+	ALIGNED_LOOP_FWD (0x8)
+	ALIGNED_LOOP_FWD (0x7)
+	ALIGNED_LOOP_FWD (0x6)
+	ALIGNED_LOOP_FWD (0x5)
+	ALIGNED_LOOP_FWD (0x4)
+	ALIGNED_LOOP_FWD (0x3)
+	ALIGNED_LOOP_FWD (0x2)
+	ALIGNED_LOOP_FWD (0x1)
+
+	.p2align 6
+L(large_loop_fwd_start):
+L(large_loop_fwd_0x0):
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	64(%rsi), %xmm4
+	movaps	80(%rsi), %xmm5
+	movntps	%xmm1, 16(%rdi)
+	movntps	%xmm2, 32(%rdi)
+	movntps	%xmm3, 48(%rdi)
+	movntps	%xmm4, 64(%rdi)
+	movntps	%xmm5, 80(%rdi)
+	addq	$80, %rdi
+	addq	$80, %rsi
+	cmpq	%rdi, %rcx
+	ja	L(large_loop_fwd_0x0)
+
+	/* Ensure no icache line split on tail.  */
+	.p2align 4
+L(end_large_loop_fwd):
+	sfence
+	movups	%xmm11, 16(%rcx)
+	movups	%xmm10, 32(%rcx)
+	movups	%xmm9, 48(%rcx)
+	movups	%xmm8, 64(%rcx)
+	movups	%xmm7, 80(%rcx)
+	ret
+
+
+	/* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
+	   96-byte spacing between each.  */
+#define ALIGNED_LARGE_LOOP_FWD(align_by);	\
+	.p2align 5;	\
+L(large_loop_fwd_ ## align_by):	\
+	movaps	16(%rsi), %xmm0;	\
+	movaps	32(%rsi), %xmm2;	\
+	movaps	48(%rsi), %xmm3;	\
+	movaps	64(%rsi), %xmm4;	\
+	movaps	80(%rsi), %xmm5;	\
+	movaps	%xmm5, %xmm6;	\
+	palignr	$align_by, %xmm4, %xmm5;	\
+	palignr	$align_by, %xmm3, %xmm4;	\
+	palignr	$align_by, %xmm2, %xmm3;	\
+	palignr	$align_by, %xmm0, %xmm2;	\
+	palignr	$align_by, %xmm1, %xmm0;	\
+	movaps	%xmm6, %xmm1;	\
+	movntps	%xmm0, 16(%rdi);	\
+	movntps	%xmm2, 32(%rdi);	\
+	movntps	%xmm3, 48(%rdi);	\
+	movntps	%xmm4, 64(%rdi);	\
+	movntps	%xmm5, 80(%rdi);	\
+	addq	$80, %rdi;	\
+	addq	$80, %rsi;	\
+	cmpq	%rdi, %rcx;	\
+	ja	L(large_loop_fwd_ ## align_by);	\
+	jmp	L(end_large_loop_fwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LARGE_LOOP_FWD (0xf)
+	ALIGNED_LARGE_LOOP_FWD (0xe)
+	ALIGNED_LARGE_LOOP_FWD (0xd)
+	ALIGNED_LARGE_LOOP_FWD (0xc)
+	ALIGNED_LARGE_LOOP_FWD (0xb)
+	ALIGNED_LARGE_LOOP_FWD (0xa)
+	ALIGNED_LARGE_LOOP_FWD (0x9)
+	ALIGNED_LARGE_LOOP_FWD (0x8)
+	ALIGNED_LARGE_LOOP_FWD (0x7)
+	ALIGNED_LARGE_LOOP_FWD (0x6)
+	ALIGNED_LARGE_LOOP_FWD (0x5)
+	ALIGNED_LARGE_LOOP_FWD (0x4)
+	ALIGNED_LARGE_LOOP_FWD (0x3)
+	ALIGNED_LARGE_LOOP_FWD (0x2)
+	ALIGNED_LARGE_LOOP_FWD (0x1)
+
+
+	.p2align 6
+L(loop_bkwd_start):
+L(loop_bkwd_0x0):
+	movaps	32(%rsi), %xmm1
+	movaps	16(%rsi), %xmm2
+	movaps	0(%rsi), %xmm3
+	movaps	%xmm1, 32(%rdi)
+	movaps	%xmm2, 16(%rdi)
+	movaps	%xmm3, 0(%rdi)
+	subq	$48, %rdi
+	subq	$48, %rsi
+	cmpq	%rdi, %r8
+	jb	L(loop_bkwd_0x0)
+L(end_loop_bkwd):
+	movups	%xmm7, -16(%r8, %rdx)
+	movups	%xmm0, 0(%r8)
+	movups	%xmm4, 16(%r8)
+	movups	%xmm5, 32(%r8)
+
+	ret
+
+
+	/* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
+	   60 bytes otherwise.  */
+#define ALIGNED_LOOP_BKWD(align_by);	\
+	.p2align 6;	\
+L(loop_bkwd_ ## align_by):	\
+	movaps	32(%rsi), %xmm1;	\
+	movaps	16(%rsi), %xmm2;	\
+	movaps	0(%rsi), %xmm3;	\
+	palignr	$align_by, %xmm1, %xmm6;	\
+	palignr	$align_by, %xmm2, %xmm1;	\
+	palignr	$align_by, %xmm3, %xmm2;	\
+	movaps	%xmm6, 32(%rdi);	\
+	movaps	%xmm1, 16(%rdi);	\
+	movaps	%xmm2, 0(%rdi);	\
+	subq	$48, %rdi;	\
+	subq	$48, %rsi;	\
+	movaps	%xmm3, %xmm6;	\
+	cmpq	%rdi, %r8;	\
+	jb	L(loop_bkwd_ ## align_by);	\
+	jmp	L(end_loop_bkwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LOOP_BKWD (0xf)
+	ALIGNED_LOOP_BKWD (0xe)
+	ALIGNED_LOOP_BKWD (0xd)
+	ALIGNED_LOOP_BKWD (0xc)
+	ALIGNED_LOOP_BKWD (0xb)
+	ALIGNED_LOOP_BKWD (0xa)
+	ALIGNED_LOOP_BKWD (0x9)
+	ALIGNED_LOOP_BKWD (0x8)
+	ALIGNED_LOOP_BKWD (0x7)
+	ALIGNED_LOOP_BKWD (0x6)
+	ALIGNED_LOOP_BKWD (0x5)
+	ALIGNED_LOOP_BKWD (0x4)
+	ALIGNED_LOOP_BKWD (0x3)
+	ALIGNED_LOOP_BKWD (0x2)
+	ALIGNED_LOOP_BKWD (0x1)
+END(MEMMOVE)
+
+strong_alias (MEMMOVE, MEMCPY)
+strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
-- 
cgit 1.4.1