about summary refs log tree commit diff
path: root/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S')
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S553
1 files changed, 0 insertions, 553 deletions
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
deleted file mode 100644
index dee3ec529c..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ /dev/null
@@ -1,553 +0,0 @@
-/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
-   Copyright (C) 2016-2017 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* memmove/memcpy/mempcpy is implemented as:
-   1. Use overlapping load and store to avoid branch.
-   2. Load all sources into registers and store them together to avoid
-      possible address overlap between source and destination.
-   3. If size is 8 * VEC_SIZE or less, load all sources into registers
-      and store them together.
-   4. If address of destination > address of source, backward copy
-      4 * VEC_SIZE at a time with unaligned load and aligned store.
-      Load the first 4 * VEC and last VEC before the loop and store
-      them after the loop to support overlapping addresses.
-   5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
-      load and aligned store.  Load the last 4 * VEC and first VEC
-      before the loop and store them after the loop to support
-      overlapping addresses.
-   6. If size >= __x86_shared_non_temporal_threshold and there is no
-      overlap between destination and source, use non-temporal store
-      instead of aligned store.  */
-
-#include <sysdep.h>
-
-#ifndef MEMCPY_SYMBOL
-# define MEMCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
-#endif
-
-#ifndef MEMPCPY_SYMBOL
-# define MEMPCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
-#endif
-
-#ifndef MEMMOVE_CHK_SYMBOL
-# define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
-#endif
-
-#ifndef VZEROUPPER
-# if VEC_SIZE > 16
-#  define VZEROUPPER vzeroupper
-# else
-#  define VZEROUPPER
-# endif
-#endif
-
-/* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
-   up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
-   memcpy micro benchmark in glibc shows that 2KB is the approximate
-   value above which REP MOVSB becomes faster than SSE2 optimization
-   on processors with Enhanced REP MOVSB.  Since larger register size
-   can move more data with a single load and store, the threshold is
-   higher with larger register size.  */
-#ifndef REP_MOVSB_THRESHOLD
-# define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
-#endif
-
-#ifndef PREFETCH
-# define PREFETCH(addr) prefetcht0 addr
-#endif
-
-/* Assume 64-byte prefetch size.  */
-#ifndef PREFETCH_SIZE
-# define PREFETCH_SIZE 64
-#endif
-
-#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
-
-#if PREFETCH_SIZE == 64
-# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
-#  define PREFETCH_ONE_SET(dir, base, offset) \
-	PREFETCH ((offset)base)
-# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
-#  define PREFETCH_ONE_SET(dir, base, offset) \
-	PREFETCH ((offset)base); \
-	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
-# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
-#  define PREFETCH_ONE_SET(dir, base, offset) \
-	PREFETCH ((offset)base); \
-	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
-	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
-	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
-	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
-# else
-#   error Unsupported PREFETCHED_LOAD_SIZE!
-# endif
-#else
-# error Unsupported PREFETCH_SIZE!
-#endif
-
-#ifndef SECTION
-# error SECTION is not defined!
-#endif
-
-	.section SECTION(.text),"ax",@progbits
-#if defined SHARED && IS_IN (libc)
-ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
-#endif
-
-#if VEC_SIZE == 16 || defined SHARED
-ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
-	movq	%rdi, %rax
-	addq	%rdx, %rax
-	jmp	L(start)
-END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
-#endif
-
-#if defined SHARED && IS_IN (libc)
-ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
-#endif
-
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
-	movq	%rdi, %rax
-L(start):
-	cmpq	$VEC_SIZE, %rdx
-	jb	L(less_vec)
-	cmpq	$(VEC_SIZE * 2), %rdx
-	ja	L(more_2x_vec)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(last_2x_vec):
-#endif
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-	VZEROUPPER
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
-#endif
-	ret
-#if defined USE_MULTIARCH && IS_IN (libc)
-END (MEMMOVE_SYMBOL (__memmove, unaligned))
-
-# if VEC_SIZE == 16
-#  if defined SHARED
-/* Only used to measure performance of REP MOVSB.  */
-ENTRY (__mempcpy_erms)
-	movq	%rdi, %rax
-	addq	%rdx, %rax
-	jmp	L(start_movsb)
-END (__mempcpy_erms)
-#  endif
-
-ENTRY (__memmove_erms)
-	movq	%rdi, %rax
-L(start_movsb):
-	movq	%rdx, %rcx
-	cmpq	%rsi, %rdi
-	jb	1f
-	/* Source == destination is less common.  */
-	je	2f
-	leaq	(%rsi,%rcx), %rdx
-	cmpq	%rdx, %rdi
-	jb	L(movsb_backward)
-1:
-	rep movsb
-2:
-	ret
-L(movsb_backward):
-	leaq	-1(%rdi,%rcx), %rdi
-	leaq	-1(%rsi,%rcx), %rsi
-	std
-	rep movsb
-	cld
-	ret
-END (__memmove_erms)
-#  if defined SHARED
-strong_alias (__memmove_erms, __memcpy_erms)
-#  endif
-# endif
-
-# ifdef SHARED
-ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
-
-ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
-	movq	%rdi, %rax
-	addq	%rdx, %rax
-	jmp	L(start_erms)
-END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
-
-ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
-# endif
-
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-	movq	%rdi, %rax
-L(start_erms):
-	cmpq	$VEC_SIZE, %rdx
-	jb	L(less_vec)
-	cmpq	$(VEC_SIZE * 2), %rdx
-	ja	L(movsb_more_2x_vec)
-L(last_2x_vec):
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-L(return):
-	VZEROUPPER
-	ret
-
-L(movsb):
-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
-	jae	L(more_8x_vec)
-	cmpq	%rsi, %rdi
-	jb	1f
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %r9
-	cmpq	%r9, %rdi
-	/* Avoid slow backward REP MOVSB.  */
-# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
-#  error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
-# endif
-	jb	L(more_8x_vec_backward)
-1:
-	movq	%rdx, %rcx
-	rep movsb
-L(nop):
-	ret
-#endif
-
-L(less_vec):
-	/* Less than 1 VEC.  */
-#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
-# error Unsupported VEC_SIZE!
-#endif
-#if VEC_SIZE > 32
-	cmpb	$32, %dl
-	jae	L(between_32_63)
-#endif
-#if VEC_SIZE > 16
-	cmpb	$16, %dl
-	jae	L(between_16_31)
-#endif
-	cmpb	$8, %dl
-	jae	L(between_8_15)
-	cmpb	$4, %dl
-	jae	L(between_4_7)
-	cmpb	$1, %dl
-	ja	L(between_2_3)
-	jb	1f
-	movzbl	(%rsi), %ecx
-	movb	%cl, (%rdi)
-1:
-	ret
-#if VEC_SIZE > 32
-L(between_32_63):
-	/* From 32 to 63.  No branch when size == 32.  */
-	vmovdqu	(%rsi), %ymm0
-	vmovdqu	-32(%rsi,%rdx), %ymm1
-	vmovdqu	%ymm0, (%rdi)
-	vmovdqu	%ymm1, -32(%rdi,%rdx)
-	VZEROUPPER
-	ret
-#endif
-#if VEC_SIZE > 16
-	/* From 16 to 31.  No branch when size == 16.  */
-L(between_16_31):
-	vmovdqu	(%rsi), %xmm0
-	vmovdqu	-16(%rsi,%rdx), %xmm1
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm1, -16(%rdi,%rdx)
-	ret
-#endif
-L(between_8_15):
-	/* From 8 to 15.  No branch when size == 8.  */
-	movq	-8(%rsi,%rdx), %rcx
-	movq	(%rsi), %rsi
-	movq	%rcx, -8(%rdi,%rdx)
-	movq	%rsi, (%rdi)
-	ret
-L(between_4_7):
-	/* From 4 to 7.  No branch when size == 4.  */
-	movl	-4(%rsi,%rdx), %ecx
-	movl	(%rsi), %esi
-	movl	%ecx, -4(%rdi,%rdx)
-	movl	%esi, (%rdi)
-	ret
-L(between_2_3):
-	/* From 2 to 3.  No branch when size == 2.  */
-	movzwl	-2(%rsi,%rdx), %ecx
-	movzwl	(%rsi), %esi
-	movw	%cx, -2(%rdi,%rdx)
-	movw	%si, (%rdi)
-	ret
-
-#if defined USE_MULTIARCH && IS_IN (libc)
-L(movsb_more_2x_vec):
-	cmpq	$REP_MOVSB_THRESHOLD, %rdx
-	ja	L(movsb)
-#endif
-L(more_2x_vec):
-	/* More than 2 * VEC and there may be overlap between destination
-	   and source.  */
-	cmpq	$(VEC_SIZE * 8), %rdx
-	ja	L(more_8x_vec)
-	cmpq	$(VEC_SIZE * 4), %rdx
-	jb	L(last_4x_vec)
-	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
-	VZEROUPPER
-	ret
-L(last_4x_vec):
-	/* Copy from 2 * VEC to 4 * VEC. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VZEROUPPER
-	ret
-
-L(more_8x_vec):
-	cmpq	%rsi, %rdi
-	ja	L(more_8x_vec_backward)
-	/* Source == destination is less common.  */
-	je	L(nop)
-	/* Load the first VEC and last 4 * VEC to support overlapping
-	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
-	/* Save start and stop of the destination buffer.  */
-	movq	%rdi, %r11
-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
-	/* Align destination for aligned stores in the loop.  Compute
-	   how much destination is misaligned.  */
-	movq	%rdi, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Get the negative of offset for alignment.  */
-	subq	$VEC_SIZE, %r8
-	/* Adjust source.  */
-	subq	%r8, %rsi
-	/* Adjust destination which should be aligned now.  */
-	subq	%r8, %rdi
-	/* Adjust length.  */
-	addq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
-	ja	L(large_forward)
-#endif
-L(loop_4x_vec_forward):
-	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$(VEC_SIZE * 4), %rsi
-	subq	$(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$(VEC_SIZE * 4), %rdi
-	cmpq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec_forward)
-	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
-	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
-	VZEROUPPER
-	ret
-
-L(more_8x_vec_backward):
-	/* Load the first 4 * VEC and last VEC to support overlapping
-	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
-	VMOVU	VEC_SIZE(%rsi), %VEC(5)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
-	/* Save stop of the destination buffer.  */
-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
-	/* Align destination end for aligned stores in the loop.  Compute
-	   how much destination end is misaligned.  */
-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	movq	%r11, %r9
-	movq	%r11, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Adjust source.  */
-	subq	%r8, %rcx
-	/* Adjust the end of destination which should be aligned now.  */
-	subq	%r8, %r9
-	/* Adjust length.  */
-	subq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
-	ja	L(large_backward)
-#endif
-L(loop_4x_vec_backward):
-	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$(VEC_SIZE * 4), %rcx
-	subq	$(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%r9)
-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$(VEC_SIZE * 4), %r9
-	cmpq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec_backward)
-	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
-	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
-	VZEROUPPER
-	ret
-
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-L(large_forward):
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rdi, %rdx), %r10
-	cmpq    %r10, %rsi
-	jb	L(loop_4x_vec_forward)
-L(loop_large_forward):
-	/* Copy 4 * VEC a time forward with non-temporal stores.  */
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$PREFETCHED_LOAD_SIZE, %rsi
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%rdi)
-	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
-	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$PREFETCHED_LOAD_SIZE, %rdi
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_forward)
-	sfence
-	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
-	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
-	VZEROUPPER
-	ret
-
-L(large_backward):
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rcx, %rdx), %r10
-	cmpq    %r10, %r9
-	jb	L(loop_4x_vec_backward)
-L(loop_large_backward):
-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$PREFETCHED_LOAD_SIZE, %rcx
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%r9)
-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$PREFETCHED_LOAD_SIZE, %r9
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_backward)
-	sfence
-	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
-	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
-	VZEROUPPER
-	ret
-#endif
-END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-
-#ifdef SHARED
-# if IS_IN (libc)
-#  ifdef USE_MULTIARCH
-strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
-	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
-strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
-	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
-#  endif
-strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
-	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
-# endif
-#endif
-#if VEC_SIZE == 16 || defined SHARED
-strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
-	      MEMCPY_SYMBOL (__memcpy, unaligned))
-#endif