about summary refs log tree commit diff
path: root/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/aarch64/multiarch/memcpy_thunderx2.S')
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_thunderx2.S572
1 files changed, 211 insertions, 361 deletions
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index 45e9a29495..058db48b95 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -31,8 +31,8 @@
 #define dst	x3
 #define srcend	x4
 #define dstend	x5
-#define tmp2    x6
-#define tmp3    x7
+#define tmp2	x6
+#define tmp3	x7
 #define tmp3w   w7
 #define A_l	x6
 #define A_lw	w6
@@ -53,26 +53,26 @@
 #define G_h	dst
 #define tmp1	x14
 
-#define A_q     q0
-#define B_q     q1
-#define C_q     q2
-#define D_q     q3
-#define E_q     q4
-#define F_q     q5
-#define G_q     q6
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
 #define H_q	q7
 #define I_q	q16
 #define J_q	q17
 
-#define A_v     v0
-#define B_v     v1
-#define C_v     v2
-#define D_v     v3
-#define E_v     v4
-#define F_v     v5
-#define G_v     v6
-#define H_v     v7
-#define I_v     v16
+#define A_v	v0
+#define B_v	v1
+#define C_v	v2
+#define D_v	v3
+#define E_v	v4
+#define F_v	v5
+#define G_v	v6
+#define H_v	v7
+#define I_v	v16
 #define J_v	v17
 
 #ifndef MEMMOVE
@@ -85,16 +85,14 @@
 #if IS_IN (libc)
 
 #undef MEMCPY
-#undef MEMMOVE
 #define MEMCPY __memcpy_thunderx2
+#undef MEMMOVE
 #define MEMMOVE __memmove_thunderx2
 
 
-/* Moves are split into 3 main cases: small copies of up to 16 bytes,
-   medium copies of 17..96 bytes which are fully unrolled. Large copies
-   of more than 96 bytes align the destination and use an unrolled loop
-   processing 64 bytes per iteration.
-   Overlapping large forward memmoves use a loop that copies backwards.
+/* Overlapping large forward memmoves use a loop that copies backwards.
+   Otherwise memcpy is used. Small moves branch to memcopy16 directly.
+   The longer memcpy cases fall through to the memcpy head.
 */
 
 ENTRY_ALIGN (MEMMOVE, 6)
@@ -103,188 +101,14 @@ ENTRY_ALIGN (MEMMOVE, 6)
 	DELOUSE (1)
 	DELOUSE (2)
 
+	add	srcend, src, count
+	cmp	count, 16
+	b.ls	L(memcopy16)
 	sub	tmp1, dstin, src
 	cmp	count, 96
 	ccmp	tmp1, count, 2, hi
 	b.lo	L(move_long)
 
-	prfm	PLDL1KEEP, [src]
-	add	srcend, src, count
-	add	dstend, dstin, count
-	cmp	count, 16
-	b.ls	L(copy16)
-	cmp	count, 96
-	b.hi	L(copy_long)
-
-	/* Medium copies: 17..96 bytes.  */
-	sub	tmp1, count, 1
-	ldp	A_l, A_h, [src]
-	tbnz	tmp1, 6, L(copy96)
-	ldp	D_l, D_h, [srcend, -16]
-	tbz	tmp1, 5, 1f
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [srcend, -32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
-1:
-	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
-	ret
-
-	.p2align 4
-	/* Small copies: 0..16 bytes.  */
-L(copy16):
-	cmp	count, 8
-	b.lo	1f
-	ldr	A_l, [src]
-	ldr	A_h, [srcend, -8]
-	str	A_l, [dstin]
-	str	A_h, [dstend, -8]
-	ret
-	.p2align 4
-1:
-	tbz	count, 2, 1f
-	ldr	A_lw, [src]
-	ldr	A_hw, [srcend, -4]
-	str	A_lw, [dstin]
-	str	A_hw, [dstend, -4]
-	ret
-
-	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
-	cbz	count, 2f
-	lsr	tmp1, count, 1
-	ldrb	A_lw, [src]
-	ldrb	A_hw, [srcend, -1]
-	ldrb	B_lw, [src, tmp1]
-	strb	A_lw, [dstin]
-	strb	B_lw, [dstin, tmp1]
-	strb	A_hw, [dstend, -1]
-2:	ret
-
-	.p2align 4
-	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
-	   32 bytes from the end.  */
-L(copy96):
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [src, 32]
-	ldp	D_l, D_h, [src, 48]
-	ldp	E_l, E_h, [srcend, -32]
-	ldp	F_l, F_h, [srcend, -16]
-	stp	A_l, A_h, [dstin]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstin, 32]
-	stp	D_l, D_h, [dstin, 48]
-	stp	E_l, E_h, [dstend, -32]
-	stp	F_l, F_h, [dstend, -16]
-	ret
-
-	/* Align DST to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.  There are at least 96 bytes
-	   to copy, so copy 16 bytes unaligned and then align.  The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
-	.p2align 4
-L(copy_long):
-	and	tmp1, dstin, 15
-	bic	dst, dstin, 15
-	ldp	D_l, D_h, [src]
-	sub	src, src, tmp1
-	add	count, count, tmp1	/* Count is now 16 too large.  */
-	ldp	A_l, A_h, [src, 16]
-	stp	D_l, D_h, [dstin]
-	ldp	B_l, B_h, [src, 32]
-	ldp	C_l, C_h, [src, 48]
-	ldp	D_l, D_h, [src, 64]!
-	subs	count, count, 128 + 16	/* Test and readjust count.  */
-	b.ls	L(last64)
-L(loop64):
-	stp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [src, 16]
-	stp	B_l, B_h, [dst, 32]
-	ldp	B_l, B_h, [src, 32]
-	stp	C_l, C_h, [dst, 48]
-	ldp	C_l, C_h, [src, 48]
-	stp	D_l, D_h, [dst, 64]!
-	ldp	D_l, D_h, [src, 64]!
-	subs	count, count, 64
-	b.hi	L(loop64)
-
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the end even if
-	   there is just 1 byte left.  */
-L(last64):
-	ldp	E_l, E_h, [srcend, -64]
-	stp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [srcend, -48]
-	stp	B_l, B_h, [dst, 32]
-	ldp	B_l, B_h, [srcend, -32]
-	stp	C_l, C_h, [dst, 48]
-	ldp	C_l, C_h, [srcend, -16]
-	stp	D_l, D_h, [dst, 64]
-	stp	E_l, E_h, [dstend, -64]
-	stp	A_l, A_h, [dstend, -48]
-	stp	B_l, B_h, [dstend, -32]
-	stp	C_l, C_h, [dstend, -16]
-	ret
-
-	.p2align 4
-L(move_long):
-	cbz	tmp1, 3f
-
-	add	srcend, src, count
-	add	dstend, dstin, count
-
-	/* Align dstend to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.  There are at least 96 bytes
-	   to copy, so copy 16 bytes unaligned and then align.  The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
-	and	tmp1, dstend, 15
-	ldp	D_l, D_h, [srcend, -16]
-	sub	srcend, srcend, tmp1
-	sub	count, count, tmp1
-	ldp	A_l, A_h, [srcend, -16]
-	stp	D_l, D_h, [dstend, -16]
-	ldp	B_l, B_h, [srcend, -32]
-	ldp	C_l, C_h, [srcend, -48]
-	ldp	D_l, D_h, [srcend, -64]!
-	sub	dstend, dstend, tmp1
-	subs	count, count, 128
-	b.ls	2f
-
-	nop
-1:
-	stp	A_l, A_h, [dstend, -16]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	B_l, B_h, [dstend, -32]
-	ldp	B_l, B_h, [srcend, -32]
-	stp	C_l, C_h, [dstend, -48]
-	ldp	C_l, C_h, [srcend, -48]
-	stp	D_l, D_h, [dstend, -64]!
-	ldp	D_l, D_h, [srcend, -64]!
-	subs	count, count, 64
-	b.hi	1b
-
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the start even if
-	   there is just 1 byte left.  */
-2:
-	ldp	G_l, G_h, [src, 48]
-	stp	A_l, A_h, [dstend, -16]
-	ldp	A_l, A_h, [src, 32]
-	stp	B_l, B_h, [dstend, -32]
-	ldp	B_l, B_h, [src, 16]
-	stp	C_l, C_h, [dstend, -48]
-	ldp	C_l, C_h, [src]
-	stp	D_l, D_h, [dstend, -64]
-	stp	G_l, G_h, [dstin, 48]
-	stp	A_l, A_h, [dstin, 32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstin]
-3:	ret
-
 END (MEMMOVE)
 libc_hidden_builtin_def (MEMMOVE)
 
@@ -293,227 +117,217 @@ libc_hidden_builtin_def (MEMMOVE)
    medium copies of 17..96 bytes which are fully unrolled. Large copies
    of more than 96 bytes align the destination and use load-and-merge
    approach in the case src and dst addresses are unaligned not evenly,
-   so that, loads and stores are always aligned.
-   Large copies use an unrolled loop processing 64 bytes per iteration.
-   The current optimized memcpy implementation is not compatible with
-   memmove and is separated from it completely.
-
-   memcpy implementation below is not compatible with memmove
-   because of pipelined loads/stores, which are faster, but they
-   can't be used in the case of overlapping memmove arrays */
+   so that, actual loads and stores are always aligned.
+   Large copies use the loops processing 64 bytes per iteration for
+   unaligned case and 128 bytes per iteration for aligned ones.
+*/
 
 #define MEMCPY_PREFETCH_LDR 640
 
+	.p2align 4
 ENTRY (MEMCPY)
+
 	DELOUSE (0)
 	DELOUSE (1)
 	DELOUSE (2)
 
-	add     srcend, src, count
-	cmp     count, 16
-	b.ls    L(memcopy16)
-	ldr     A_q, [src], #16
-	add     dstend, dstin, count
-	and     tmp1, src, 15
-	cmp     count, 96
-	b.hi    L(memcopy_long)
+	add	srcend, src, count
+	cmp	count, 16
+	b.ls	L(memcopy16)
+	ldr	A_q, [src], #16
+	add	dstend, dstin, count
+	and	tmp1, src, 15
+	cmp	count, 96
+	b.hi	L(memcopy_long)
 
 	/* Medium copies: 17..96 bytes.  */
-	ldr     E_q, [srcend, -16]
-	cmp     count, 64
-	b.gt    L(memcpy_copy96)
-	cmp     count, 48
-	b.le    L(bytes_17_to_48)
+	ldr	E_q, [srcend, -16]
+	cmp	count, 64
+	b.gt	L(memcpy_copy96)
+	cmp	count, 48
+	b.le	L(bytes_17_to_48)
 	/* 49..64 bytes */
-	ldp     B_q, C_q, [src]
-	str     E_q, [dstend, -16]
-	stp     A_q, B_q, [dstin]
-	str     C_q, [dstin, 32]
+	ldp	B_q, C_q, [src]
+	str	E_q, [dstend, -16]
+	stp	A_q, B_q, [dstin]
+	str	C_q, [dstin, 32]
 	ret
 
 L(bytes_17_to_48):
 	/* 17..48 bytes*/
-	cmp     count, 32
-	b.gt    L(bytes_32_to_48)
+	cmp	count, 32
+	b.gt	L(bytes_32_to_48)
 	/* 17..32 bytes*/
-	str     A_q, [dstin]
-	str     E_q, [dstend, -16]
+	str	A_q, [dstin]
+	str	E_q, [dstend, -16]
 	ret
 
 L(bytes_32_to_48):
 	/* 32..48 */
-	ldr     B_q, [src]
-	str     A_q, [dstin]
-	str     E_q, [dstend, -16]
-	str     B_q, [dstin, 16]
+	ldr	B_q, [src]
+	str	A_q, [dstin]
+	str	E_q, [dstend, -16]
+	str	B_q, [dstin, 16]
 	ret
 
 	.p2align 4
 	/* Small copies: 0..16 bytes.  */
 L(memcopy16):
-	cmp     count, 8
-	b.lo    L(bytes_0_to_8)
-	ldr     A_l, [src]
-	ldr     A_h, [srcend, -8]
-	add     dstend, dstin, count
-	str     A_l, [dstin]
-	str     A_h, [dstend, -8]
+	cmp	count, 8
+	b.lo	L(bytes_0_to_8)
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	add	dstend, dstin, count
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
 	ret
 	.p2align 4
 
 L(bytes_0_to_8):
-	tbz     count, 2, L(bytes_0_to_3)
-	ldr     A_lw, [src]
-	ldr     A_hw, [srcend, -4]
-	add     dstend, dstin, count
-	str     A_lw, [dstin]
-	str     A_hw, [dstend, -4]
+	tbz	count, 2, L(bytes_0_to_3)
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	add	dstend, dstin, count
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
 	ret
 
 	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
 	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 L(bytes_0_to_3):
-	cbz     count, L(end)
-	lsr     tmp1, count, 1
-	ldrb    A_lw, [src]
-	ldrb    A_hw, [srcend, -1]
-	add     dstend, dstin, count
-	ldrb    B_lw, [src, tmp1]
-	strb    A_lw, [dstin]
-	strb    B_lw, [dstin, tmp1]
-	strb    A_hw, [dstend, -1]
-L(end):
+	cbz	count, 1f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	add	dstend, dstin, count
+	ldrb	B_lw, [src, tmp1]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+	strb	A_lw, [dstin]
+1:
 	ret
 
 	.p2align 4
 
 L(memcpy_copy96):
 	/* Copying 65..96 bytes. A_q (first 16 bytes) and
-	   E_q(last 16 bytes) are already loaded.
-
-	   The size is large enough to benefit from aligned
-	   loads */
-	bic     src, src, 15
-	ldp     B_q, C_q, [src]
-	str     A_q, [dstin]
+	   E_q(last 16 bytes) are already loaded. The size
+	   is large enough to benefit from aligned loads */
+	bic	src, src, 15
+	ldp	B_q, C_q, [src]
 	/* Loaded 64 bytes, second 16-bytes chunk can be
 	   overlapping with the first chunk by tmp1 bytes.
 	   Stored 16 bytes. */
-	sub     dst, dstin, tmp1
-	add     count, count, tmp1
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1
 	/* The range of count being [65..96] becomes [65..111]
 	   after tmp [0..15] gets added to it,
 	   count now is <bytes-left-to-load>+48 */
-	cmp     count, 80
-	b.gt    L(copy96_medium)
-	ldr     D_q, [src, 32]
-	stp     B_q, C_q, [dst, 16]
-	str     E_q, [dstend, -16]
-	str     D_q, [dst, 48]
+	cmp	count, 80
+	b.gt	L(copy96_medium)
+	ldr	D_q, [src, 32]
+	stp	B_q, C_q, [dst, 16]
+	str	D_q, [dst, 48]
+	str	A_q, [dstin]
+	str	E_q, [dstend, -16]
 	ret
 
 	.p2align 4
 L(copy96_medium):
-	ldp     D_q, A_q, [src, 32]
-	str     B_q, [dst, 16]
-	cmp     count, 96
-	b.gt    L(copy96_large)
-	str     E_q, [dstend, -16]
-	stp     C_q, D_q, [dst, 32]
-	str     A_q, [dst, 64]
+	ldp	D_q, G_q, [src, 32]
+	cmp	count, 96
+	b.gt	L(copy96_large)
+	stp	B_q, C_q, [dst, 16]
+	stp	D_q, G_q, [dst, 48]
+	str	A_q, [dstin]
+	str	E_q, [dstend, -16]
 	ret
 
 L(copy96_large):
-	ldr     F_q, [src, 64]
-	stp     C_q, D_q, [dst, 32]
-	str     E_q, [dstend, -16]
-	stp     A_q, F_q, [dst, 64]
+	ldr	F_q, [src, 64]
+	str	B_q, [dst, 16]
+	stp	C_q, D_q, [dst, 32]
+	stp	G_q, F_q, [dst, 64]
+	str	A_q, [dstin]
+	str	E_q, [dstend, -16]
 	ret
 
 	.p2align 4
 L(memcopy_long):
-	bic     src, src, 15
-	ldp     B_q, C_q, [src], #32
-	str     A_q, [dstin]
-	sub     dst, dstin, tmp1
-	add     count, count, tmp1
-	add     dst, dst, 16
+	bic	src, src, 15
+	ldp	B_q, C_q, [src], #32
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1
+	add	dst, dst, 16
 	and	tmp1, dst, 15
-	ldp     D_q, E_q, [src], #32
-	str     B_q, [dst], #16
+	ldp	D_q, E_q, [src], #32
+	str	A_q, [dstin]
 
 	/* Already loaded 64+16 bytes. Check if at
 	   least 64 more bytes left */
-	subs    count, count, 64+64+16
-	b.lt    L(loop128_exit2)
-	cmp     count, MEMCPY_PREFETCH_LDR + 64 + 32
-	b.lt    L(loop128)
+	subs	count, count, 64+64+16
+	b.lt	L(loop128_exit0)
+	cmp	count, MEMCPY_PREFETCH_LDR + 64 + 32
+	b.lt	L(loop128)
 	cbnz	tmp1, L(dst_unaligned)
-	sub     count, count, MEMCPY_PREFETCH_LDR + 64 + 32
+	sub	count, count, MEMCPY_PREFETCH_LDR + 64 + 32
 
 	.p2align 4
 
 L(loop128_prefetch):
-	str     C_q, [dst], #16
-	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
-	str     D_q, [dst], #16
-	ldp     F_q, G_q, [src], #32
-	str	E_q, [dst], #16
-	ldp     H_q, A_q, [src], #32
-	str     F_q, [dst], #16
-	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
-	str     G_q, [dst], #16
-	ldp     B_q, C_q, [src], #32
-	str	H_q, [dst], #16
-	ldp     D_q, E_q, [src], #32
-	stp	A_q, B_q, [dst], #32
+	prfm	pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp	F_q, G_q, [src], #32
+	stp	B_q, C_q, [dst], #32
+	ldp	H_q, I_q, [src], #32
+	prfm	pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp	B_q, C_q, [src], #32
+	stp	D_q, E_q, [dst], #32
+	ldp	D_q, E_q, [src], #32
+	stp	F_q, G_q, [dst], #32
+	stp	H_q, I_q, [dst], #32
 	subs	count, count, 128
-	b.ge    L(loop128_prefetch)
+	b.ge	L(loop128_prefetch)
 
-L(preloop128):
 	add	count, count, MEMCPY_PREFETCH_LDR + 64 + 32
 	.p2align 4
 L(loop128):
-	ldp     F_q, G_q, [src], #32
-	str     C_q, [dst], #16
-	ldp     B_q, A_q, [src], #32
-	str     D_q, [dst], #16
-	stp     E_q, F_q, [dst], #32
-	stp     G_q, B_q, [dst], #32
-	subs    count, count, 64
-	b.lt    L(loop128_exit1)
-L(loop128_proceed):
-	ldp     B_q, C_q, [src], #32
-	str     A_q, [dst], #16
-	ldp     D_q, E_q, [src], #32
-	str     B_q, [dst], #16
-	subs    count, count, 64
-	b.ge    L(loop128)
-
-	.p2align 4
-L(loop128_exit2):
-	stp     C_q, D_q, [dst], #32
-	str     E_q, [dst], #16
-	b       L(copy_long_check32);
-
+	ldp	F_q, G_q, [src], #32
+	ldp	H_q, I_q, [src], #32
+	stp	B_q, C_q, [dst], #32
+	stp	D_q, E_q, [dst], #32
+	subs	count, count, 64
+	b.lt	L(loop128_exit1)
+	ldp	B_q, C_q, [src], #32
+	ldp	D_q, E_q, [src], #32
+	stp	F_q, G_q, [dst], #32
+	stp	H_q, I_q, [dst], #32
+	subs	count, count, 64
+	b.ge	L(loop128)
+L(loop128_exit0):
+	ldp	F_q, G_q, [srcend, -64]
+	ldp	H_q, I_q, [srcend, -32]
+	stp	B_q, C_q, [dst], #32
+	stp	D_q, E_q, [dst]
+	stp	F_q, G_q, [dstend, -64]
+	stp	H_q, I_q, [dstend, -32]
+	ret
 L(loop128_exit1):
-	/* A_q is still not stored and 0..63 bytes left,
-	   so, count is -64..-1.
-	   Check if less than 32 bytes left (count < -32) */
-	str     A_q, [dst], #16
-L(copy_long_check32):
-	cmn     count, 64
-	b.eq    L(copy_long_done)
-	cmn     count, 32
-	b.le    L(copy_long_last32)
-	ldp     B_q, C_q, [src]
-	stp     B_q, C_q, [dst]
-
-L(copy_long_last32):
-	ldp     F_q, G_q, [srcend, -32]
-	stp     F_q, G_q, [dstend, -32]
-
-L(copy_long_done):
+	ldp	B_q, C_q, [srcend, -64]
+	ldp	D_q, E_q, [srcend, -32]
+	stp	F_q, G_q, [dst], #32
+	stp	H_q, I_q, [dst]
+	stp	B_q, C_q, [dstend, -64]
+	stp	D_q, E_q, [dstend, -32]
+	ret
+
+L(dst_unaligned_tail):
+	ldp	C_q, D_q, [srcend, -64]
+	ldp	E_q, F_q, [srcend, -32]
+	stp	A_q, B_q, [dst], #32
+	stp	H_q, I_q, [dst], #16
+	str	G_q, [dst, tmp1]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstend, -32]
 	ret
 
 L(dst_unaligned):
@@ -542,17 +356,20 @@ L(dst_unaligned):
 	/* Store the 16 bytes to dst and align dst for further
 	   operations, several bytes will be stored at this
 	   address once more */
-	str     C_q, [dst], #16
-	ldp     F_q, G_q, [src], #32
+
+	ldp	F_q, G_q, [src], #32
+	stp	B_q, C_q, [dst], #32
 	bic	dst, dst, 15
-	subs    count, count, 32
+	sub	count, count, 32
 	adrp	tmp2, L(ext_table)
 	add	tmp2, tmp2, :lo12:L(ext_table)
 	add	tmp2, tmp2, tmp1, LSL #2
 	ldr	tmp3w, [tmp2]
 	add	tmp2, tmp2, tmp3w, SXTW
 	br	tmp2
-.p2align 4 ;\
+
+.p2align 4
+	/* to make the loop in each chunk 16-bytes aligned */
 	nop
 #define EXT_CHUNK(shft) \
 L(ext_size_ ## shft):;\
@@ -573,7 +390,7 @@ L(ext_size_ ## shft):;\
 	b.ge    1b;\
 2:;\
 	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	b	L(ext_tail);
+	b	L(dst_unaligned_tail);
 
 EXT_CHUNK(1)
 EXT_CHUNK(2)
@@ -591,12 +408,45 @@ EXT_CHUNK(13)
 EXT_CHUNK(14)
 EXT_CHUNK(15)
 
-L(ext_tail):
-	stp     A_q, B_q, [dst], #32
-	stp     H_q, I_q, [dst], #16
-	add     dst, dst, tmp1
-	str     G_q, [dst], #16
-	b       L(copy_long_check32)
+L(move_long):
+	.p2align 4
+1:
+	cbz	tmp1, 3f
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	and	tmp1, srcend, 15
+	ldr	D_q, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+
+	.p2align 4
+1:
+	subs	count, count, 64
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -64]!
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
+2:
+	ldp	E_q, F_q, [src, 32]
+	ldp	G_q, H_q, [src]
+	stp	A_q, B_q, [dstend, -32]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	G_q, H_q, [dstin]
+3:	ret
 
 
 END (MEMCPY)