/* Optimized memmove implementation for POWER10.
   Copyright (C) 2021-2024 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <sysdep.h>


/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])

   This optimization checks if 'src' and 'dst' overlap.  If they do not
   or 'src' is ahead of 'dest' then it copies forward.
   Otherwise, an optimized backward copy is used.  */

#ifndef MEMMOVE
# define MEMMOVE memmove
#endif
	.machine power9
ENTRY_TOCLESS (MEMMOVE, 5)
	CALL_MCOUNT 3

L(_memmove):
	.p2align 5
	/* Check if there is overlap, if so it will branch to backward copy.  */
	subf	r9,r4,r3
	cmpld	cr7,r9,r5
	blt	cr7,L(memmove_bwd)

	/* Fast path for length shorter than 16 bytes.  */
	sldi	r7,r5,56
	lxvl	32+v2,r4,r7
	stxvl	32+v2,r3,r7
	subic.	r8,r5,16
	blelr

	/* For shorter lengths aligning the dest address to 16 bytes either
	   decreases performance or is irrelevant.  I'm making use of this
	   comparison to skip the alignment in.  */
	cmpldi	cr6,r5,256
	bge	cr6,L(ge_256)
	/* Account for the first 16-byte copy.  */
	addi	r4,r4,16
	addi	r11,r3,16	/* use r11 to keep dest address on r3.  */
	subi	r5,r5,16
	b	L(loop_head)

	.p2align 5
L(ge_256):
	/* Account for the first copy <= 16 bytes.  This is necessary for
	   memmove because at this point the src address can be in front of the
	   dest address.  */
	clrldi	r9,r5,56
	li	r8,16
	cmpldi	r9,16
	iselgt	r9,r8,r9
	add	r4,r4,r9
	add	r11,r3,r9	/* use r11 to keep dest address on r3.  */
	sub	r5,r5,r9

	/* Align dest to 16 bytes.  */
	neg	r7,r3
	clrldi.	r9,r7,60
	beq	L(loop_head)

	.p2align 5
	sldi	r6,r9,56
	lxvl	32+v0,r4,r6
	stxvl	32+v0,r11,r6
	sub	r5,r5,r9
	add	r4,r4,r9
	add	r11,r11,r9

L(loop_head):
	cmpldi	r5,63
	ble	L(final_64)

	srdi.	r7,r5,7
	beq	L(loop_tail)

	mtctr	r7

/* Main loop that copies 128 bytes each iteration.  */
	.p2align 5
L(loop):
	addi	r9,r4,64
	addi	r10,r11,64

	lxv	32+v0,0(r4)
	lxv	32+v1,16(r4)
	lxv	32+v2,32(r4)
	lxv	32+v3,48(r4)

	stxv	32+v0,0(r11)
	stxv	32+v1,16(r11)
	stxv	32+v2,32(r11)
	stxv	32+v3,48(r11)

	addi	r4,r4,128
	addi	r11,r11,128

	lxv	32+v4,0(r9)
	lxv	32+v5,16(r9)
	lxv	32+v6,32(r9)
	lxv	32+v7,48(r9)

	stxv	32+v4,0(r10)
	stxv	32+v5,16(r10)
	stxv	32+v6,32(r10)
	stxv	32+v7,48(r10)

	bdnz	L(loop)
	clrldi.	r5,r5,57
	beqlr

/* Copy 64 bytes.  */
	.p2align 5
L(loop_tail):
	cmpldi 	cr5,r5,63
	ble	cr5,L(final_64)

	lxv	32+v0,0(r4)
	lxv	32+v1,16(r4)
	lxv	32+v2,32(r4)
	lxv	32+v3,48(r4)

	stxv	32+v0,0(r11)
	stxv	32+v1,16(r11)
	stxv	32+v2,32(r11)
	stxv	32+v3,48(r11)

	addi	r4,r4,64
	addi	r11,r11,64
	subi	r5,r5,64

/* Copies the last 1-63 bytes.  */
	.p2align 5
L(final_64):
	/* r8 holds the number of bytes that will be copied with lxv/stxv.  */
	clrrdi.	r8,r5,4
	beq	L(tail1)

	cmpldi  cr5,r5,32
	lxv	32+v0,0(r4)
	blt	cr5,L(tail2)

	cmpldi	cr6,r5,48
	lxv	32+v1,16(r4)
	blt	cr6,L(tail3)

	.p2align 5
	lxv	32+v2,32(r4)
	stxv	32+v2,32(r11)
L(tail3):
	stxv	32+v1,16(r11)
L(tail2):
	stxv	32+v0,0(r11)
	sub	r5,r5,r8
	add	r4,r4,r8
	add	r11,r11,r8
	.p2align 5
L(tail1):
	sldi	r6,r5,56
	lxvl	v4,r4,r6
	stxvl	v4,r11,r6
	blr

/* If dest and src overlap, we should copy backwards.  */
L(memmove_bwd):
	add	r11,r3,r5
	add	r4,r4,r5

	/* Optimization for length smaller than 16 bytes.  */
	cmpldi	cr5,r5,15
	ble	cr5,L(tail1_bwd)

	/* For shorter lengths the alignment either slows down or is irrelevant.
	   The forward copy uses a already need 256 comparison for that.  Here
	   it's using 128 as it will reduce code and improve readability.  */
	cmpldi	cr7,r5,128
	blt	cr7,L(bwd_loop_tail)

	/* Align dest address to 16 bytes.  */
	.p2align 5
	clrldi.	r9,r11,60
	beq	L(bwd_loop_head)
	sub	r4,r4,r9
	sub	r11,r11,r9
	lxv	32+v0,0(r4)
	sldi	r6,r9,56
	stxvl   32+v0,r11,r6
	sub	r5,r5,r9

L(bwd_loop_head):
	srdi.	r7,r5,7
	beq	L(bwd_loop_tail)

	mtctr	r7

/* Main loop that copies 128 bytes every iteration.  */
	.p2align 5
L(bwd_loop):
	addi	r9,r4,-64
	addi	r10,r11,-64

	lxv	32+v0,-16(r4)
	lxv	32+v1,-32(r4)
	lxv	32+v2,-48(r4)
	lxv	32+v3,-64(r4)

	stxv	32+v0,-16(r11)
	stxv	32+v1,-32(r11)
	stxv	32+v2,-48(r11)
	stxv	32+v3,-64(r11)

	addi	r4,r4,-128
	addi	r11,r11,-128

	lxv	32+v0,-16(r9)
	lxv	32+v1,-32(r9)
	lxv	32+v2,-48(r9)
	lxv	32+v3,-64(r9)

	stxv	32+v0,-16(r10)
	stxv	32+v1,-32(r10)
	stxv	32+v2,-48(r10)
	stxv	32+v3,-64(r10)

	bdnz	L(bwd_loop)
	clrldi.	r5,r5,57
	beqlr

/* Copy 64 bytes.  */
	.p2align 5
L(bwd_loop_tail):
	cmpldi 	cr5,r5,63
	ble	cr5,L(bwd_final_64)

	addi	r4,r4,-64
	addi	r11,r11,-64

	lxv	32+v0,0(r4)
	lxv	32+v1,16(r4)
	lxv	32+v2,32(r4)
	lxv	32+v3,48(r4)

	stxv	32+v0,0(r11)
	stxv	32+v1,16(r11)
	stxv	32+v2,32(r11)
	stxv	32+v3,48(r11)

	subi	r5,r5,64

/* Copies the last 1-63 bytes.  */
	.p2align 5
L(bwd_final_64):
	/* r8 holds the number of bytes that will be copied with lxv/stxv.  */
	clrrdi.	r8,r5,4
	beq	L(tail1_bwd)

	cmpldi	cr5,r5,32
	lxv	32+v2,-16(r4)
	blt	cr5,L(tail2_bwd)

	cmpldi	cr6,r5,48
	lxv	32+v1,-32(r4)
	blt	cr6,L(tail3_bwd)

	.p2align 5
	lxv	32+v0,-48(r4)
	stxv	32+v0,-48(r11)
L(tail3_bwd):
	stxv	32+v1,-32(r11)
L(tail2_bwd):
	stxv	32+v2,-16(r11)
	sub	r4,r4,r5
	sub	r11,r11,r5
	sub	r5,r5,r8
	sldi	r6,r5,56
	lxvl	v4,r4,r6
	stxvl	v4,r11,r6
	blr

/* Copy last 16 bytes.  */
	.p2align 5
L(tail1_bwd):
	sub	r4,r4,r5
	sub	r11,r11,r5
	sldi	r6,r5,56
	lxvl	v4,r4,r6
	stxvl	v4,r11,r6
	blr

END_GEN_TB (MEMMOVE,TB_TOCLESS)
libc_hidden_builtin_def (memmove)