1 files changed, 435 insertions, 0 deletions
diff --git a/sysdeps/powerpc/powerpc32/power7/memset.S b/sysdeps/powerpc/powerpc32/power7/memset.S
new file mode 100644
index 0000000000..99d07ec895
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/memset.S
@@ -0,0 +1,435 @@
+/* Optimized memset implementation for PowerPC32/POWER7.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
+   02110-1301 USA.  */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+	.machine  power7
+EALIGN (BP_SYM (memset), 5, 0)
+	CALL_MCOUNT
+
+	.align	4
+L(_memset):
+	cmplwi	cr7,5,31
+	cmplwi	cr6,5,8
+	mr	10,3		/* Save original argument for later.  */
+	mr	7,1		/* Save original r1 for later.  */
+	cfi_offset(31,-8)
+
+	/* Replicate byte to word.  */
+	rlwimi	4,4,8,16,23
+	rlwimi	4,4,16,0,15
+
+	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
+
+	neg	0,3
+	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
+
+	/* Save our word twice to create a doubleword that we will later
+	   copy to a FPR.  */
+	stwu	1,-32(1)
+	andi.	11,10,7		/* Check alignment of DST.  */
+	mr	12,5
+	stw	4,24(1)
+	stw	4,28(1)
+	beq	L(big_aligned)
+
+	clrlwi	0,0,29
+	mtocrf	0x01,0
+	subf	5,0,5
+
+	/* Get DST aligned to 8 bytes.  */
+1:	bf	31,2f
+
+	stb	4,0(10)
+	addi	10,10,1
+2:	bf	30,4f
+
+	sth	4,0(10)
+	addi	10,10,2
+4:	bf	29,L(big_aligned)
+
+	stw	4,0(10)
+	addi	10,10,4
+
+	.align	4
+L(big_aligned):
+	cmplwi	cr5,5,255
+	li	0,32
+	cmplwi	cr1,5,160
+	dcbtst	0,10
+	cmplwi	cr6,4,0
+	srwi	9,5,3		/* Number of full doublewords remaining.  */
+	crand	27,26,21
+	mtocrf	0x01,9
+	bt	27,L(huge)
+
+	/* From this point on, we'll copy 32+ bytes and the value
+	   isn't 0 (so we can't use dcbz).  */
+
+	srwi	8,5,5
+	clrlwi	11,5,29
+	cmplwi	cr6,11,0
+	cmplwi	cr1,9,4
+	mtctr	8
+
+	/* Copy 1~3 doublewords so the main loop starts
+	at a multiple of 32 bytes.  */
+
+	bf	30,1f
+
+	stw	4,0(10)
+	stw	4,4(10)
+	stw	4,8(10)
+	stw	4,12(10)
+	addi	10,10,16
+	bf	31,L(big_loop)
+
+	stw	4,0(10)
+	stw	4,4(10)
+	addi	10,10,8
+	mr	12,10
+	blt	cr1,L(tail_bytes)
+
+	b	L(big_loop)
+
+	.align	4
+1:	/* Copy 1 doubleword.  */
+	bf	31,L(big_loop)
+
+	stw	4,0(10)
+	stw	4,4(10)
+	addi	10,10,8
+
+	/* First use a 32-bytes loop with stw's to try and avoid the LHS due
+	   to the lfd we will do next.  Also, ping-pong through r10 and r12
+	   to avoid AGEN delays.  */
+	.align	4
+L(big_loop):
+	addi	12,10,32
+	stw	4,0(10)
+	stw	4,4(10)
+	stw	4,8(10)
+	stw	4,12(10)
+	stw	4,16(10)
+	stw	4,20(10)
+	stw	4,24(10)
+	stw	4,28(10)
+	bdz	L(tail_bytes)
+
+	addi	10,10,64
+	stw	4,0(12)
+	stw	4,4(12)
+	stw	4,8(12)
+	stw	4,12(12)
+	stw	4,16(12)
+	stw	4,20(12)
+	stw	4,24(12)
+	stw	4,28(12)
+	bdnz	L(big_loop_fast_setup)
+
+	mr	12,10
+	b	L(tail_bytes)
+
+	/* Now that we're probably past the LHS window, use the VSX to
+	   speed up the loop.  */
+L(big_loop_fast_setup):
+	li	0,0
+	li	11,24
+	li	6,16
+	lxvdsx	4,1,11
+
+	.align	4
+L(big_loop_fast):
+	addi	12,10,32
+	stxvd2x	4,10,0
+	stxvd2x	4,10,6
+	bdz	L(tail_bytes)
+
+	addi	10,10,64
+	stxvd2x	4,12,0
+	stxvd2x	4,12,6
+	bdnz	L(big_loop_fast)
+
+	mr	12,10
+
+	.align	4
+L(tail_bytes):
+
+	/* Check for tail bytes.  */
+	mr	1,7		/* Restore r1.  */
+	beqlr	cr6
+
+	clrlwi	0,5,29
+	mtocrf	0x01,0
+
+	/*  At this point we have a tail of 0-7 bytes and we know that the
+	destination is doubleword-aligned.  */
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw	4,0(12)
+	addi	12,12,4
+2:	/* Copy 2 bytes.  */
+	bf	30,1f
+
+	sth	4,0(12)
+	addi	12,12,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(12)
+	blr
+
+
+	/* Special case when value is 0 and we have a long length to deal
+	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
+	   dcbz though, we need to get the destination 128-bytes aligned.  */
+	.align	4
+L(huge):
+	lfd	4,24(1)
+	andi.	11,10,127
+	neg	0,10
+	beq	L(huge_aligned)
+
+	clrlwi	0,0,25
+	subf	5,0,5
+	srwi	0,0,3
+	mtocrf  0x01,0
+
+	/* Get DST aligned to 128 bytes.  */
+8:	bf	28,4f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	stfd	4,16(10)
+	stfd	4,24(10)
+	stfd	4,32(10)
+	stfd	4,40(10)
+	stfd	4,48(10)
+	stfd	4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	stfd	4,16(10)
+	stfd	4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(huge_aligned)
+
+	stfd	4,0(10)
+	addi	10,10,8
+
+L(huge_aligned):
+	srwi	8,5,7
+	clrlwi	11,5,25
+	cmplwi	cr6,11,0
+	mtctr	8
+
+	/* Copies 128-bytes at a time.  */
+	.align	4
+L(huge_loop):
+	dcbz	0,10
+	addi	10,10,128
+	bdnz	L(huge_loop)
+
+	/* We have a tail of 0~127 bytes to handle.  */
+	mr	1,7		/* Restore r1.  */
+	beqlr	cr6
+
+	subf	9,3,10
+	subf	5,9,12
+	srwi	8,5,3
+	cmplwi	cr6,8,0
+	mtocrf	0x01,8
+
+	/* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
+	speed.  We'll handle the resulting tail bytes later.  */
+	beq	cr6,L(tail)
+
+8:	bf	28,4f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	stfd	4,16(10)
+	stfd	4,24(10)
+	stfd	4,32(10)
+	stfd	4,40(10)
+	stfd	4,48(10)
+	stfd	4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	stfd	4,16(10)
+	stfd	4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	stfd	4,0(10)
+	stfd	4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(tail)
+
+	stfd	4,0(10)
+	addi	10,10,8
+
+	/* Handle the rest of the tail bytes here.  */
+L(tail):
+	mtocrf	0x01,5
+
+	.align	4
+4:	bf	29,2f
+
+	stw	4,0(10)
+	addi	10,10,4
+	.align	4
+2:	bf	30,1f
+
+	sth	4,0(10)
+	addi	10,10,2
+	.align	4
+1:	bflr	31
+
+	stb	4,0(10)
+	blr
+
+
+	/* Expanded tree to copy tail bytes without increments.  */
+	.align	4
+L(copy_tail):
+	bf	29,L(FXX)
+
+	stw	4,0(10)
+	bf	30,L(TFX)
+
+	sth	4,4(10)
+	bflr	31
+
+	stb	4,6(10)
+	blr
+
+	.align	4
+L(FXX):	bf	30,L(FFX)
+
+	sth	4,0(10)
+	bflr	31
+
+	stb	4,2(10)
+	blr
+
+	.align	4
+L(TFX):	bflr	31
+
+	stb	4,4(10)
+	blr
+
+	.align	4
+L(FFX):	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handle copies of 9~31 bytes.  */
+	.align	4
+L(medium):
+	/* At least 9 bytes to go.  */
+	andi.	11,10,3
+	clrlwi	0,0,30
+	beq	L(medium_aligned)
+
+	/* Force 4-bytes alignment for DST.  */
+	mtocrf	0x01,0
+	subf	5,0,5
+1:	/* Copy 1 byte.  */
+	bf	31,2f
+
+	stb	4,0(10)
+	addi	10,10,1
+2:	/* Copy 2 bytes.  */
+	bf	30,L(medium_aligned)
+
+	sth	4,0(10)
+	addi	10,10,2
+
+	.align	4
+L(medium_aligned):
+	/* At least 6 bytes to go, and DST is word-aligned.  */
+	cmplwi	cr1,5,16
+	mtocrf	0x01,5
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	stw	4,0(10)
+	stw	4,4(10)
+	stw	4,8(10)
+	stw	4,12(10)
+	addi	10,10,16
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	stw	4,0(10)
+	stw	4,4(10)
+	addi	10,10,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw	4,0(10)
+	addi	10,10,4
+2:	/* Copy 2-3 bytes.  */
+	bf	30,1f
+
+	sth	4,0(10)
+	addi	10,10,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(small):
+	mtocrf	0x01,5
+	bne	cr6,L(copy_tail)
+
+	stw	4,0(10)
+	stw	4,4(10)
+	blr
+
+END (BP_SYM (memset))
+libc_hidden_builtin_def (memset)