about summary refs log tree commit diff
path: root/sysdeps/powerpc/powerpc64/power7/memset.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power7/memset.S')
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memset.S398
1 files changed, 398 insertions, 0 deletions
diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S
new file mode 100644
index 0000000000..02a9eedd6b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -0,0 +1,398 @@
+/* Optimized memset implementation for PowerPC64/POWER7.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+	.machine power7
+EALIGN (BP_SYM (memset), 5, 0)
+	CALL_MCOUNT 3
+
+L(_memset):
+	cmpldi	cr7,5,31
+	cmpldi	cr6,5,8
+	mr	10,3
+
+	/* Replicate byte to word.  */
+	rlwimi	4,4,8,16,23
+	rlwimi	4,4,16,0,15
+	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
+
+	neg	0,3
+	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
+
+	andi.	11,10,7		/* Check alignment of SRC.  */
+	insrdi	4,4,32,0	/* Replicate word to double word.  */
+
+	mr	12,5
+	beq	L(big_aligned)
+
+	clrldi	0,0,61
+	mtocrf	0x01,0
+	subf	5,0,5
+
+	/* Get DST aligned to 8 bytes.  */
+1:	bf	31,2f
+
+	stb	4,0(10)
+	addi	10,10,1
+2:	bf	30,4f
+
+	sth	4,0(10)
+	addi	10,10,2
+4:	bf	29,L(big_aligned)
+
+	stw	4,0(10)
+	addi	10,10,4
+
+	.align	4
+L(big_aligned):
+
+	cmpldi	cr5,5,255
+	li	0,32
+	dcbtst	0,10
+	cmpldi	cr6,4,0
+	srdi	9,5,3	/* Number of full doublewords remaining.  */
+	crand	27,26,21
+	mtocrf	0x01,9
+	bt	27,L(huge)
+
+	/* From this point on, we'll copy 32+ bytes and the value
+	   isn't 0 (so we can't use dcbz).  */
+
+	srdi	8,5,5
+	clrldi	11,5,61
+	cmpldi	cr6,11,0
+	cmpldi	cr1,9,4
+	mtctr	8
+
+	/* Copy 1~3 doublewords so the main loop starts
+	at a multiple of 32 bytes.  */
+
+	bf	30,1f
+
+	std	4,0(10)
+	std	4,8(10)
+	addi	10,10,16
+	bf	31,L(big_loop)
+
+	std	4,0(10)
+	addi	10,10,8
+	mr	12,10
+	blt	cr1,L(tail_bytes)
+	b	L(big_loop)
+
+	.align	4
+1:	/* Copy 1 doubleword.  */
+	bf	31,L(big_loop)
+
+	std	4,0(10)
+	addi	10,10,8
+
+	/* Main aligned copy loop.  Copies 32-bytes at a time and
+	   ping-pong through r10 and r12 to avoid AGEN delays.  */
+	.align	4
+L(big_loop):
+	addi	12,10,32
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	bdz	L(tail_bytes)
+
+	addi	10,10,64
+	std	4,0(12)
+	std	4,8(12)
+	std	4,16(12)
+	std	4,24(12)
+	bdnz	L(big_loop)
+
+	mr	12,10
+	b	L(tail_bytes)
+
+	.align	4
+L(tail_bytes):
+
+	/* Check for tail bytes.  */
+	beqlr	cr6
+
+	clrldi	0,5,61
+	mtocrf	0x01,0
+
+	/*  At this point we have a tail of 0-7 bytes and we know that the
+	destination is doubleword-aligned.  */
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw	4,0(12)
+	addi	12,12,4
+2:	/* Copy 2 bytes.  */
+	bf	30,1f
+
+	sth	4,0(12)
+	addi	12,12,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(12)
+	blr
+
+	/* Special case when value is 0 and we have a long length to deal
+	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
+	   dcbz though, we need to get the destination 128-bytes aligned.  */
+	.align	4
+L(huge):
+	andi.	11,10,127
+	neg	0,10
+	beq	L(huge_aligned)
+
+	clrldi	0,0,57
+	subf	5,0,5
+	srdi	0,0,3
+	mtocrf	0x01,0
+
+	/* Get DST aligned to 128 bytes.  */
+8:	bf	28,4f
+
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	std	4,32(10)
+	std	4,40(10)
+	std	4,48(10)
+	std	4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	std	4,0(10)
+	std	4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(huge_aligned)
+
+	std	4,0(10)
+	addi	10,10,8
+
+
+L(huge_aligned):
+	srdi	8,5,7
+	clrldi	11,5,57
+	cmpldi	cr6,11,0
+	mtctr	8
+
+	.align	4
+L(huge_loop):
+	dcbz	0,10
+	addi	10,10,128
+	bdnz	L(huge_loop)
+
+	/* Check how many bytes are still left.  */
+	beqlr	cr6
+
+	subf	9,3,10
+	subf	5,9,12
+	srdi	8,5,3
+	cmpldi	cr6,8,0
+	mtocrf	0x01,8
+
+	/* We have a tail o 1~127 bytes.  Copy up to 15 doublewords for
+	speed.  We'll handle the resulting tail bytes later.  */
+	beq	cr6,L(tail)
+
+8:	bf	28,4f
+
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	std	4,32(10)
+	std	4,40(10)
+	std	4,48(10)
+	std	4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	std	4,0(10)
+	std	4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(tail)
+
+	std	4,0(10)
+	addi	10,10,8
+
+	/* Handle the rest of the tail bytes here.  */
+L(tail):
+	mtocrf	0x01,5
+
+	.align	4
+4:	bf	29,2f
+
+	stw	4,0(10)
+	addi	10,10,4
+	.align	4
+2:	bf	30,1f
+
+	sth	4,0(10)
+	addi	10,10,2
+	.align	4
+1:	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Expanded tree to copy tail bytes without increments.  */
+	.align	4
+L(copy_tail):
+	bf	29,L(FXX)
+
+	stw	4,0(10)
+	bf	30,L(TFX)
+
+	sth	4,4(10)
+	bflr	31
+
+	stb	4,6(10)
+	blr
+
+	.align	4
+L(FXX):	bf	30,L(FFX)
+
+	sth	4,0(10)
+	bflr	31
+
+	stb	4,2(10)
+	blr
+
+	.align	4
+L(TFX):	bflr	31
+
+	stb	4,4(10)
+	blr
+
+	.align	4
+L(FFX):	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handle copies of 9~31 bytes.  */
+	.align	4
+L(medium):
+	/* At least 9 bytes to go.  */
+	andi.	11,10,3
+	clrldi	0,0,62
+	beq	L(medium_aligned)
+
+	/* Force 4-bytes alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	5,0,5
+1:	/* Copy 1 byte.  */
+	bf	31,2f
+
+	stb	4,0(10)
+	addi	10,10,1
+2:	/* Copy 2 bytes.  */
+	bf	30,L(medium_aligned)
+
+	sth	4,0(10)
+	addi	10,10,2
+
+	.align	4
+L(medium_aligned):
+	/* At least 6 bytes to go, and DST is word-aligned.  */
+	cmpldi	cr1,5,16
+	mtocrf	0x01,5
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	stw	4,0(10)
+	stw	4,4(10)
+	stw	4,8(10)
+	stw	4,12(10)
+	addi	10,10,16
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	stw	4,0(10)
+	stw	4,4(10)
+	addi	10,10,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw	4,0(10)
+	addi	10,10,4
+2:	/* Copy 2-3 bytes.  */
+	bf	30,1f
+
+	sth	4,0(10)
+	addi	10,10,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(small):
+	mtocrf	0x01,5
+	bne	cr6,L(copy_tail)
+
+	stw	4,0(10)
+	stw	4,4(10)
+	blr
+
+END_GEN_TB (BP_SYM (memset),TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+   between bzero and memset.  */
+ENTRY (BP_SYM (__bzero))
+	CALL_MCOUNT 3
+	mr	r5,r4
+	li	r4,0
+	b	L(_memset)
+END_GEN_TB (BP_SYM (__bzero),TB_TOCLESS)
+
+weak_alias (BP_SYM (__bzero), BP_SYM (bzero))