diff options
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8/memset.S')
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/memset.S | 458 |
1 files changed, 0 insertions, 458 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S deleted file mode 100644 index bc734c9f4f..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/memset.S +++ /dev/null @@ -1,458 +0,0 @@ -/* Optimized memset implementation for PowerPC64/POWER8. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#define MTVSRD_V1_R4 .long 0x7c240166 /* mtvsrd v1,r4 */ - -/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); - Returns 's'. */ - -#ifndef MEMSET -# define MEMSET memset -#endif - - /* No need to use .machine power8 since mtvsrd is already - handled by the define. It avoid breakage on binutils - that does not support this machine specifier. */ - .machine power7 -EALIGN (MEMSET, 5, 0) - CALL_MCOUNT 3 - -L(_memset): - cmpldi cr7,r5,31 - neg r0,r3 - mr r10,r3 - - insrdi r4,r4,8,48 - insrdi r4,r4,16,32 /* Replicate byte to word. */ - ble cr7,L(write_LT_32) - - andi. r11,r10,15 /* Check alignment of DST. */ - insrdi r4,r4,32,0 /* Replicate word to double word. */ - - beq L(big_aligned) - - mtocrf 0x01,r0 - clrldi r0,r0,60 - - /* Get DST aligned to 16 bytes. */ -1: bf 31,2f - stb r4,0(r10) - addi r10,r10,1 - -2: bf 30,4f - sth r4,0(r10) - addi r10,r10,2 - -4: bf 29,8f - stw r4,0(r10) - addi r10,r10,4 - -8: bf 28,16f - std r4,0(r10) - addi r10,r10,8 - -16: subf r5,r0,r5 - - .align 4 -L(big_aligned): - /* For sizes larger than 255 two possible paths: - - if constant is '0', zero full cache lines with dcbz - - otherwise uses vector instructions. */ - cmpldi cr5,r5,255 - dcbtst 0,r10 - cmpldi cr6,r4,0 - crand 27,26,21 - bt 27,L(huge_dcbz) - bge cr5,L(huge_vector) - - - /* Size between 32 and 255 bytes with constant different than 0, use - doubleword store instruction to achieve best throughput. */ - srdi r8,r5,5 - clrldi r11,r5,59 - cmpldi cr6,r11,0 - cmpdi r8,0 - beq L(tail_bytes) - mtctr r8 - - /* Main aligned write loop, writes 32-bytes at a time. */ - .align 4 -L(big_loop): - std r4,0(r10) - std r4,8(r10) - std r4,16(r10) - std r4,24(r10) - addi r10,r10,32 - bdz L(tail_bytes) - - std r4,0(r10) - std r4,8(r10) - std r4,16(r10) - std r4,24(r10) - addi r10,10,32 - bdnz L(big_loop) - - b L(tail_bytes) - - /* Write remaining 1~31 bytes. */ - .align 4 -L(tail_bytes): - beqlr cr6 - - srdi r7,r11,4 - clrldi r8,r11,60 - mtocrf 0x01,r7 - - .align 4 - bf 31,8f - std r4,0(r10) - std r4,8(r10) - addi r10,r10,16 - - .align 4 -8: mtocrf 0x1,r8 - bf 28,4f - std r4,0(r10) - addi r10,r10,8 - - .align 4 -4: bf 29,2f - stw 4,0(10) - addi 10,10,4 - - .align 4 -2: bf 30,1f - sth 4,0(10) - addi 10,10,2 - - .align 4 -1: bflr 31 - stb 4,0(10) - blr - - /* Size larger than 255 bytes with constant different than 0, use - vector instruction to achieve best throughput. */ -L(huge_vector): - /* Replicate set byte to quadword in VMX register. */ - MTVSRD_V1_R4 - xxpermdi 32,v0,v1,0 - vspltb v2,v0,15 - - /* Main aligned write loop: 128 bytes at a time. */ - li r6,16 - li r7,32 - li r8,48 - mtocrf 0x02,r5 - srdi r12,r5,7 - cmpdi r12,0 - beq L(aligned_tail) - mtctr r12 - b L(aligned_128loop) - - .align 4 -L(aligned_128loop): - stvx v2,0,r10 - stvx v2,r10,r6 - stvx v2,r10,r7 - stvx v2,r10,r8 - addi r10,r10,64 - stvx v2,0,r10 - stvx v2,r10,r6 - stvx v2,r10,r7 - stvx v2,r10,r8 - addi r10,r10,64 - bdnz L(aligned_128loop) - - /* Write remaining 1~127 bytes. */ -L(aligned_tail): - mtocrf 0x01,r5 - bf 25,32f - stvx v2,0,r10 - stvx v2,r10,r6 - stvx v2,r10,r7 - stvx v2,r10,r8 - addi r10,r10,64 - -32: bf 26,16f - stvx v2,0,r10 - stvx v2,r10,r6 - addi r10,r10,32 - -16: bf 27,8f - stvx v2,0,r10 - addi r10,r10,16 - -8: bf 28,4f - std r4,0(r10) - addi r10,r10,8 - - /* Copies 4~7 bytes. */ -4: bf 29,L(tail2) - stw r4,0(r10) - bf 30,L(tail5) - sth r4,4(r10) - bflr 31 - stb r4,6(r10) - /* Return original DST pointer. */ - blr - - /* Special case when value is 0 and we have a long length to deal - with. Use dcbz to zero out a full cacheline of 128 bytes at a time. - Before using dcbz though, we need to get the destination 128-byte - aligned. */ - .align 4 -L(huge_dcbz): - andi. r11,r10,127 - neg r0,r10 - beq L(huge_dcbz_aligned) - - clrldi r0,r0,57 - subf r5,r0,r5 - srdi r0,r0,3 - mtocrf 0x01,r0 - - /* Write 1~128 bytes until DST is aligned to 128 bytes. */ -8: bf 28,4f - - std r4,0(r10) - std r4,8(r10) - std r4,16(r10) - std r4,24(r10) - std r4,32(r10) - std r4,40(r10) - std r4,48(r10) - std r4,56(r10) - addi r10,r10,64 - - .align 4 -4: bf 29,2f - std r4,0(r10) - std r4,8(r10) - std r4,16(r10) - std r4,24(r10) - addi r10,r10,32 - - .align 4 -2: bf 30,1f - std r4,0(r10) - std r4,8(r10) - addi r10,r10,16 - - .align 4 -1: bf 31,L(huge_dcbz_aligned) - std r4,0(r10) - addi r10,r10,8 - -L(huge_dcbz_aligned): - /* Setup dcbz unroll offsets and count numbers. */ - srdi r8,r5,9 - clrldi r11,r5,55 - cmpldi cr6,r11,0 - li r9,128 - cmpdi r8,0 - beq L(huge_tail) - li r7,256 - li r6,384 - mtctr r8 - - .align 4 -L(huge_loop): - /* Sets 512 bytes to zero in each iteration, the loop unrolling shows - a throughput boost for large sizes (2048 bytes or higher). */ - dcbz 0,r10 - dcbz r9,r10 - dcbz r7,r10 - dcbz r6,r10 - addi r10,r10,512 - bdnz L(huge_loop) - - beqlr cr6 - -L(huge_tail): - srdi r6,r11,8 - srdi r7,r11,4 - clrldi r8,r11,4 - cmpldi cr6,r8,0 - mtocrf 0x01,r6 - - beq cr6,L(tail) - - /* We have 1~511 bytes remaining. */ - .align 4 -32: bf 31,16f - dcbz 0,r10 - dcbz r9,r10 - addi r10,r10,256 - - .align 4 -16: mtocrf 0x01,r7 - bf 28,8f - dcbz 0,r10 - addi r10,r10,128 - - .align 4 -8: bf 29,4f - std r4,0(r10) - std r4,8(r10) - std r4,16(r10) - std r4,24(r10) - std r4,32(r10) - std r4,40(r10) - std r4,48(r10) - std r4,56(r10) - addi r10,r10,64 - - .align 4 -4: bf 30,2f - std r4,0(r10) - std r4,8(r10) - std r4,16(r10) - std r4,24(r10) - addi r10,r10,32 - - .align 4 -2: bf 31,L(tail) - std r4,0(r10) - std r4,8(r10) - addi r10,r10,16 - .align 4 - - /* Remaining 1~15 bytes. */ -L(tail): - mtocrf 0x01,r8 - - .align -8: bf 28,4f - std r4,0(r10) - addi r10,r10,8 - - .align 4 -4: bf 29,2f - stw r4,0(r10) - addi r10,r10,4 - - .align 4 -2: bf 30,1f - sth r4,0(r10) - addi r10,r10,2 - - .align 4 -1: bflr 31 - stb r4,0(r10) - blr - - /* Handle short copies of 0~31 bytes. Best throughput is achieved - by just unrolling all operations. */ - .align 4 -L(write_LT_32): - cmpldi cr6,5,8 - mtocrf 0x01,r5 - ble cr6,L(write_LE_8) - - /* At least 9 bytes to go. */ - neg r8,r4 - andi. r0,r8,3 - cmpldi cr1,r5,16 - beq L(write_LT_32_aligned) - - /* Force 4-byte alignment for SRC. */ - mtocrf 0x01,r0 - subf r5,r0,r5 - -2: bf 30,1f - sth r4,0(r10) - addi r10,r10,2 - -1: bf 31,L(end_4bytes_alignment) - stb r4,0(r10) - addi r10,r10,1 - - .align 4 -L(end_4bytes_alignment): - cmpldi cr1,r5,16 - mtocrf 0x01,r5 - -L(write_LT_32_aligned): - blt cr1,8f - - stw r4,0(r10) - stw r4,4(r10) - stw r4,8(r10) - stw r4,12(r10) - addi r10,r10,16 - -8: bf 28,L(tail4) - stw r4,0(r10) - stw r4,4(r10) - addi r10,r10,8 - - .align 4 - /* Copies 4~7 bytes. */ -L(tail4): - bf 29,L(tail2) - stw r4,0(r10) - bf 30,L(tail5) - sth r4,4(r10) - bflr 31 - stb r4,6(r10) - blr - - .align 4 - /* Copies 2~3 bytes. */ -L(tail2): - bf 30,1f - sth r4,0(r10) - bflr 31 - stb r4,2(r10) - blr - - .align 4 -L(tail5): - bflr 31 - stb r4,4(r10) - blr - - .align 4 -1: bflr 31 - stb r4,0(r10) - blr - - /* Handles copies of 0~8 bytes. */ - .align 4 -L(write_LE_8): - bne cr6,L(tail4) - - stw r4,0(r10) - stw r4,4(r10) - blr -END_GEN_TB (MEMSET,TB_TOCLESS) -libc_hidden_builtin_def (memset) - -/* Copied from bzero.S to prevent the linker from inserting a stub - between bzero and memset. */ -ENTRY (__bzero) - CALL_MCOUNT 3 - mr r5,r4 - li r4,0 - b L(_memset) -END (__bzero) -#ifndef __bzero -weak_alias (__bzero, bzero) -#endif |