diff options
Diffstat (limited to 'sysdeps/powerpc/powerpc32')
-rw-r--r-- | sysdeps/powerpc/powerpc32/power7/memset.S | 435 |
1 files changed, 435 insertions, 0 deletions
diff --git a/sysdeps/powerpc/powerpc32/power7/memset.S b/sysdeps/powerpc/powerpc32/power7/memset.S new file mode 100644 index 0000000000..99d07ec895 --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/memset.S @@ -0,0 +1,435 @@ +/* Optimized memset implementation for PowerPC32/POWER7. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado <luisgpm@br.ibm.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA + 02110-1301 USA. */ + +#include <sysdep.h> +#include <bp-sym.h> +#include <bp-asm.h> + +/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); + Returns 's'. */ + + .machine power7 +EALIGN (BP_SYM (memset), 5, 0) + CALL_MCOUNT + + .align 4 +L(_memset): + cmplwi cr7,5,31 + cmplwi cr6,5,8 + mr 10,3 /* Save original argument for later. */ + mr 7,1 /* Save original r1 for later. */ + cfi_offset(31,-8) + + /* Replicate byte to word. */ + rlwimi 4,4,8,16,23 + rlwimi 4,4,16,0,15 + + ble cr6,L(small) /* If length <= 8, use short copy code. */ + + neg 0,3 + ble cr7,L(medium) /* If length < 32, use medium copy code. */ + + /* Save our word twice to create a doubleword that we will later + copy to a FPR. */ + stwu 1,-32(1) + andi. 11,10,7 /* Check alignment of DST. */ + mr 12,5 + stw 4,24(1) + stw 4,28(1) + beq L(big_aligned) + + clrlwi 0,0,29 + mtocrf 0x01,0 + subf 5,0,5 + + /* Get DST aligned to 8 bytes. */ +1: bf 31,2f + + stb 4,0(10) + addi 10,10,1 +2: bf 30,4f + + sth 4,0(10) + addi 10,10,2 +4: bf 29,L(big_aligned) + + stw 4,0(10) + addi 10,10,4 + + .align 4 +L(big_aligned): + cmplwi cr5,5,255 + li 0,32 + cmplwi cr1,5,160 + dcbtst 0,10 + cmplwi cr6,4,0 + srwi 9,5,3 /* Number of full doublewords remaining. */ + crand 27,26,21 + mtocrf 0x01,9 + bt 27,L(huge) + + /* From this point on, we'll copy 32+ bytes and the value + isn't 0 (so we can't use dcbz). */ + + srwi 8,5,5 + clrlwi 11,5,29 + cmplwi cr6,11,0 + cmplwi cr1,9,4 + mtctr 8 + + /* Copy 1~3 doublewords so the main loop starts + at a multiple of 32 bytes. */ + + bf 30,1f + + stw 4,0(10) + stw 4,4(10) + stw 4,8(10) + stw 4,12(10) + addi 10,10,16 + bf 31,L(big_loop) + + stw 4,0(10) + stw 4,4(10) + addi 10,10,8 + mr 12,10 + blt cr1,L(tail_bytes) + + b L(big_loop) + + .align 4 +1: /* Copy 1 doubleword. */ + bf 31,L(big_loop) + + stw 4,0(10) + stw 4,4(10) + addi 10,10,8 + + /* First use a 32-bytes loop with stw's to try and avoid the LHS due + to the lfd we will do next. Also, ping-pong through r10 and r12 + to avoid AGEN delays. */ + .align 4 +L(big_loop): + addi 12,10,32 + stw 4,0(10) + stw 4,4(10) + stw 4,8(10) + stw 4,12(10) + stw 4,16(10) + stw 4,20(10) + stw 4,24(10) + stw 4,28(10) + bdz L(tail_bytes) + + addi 10,10,64 + stw 4,0(12) + stw 4,4(12) + stw 4,8(12) + stw 4,12(12) + stw 4,16(12) + stw 4,20(12) + stw 4,24(12) + stw 4,28(12) + bdnz L(big_loop_fast_setup) + + mr 12,10 + b L(tail_bytes) + + /* Now that we're probably past the LHS window, use the VSX to + speed up the loop. */ +L(big_loop_fast_setup): + li 0,0 + li 11,24 + li 6,16 + lxvdsx 4,1,11 + + .align 4 +L(big_loop_fast): + addi 12,10,32 + stxvd2x 4,10,0 + stxvd2x 4,10,6 + bdz L(tail_bytes) + + addi 10,10,64 + stxvd2x 4,12,0 + stxvd2x 4,12,6 + bdnz L(big_loop_fast) + + mr 12,10 + + .align 4 +L(tail_bytes): + + /* Check for tail bytes. */ + mr 1,7 /* Restore r1. */ + beqlr cr6 + + clrlwi 0,5,29 + mtocrf 0x01,0 + + /* At this point we have a tail of 0-7 bytes and we know that the + destination is doubleword-aligned. */ +4: /* Copy 4 bytes. */ + bf 29,2f + + stw 4,0(12) + addi 12,12,4 +2: /* Copy 2 bytes. */ + bf 30,1f + + sth 4,0(12) + addi 12,12,2 +1: /* Copy 1 byte. */ + bflr 31 + + stb 4,0(12) + blr + + + /* Special case when value is 0 and we have a long length to deal + with. Use dcbz to zero out 128-bytes at a time. Before using + dcbz though, we need to get the destination 128-bytes aligned. */ + .align 4 +L(huge): + lfd 4,24(1) + andi. 11,10,127 + neg 0,10 + beq L(huge_aligned) + + clrlwi 0,0,25 + subf 5,0,5 + srwi 0,0,3 + mtocrf 0x01,0 + + /* Get DST aligned to 128 bytes. */ +8: bf 28,4f + + stfd 4,0(10) + stfd 4,8(10) + stfd 4,16(10) + stfd 4,24(10) + stfd 4,32(10) + stfd 4,40(10) + stfd 4,48(10) + stfd 4,56(10) + addi 10,10,64 + .align 4 +4: bf 29,2f + + stfd 4,0(10) + stfd 4,8(10) + stfd 4,16(10) + stfd 4,24(10) + addi 10,10,32 + .align 4 +2: bf 30,1f + + stfd 4,0(10) + stfd 4,8(10) + addi 10,10,16 + .align 4 +1: bf 31,L(huge_aligned) + + stfd 4,0(10) + addi 10,10,8 + +L(huge_aligned): + srwi 8,5,7 + clrlwi 11,5,25 + cmplwi cr6,11,0 + mtctr 8 + + /* Copies 128-bytes at a time. */ + .align 4 +L(huge_loop): + dcbz 0,10 + addi 10,10,128 + bdnz L(huge_loop) + + /* We have a tail of 0~127 bytes to handle. */ + mr 1,7 /* Restore r1. */ + beqlr cr6 + + subf 9,3,10 + subf 5,9,12 + srwi 8,5,3 + cmplwi cr6,8,0 + mtocrf 0x01,8 + + /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for + speed. We'll handle the resulting tail bytes later. */ + beq cr6,L(tail) + +8: bf 28,4f + + stfd 4,0(10) + stfd 4,8(10) + stfd 4,16(10) + stfd 4,24(10) + stfd 4,32(10) + stfd 4,40(10) + stfd 4,48(10) + stfd 4,56(10) + addi 10,10,64 + .align 4 +4: bf 29,2f + + stfd 4,0(10) + stfd 4,8(10) + stfd 4,16(10) + stfd 4,24(10) + addi 10,10,32 + .align 4 +2: bf 30,1f + + stfd 4,0(10) + stfd 4,8(10) + addi 10,10,16 + .align 4 +1: bf 31,L(tail) + + stfd 4,0(10) + addi 10,10,8 + + /* Handle the rest of the tail bytes here. */ +L(tail): + mtocrf 0x01,5 + + .align 4 +4: bf 29,2f + + stw 4,0(10) + addi 10,10,4 + .align 4 +2: bf 30,1f + + sth 4,0(10) + addi 10,10,2 + .align 4 +1: bflr 31 + + stb 4,0(10) + blr + + + /* Expanded tree to copy tail bytes without increments. */ + .align 4 +L(copy_tail): + bf 29,L(FXX) + + stw 4,0(10) + bf 30,L(TFX) + + sth 4,4(10) + bflr 31 + + stb 4,6(10) + blr + + .align 4 +L(FXX): bf 30,L(FFX) + + sth 4,0(10) + bflr 31 + + stb 4,2(10) + blr + + .align 4 +L(TFX): bflr 31 + + stb 4,4(10) + blr + + .align 4 +L(FFX): bflr 31 + + stb 4,0(10) + blr + + /* Handle copies of 9~31 bytes. */ + .align 4 +L(medium): + /* At least 9 bytes to go. */ + andi. 11,10,3 + clrlwi 0,0,30 + beq L(medium_aligned) + + /* Force 4-bytes alignment for DST. */ + mtocrf 0x01,0 + subf 5,0,5 +1: /* Copy 1 byte. */ + bf 31,2f + + stb 4,0(10) + addi 10,10,1 +2: /* Copy 2 bytes. */ + bf 30,L(medium_aligned) + + sth 4,0(10) + addi 10,10,2 + + .align 4 +L(medium_aligned): + /* At least 6 bytes to go, and DST is word-aligned. */ + cmplwi cr1,5,16 + mtocrf 0x01,5 + blt cr1,8f + + /* Copy 16 bytes. */ + stw 4,0(10) + stw 4,4(10) + stw 4,8(10) + stw 4,12(10) + addi 10,10,16 +8: /* Copy 8 bytes. */ + bf 28,4f + + stw 4,0(10) + stw 4,4(10) + addi 10,10,8 +4: /* Copy 4 bytes. */ + bf 29,2f + + stw 4,0(10) + addi 10,10,4 +2: /* Copy 2-3 bytes. */ + bf 30,1f + + sth 4,0(10) + addi 10,10,2 +1: /* Copy 1 byte. */ + bflr 31 + + stb 4,0(10) + blr + + /* Handles copies of 0~8 bytes. */ + .align 4 +L(small): + mtocrf 0x01,5 + bne cr6,L(copy_tail) + + stw 4,0(10) + stw 4,4(10) + blr + +END (BP_SYM (memset)) +libc_hidden_builtin_def (memset) |