From 0a89b6a6fac08b42533075d90d8693ec825bdac1 Mon Sep 17 00:00:00 2001 From: Luis Machado Date: Fri, 14 May 2010 14:15:22 -0500 Subject: Power7 memset powerpc32 and powerpc64 .S optimizations. --- ChangeLog | 6 + sysdeps/powerpc/powerpc32/power7/memset.S | 434 ++++++++++++++++++++++++++++++ sysdeps/powerpc/powerpc64/power7/memset.S | 398 +++++++++++++++++++++++++++ 3 files changed, 838 insertions(+) create mode 100644 sysdeps/powerpc/powerpc32/power7/memset.S create mode 100644 sysdeps/powerpc/powerpc64/power7/memset.S diff --git a/ChangeLog b/ChangeLog index a16a1fba85..4fb7a2979f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2010-05-15 Luis Machado + * sysdeps/powerpc/powerpc64/power7/memset.S: New POWER7-optimized + 64-bit memset. + * sysdeps/powerpc/powerpc32/power7/memset.S: New POWER7-optimized + 32-bit memset. + 2010-05-01 Alan Modra * sysdeps/powerpc/powerpc32/power4/memcmp.S: Correct cfi for r24. * sysdeps/powerpc/powerpc64/bsd-_setjmp.S: Move contents.. diff --git a/sysdeps/powerpc/powerpc32/power7/memset.S b/sysdeps/powerpc/powerpc32/power7/memset.S new file mode 100644 index 0000000000..990faa1452 --- /dev/null +++ b/sysdeps/powerpc/powerpc32/power7/memset.S @@ -0,0 +1,434 @@ +/* Optimized memset implementation for PowerPC32/POWER7. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA + 02110-1301 USA. */ + +#include +#include +#include + +/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); + Returns 's'. */ + + .machine power7 +EALIGN (BP_SYM (memset), 5, 0) + CALL_MCOUNT + + .align 4 +L(_memset): + cmplwi cr7,5,31 + cmplwi cr6,5,8 + mr 10,3 /* Save original argument for later. */ + mr 7,1 /* Save original r1 for later. */ + cfi_offset(31,-8) + + /* Replicate byte to word. */ + rlwimi 4,4,8,16,23 + rlwimi 4,4,16,0,15 + + ble cr6,L(small) /* If length <= 8, use short copy code. */ + + neg 0,3 + ble cr7,L(medium) /* If length < 32, use medium copy code. */ + + /* Save our word twice to create a doubleword that we will later + copy to a FPR. */ + stwu 1,-32(1) + andi. 11,10,7 /* Check alignment of DST. */ + mr 12,5 + stw 4,24(1) + stw 4,28(1) + beq L(big_aligned) + + clrlwi 0,0,29 + mtocrf 0x01,0 + subf 5,0,5 + + /* Get DST aligned to 8 bytes. */ +1: bf 31,2f + + stb 4,0(10) + addi 10,10,1 +2: bf 30,4f + + sth 4,0(10) + addi 10,10,2 +4: bf 29,L(big_aligned) + + stw 4,0(10) + addi 10,10,4 + + .align 4 +L(big_aligned): + cmplwi cr5,5,255 + li 0,32 + cmplwi cr1,5,160 + dcbtst 0,10 + cmplwi cr6,4,0 + srwi 9,5,3 /* Number of full doublewords remaining. */ + crand 27,26,21 + mtocrf 0x01,9 + bt 27,L(huge) + + /* From this point on, we'll copy 32+ bytes and the value + isn't 0 (so we can't use dcbz). */ + + srwi 8,5,5 + clrlwi 11,5,29 + cmplwi cr6,11,0 + cmplwi cr1,9,4 + mtctr 8 + + /* Copy 1~3 doublewords so the main loop starts + at a multiple of 32 bytes. */ + + bf 30,1f + + stw 4,0(10) + stw 4,4(10) + stw 4,8(10) + stw 4,12(10) + addi 10,10,16 + bf 31,L(big_loop) + + stw 4,0(10) + stw 4,4(10) + addi 10,10,8 + mr 12,10 + blt cr1,L(tail_bytes) + + b L(big_loop) + + .align 4 +1: /* Copy 1 doubleword. */ + bf 31,L(big_loop) + + stw 4,0(10) + stw 4,4(10) + addi 10,10,8 + + /* First use a 32-bytes loop with stw's to try and avoid the LHS due + to the lfd we will do next. Also, ping-pong through r10 and r12 + to avoid AGEN delays. */ + .align 4 +L(big_loop): + addi 12,10,32 + stw 4,0(10) + stw 4,4(10) + stw 4,8(10) + stw 4,12(10) + stw 4,16(10) + stw 4,20(10) + stw 4,24(10) + stw 4,28(10) + bdz L(tail_bytes) + + addi 10,10,64 + stw 4,0(12) + stw 4,4(12) + stw 4,8(12) + stw 4,12(12) + stw 4,16(12) + stw 4,20(12) + stw 4,24(12) + stw 4,28(12) + bdnz L(big_loop_fast_setup) + + mr 12,10 + b L(tail_bytes) + + /* Now that we're probably past the LHS window, use the VSX to + speed up the loop. */ +L(big_loop_fast_setup): + li 11,24 + li 6,16 + lxvdsx 4,1,11 + + .align 4 +L(big_loop_fast): + addi 12,10,32 + stxvd2x 4,10,0 + stxvd2x 4,10,6 + bdz L(tail_bytes) + + addi 10,10,64 + stxvd2x 4,12,0 + stxvd2x 4,12,6 + bdnz L(big_loop_fast) + + mr 12,10 + + .align 4 +L(tail_bytes): + + /* Check for tail bytes. */ + mr 1,7 /* Restore r1. */ + beqlr cr6 + + clrlwi 0,5,29 + mtocrf 0x01,0 + + /* At this point we have a tail of 0-7 bytes and we know that the + destination is doubleword-aligned. */ +4: /* Copy 4 bytes. */ + bf 29,2f + + stw 4,0(12) + addi 12,12,4 +2: /* Copy 2 bytes. */ + bf 30,1f + + sth 4,0(12) + addi 12,12,2 +1: /* Copy 1 byte. */ + bflr 31 + + stb 4,0(12) + blr + + + /* Special case when value is 0 and we have a long length to deal + with. Use dcbz to zero out 128-bytes at a time. Before using + dcbz though, we need to get the destination 128-bytes aligned. */ + .align 4 +L(huge): + lfd 4,24(1) + andi. 11,10,127 + neg 0,10 + beq L(huge_aligned) + + clrlwi 0,0,25 + subf 5,0,5 + srwi 0,0,3 + mtocrf 0x01,0 + + /* Get DST aligned to 128 bytes. */ +8: bf 28,4f + + stfd 4,0(10) + stfd 4,8(10) + stfd 4,16(10) + stfd 4,24(10) + stfd 4,32(10) + stfd 4,40(10) + stfd 4,48(10) + stfd 4,56(10) + addi 10,10,64 + .align 4 +4: bf 29,2f + + stfd 4,0(10) + stfd 4,8(10) + stfd 4,16(10) + stfd 4,24(10) + addi 10,10,32 + .align 4 +2: bf 30,1f + + stfd 4,0(10) + stfd 4,8(10) + addi 10,10,16 + .align 4 +1: bf 31,L(huge_aligned) + + stfd 4,0(10) + addi 10,10,8 + +L(huge_aligned): + srwi 8,5,7 + clrlwi 11,5,25 + cmplwi cr6,11,0 + mtctr 8 + + /* Copies 128-bytes at a time. */ + .align 4 +L(huge_loop): + dcbz 0,10 + addi 10,10,128 + bdnz L(huge_loop) + + /* We have a tail of 0~127 bytes to handle. */ + mr 1,7 /* Restore r1. */ + beqlr cr6 + + subf 9,3,10 + subf 5,9,12 + srwi 8,5,3 + cmplwi cr6,8,0 + mtocrf 0x01,8 + + /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for + speed. We'll handle the resulting tail bytes later. */ + beq cr6,L(tail) + +8: bf 28,4f + + stfd 4,0(10) + stfd 4,8(10) + stfd 4,16(10) + stfd 4,24(10) + stfd 4,32(10) + stfd 4,40(10) + stfd 4,48(10) + stfd 4,56(10) + addi 10,10,64 + .align 4 +4: bf 29,2f + + stfd 4,0(10) + stfd 4,8(10) + stfd 4,16(10) + stfd 4,24(10) + addi 10,10,32 + .align 4 +2: bf 30,1f + + stfd 4,0(10) + stfd 4,8(10) + addi 10,10,16 + .align 4 +1: bf 31,L(tail) + + stfd 4,0(10) + addi 10,10,8 + + /* Handle the rest of the tail bytes here. */ +L(tail): + mtocrf 0x01,5 + + .align 4 +4: bf 29,2f + + stw 4,0(10) + addi 10,10,4 + .align 4 +2: bf 30,1f + + sth 4,0(10) + addi 10,10,2 + .align 4 +1: bflr 31 + + stb 4,0(10) + blr + + + /* Expanded tree to copy tail bytes without increments. */ + .align 4 +L(copy_tail): + bf 29,L(FXX) + + stw 4,0(10) + bf 30,L(TFX) + + sth 4,4(10) + bflr 31 + + stb 4,6(10) + blr + + .align 4 +L(FXX): bf 30,L(FFX) + + sth 4,0(10) + bflr 31 + + stb 4,2(10) + blr + + .align 4 +L(TFX): bflr 31 + + stb 4,4(10) + blr + + .align 4 +L(FFX): bflr 31 + + stb 4,0(10) + blr + + /* Handle copies of 9~31 bytes. */ + .align 4 +L(medium): + /* At least 9 bytes to go. */ + andi. 11,10,3 + clrlwi 0,0,30 + beq L(medium_aligned) + + /* Force 4-bytes alignment for DST. */ + mtocrf 0x01,0 + subf 5,0,5 +1: /* Copy 1 byte. */ + bf 31,2f + + stb 4,0(10) + addi 10,10,1 +2: /* Copy 2 bytes. */ + bf 30,L(medium_aligned) + + sth 4,0(10) + addi 10,10,2 + + .align 4 +L(medium_aligned): + /* At least 6 bytes to go, and DST is word-aligned. */ + cmplwi cr1,5,16 + mtocrf 0x01,5 + blt cr1,8f + + /* Copy 16 bytes. */ + stw 4,0(10) + stw 4,4(10) + stw 4,8(10) + stw 4,12(10) + addi 10,10,16 +8: /* Copy 8 bytes. */ + bf 28,4f + + stw 4,0(10) + stw 4,4(10) + addi 10,10,8 +4: /* Copy 4 bytes. */ + bf 29,2f + + stw 4,0(10) + addi 10,10,4 +2: /* Copy 2-3 bytes. */ + bf 30,1f + + sth 4,0(10) + addi 10,10,2 +1: /* Copy 1 byte. */ + bflr 31 + + stb 4,0(10) + blr + + /* Handles copies of 0~8 bytes. */ + .align 4 +L(small): + mtocrf 0x01,5 + bne cr6,L(copy_tail) + + stw 4,0(10) + stw 4,4(10) + blr + +END (BP_SYM (memset)) +libc_hidden_builtin_def (memset) diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S new file mode 100644 index 0000000000..0f726d4f37 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/memset.S @@ -0,0 +1,398 @@ +/* Optimized memset implementation for PowerPC64/POWER7. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include + +/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); + Returns 's'. */ + + .machine power7 +EALIGN (BP_SYM (memset), 5, 0) + CALL_MCOUNT 3 + +L(_memset): + cmpldi cr7,5,31 + cmpldi cr6,5,8 + mr 10,3 + + /* Replicate byte to word. */ + rlwimi 4,4,8,16,23 + rlwimi 4,4,16,0,15 + ble cr6, L(small) /* If length <= 8, use short copy code. */ + + neg 0,3 + ble cr7, L(medium) /* If length < 32, use medium copy code. */ + + andi. 11,10,7 /* Check alignment of SRC. */ + insrdi 4,4,32,0 /* Replicate word to double word. */ + + mr 12,5 + beq L(big_aligned) + + clrldi 0,0,61 + mtocrf 0x01,0 + subf 5,0,5 + + /* Get DST aligned to 8 bytes. */ +1: bf 31,2f + + stb 4,0(10) + addi 10,10,1 +2: bf 30,4f + + sth 4,0(10) + addi 10,10,2 +4: bf 29,L(big_aligned) + + stw 4,0(10) + addi 10,10,4 + + .align 4 +L(big_aligned): + + cmpldi cr5,5,255 + li 0,32 + dcbtst 0,10 + cmpldi cr6,4,0 + srdi 9,5,3 /* Number of full doublewords remaining. */ + crand 27,26,21 + mtocrf 0x01,9 + bt 27,L(huge) + + /* From this point on, we'll copy 32+ bytes and the value + isn't 0 (so we can't use dcbz). */ + + srdi 8,5,5 + clrldi 11,5,61 + cmpldi cr6,11,0 + cmpldi cr1,9,4 + mtctr 8 + + /* Copy 1~3 doublewords so the main loop starts + at a multiple of 32 bytes. */ + + bf 30,1f + + std 4,0(10) + std 4,8(10) + addi 10,10,16 + bf 31,L(big_loop) + + std 4,0(10) + addi 10,10,8 + mr 12,10 + blt cr1,L(tail_bytes) + b L(big_loop) + + .align 4 +1: /* Copy 1 doubleword. */ + bf 31,L(big_loop) + + std 4,0(10) + addi 10,10,8 + + /* Main aligned copy loop. Copies 32-bytes at a time and + ping-pong through r10 and r12 to avoid AGEN delays. */ + .align 4 +L(big_loop): + addi 12,10,32 + std 4,0(10) + std 4,8(10) + std 4,16(10) + std 4,24(10) + bdz L(tail_bytes) + + addi 10,10,64 + std 4,0(12) + std 4,8(12) + std 4,16(12) + std 4,24(12) + bdnz L(big_loop) + + mr 12,10 + b L(tail_bytes) + + .align 4 +L(tail_bytes): + + /* Check for tail bytes. */ + beqlr cr6 + + clrldi 0,5,61 + mtocrf 0x01,0 + + /* At this point we have a tail of 0-7 bytes and we know that the + destination is doubleword-aligned. */ +4: /* Copy 4 bytes. */ + bf 29,2f + + stw 4,0(12) + addi 12,12,4 +2: /* Copy 2 bytes. */ + bf 30,1f + + sth 4,0(12) + addi 12,12,2 +1: /* Copy 1 byte. */ + bflr 31 + + stb 4,0(12) + blr + + /* Special case when value is 0 and we have a long length to deal + with. Use dcbz to zero out 128-bytes at a time. Before using + dcbz though, we need to get the destination 128-bytes aligned. */ + .align 4 +L(huge): + andi. 11,10,127 + neg 0,10 + beq L(huge_aligned) + + clrldi 0,0,57 + subf 5,0,5 + srdi 0,0,3 + mtocrf 0x01,0 + + /* Get DST aligned to 128 bytes. */ +8: bf 28,4f + + std 4,0(10) + std 4,8(10) + std 4,16(10) + std 4,24(10) + std 4,32(10) + std 4,40(10) + std 4,48(10) + std 4,56(10) + addi 10,10,64 + .align 4 +4: bf 29,2f + + std 4,0(10) + std 4,8(10) + std 4,16(10) + std 4,24(10) + addi 10,10,32 + .align 4 +2: bf 30,1f + + std 4,0(10) + std 4,8(10) + addi 10,10,16 + .align 4 +1: bf 31,L(huge_aligned) + + std 4,0(10) + addi 10,10,8 + + +L(huge_aligned): + srdi 8,5,7 + clrldi 11,5,57 + cmpldi cr6,11,0 + mtctr 8 + + .align 4 +L(huge_loop): + dcbz 0,10 + addi 10,10,128 + bdnz L(huge_loop) + + /* Check how many bytes are still left. */ + beqlr cr6 + + subf 9,3,10 + subf 5,9,12 + srdi 8,5,3 + cmpldi cr6,8,0 + mtocrf 0x01,8 + + /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for + speed. We'll handle the resulting tail bytes later. */ + beq cr6,L(tail) + +8: bf 28,4f + + std 4,0(10) + std 4,8(10) + std 4,16(10) + std 4,24(10) + std 4,32(10) + std 4,40(10) + std 4,48(10) + std 4,56(10) + addi 10,10,64 + .align 4 +4: bf 29,2f + + std 4,0(10) + std 4,8(10) + std 4,16(10) + std 4,24(10) + addi 10,10,32 + .align 4 +2: bf 30,1f + + std 4,0(10) + std 4,8(10) + addi 10,10,16 + .align 4 +1: bf 31,L(tail) + + std 4,0(10) + addi 10,10,8 + + /* Handle the rest of the tail bytes here. */ +L(tail): + mtocrf 0x01,5 + + .align 4 +4: bf 29,2f + + stw 4,0(10) + addi 10,10,4 + .align 4 +2: bf 30,1f + + sth 4,0(10) + addi 10,10,2 + .align 4 +1: bflr 31 + + stb 4,0(10) + blr + + /* Expanded tree to copy tail bytes without increments. */ + .align 4 +L(copy_tail): + bf 29,L(FXX) + + stw 4,0(10) + bf 30,L(TFX) + + sth 4,4(10) + bflr 31 + + stb 4,6(10) + blr + + .align 4 +L(FXX): bf 30,L(FFX) + + sth 4,0(10) + bflr 31 + + stb 4,2(10) + blr + + .align 4 +L(TFX): bflr 31 + + stb 4,4(10) + blr + + .align 4 +L(FFX): bflr 31 + + stb 4,0(10) + blr + + /* Handle copies of 9~31 bytes. */ + .align 4 +L(medium): + /* At least 9 bytes to go. */ + andi. 11,10,3 + clrldi 0,0,62 + beq L(medium_aligned) + + /* Force 4-bytes alignment for SRC. */ + mtocrf 0x01,0 + subf 5,0,5 +1: /* Copy 1 byte. */ + bf 31,2f + + stb 4,0(10) + addi 10,10,1 +2: /* Copy 2 bytes. */ + bf 30,L(medium_aligned) + + sth 4,0(10) + addi 10,10,2 + + .align 4 +L(medium_aligned): + /* At least 6 bytes to go, and DST is word-aligned. */ + cmpldi cr1,5,16 + mtocrf 0x01,5 + blt cr1,8f + + /* Copy 16 bytes. */ + stw 4,0(10) + stw 4,4(10) + stw 4,8(10) + stw 4,12(10) + addi 10,10,16 +8: /* Copy 8 bytes. */ + bf 28,4f + + stw 4,0(10) + stw 4,4(10) + addi 10,10,8 +4: /* Copy 4 bytes. */ + bf 29,2f + + stw 4,0(10) + addi 10,10,4 +2: /* Copy 2-3 bytes. */ + bf 30,1f + + sth 4,0(10) + addi 10,10,2 +1: /* Copy 1 byte. */ + bflr 31 + + stb 4,0(10) + blr + + /* Handles copies of 0~8 bytes. */ + .align 4 +L(small): + mtocrf 0x01,5 + bne cr6,L(copy_tail) + + stw 4,0(10) + stw 4,4(10) + blr + +END_GEN_TB (BP_SYM (memset),TB_TOCLESS) +libc_hidden_builtin_def (memset) + +/* Copied from bzero.S to prevent the linker from inserting a stub + between bzero and memset. */ +ENTRY (BP_SYM (__bzero)) + CALL_MCOUNT 3 + mr r5,r4 + li r4,0 + b L(_memset) +END_GEN_TB (BP_SYM (__bzero),TB_TOCLESS) + +weak_alias (BP_SYM (__bzero), BP_SYM (bzero)) -- cgit 1.4.1