diff options
author | Ryan Arnold <ryanarn@etna.rchland.ibm.com> | 2010-05-20 17:05:06 -0500 |
---|---|---|
committer | Ryan Arnold <ryanarn@etna.rchland.ibm.com> | 2010-05-20 17:05:06 -0500 |
commit | 5fe0b279e4cb7b05c870a8903694a710b650acd7 (patch) | |
tree | 9aa1304d21116a3b513652513c3d8e0d505708e0 | |
parent | 0a89b6a6fac08b42533075d90d8693ec825bdac1 (diff) | |
download | glibc-5fe0b279e4cb7b05c870a8903694a710b650acd7.tar.gz glibc-5fe0b279e4cb7b05c870a8903694a710b650acd7.tar.xz glibc-5fe0b279e4cb7b05c870a8903694a710b650acd7.zip |
Revert "Power7 memset powerpc32 and powerpc64 .S optimizations."
This reverts commit 0a89b6a6fac08b42533075d90d8693ec825bdac1. A corrected version has been pushed upstream and that will be cherry-picked.
-rw-r--r-- | ChangeLog | 6 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc32/power7/memset.S | 434 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power7/memset.S | 398 |
3 files changed, 0 insertions, 838 deletions
diff --git a/ChangeLog b/ChangeLog index 4fb7a2979f..a16a1fba85 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,9 +1,3 @@ -2010-05-15 Luis Machado <luisgpm@br.ibm.com> - * sysdeps/powerpc/powerpc64/power7/memset.S: New POWER7-optimized - 64-bit memset. - * sysdeps/powerpc/powerpc32/power7/memset.S: New POWER7-optimized - 32-bit memset. - 2010-05-01 Alan Modra <amodra@gmail.com> * sysdeps/powerpc/powerpc32/power4/memcmp.S: Correct cfi for r24. * sysdeps/powerpc/powerpc64/bsd-_setjmp.S: Move contents.. diff --git a/sysdeps/powerpc/powerpc32/power7/memset.S b/sysdeps/powerpc/powerpc32/power7/memset.S deleted file mode 100644 index 990faa1452..0000000000 --- a/sysdeps/powerpc/powerpc32/power7/memset.S +++ /dev/null @@ -1,434 +0,0 @@ -/* Optimized memset implementation for PowerPC32/POWER7. - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA - 02110-1301 USA. */ - -#include <sysdep.h> -#include <bp-sym.h> -#include <bp-asm.h> - -/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); - Returns 's'. */ - - .machine power7 -EALIGN (BP_SYM (memset), 5, 0) - CALL_MCOUNT - - .align 4 -L(_memset): - cmplwi cr7,5,31 - cmplwi cr6,5,8 - mr 10,3 /* Save original argument for later. */ - mr 7,1 /* Save original r1 for later. */ - cfi_offset(31,-8) - - /* Replicate byte to word. */ - rlwimi 4,4,8,16,23 - rlwimi 4,4,16,0,15 - - ble cr6,L(small) /* If length <= 8, use short copy code. */ - - neg 0,3 - ble cr7,L(medium) /* If length < 32, use medium copy code. */ - - /* Save our word twice to create a doubleword that we will later - copy to a FPR. */ - stwu 1,-32(1) - andi. 11,10,7 /* Check alignment of DST. */ - mr 12,5 - stw 4,24(1) - stw 4,28(1) - beq L(big_aligned) - - clrlwi 0,0,29 - mtocrf 0x01,0 - subf 5,0,5 - - /* Get DST aligned to 8 bytes. */ -1: bf 31,2f - - stb 4,0(10) - addi 10,10,1 -2: bf 30,4f - - sth 4,0(10) - addi 10,10,2 -4: bf 29,L(big_aligned) - - stw 4,0(10) - addi 10,10,4 - - .align 4 -L(big_aligned): - cmplwi cr5,5,255 - li 0,32 - cmplwi cr1,5,160 - dcbtst 0,10 - cmplwi cr6,4,0 - srwi 9,5,3 /* Number of full doublewords remaining. */ - crand 27,26,21 - mtocrf 0x01,9 - bt 27,L(huge) - - /* From this point on, we'll copy 32+ bytes and the value - isn't 0 (so we can't use dcbz). */ - - srwi 8,5,5 - clrlwi 11,5,29 - cmplwi cr6,11,0 - cmplwi cr1,9,4 - mtctr 8 - - /* Copy 1~3 doublewords so the main loop starts - at a multiple of 32 bytes. */ - - bf 30,1f - - stw 4,0(10) - stw 4,4(10) - stw 4,8(10) - stw 4,12(10) - addi 10,10,16 - bf 31,L(big_loop) - - stw 4,0(10) - stw 4,4(10) - addi 10,10,8 - mr 12,10 - blt cr1,L(tail_bytes) - - b L(big_loop) - - .align 4 -1: /* Copy 1 doubleword. */ - bf 31,L(big_loop) - - stw 4,0(10) - stw 4,4(10) - addi 10,10,8 - - /* First use a 32-bytes loop with stw's to try and avoid the LHS due - to the lfd we will do next. Also, ping-pong through r10 and r12 - to avoid AGEN delays. */ - .align 4 -L(big_loop): - addi 12,10,32 - stw 4,0(10) - stw 4,4(10) - stw 4,8(10) - stw 4,12(10) - stw 4,16(10) - stw 4,20(10) - stw 4,24(10) - stw 4,28(10) - bdz L(tail_bytes) - - addi 10,10,64 - stw 4,0(12) - stw 4,4(12) - stw 4,8(12) - stw 4,12(12) - stw 4,16(12) - stw 4,20(12) - stw 4,24(12) - stw 4,28(12) - bdnz L(big_loop_fast_setup) - - mr 12,10 - b L(tail_bytes) - - /* Now that we're probably past the LHS window, use the VSX to - speed up the loop. */ -L(big_loop_fast_setup): - li 11,24 - li 6,16 - lxvdsx 4,1,11 - - .align 4 -L(big_loop_fast): - addi 12,10,32 - stxvd2x 4,10,0 - stxvd2x 4,10,6 - bdz L(tail_bytes) - - addi 10,10,64 - stxvd2x 4,12,0 - stxvd2x 4,12,6 - bdnz L(big_loop_fast) - - mr 12,10 - - .align 4 -L(tail_bytes): - - /* Check for tail bytes. */ - mr 1,7 /* Restore r1. */ - beqlr cr6 - - clrlwi 0,5,29 - mtocrf 0x01,0 - - /* At this point we have a tail of 0-7 bytes and we know that the - destination is doubleword-aligned. */ -4: /* Copy 4 bytes. */ - bf 29,2f - - stw 4,0(12) - addi 12,12,4 -2: /* Copy 2 bytes. */ - bf 30,1f - - sth 4,0(12) - addi 12,12,2 -1: /* Copy 1 byte. */ - bflr 31 - - stb 4,0(12) - blr - - - /* Special case when value is 0 and we have a long length to deal - with. Use dcbz to zero out 128-bytes at a time. Before using - dcbz though, we need to get the destination 128-bytes aligned. */ - .align 4 -L(huge): - lfd 4,24(1) - andi. 11,10,127 - neg 0,10 - beq L(huge_aligned) - - clrlwi 0,0,25 - subf 5,0,5 - srwi 0,0,3 - mtocrf 0x01,0 - - /* Get DST aligned to 128 bytes. */ -8: bf 28,4f - - stfd 4,0(10) - stfd 4,8(10) - stfd 4,16(10) - stfd 4,24(10) - stfd 4,32(10) - stfd 4,40(10) - stfd 4,48(10) - stfd 4,56(10) - addi 10,10,64 - .align 4 -4: bf 29,2f - - stfd 4,0(10) - stfd 4,8(10) - stfd 4,16(10) - stfd 4,24(10) - addi 10,10,32 - .align 4 -2: bf 30,1f - - stfd 4,0(10) - stfd 4,8(10) - addi 10,10,16 - .align 4 -1: bf 31,L(huge_aligned) - - stfd 4,0(10) - addi 10,10,8 - -L(huge_aligned): - srwi 8,5,7 - clrlwi 11,5,25 - cmplwi cr6,11,0 - mtctr 8 - - /* Copies 128-bytes at a time. */ - .align 4 -L(huge_loop): - dcbz 0,10 - addi 10,10,128 - bdnz L(huge_loop) - - /* We have a tail of 0~127 bytes to handle. */ - mr 1,7 /* Restore r1. */ - beqlr cr6 - - subf 9,3,10 - subf 5,9,12 - srwi 8,5,3 - cmplwi cr6,8,0 - mtocrf 0x01,8 - - /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for - speed. We'll handle the resulting tail bytes later. */ - beq cr6,L(tail) - -8: bf 28,4f - - stfd 4,0(10) - stfd 4,8(10) - stfd 4,16(10) - stfd 4,24(10) - stfd 4,32(10) - stfd 4,40(10) - stfd 4,48(10) - stfd 4,56(10) - addi 10,10,64 - .align 4 -4: bf 29,2f - - stfd 4,0(10) - stfd 4,8(10) - stfd 4,16(10) - stfd 4,24(10) - addi 10,10,32 - .align 4 -2: bf 30,1f - - stfd 4,0(10) - stfd 4,8(10) - addi 10,10,16 - .align 4 -1: bf 31,L(tail) - - stfd 4,0(10) - addi 10,10,8 - - /* Handle the rest of the tail bytes here. */ -L(tail): - mtocrf 0x01,5 - - .align 4 -4: bf 29,2f - - stw 4,0(10) - addi 10,10,4 - .align 4 -2: bf 30,1f - - sth 4,0(10) - addi 10,10,2 - .align 4 -1: bflr 31 - - stb 4,0(10) - blr - - - /* Expanded tree to copy tail bytes without increments. */ - .align 4 -L(copy_tail): - bf 29,L(FXX) - - stw 4,0(10) - bf 30,L(TFX) - - sth 4,4(10) - bflr 31 - - stb 4,6(10) - blr - - .align 4 -L(FXX): bf 30,L(FFX) - - sth 4,0(10) - bflr 31 - - stb 4,2(10) - blr - - .align 4 -L(TFX): bflr 31 - - stb 4,4(10) - blr - - .align 4 -L(FFX): bflr 31 - - stb 4,0(10) - blr - - /* Handle copies of 9~31 bytes. */ - .align 4 -L(medium): - /* At least 9 bytes to go. */ - andi. 11,10,3 - clrlwi 0,0,30 - beq L(medium_aligned) - - /* Force 4-bytes alignment for DST. */ - mtocrf 0x01,0 - subf 5,0,5 -1: /* Copy 1 byte. */ - bf 31,2f - - stb 4,0(10) - addi 10,10,1 -2: /* Copy 2 bytes. */ - bf 30,L(medium_aligned) - - sth 4,0(10) - addi 10,10,2 - - .align 4 -L(medium_aligned): - /* At least 6 bytes to go, and DST is word-aligned. */ - cmplwi cr1,5,16 - mtocrf 0x01,5 - blt cr1,8f - - /* Copy 16 bytes. */ - stw 4,0(10) - stw 4,4(10) - stw 4,8(10) - stw 4,12(10) - addi 10,10,16 -8: /* Copy 8 bytes. */ - bf 28,4f - - stw 4,0(10) - stw 4,4(10) - addi 10,10,8 -4: /* Copy 4 bytes. */ - bf 29,2f - - stw 4,0(10) - addi 10,10,4 -2: /* Copy 2-3 bytes. */ - bf 30,1f - - sth 4,0(10) - addi 10,10,2 -1: /* Copy 1 byte. */ - bflr 31 - - stb 4,0(10) - blr - - /* Handles copies of 0~8 bytes. */ - .align 4 -L(small): - mtocrf 0x01,5 - bne cr6,L(copy_tail) - - stw 4,0(10) - stw 4,4(10) - blr - -END (BP_SYM (memset)) -libc_hidden_builtin_def (memset) diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S deleted file mode 100644 index 0f726d4f37..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/memset.S +++ /dev/null @@ -1,398 +0,0 @@ -/* Optimized memset implementation for PowerPC64/POWER7. - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> -#include <bp-sym.h> -#include <bp-asm.h> - -/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); - Returns 's'. */ - - .machine power7 -EALIGN (BP_SYM (memset), 5, 0) - CALL_MCOUNT 3 - -L(_memset): - cmpldi cr7,5,31 - cmpldi cr6,5,8 - mr 10,3 - - /* Replicate byte to word. */ - rlwimi 4,4,8,16,23 - rlwimi 4,4,16,0,15 - ble cr6, L(small) /* If length <= 8, use short copy code. */ - - neg 0,3 - ble cr7, L(medium) /* If length < 32, use medium copy code. */ - - andi. 11,10,7 /* Check alignment of SRC. */ - insrdi 4,4,32,0 /* Replicate word to double word. */ - - mr 12,5 - beq L(big_aligned) - - clrldi 0,0,61 - mtocrf 0x01,0 - subf 5,0,5 - - /* Get DST aligned to 8 bytes. */ -1: bf 31,2f - - stb 4,0(10) - addi 10,10,1 -2: bf 30,4f - - sth 4,0(10) - addi 10,10,2 -4: bf 29,L(big_aligned) - - stw 4,0(10) - addi 10,10,4 - - .align 4 -L(big_aligned): - - cmpldi cr5,5,255 - li 0,32 - dcbtst 0,10 - cmpldi cr6,4,0 - srdi 9,5,3 /* Number of full doublewords remaining. */ - crand 27,26,21 - mtocrf 0x01,9 - bt 27,L(huge) - - /* From this point on, we'll copy 32+ bytes and the value - isn't 0 (so we can't use dcbz). */ - - srdi 8,5,5 - clrldi 11,5,61 - cmpldi cr6,11,0 - cmpldi cr1,9,4 - mtctr 8 - - /* Copy 1~3 doublewords so the main loop starts - at a multiple of 32 bytes. */ - - bf 30,1f - - std 4,0(10) - std 4,8(10) - addi 10,10,16 - bf 31,L(big_loop) - - std 4,0(10) - addi 10,10,8 - mr 12,10 - blt cr1,L(tail_bytes) - b L(big_loop) - - .align 4 -1: /* Copy 1 doubleword. */ - bf 31,L(big_loop) - - std 4,0(10) - addi 10,10,8 - - /* Main aligned copy loop. Copies 32-bytes at a time and - ping-pong through r10 and r12 to avoid AGEN delays. */ - .align 4 -L(big_loop): - addi 12,10,32 - std 4,0(10) - std 4,8(10) - std 4,16(10) - std 4,24(10) - bdz L(tail_bytes) - - addi 10,10,64 - std 4,0(12) - std 4,8(12) - std 4,16(12) - std 4,24(12) - bdnz L(big_loop) - - mr 12,10 - b L(tail_bytes) - - .align 4 -L(tail_bytes): - - /* Check for tail bytes. */ - beqlr cr6 - - clrldi 0,5,61 - mtocrf 0x01,0 - - /* At this point we have a tail of 0-7 bytes and we know that the - destination is doubleword-aligned. */ -4: /* Copy 4 bytes. */ - bf 29,2f - - stw 4,0(12) - addi 12,12,4 -2: /* Copy 2 bytes. */ - bf 30,1f - - sth 4,0(12) - addi 12,12,2 -1: /* Copy 1 byte. */ - bflr 31 - - stb 4,0(12) - blr - - /* Special case when value is 0 and we have a long length to deal - with. Use dcbz to zero out 128-bytes at a time. Before using - dcbz though, we need to get the destination 128-bytes aligned. */ - .align 4 -L(huge): - andi. 11,10,127 - neg 0,10 - beq L(huge_aligned) - - clrldi 0,0,57 - subf 5,0,5 - srdi 0,0,3 - mtocrf 0x01,0 - - /* Get DST aligned to 128 bytes. */ -8: bf 28,4f - - std 4,0(10) - std 4,8(10) - std 4,16(10) - std 4,24(10) - std 4,32(10) - std 4,40(10) - std 4,48(10) - std 4,56(10) - addi 10,10,64 - .align 4 -4: bf 29,2f - - std 4,0(10) - std 4,8(10) - std 4,16(10) - std 4,24(10) - addi 10,10,32 - .align 4 -2: bf 30,1f - - std 4,0(10) - std 4,8(10) - addi 10,10,16 - .align 4 -1: bf 31,L(huge_aligned) - - std 4,0(10) - addi 10,10,8 - - -L(huge_aligned): - srdi 8,5,7 - clrldi 11,5,57 - cmpldi cr6,11,0 - mtctr 8 - - .align 4 -L(huge_loop): - dcbz 0,10 - addi 10,10,128 - bdnz L(huge_loop) - - /* Check how many bytes are still left. */ - beqlr cr6 - - subf 9,3,10 - subf 5,9,12 - srdi 8,5,3 - cmpldi cr6,8,0 - mtocrf 0x01,8 - - /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for - speed. We'll handle the resulting tail bytes later. */ - beq cr6,L(tail) - -8: bf 28,4f - - std 4,0(10) - std 4,8(10) - std 4,16(10) - std 4,24(10) - std 4,32(10) - std 4,40(10) - std 4,48(10) - std 4,56(10) - addi 10,10,64 - .align 4 -4: bf 29,2f - - std 4,0(10) - std 4,8(10) - std 4,16(10) - std 4,24(10) - addi 10,10,32 - .align 4 -2: bf 30,1f - - std 4,0(10) - std 4,8(10) - addi 10,10,16 - .align 4 -1: bf 31,L(tail) - - std 4,0(10) - addi 10,10,8 - - /* Handle the rest of the tail bytes here. */ -L(tail): - mtocrf 0x01,5 - - .align 4 -4: bf 29,2f - - stw 4,0(10) - addi 10,10,4 - .align 4 -2: bf 30,1f - - sth 4,0(10) - addi 10,10,2 - .align 4 -1: bflr 31 - - stb 4,0(10) - blr - - /* Expanded tree to copy tail bytes without increments. */ - .align 4 -L(copy_tail): - bf 29,L(FXX) - - stw 4,0(10) - bf 30,L(TFX) - - sth 4,4(10) - bflr 31 - - stb 4,6(10) - blr - - .align 4 -L(FXX): bf 30,L(FFX) - - sth 4,0(10) - bflr 31 - - stb 4,2(10) - blr - - .align 4 -L(TFX): bflr 31 - - stb 4,4(10) - blr - - .align 4 -L(FFX): bflr 31 - - stb 4,0(10) - blr - - /* Handle copies of 9~31 bytes. */ - .align 4 -L(medium): - /* At least 9 bytes to go. */ - andi. 11,10,3 - clrldi 0,0,62 - beq L(medium_aligned) - - /* Force 4-bytes alignment for SRC. */ - mtocrf 0x01,0 - subf 5,0,5 -1: /* Copy 1 byte. */ - bf 31,2f - - stb 4,0(10) - addi 10,10,1 -2: /* Copy 2 bytes. */ - bf 30,L(medium_aligned) - - sth 4,0(10) - addi 10,10,2 - - .align 4 -L(medium_aligned): - /* At least 6 bytes to go, and DST is word-aligned. */ - cmpldi cr1,5,16 - mtocrf 0x01,5 - blt cr1,8f - - /* Copy 16 bytes. */ - stw 4,0(10) - stw 4,4(10) - stw 4,8(10) - stw 4,12(10) - addi 10,10,16 -8: /* Copy 8 bytes. */ - bf 28,4f - - stw 4,0(10) - stw 4,4(10) - addi 10,10,8 -4: /* Copy 4 bytes. */ - bf 29,2f - - stw 4,0(10) - addi 10,10,4 -2: /* Copy 2-3 bytes. */ - bf 30,1f - - sth 4,0(10) - addi 10,10,2 -1: /* Copy 1 byte. */ - bflr 31 - - stb 4,0(10) - blr - - /* Handles copies of 0~8 bytes. */ - .align 4 -L(small): - mtocrf 0x01,5 - bne cr6,L(copy_tail) - - stw 4,0(10) - stw 4,4(10) - blr - -END_GEN_TB (BP_SYM (memset),TB_TOCLESS) -libc_hidden_builtin_def (memset) - -/* Copied from bzero.S to prevent the linker from inserting a stub - between bzero and memset. */ -ENTRY (BP_SYM (__bzero)) - CALL_MCOUNT 3 - mr r5,r4 - li r4,0 - b L(_memset) -END_GEN_TB (BP_SYM (__bzero),TB_TOCLESS) - -weak_alias (BP_SYM (__bzero), BP_SYM (bzero)) |