diff options
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power7/memcpy.S')
-rw-r--r-- | sysdeps/powerpc/powerpc64/power7/memcpy.S | 430 |
1 files changed, 0 insertions, 430 deletions
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S deleted file mode 100644 index e08993cbc3..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/memcpy.S +++ /dev/null @@ -1,430 +0,0 @@ -/* Optimized memcpy implementation for PowerPC64/POWER7. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - - -/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); - Returns 'dst'. */ - -#ifndef MEMCPY -# define MEMCPY memcpy -#endif - -#define dst 11 /* Use r11 so r3 kept unchanged. */ -#define src 4 -#define cnt 5 - - .machine power7 -EALIGN (MEMCPY, 5, 0) - CALL_MCOUNT 3 - - cmpldi cr1,cnt,31 - neg 0,3 - ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move - code. */ - -/* Align copies using VSX instructions to quadword. It is to avoid alignment - traps when memcpy is used on non-cacheable memory (for instance, memory - mapped I/O). */ - andi. 10,3,15 - clrldi 11,4,60 - cmpld cr6,10,11 /* SRC and DST alignments match? */ - - mr dst,3 - bne cr6,L(copy_GE_32_unaligned) - beq L(aligned_copy) - - mtocrf 0x01,0 - clrldi 0,0,60 - -/* Get the DST and SRC aligned to 16 bytes. */ -1: - bf 31,2f - lbz 6,0(src) - addi src,src,1 - stb 6,0(dst) - addi dst,dst,1 -2: - bf 30,4f - lhz 6,0(src) - addi src,src,2 - sth 6,0(dst) - addi dst,dst,2 -4: - bf 29,8f - lwz 6,0(src) - addi src,src,4 - stw 6,0(dst) - addi dst,dst,4 -8: - bf 28,16f - ld 6,0(src) - addi src,src,8 - std 6,0(dst) - addi dst,dst,8 -16: - subf cnt,0,cnt - -/* Main aligned copy loop. Copies 128 bytes at a time. */ -L(aligned_copy): - li 6,16 - li 7,32 - li 8,48 - mtocrf 0x02,cnt - srdi 12,cnt,7 - cmpdi 12,0 - beq L(aligned_tail) - lxvd2x 6,0,src - lxvd2x 7,src,6 - mtctr 12 - b L(aligned_128loop) - - .align 4 -L(aligned_128head): - /* for the 2nd + iteration of this loop. */ - lxvd2x 6,0,src - lxvd2x 7,src,6 -L(aligned_128loop): - lxvd2x 8,src,7 - lxvd2x 9,src,8 - stxvd2x 6,0,dst - addi src,src,64 - stxvd2x 7,dst,6 - stxvd2x 8,dst,7 - stxvd2x 9,dst,8 - lxvd2x 6,0,src - lxvd2x 7,src,6 - addi dst,dst,64 - lxvd2x 8,src,7 - lxvd2x 9,src,8 - addi src,src,64 - stxvd2x 6,0,dst - stxvd2x 7,dst,6 - stxvd2x 8,dst,7 - stxvd2x 9,dst,8 - addi dst,dst,64 - bdnz L(aligned_128head) - -L(aligned_tail): - mtocrf 0x01,cnt - bf 25,32f - lxvd2x 6,0,src - lxvd2x 7,src,6 - lxvd2x 8,src,7 - lxvd2x 9,src,8 - addi src,src,64 - stxvd2x 6,0,dst - stxvd2x 7,dst,6 - stxvd2x 8,dst,7 - stxvd2x 9,dst,8 - addi dst,dst,64 -32: - bf 26,16f - lxvd2x 6,0,src - lxvd2x 7,src,6 - addi src,src,32 - stxvd2x 6,0,dst - stxvd2x 7,dst,6 - addi dst,dst,32 -16: - bf 27,8f - lxvd2x 6,0,src - addi src,src,16 - stxvd2x 6,0,dst - addi dst,dst,16 -8: - bf 28,4f - ld 6,0(src) - addi src,src,8 - std 6,0(dst) - addi dst,dst,8 -4: /* Copies 4~7 bytes. */ - bf 29,L(tail2) - lwz 6,0(src) - stw 6,0(dst) - bf 30,L(tail5) - lhz 7,4(src) - sth 7,4(dst) - bflr 31 - lbz 8,6(src) - stb 8,6(dst) - /* Return original DST pointer. */ - blr - - -/* Handle copies of 0~31 bytes. */ - .align 4 -L(copy_LT_32): - mr dst,3 - cmpldi cr6,cnt,8 - mtocrf 0x01,cnt - ble cr6,L(copy_LE_8) - - /* At least 9 bytes to go. */ - neg 8,4 - andi. 0,8,3 - cmpldi cr1,cnt,16 - beq L(copy_LT_32_aligned) - - /* Force 4-byte alignment for SRC. */ - mtocrf 0x01,0 - subf cnt,0,cnt -2: - bf 30,1f - lhz 6,0(src) - addi src,src,2 - sth 6,0(dst) - addi dst,dst,2 -1: - bf 31,L(end_4bytes_alignment) - lbz 6,0(src) - addi src,src,1 - stb 6,0(dst) - addi dst,dst,1 - - .align 4 -L(end_4bytes_alignment): - cmpldi cr1,cnt,16 - mtocrf 0x01,cnt - -L(copy_LT_32_aligned): - /* At least 6 bytes to go, and SRC is word-aligned. */ - blt cr1,8f - - /* Copy 16 bytes. */ - lwz 6,0(src) - lwz 7,4(src) - stw 6,0(dst) - lwz 8,8(src) - stw 7,4(dst) - lwz 6,12(src) - addi src,src,16 - stw 8,8(dst) - stw 6,12(dst) - addi dst,dst,16 -8: /* Copy 8 bytes. */ - bf 28,L(tail4) - lwz 6,0(src) - lwz 7,4(src) - addi src,src,8 - stw 6,0(dst) - stw 7,4(dst) - addi dst,dst,8 - - .align 4 -/* Copies 4~7 bytes. */ -L(tail4): - bf 29,L(tail2) - lwz 6,0(src) - stw 6,0(dst) - bf 30,L(tail5) - lhz 7,4(src) - sth 7,4(dst) - bflr 31 - lbz 8,6(src) - stb 8,6(dst) - /* Return original DST pointer. */ - blr - - .align 4 -/* Copies 2~3 bytes. */ -L(tail2): - bf 30,1f - lhz 6,0(src) - sth 6,0(dst) - bflr 31 - lbz 7,2(src) - stb 7,2(dst) - blr - - .align 4 -L(tail5): - bflr 31 - lbz 6,4(src) - stb 6,4(dst) - blr - - .align 4 -1: - bflr 31 - lbz 6,0(src) - stb 6,0(dst) - /* Return original DST pointer. */ - blr - - -/* Handles copies of 0~8 bytes. */ - .align 4 -L(copy_LE_8): - bne cr6,L(tail4) - - /* Though we could've used ld/std here, they are still - slow for unaligned cases. */ - - lwz 6,0(src) - lwz 7,4(src) - stw 6,0(dst) - stw 7,4(dst) - blr - - -/* Handle copies of 32+ bytes where DST is aligned (to quadword) but - SRC is not. Use aligned quadword loads from SRC, shifted to realign - the data, allowing for aligned DST stores. */ - .align 4 -L(copy_GE_32_unaligned): - clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */ - srdi 9,cnt,4 /* Number of full quadwords remaining. */ - - beq L(copy_GE_32_unaligned_cont) - - /* DST is not quadword aligned, get it aligned. */ - - mtocrf 0x01,0 - subf cnt,0,cnt - - /* Vector instructions work best when proper alignment (16-bytes) - is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ -1: - bf 31,2f - lbz 6,0(src) - addi src,src,1 - stb 6,0(dst) - addi dst,dst,1 -2: - bf 30,4f - lhz 6,0(src) - addi src,src,2 - sth 6,0(dst) - addi dst,dst,2 -4: - bf 29,8f - lwz 6,0(src) - addi src,src,4 - stw 6,0(dst) - addi dst,dst,4 -8: - bf 28,0f - ld 6,0(src) - addi src,src,8 - std 6,0(dst) - addi dst,dst,8 -0: - srdi 9,cnt,4 /* Number of full quadwords remaining. */ - - /* The proper alignment is present, it is OK to copy the bytes now. */ -L(copy_GE_32_unaligned_cont): - - /* Setup two indexes to speed up the indexed vector operations. */ - clrldi 10,cnt,60 - li 6,16 /* Index for 16-bytes offsets. */ - li 7,32 /* Index for 32-bytes offsets. */ - cmpldi cr1,10,0 - srdi 8,cnt,5 /* Setup the loop counter. */ - mtocrf 0x01,9 - cmpldi cr6,9,1 -#ifdef __LITTLE_ENDIAN__ - lvsr 5,0,src -#else - lvsl 5,0,src -#endif - lvx 3,0,src - li 0,0 - bf 31,L(setup_unaligned_loop) - - /* Copy another 16 bytes to align to 32-bytes due to the loop. */ - lvx 4,src,6 -#ifdef __LITTLE_ENDIAN__ - vperm 6,4,3,5 -#else - vperm 6,3,4,5 -#endif - addi src,src,16 - stvx 6,0,dst - addi dst,dst,16 - vor 3,4,4 - clrrdi 0,src,60 - -L(setup_unaligned_loop): - mtctr 8 - ble cr6,L(end_unaligned_loop) - - /* Copy 32 bytes at a time using vector instructions. */ - .align 4 -L(unaligned_loop): - - /* Note: vr6/vr10 may contain data that was already copied, - but in order to get proper alignment, we may have to copy - some portions again. This is faster than having unaligned - vector instructions though. */ - - lvx 4,src,6 -#ifdef __LITTLE_ENDIAN__ - vperm 6,4,3,5 -#else - vperm 6,3,4,5 -#endif - lvx 3,src,7 -#ifdef __LITTLE_ENDIAN__ - vperm 10,3,4,5 -#else - vperm 10,4,3,5 -#endif - addi src,src,32 - stvx 6,0,dst - stvx 10,dst,6 - addi dst,dst,32 - bdnz L(unaligned_loop) - - clrrdi 0,src,60 - - .align 4 -L(end_unaligned_loop): - - /* Check for tail bytes. */ - mtocrf 0x01,cnt - beqlr cr1 - - add src,src,0 - - /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ - /* Copy 8 bytes. */ - bf 28,4f - lwz 6,0(src) - lwz 7,4(src) - addi src,src,8 - stw 6,0(dst) - stw 7,4(dst) - addi dst,dst,8 -4: /* Copy 4~7 bytes. */ - bf 29,L(tail2) - lwz 6,0(src) - stw 6,0(dst) - bf 30,L(tail5) - lhz 7,4(src) - sth 7,4(dst) - bflr 31 - lbz 8,6(src) - stb 8,6(dst) - /* Return original DST pointer. */ - blr - -END_GEN_TB (MEMCPY,TB_TOCLESS) -libc_hidden_builtin_def (memcpy) |