From 5025581e1c66a184a587ab1bd99cd168e8fb7770 Mon Sep 17 00:00:00 2001 From: Will Schmidt Date: Wed, 7 Sep 2011 21:54:41 -0400 Subject: power7 memcpy VSX optimizations --- sysdeps/powerpc/powerpc32/power7/memcpy.S | 88 +++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 15 deletions(-) (limited to 'sysdeps/powerpc/powerpc32/power7') diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S index f0c332f1ba..ec7055745c 100644 --- a/sysdeps/powerpc/powerpc32/power7/memcpy.S +++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S @@ -1,5 +1,5 @@ /* Optimized memcpy implementation for PowerPC32/POWER7. - Copyright (C) 2010 Free Software Foundation, Inc. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Luis Machado . This file is part of the GNU C Library. @@ -116,24 +116,82 @@ L(copy_GE_32_aligned_cont): stfd 6,0(3) addi 10,3,8 +L(aligned_copy): + /* Main aligned copy loop. Copies up to 128-bytes at a time. */ .align 4 -4: /* Main aligned copy loop. Copies 32-bytes at a time. */ - lfd 6,0(11) - lfd 7,8(11) - lfd 8,16(11) - lfd 0,24(11) - addi 11,11,32 +4: + /* check for any 32-byte or 64-byte lumps that are outside of a + nice 128-byte range. R8 contains the number of 32-byte + lumps, so drop this into the CR, and use the SO/EQ bits to help + handle the 32- or 64- byte lumps. Then handle the rest with an + unrolled 128-bytes-at-a-time copy loop. */ + mtocrf 1,8 + li 6,16 # 16() index + li 7,32 # 32() index + li 8,48 # 48() index + +L(aligned_32byte): + /* if the SO bit (indicating a 32-byte lump) is not set, move along. */ + bns cr7,L(aligned_64byte) + lxvd2x 6,0,11 + lxvd2x 7,11,6 + addi 11,11,32 + stxvd2x 6,0,10 + stxvd2x 7,10,6 + addi 10,10,32 + +L(aligned_64byte): + /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */ + bne cr7,L(aligned_128setup) + lxvd2x 6,0,11 + lxvd2x 7,11,6 + lxvd2x 8,11,7 + lxvd2x 9,11,8 + addi 11,11,64 + stxvd2x 6,0,10 + stxvd2x 7,10,6 + stxvd2x 8,10,7 + stxvd2x 9,10,8 + addi 10,10,64 + +L(aligned_128setup): + /* Set up for the 128-byte at a time copy loop. */ + srwi 8,31,7 + cmpwi 8,0 # Any 4x lumps left? + beq 3f # if not, move along. + lxvd2x 6,0,11 + lxvd2x 7,11,6 + mtctr 8 # otherwise, load the ctr and begin. + li 8,48 # 48() index + b L(aligned_128loop) + +L(aligned_128head): + /* for the 2nd + iteration of this loop. */ + lxvd2x 6,0,11 + lxvd2x 7,11,6 +L(aligned_128loop): + lxvd2x 8,11,7 + lxvd2x 9,11,8 + stxvd2x 6,0,10 + addi 11,11,64 + stxvd2x 7,10,6 + stxvd2x 8,10,7 + stxvd2x 9,10,8 + lxvd2x 6,0,11 + lxvd2x 7,11,6 + addi 10,10,64 + lxvd2x 8,11,7 + lxvd2x 9,11,8 + addi 11,11,64 + stxvd2x 6,0,10 + stxvd2x 7,10,6 + stxvd2x 8,10,7 + stxvd2x 9,10,8 + addi 10,10,64 + bdnz L(aligned_128head) - stfd 6,0(10) - stfd 7,8(10) - stfd 8,16(10) - stfd 0,24(10) - addi 10,10,32 - bdnz 4b 3: - /* Check for tail bytes. */ - clrrwi 0,31,3 mtcrf 0x01,31 beq cr6,0f -- cgit 1.4.1