From 057edf90e015117bcb7c7cf2e895359e7244dbf8 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Mon, 18 Jan 2010 12:40:29 -0800 Subject: memcpy for ppc/cell. --- sysdeps/powerpc/powerpc32/cell/memcpy.S | 245 ++++++++++++++++++++++++++++++++ 1 file changed, 245 insertions(+) create mode 100644 sysdeps/powerpc/powerpc32/cell/memcpy.S (limited to 'sysdeps/powerpc/powerpc32') diff --git a/sysdeps/powerpc/powerpc32/cell/memcpy.S b/sysdeps/powerpc/powerpc32/cell/memcpy.S new file mode 100644 index 0000000000..e6c076cbe1 --- /dev/null +++ b/sysdeps/powerpc/powerpc32/cell/memcpy.S @@ -0,0 +1,245 @@ +/* Optimized memcpy implementation for CELL BE PowerPC. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include + +#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */ +#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */ + +/* memcpy routine optimized for CELL-BE-PPC v2.0 + * + * The CELL PPC core has 1 integer unit and 1 load/store unit + * CELL: + * 1st level data cache = 32K + * 2nd level data cache = 512K + * 3rd level data cache = 0K + * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks, + * latency to memory is >400 clocks + * To improve copy performance we need to prefetch source data + * far ahead to hide this latency + * For best performance instructionforms ending in "." like "andi." + * should be avoided as the are implemented in microcode on CELL. + * The below code is loop unrolled for the CELL cache line of 128 bytes + */ + +.align 7 + +EALIGN (BP_SYM (memcpy), 5, 0) + CALL_MCOUNT + + dcbt 0,r4 /* Prefetch ONE SRC cacheline */ + cmplwi cr1,r5,16 /* is size < 16 ? */ + mr r6,r3 + blt+ cr1,.Lshortcopy + +.Lbigcopy: + neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ + clrlwi r8,r8,32-4 /* aling to 16byte boundary */ + sub r7,r4,r3 + cmplwi cr0,r8,0 + beq+ .Ldst_aligned + +.Ldst_unaligned: + mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ + subf r5,r8,r5 + + bf cr7*4+3,1f + lbzx r0,r7,r6 /* copy 1 byte */ + stb r0,0(r6) + addi r6,r6,1 +1: bf cr7*4+2,2f + lhzx r0,r7,r6 /* copy 2 byte */ + sth r0,0(r6) + addi r6,r6,2 +2: bf cr7*4+1,4f + lwzx r0,r7,r6 /* copy 4 byte */ + stw r0,0(r6) + addi r6,r6,4 +4: bf cr7*4+0,8f + lfdx fp9,r7,r6 /* copy 8 byte */ + stfd fp9,0(r6) + addi r6,r6,8 +8: + add r4,r7,r6 + +.Ldst_aligned: + + cmpwi cr5,r5,128-1 + + neg r7,r6 + addi r6,r6,-8 /* prepare for stfdu */ + addi r4,r4,-8 /* prepare for lfdu */ + + clrlwi r7,r7,32-7 /* align to cacheline boundary */ + ble+ cr5,.Llessthancacheline + + cmplwi cr6,r7,0 + subf r5,r7,r5 + srwi r7,r7,4 /* divide size by 16 */ + srwi r10,r5,7 /* number of cache lines to copy */ + + cmplwi r10,0 + li r11,0 /* number cachelines to copy with prefetch */ + beq .Lnocacheprefetch + + cmplwi r10,PREFETCH_AHEAD + li r12,128+8 /* prefetch distance */ + ble .Llessthanmaxprefetch + + subi r11,r10,PREFETCH_AHEAD + li r10,PREFETCH_AHEAD + +.Llessthanmaxprefetch: + mtctr r10 + +.LprefetchSRC: + dcbt r12,r4 + addi r12,r12,128 + bdnz .LprefetchSRC + +.Lnocacheprefetch: + mtctr r7 + cmplwi cr1,r5,128 + clrlwi r5,r5,32-7 + beq cr6,.Lcachelinealigned + +.Laligntocacheline: + lfd fp9,0x08(r4) + lfdu fp10,0x10(r4) + stfd fp9,0x08(r6) + stfdu fp10,0x10(r6) + bdnz .Laligntocacheline + + +.Lcachelinealigned: /* copy while cache lines */ + + blt- cr1,.Llessthancacheline /* size <128 */ + +.Louterloop: + cmpwi r11,0 + mtctr r11 + beq- .Lendloop + + li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ + +.align 4 + /* Copy whole cachelines, optimized by prefetching SRC cacheline */ +.Lloop: /* Copy aligned body */ + dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ + lfd fp9, 0x08(r4) + dcbz r11,r6 + lfd fp10, 0x10(r4) /* 4 register stride copy is optimal */ + lfd fp11, 0x18(r4) /* to hide 1st level cache lantency. */ + lfd fp12, 0x20(r4) + stfd fp9, 0x08(r6) + stfd fp10, 0x10(r6) + stfd fp11, 0x18(r6) + stfd fp12, 0x20(r6) + lfd fp9, 0x28(r4) + lfd fp10, 0x30(r4) + lfd fp11, 0x38(r4) + lfd fp12, 0x40(r4) + stfd fp9, 0x28(r6) + stfd fp10, 0x30(r6) + stfd fp11, 0x38(r6) + stfd fp12, 0x40(r6) + lfd fp9, 0x48(r4) + lfd fp10, 0x50(r4) + lfd fp11, 0x58(r4) + lfd fp12, 0x60(r4) + stfd fp9, 0x48(r6) + stfd fp10, 0x50(r6) + stfd fp11, 0x58(r6) + stfd fp12, 0x60(r6) + lfd fp9, 0x68(r4) + lfd fp10, 0x70(r4) + lfd fp11, 0x78(r4) + lfdu fp12, 0x80(r4) + stfd fp9, 0x68(r6) + stfd fp10, 0x70(r6) + stfd fp11, 0x78(r6) + stfdu fp12, 0x80(r6) + + bdnz .Lloop + +.Lendloop: + cmpwi r10,0 + slwi r10,r10,2 /* adjust from 128 to 32 byte stride */ + beq- .Lendloop2 + mtctr r10 + +.Lloop2: /* Copy aligned body */ + lfd fp9, 0x08(r4) + lfd fp10, 0x10(r4) + lfd fp11, 0x18(r4) + lfdu fp12, 0x20(r4) + stfd fp9, 0x08(r6) + stfd fp10, 0x10(r6) + stfd fp11, 0x18(r6) + stfdu fp12, 0x20(r6) + + bdnz .Lloop2 +.Lendloop2: + +.Llessthancacheline: /* less than cache to do ? */ + cmplwi cr0,r5,16 + srwi r7,r5,4 /* divide size by 16 */ + blt- .Ldo_lt16 + mtctr r7 + +.Lcopy_remaining: + lfd fp9,0x08(r4) + lfdu fp10,0x10(r4) + stfd fp9,0x08(r6) + stfdu fp10,0x10(r6) + bdnz .Lcopy_remaining + +.Ldo_lt16: /* less than 16 ? */ + cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */ + beqlr+ /* no rest to copy */ + addi r4,r4,8 + addi r6,r6,8 + +.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */ + mtcrf 0x01,r5 + sub r7,r4,r6 + bf- cr7*4+0,8f + lfdx fp9,r7,r6 /* copy 8 byte */ + stfd fp9,0(r6) + addi r6,r6,8 +8: + bf cr7*4+1,4f + lwzx r0,r7,r6 /* copy 4 byte */ + stw r0,0(r6) + addi r6,r6,4 +4: + bf cr7*4+2,2f + lhzx r0,r7,r6 /* copy 2 byte */ + sth r0,0(r6) + addi r6,r6,2 +2: + bf cr7*4+3,1f + lbzx r0,r7,r6 /* copy 1 byte */ + stb r0,0(r6) +1: blr + +END (BP_SYM (memcpy)) +libc_hidden_builtin_def (memcpy) -- cgit 1.4.1