diff options
author | Ulrich Drepper <drepper@redhat.com> | 2010-01-18 12:40:29 -0800 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2010-01-18 12:40:29 -0800 |
commit | 057edf90e015117bcb7c7cf2e895359e7244dbf8 (patch) | |
tree | 37852c217d90cd1ad5e745a4adc1494276562d32 /sysdeps/powerpc/powerpc64 | |
parent | f87d0dac8b79920b34f0a7878d2be711a7cdf537 (diff) | |
download | glibc-057edf90e015117bcb7c7cf2e895359e7244dbf8.tar.gz glibc-057edf90e015117bcb7c7cf2e895359e7244dbf8.tar.xz glibc-057edf90e015117bcb7c7cf2e895359e7244dbf8.zip |
memcpy for ppc/cell.
Diffstat (limited to 'sysdeps/powerpc/powerpc64')
-rw-r--r-- | sysdeps/powerpc/powerpc64/cell/memcpy.S | 245 |
1 files changed, 245 insertions, 0 deletions
diff --git a/sysdeps/powerpc/powerpc64/cell/memcpy.S b/sysdeps/powerpc/powerpc64/cell/memcpy.S new file mode 100644 index 0000000000..2a00a6ed52 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/cell/memcpy.S @@ -0,0 +1,245 @@ +/* Optimized memcpy implementation for CELL BE PowerPC. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <bp-sym.h> +#include <bp-asm.h> + +#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */ +#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */ + +/* memcpy routine optimized for CELL-BE-PPC v2.0 + * + * The CELL PPC core has 1 integer unit and 1 load/store unit + * CELL: + * 1st level data cache = 32K + * 2nd level data cache = 512K + * 3rd level data cache = 0K + * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks, + * latency to memory is >400 clocks + * To improve copy performance we need to prefetch source data + * far ahead to hide this latency + * For best performance instructionforms ending in "." like "andi." + * should be avoided as the are implemented in microcode on CELL. + * The below code is loop unrolled for the CELL cache line of 128 bytes + */ + +.align 7 + +EALIGN (BP_SYM (memcpy), 5, 0) + CALL_MCOUNT 3 + + dcbt 0,r4 /* Prefetch ONE SRC cacheline */ + cmpldi cr1,r5,16 /* is size < 16 ? */ + mr r6,r3 + blt+ cr1,.Lshortcopy + +.Lbigcopy: + neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ + clrldi r8,r8,64-4 /* aling to 16byte boundary */ + sub r7,r4,r3 + cmpldi cr0,r8,0 + beq+ .Ldst_aligned + +.Ldst_unaligned: + mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ + subf r5,r8,r5 + + bf cr7*4+3,1f + lbzx r0,r7,r6 /* copy 1 byte */ + stb r0,0(r6) + addi r6,r6,1 +1: bf cr7*4+2,2f + lhzx r0,r7,r6 /* copy 2 byte */ + sth r0,0(r6) + addi r6,r6,2 +2: bf cr7*4+1,4f + lwzx r0,r7,r6 /* copy 4 byte */ + stw r0,0(r6) + addi r6,r6,4 +4: bf cr7*4+0,8f + ldx r0,r7,r6 /* copy 8 byte */ + std r0,0(r6) + addi r6,r6,8 +8: + add r4,r7,r6 + +.Ldst_aligned: + + cmpdi cr5,r5,128-1 + + neg r7,r6 + addi r6,r6,-8 /* prepare for stdu */ + addi r4,r4,-8 /* prepare for ldu */ + + clrldi r7,r7,64-7 /* align to cacheline boundary */ + ble+ cr5,.Llessthancacheline + + cmpldi cr6,r7,0 + subf r5,r7,r5 + srdi r7,r7,4 /* divide size by 16 */ + srdi r10,r5,7 /* number of cache lines to copy */ + + cmpldi r10,0 + li r11,0 /* number cachelines to copy with prefetch */ + beq .Lnocacheprefetch + + cmpldi r10,PREFETCH_AHEAD + li r12,128+8 /* prefetch distance */ + ble .Llessthanmaxprefetch + + subi r11,r10,PREFETCH_AHEAD + li r10,PREFETCH_AHEAD + +.Llessthanmaxprefetch: + mtctr r10 + +.LprefetchSRC: + dcbt r12,r4 + addi r12,r12,128 + bdnz .LprefetchSRC + +.Lnocacheprefetch: + mtctr r7 + cmpldi cr1,r5,128 + clrldi r5,r5,64-7 + beq cr6,.Lcachelinealigned + +.Laligntocacheline: + ld r9,0x08(r4) + ldu r7,0x10(r4) + std r9,0x08(r6) + stdu r7,0x10(r6) + bdnz .Laligntocacheline + + +.Lcachelinealigned: /* copy while cache lines */ + + blt- cr1,.Llessthancacheline /* size <128 */ + +.Louterloop: + cmpdi r11,0 + mtctr r11 + beq- .Lendloop + + li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ + +.align 4 + /* Copy whole cachelines, optimized by prefetching SRC cacheline */ +.Lloop: /* Copy aligned body */ + dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ + ld r9, 0x08(r4) + dcbz r11,r6 + ld r7, 0x10(r4) /* 4 register stride copy is optimal */ + ld r8, 0x18(r4) /* to hide 1st level cache lantency. */ + ld r0, 0x20(r4) + std r9, 0x08(r6) + std r7, 0x10(r6) + std r8, 0x18(r6) + std r0, 0x20(r6) + ld r9, 0x28(r4) + ld r7, 0x30(r4) + ld r8, 0x38(r4) + ld r0, 0x40(r4) + std r9, 0x28(r6) + std r7, 0x30(r6) + std r8, 0x38(r6) + std r0, 0x40(r6) + ld r9, 0x48(r4) + ld r7, 0x50(r4) + ld r8, 0x58(r4) + ld r0, 0x60(r4) + std r9, 0x48(r6) + std r7, 0x50(r6) + std r8, 0x58(r6) + std r0, 0x60(r6) + ld r9, 0x68(r4) + ld r7, 0x70(r4) + ld r8, 0x78(r4) + ldu r0, 0x80(r4) + std r9, 0x68(r6) + std r7, 0x70(r6) + std r8, 0x78(r6) + stdu r0, 0x80(r6) + + bdnz .Lloop + +.Lendloop: + cmpdi r10,0 + sldi r10,r10,2 /* adjust from 128 to 32 byte stride */ + beq- .Lendloop2 + mtctr r10 + +.Lloop2: /* Copy aligned body */ + ld r9, 0x08(r4) + ld r7, 0x10(r4) + ld r8, 0x18(r4) + ldu r0, 0x20(r4) + std r9, 0x08(r6) + std r7, 0x10(r6) + std r8, 0x18(r6) + stdu r0, 0x20(r6) + + bdnz .Lloop2 +.Lendloop2: + +.Llessthancacheline: /* less than cache to do ? */ + cmpldi cr0,r5,16 + srdi r7,r5,4 /* divide size by 16 */ + blt- .Ldo_lt16 + mtctr r7 + +.Lcopy_remaining: + ld r8,0x08(r4) + ldu r7,0x10(r4) + std r8,0x08(r6) + stdu r7,0x10(r6) + bdnz .Lcopy_remaining + +.Ldo_lt16: /* less than 16 ? */ + cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */ + beqlr+ /* no rest to copy */ + addi r4,r4,8 + addi r6,r6,8 + +.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */ + mtcrf 0x01,r5 + sub r7,r4,r6 + bf- cr7*4+0,8f + ldx r0,r7,r6 /* copy 8 byte */ + std r0,0(r6) + addi r6,r6,8 +8: + bf cr7*4+1,4f + lwzx r0,r7,r6 /* copy 4 byte */ + stw r0,0(r6) + addi r6,r6,4 +4: + bf cr7*4+2,2f + lhzx r0,r7,r6 /* copy 2 byte */ + sth r0,0(r6) + addi r6,r6,2 +2: + bf cr7*4+3,1f + lbzx r0,r7,r6 /* copy 1 byte */ + stb r0,0(r6) +1: blr + +END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS) +libc_hidden_builtin_def (memcpy) |