From 09dec6c37e3cd967f62795320703647f24545e3e Mon Sep 17 00:00:00 2001 From: "Ryan S. Arnold" Date: Tue, 30 Oct 2012 17:07:18 -0500 Subject: Correct cacheline size to 32-bytes for ppc405 memset.S (bug 14595). This patch also creates a version of memset.S for the ppc476 processor which uses a 128-byte cacheline size for dcbz insns. --- ports/ChangeLog.powerpc | 9 ++ ports/sysdeps/powerpc/powerpc32/405/memset.S | 12 +-- ports/sysdeps/powerpc/powerpc32/476/memset.S | 154 +++++++++++++++++++++++++++ 3 files changed, 169 insertions(+), 6 deletions(-) create mode 100644 ports/sysdeps/powerpc/powerpc32/476/memset.S (limited to 'ports') diff --git a/ports/ChangeLog.powerpc b/ports/ChangeLog.powerpc index 642e7165c6..e22a7333a3 100644 --- a/ports/ChangeLog.powerpc +++ b/ports/ChangeLog.powerpc @@ -1,3 +1,12 @@ +2012-09-25 Jason Gunthorpe + Ryan S. Arnold + + [BZ #14595] + * sysdeps/powerpc/powerpc32/476/memset.S: New file copied from + 405/memset.S to preserve 128-byte cacheline size. + * sysdeps/powerpc/powerpc32/405/memset.S (memset): Fix cacheline size + to 32-bytes for 405, 440, and 464 processors. + 2012-10-19 Roland McGrath * sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/nptl/libc.abilist diff --git a/ports/sysdeps/powerpc/powerpc32/405/memset.S b/ports/sysdeps/powerpc/powerpc32/405/memset.S index e132ce3652..c2ee6c593c 100644 --- a/ports/sysdeps/powerpc/powerpc32/405/memset.S +++ b/ports/sysdeps/powerpc/powerpc32/405/memset.S @@ -1,5 +1,5 @@ -/* Optimized memset implementation for PowerPC476. - Copyright (C) 2010 Free Software Foundation, Inc. +/* Optimized memset for PowerPC405,440,464 (32-byte cacheline). + Copyright (C) 2012 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -104,7 +104,7 @@ L(use_dcbz): add r3,r3,r7 L(skip_string_loop): - clrlwi r8,r6,25 + clrlwi r8,r6,27 srwi. r8,r8,4 beq L(dcbz_pre_loop) mtctr r8 @@ -119,14 +119,14 @@ L(word_loop): bdnz L(word_loop) L(dcbz_pre_loop): - srwi r6,r5,7 + srwi r6,r5,5 mtctr r6 addi r7,0,0 L(dcbz_loop): dcbz r3,r7 - addi r3,r3,0x80 - subi r5,r5,0x80 + addi r3,r3,0x20 + subi r5,r5,0x20 bdnz L(dcbz_loop) srwi. r6,r5,4 beq L(postword2_count_loop) diff --git a/ports/sysdeps/powerpc/powerpc32/476/memset.S b/ports/sysdeps/powerpc/powerpc32/476/memset.S new file mode 100644 index 0000000000..8b5750442b --- /dev/null +++ b/ports/sysdeps/powerpc/powerpc32/476/memset.S @@ -0,0 +1,154 @@ +/* Optimized memset for PowerPC476 (128-byte cacheline). + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include +#include +#include + +/* memset + + r3:destination address and return address + r4:source integer to copy + r5:byte count + r11:sources integer to copy in all 32 bits of reg + r12:temp return address + + Save return address in r12 + If destinationn is unaligned and count is greater tha 255 bytes + set 0-3 bytes to make destination aligned + If count is greater tha 255 bytes and setting zero to memory + use dbcz to set memeory when we can + otherwsie do the follwoing + If 16 or more words to set we use 16 word copy loop. + Finaly we set 0-15 extra bytes with string store. */ + +EALIGN (BP_SYM (memset), 5, 0) + rlwinm r11,r4,0,24,31 + rlwimi r11,r4,8,16,23 + rlwimi r11,r11,16,0,15 + addi r12,r3,0 + cmpwi r5,0x00FF + ble L(preword8_count_loop) + cmpwi r4,0x00 + beq L(use_dcbz) + neg r6,r3 + clrlwi. r6,r6,30 + beq L(preword8_count_loop) + addi r8,0,1 + mtctr r6 + subi r3,r3,1 + +L(unaligned_bytecopy_loop): + stbu r11,0x1(r3) + subf. r5,r8,r5 + beq L(end_memset) + bdnz L(unaligned_bytecopy_loop) + addi r3,r3,1 + +L(preword8_count_loop): + srwi. r6,r5,4 + beq L(preword2_count_loop) + mtctr r6 + addi r3,r3,-4 + mr r8,r11 + mr r9,r11 + mr r10,r11 + +L(word8_count_loop_no_dcbt): + stwu r8,4(r3) + stwu r9,4(r3) + subi r5,r5,0x10 + stwu r10,4(r3) + stwu r11,4(r3) + bdnz L(word8_count_loop_no_dcbt) + addi r3,r3,4 + +L(preword2_count_loop): + clrlwi. r7,r5,28 + beq L(end_memset) + mr r8,r11 + mr r9,r11 + mr r10,r11 + mtxer r7 + stswx r8,0,r3 + +L(end_memset): + addi r3,r12,0 + blr + +L(use_dcbz): + neg r6,r3 + clrlwi. r7,r6,28 + beq L(skip_string_loop) + mr r8,r11 + mr r9,r11 + mr r10,r11 + subf r5,r7,r5 + mtxer r7 + stswx r8,0,r3 + add r3,r3,r7 + +L(skip_string_loop): + clrlwi r8,r6,25 + srwi. r8,r8,4 + beq L(dcbz_pre_loop) + mtctr r8 + +L(word_loop): + stw r11,0(r3) + subi r5,r5,0x10 + stw r11,4(r3) + stw r11,8(r3) + stw r11,12(r3) + addi r3,r3,0x10 + bdnz L(word_loop) + +L(dcbz_pre_loop): + srwi r6,r5,7 + mtctr r6 + addi r7,0,0 + +L(dcbz_loop): + dcbz r3,r7 + addi r3,r3,0x80 + subi r5,r5,0x80 + bdnz L(dcbz_loop) + srwi. r6,r5,4 + beq L(postword2_count_loop) + mtctr r6 + +L(postword8_count_loop): + stw r11,0(r3) + subi r5,r5,0x10 + stw r11,4(r3) + stw r11,8(r3) + stw r11,12(r3) + addi r3,r3,0x10 + bdnz L(postword8_count_loop) + +L(postword2_count_loop): + clrlwi. r7,r5,28 + beq L(end_memset) + mr r8,r11 + mr r9,r11 + mr r10,r11 + mtxer r7 + stswx r8,0,r3 + b L(end_memset) +END (BP_SYM (memset)) +libc_hidden_builtin_def (memset) -- cgit 1.4.1