From a88f47a72f4ca65832584a3f5a591690f6675092 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 12 Jul 2007 18:38:01 +0000 Subject: * sysdeps/powerpc/powerpc32/power6/memset.S: Update comments. Specify .machine power6 to get ISA-V2.0 branch hints. Unroll loops and avoid branch misspredicts for > 31 bytes memset case. * sysdeps/powerpc/powerpc64/power6/memset.S: Likewise. Remove toc ref to __cache_line_size. * sysdeps/powerpc/powerpc32/power4/memcmp.S: Specify .machine power4 to get ISA-V2.0 branch hints. * sysdeps/powerpc/powerpc32/power4/memcpy.S: Likewise * sysdeps/powerpc/powerpc32/power4/memset.S: Likewise * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power4/memcmp.S: Likewise. * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power4/memset.S: Likewise. Remove toc ref to __cache_line_size. * sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S: Include math_ldbl_opt.h. --- sysdeps/powerpc/powerpc32/power4/memcmp.S | 1 + sysdeps/powerpc/powerpc32/power4/memcpy.S | 1 + sysdeps/powerpc/powerpc32/power4/memset.S | 1 + sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S | 1 + sysdeps/powerpc/powerpc32/power6/memcpy.S | 1 + sysdeps/powerpc/powerpc32/power6/memset.S | 449 ++++++++++++++++++++---- sysdeps/powerpc/powerpc64/power4/memcmp.S | 1 + sysdeps/powerpc/powerpc64/power4/memcpy.S | 1 + sysdeps/powerpc/powerpc64/power4/memset.S | 7 +- sysdeps/powerpc/powerpc64/power6/memset.S | 274 +++++++++++---- 10 files changed, 592 insertions(+), 145 deletions(-) (limited to 'sysdeps/powerpc') diff --git a/sysdeps/powerpc/powerpc32/power4/memcmp.S b/sysdeps/powerpc/powerpc32/power4/memcmp.S index 4715302739..75b328403a 100644 --- a/sysdeps/powerpc/powerpc32/power4/memcmp.S +++ b/sysdeps/powerpc/powerpc32/power4/memcmp.S @@ -23,6 +23,7 @@ /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */ + .machine power4 EALIGN (BP_SYM(memcmp), 4, 0) CALL_MCOUNT diff --git a/sysdeps/powerpc/powerpc32/power4/memcpy.S b/sysdeps/powerpc/powerpc32/power4/memcpy.S index c48db2f3df..73020c6da8 100644 --- a/sysdeps/powerpc/powerpc32/power4/memcpy.S +++ b/sysdeps/powerpc/powerpc32/power4/memcpy.S @@ -34,6 +34,7 @@ possible when both source and destination are word aligned. Each case has an optimized unrolled loop. */ + .machine power4 EALIGN (BP_SYM (memcpy), 5, 0) CALL_MCOUNT diff --git a/sysdeps/powerpc/powerpc32/power4/memset.S b/sysdeps/powerpc/powerpc32/power4/memset.S index b07ed3c2d3..5dd1d943cf 100644 --- a/sysdeps/powerpc/powerpc32/power4/memset.S +++ b/sysdeps/powerpc/powerpc32/power4/memset.S @@ -28,6 +28,7 @@ cache line (1024 bits). There is a special case for setting cache lines to 0, to take advantage of the dcbz instruction. */ + .machine power4 EALIGN (BP_SYM (memset), 5, 0) CALL_MCOUNT diff --git a/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S b/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S index bcbc1fc3eb..528607602d 100644 --- a/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S +++ b/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S @@ -18,6 +18,7 @@ 02110-1301 USA. */ #include +#include /* long long int[r3, r4] __llrint (double x[fp1]) */ ENTRY (__llrint) diff --git a/sysdeps/powerpc/powerpc32/power6/memcpy.S b/sysdeps/powerpc/powerpc32/power6/memcpy.S index e8d56eb135..ba45fd250c 100644 --- a/sysdeps/powerpc/powerpc32/power6/memcpy.S +++ b/sysdeps/powerpc/powerpc32/power6/memcpy.S @@ -34,6 +34,7 @@ possible when both source and destination are word aligned. Each case has an optimized unrolled loop. */ + .machine power6 EALIGN (BP_SYM (memcpy), 5, 0) CALL_MCOUNT diff --git a/sysdeps/powerpc/powerpc32/power6/memset.S b/sysdeps/powerpc/powerpc32/power6/memset.S index 71c1209fa4..10fb7b9786 100644 --- a/sysdeps/powerpc/powerpc32/power6/memset.S +++ b/sysdeps/powerpc/powerpc32/power6/memset.S @@ -1,5 +1,5 @@ -/* Optimized memset implementation for PowerPC64. - Copyright (C) 1997,99, 2000,02,03, 2006 Free Software Foundation, Inc. +/* Optimized 32-bit memset implementation for POWER6. + Copyright (C) 1997,99, 2000,02,03,06,2007 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -28,7 +28,8 @@ cache line (1024 bits). There is a special case for setting cache lines to 0, to take advantage of the dcbz instruction. */ -EALIGN (BP_SYM (memset), 5, 0) + .machine power6 +EALIGN (BP_SYM (memset), 7, 0) CALL_MCOUNT #define rTMP r0 @@ -41,15 +42,13 @@ EALIGN (BP_SYM (memset), 5, 0) #define rMEMP2 r8 #define rNEG64 r8 /* Constant -64 for clearing with dcbz. */ -#define rCLS r8 /* Cache line size (known to be 128). */ -#define rCLM r9 /* Cache line size mask to check for cache alignment. */ +#define rMEMP3 r9 /* Alt mem pointer. */ L(_memset): /* Take care of case for size <= 4. */ cmplwi cr1, rLEN, 4 andi. rALIGN, rMEMP0, 3 mr rMEMP, rMEMP0 ble- cr1, L(small) - /* Align to word boundary. */ cmplwi cr5, rLEN, 31 rlwimi rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword. */ @@ -82,6 +81,7 @@ L(aligned): bf 28, L(a1) stw rCHR, -4(rMEMP2) stwu rCHR, -8(rMEMP2) + nop L(a1): blt cr1, L(a2) stw rCHR, -4(rMEMP2) stw rCHR, -8(rMEMP2) @@ -90,7 +90,7 @@ L(a1): blt cr1, L(a2) L(a2): bf 29, L(caligned) stw rCHR, -4(rMEMP2) - .align 4 + .align 3 /* Now aligned to a 32 byte boundary. */ L(caligned): cmplwi cr1, rCHR, 0 @@ -98,83 +98,394 @@ L(caligned): mtcrf 0x01, rLEN beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ L(nondcbz): - srwi rTMP, rALIGN, 5 - mtctr rTMP beq L(medium) /* We may not actually get to do a full line. */ - clrlwi. rLEN, rLEN, 27 - add rMEMP, rMEMP, rALIGN - li rNEG64, -0x40 - bdz L(cloopdone) + nop +/* Storing a non-zero "c" value. We are aligned at a sector (32-byte) + boundary may not be at cache line (128-byte) boundary. */ +L(nzloopstart): +/* memset in 32-byte chunks until we get to a cache line boundary. + If rLEN is less then the distance to the next cache-line boundary use + cacheAligned1 code to finish the tail. */ + cmplwi cr1,rLEN,128 + + andi. rTMP,rMEMP,127 + blt cr1,L(cacheAligned1) + addi rMEMP3,rMEMP,32 + beq L(nzCacheAligned) + addi rLEN,rLEN,-32 + stw rCHR,0(rMEMP) + stw rCHR,4(rMEMP) + stw rCHR,8(rMEMP) + stw rCHR,12(rMEMP) + stw rCHR,16(rMEMP) + stw rCHR,20(rMEMP) + addi rMEMP,rMEMP,32 + andi. rTMP,rMEMP3,127 + stw rCHR,-8(rMEMP3) + stw rCHR,-4(rMEMP3) + beq L(nzCacheAligned) + addi rLEN,rLEN,-32 + stw rCHR,0(rMEMP3) + stw rCHR,4(rMEMP3) + addi rMEMP,rMEMP,32 + stw rCHR,8(rMEMP3) + stw rCHR,12(rMEMP3) + andi. rTMP,rMEMP,127 + stw rCHR,16(rMEMP3) + stw rCHR,20(rMEMP3) + stw rCHR,24(rMEMP3) + stw rCHR,28(rMEMP3) + + beq L(nzCacheAligned) + addi rLEN,rLEN,-32 +/* At this point we can overrun the store queue (pipe reject) so it is + time to slow things down. The store queue can merge two adjacent + stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU. + So we add "group ending nops" to guarantee that we dispatch only two + stores every other cycle. */ + ori r1,r1,0 + ori r1,r1,0 + stw rCHR,32(rMEMP3) + stw rCHR,36(rMEMP3) + addi rMEMP,rMEMP,32 + cmplwi cr1,rLEN,128 + ori r1,r1,0 + ori r1,r1,0 + stw rCHR,40(rMEMP3) + stw rCHR,44(rMEMP3) + ori r1,r1,0 + ori r1,r1,0 + stw rCHR,48(rMEMP3) + stw rCHR,52(rMEMP3) + ori r1,r1,0 + ori r1,r1,0 + stw rCHR,56(rMEMP3) + stw rCHR,60(rMEMP3) + blt cr1,L(cacheAligned1) + b L(nzCacheAligned) + +/* Now we are aligned to the cache line and can use dcbtst. */ + .align 5 +L(nzCacheAligned): + cmplwi cr1,rLEN,128 + cmplwi cr6,rLEN,256 + blt cr1,L(cacheAligned1) + blt cr6,L(nzCacheAligned128) .align 4 -L(c3): dcbtst rNEG64, rMEMP - stw rCHR, -4(rMEMP) - stw rCHR, -8(rMEMP) - stw rCHR, -12(rMEMP) - stw rCHR, -16(rMEMP) - stw rCHR, -20(rMEMP) - stw rCHR, -24(rMEMP) - stw rCHR, -28(rMEMP) - stwu rCHR, -32(rMEMP) - bdnz L(c3) -L(cloopdone): - stw rCHR, -4(rMEMP) - stw rCHR, -8(rMEMP) - stw rCHR, -12(rMEMP) - stw rCHR, -16(rMEMP) - cmplwi cr1, rLEN, 16 - stw rCHR, -20(rMEMP) - stw rCHR, -24(rMEMP) - stw rCHR, -28(rMEMP) - stwu rCHR, -32(rMEMP) - beqlr - add rMEMP, rMEMP, rALIGN - b L(medium_tail2) +L(nzCacheAligned128): + nop + addi rMEMP3,rMEMP,64 + stw rCHR,0(rMEMP) + stw rCHR,4(rMEMP) + stw rCHR,8(rMEMP) + stw rCHR,12(rMEMP) + stw rCHR,16(rMEMP) + stw rCHR,20(rMEMP) + stw rCHR,24(rMEMP) + stw rCHR,28(rMEMP) + stw rCHR,32(rMEMP) + stw rCHR,36(rMEMP) + stw rCHR,40(rMEMP) + stw rCHR,44(rMEMP) + stw rCHR,48(rMEMP) + stw rCHR,52(rMEMP) + stw rCHR,56(rMEMP) + stw rCHR,60(rMEMP) + addi rMEMP,rMEMP3,64 + addi rLEN,rLEN,-128 +/* At this point we can overrun the store queue (pipe reject) so it is + time to slow things down. The store queue can merge two adjacent + stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU. + So we add "group ending nops" to guarantee that we dispatch only one + store per cycle. */ + stw rCHR,0(rMEMP3) + ori r1,r1,0 + stw rCHR,4(rMEMP3) + ori r1,r1,0 + stw rCHR,8(rMEMP3) + ori r1,r1,0 + stw rCHR,12(rMEMP3) + ori r1,r1,0 + stw rCHR,16(rMEMP3) + ori r1,r1,0 + stw rCHR,20(rMEMP3) + ori r1,r1,0 + stw rCHR,24(rMEMP3) + ori r1,r1,0 + stw rCHR,28(rMEMP3) + ori r1,r1,0 + stw rCHR,32(rMEMP3) + ori r1,r1,0 + stw rCHR,36(rMEMP3) + ori r1,r1,0 + stw rCHR,40(rMEMP3) + ori r1,r1,0 + stw rCHR,44(rMEMP3) + ori r1,r1,0 + stw rCHR,48(rMEMP3) + ori r1,r1,0 + stw rCHR,52(rMEMP3) + ori r1,r1,0 + stw rCHR,56(rMEMP3) + ori r1,r1,0 + stw rCHR,60(rMEMP3) + blt cr6,L(cacheAligned1) +#ifndef NOT_IN_libc + lfd 0,-128(rMEMP) +#endif + b L(nzCacheAligned256) + .align 5 +L(nzCacheAligned256): + cmplwi cr1,rLEN,256 + addi rMEMP3,rMEMP,64 +#ifdef NOT_IN_libc +/* When we are not in libc we should use only GPRs to avoid the FPU lock + interrupt. */ + stw rCHR,0(rMEMP) + stw rCHR,4(rMEMP) + stw rCHR,8(rMEMP) + stw rCHR,12(rMEMP) + stw rCHR,16(rMEMP) + stw rCHR,20(rMEMP) + stw rCHR,24(rMEMP) + stw rCHR,28(rMEMP) + stw rCHR,32(rMEMP) + stw rCHR,36(rMEMP) + stw rCHR,40(rMEMP) + stw rCHR,44(rMEMP) + stw rCHR,48(rMEMP) + stw rCHR,52(rMEMP) + stw rCHR,56(rMEMP) + stw rCHR,60(rMEMP) + addi rMEMP,rMEMP3,64 + addi rLEN,rLEN,-128 + stw rCHR,0(rMEMP3) + stw rCHR,4(rMEMP3) + stw rCHR,8(rMEMP3) + stw rCHR,12(rMEMP3) + stw rCHR,16(rMEMP3) + stw rCHR,20(rMEMP3) + stw rCHR,24(rMEMP3) + stw rCHR,28(rMEMP3) + stw rCHR,32(rMEMP3) + stw rCHR,36(rMEMP3) + stw rCHR,40(rMEMP3) + stw rCHR,44(rMEMP3) + stw rCHR,48(rMEMP3) + stw rCHR,52(rMEMP3) + stw rCHR,56(rMEMP3) + stw rCHR,60(rMEMP3) +#else +/* We are in libc and this is a long memset so we can use FPRs and can afford + occasional FPU locked interrupts. */ + stfd 0,0(rMEMP) + stfd 0,8(rMEMP) + stfd 0,16(rMEMP) + stfd 0,24(rMEMP) + stfd 0,32(rMEMP) + stfd 0,40(rMEMP) + stfd 0,48(rMEMP) + stfd 0,56(rMEMP) + addi rMEMP,rMEMP3,64 + addi rLEN,rLEN,-128 + stfd 0,0(rMEMP3) + stfd 0,8(rMEMP3) + stfd 0,16(rMEMP3) + stfd 0,24(rMEMP3) + stfd 0,32(rMEMP3) + stfd 0,40(rMEMP3) + stfd 0,48(rMEMP3) + stfd 0,56(rMEMP3) +#endif + bge cr1,L(nzCacheAligned256) + dcbtst 0,rMEMP + b L(cacheAligned1) - .align 5 -/* Clear lines of memory in 128-byte chunks. */ + .align 4 +/* Storing a zero "c" value. We are aligned at a sector (32-byte) + boundary but may not be at cache line (128-byte) boundary. If the + remaining length spans a full cache line we can use the Data cache + block zero instruction. */ L(zloopstart): -/* If the remaining length is less the 32 bytes, don't bother getting - the cache line size. */ +/* memset in 32-byte chunks until we get to a cache line boundary. + If rLEN is less then the distance to the next cache-line boundary use + cacheAligned1 code to finish the tail. */ + cmplwi cr1,rLEN,128 beq L(medium) - li rCLS,128 /* cache line size is 128 */ - dcbt 0,rMEMP L(getCacheAligned): - cmplwi cr1,rLEN,32 andi. rTMP,rMEMP,127 - blt cr1,L(handletail32) + blt cr1,L(cacheAligned1) + addi rMEMP3,rMEMP,32 + beq L(cacheAligned) + addi rLEN,rLEN,-32 + stw rCHR,0(rMEMP) + stw rCHR,4(rMEMP) + stw rCHR,8(rMEMP) + stw rCHR,12(rMEMP) + stw rCHR,16(rMEMP) + stw rCHR,20(rMEMP) + addi rMEMP,rMEMP,32 + andi. rTMP,rMEMP3,127 + stw rCHR,-8(rMEMP3) + stw rCHR,-4(rMEMP3) +L(getCacheAligned2): beq L(cacheAligned) + addi rLEN,rLEN,-32 addi rMEMP,rMEMP,32 + stw rCHR,0(rMEMP3) + stw rCHR,4(rMEMP3) + stw rCHR,8(rMEMP3) + stw rCHR,12(rMEMP3) + andi. rTMP,rMEMP,127 + nop + stw rCHR,16(rMEMP3) + stw rCHR,20(rMEMP3) + stw rCHR,24(rMEMP3) + stw rCHR,28(rMEMP3) +L(getCacheAligned3): + beq L(cacheAligned) +/* At this point we can overrun the store queue (pipe reject) so it is + time to slow things down. The store queue can merge two adjacent + stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU. + So we add "group ending nops" to guarantee that we dispatch only two + stores every other cycle. */ addi rLEN,rLEN,-32 - stw rCHR,-32(rMEMP) - stw rCHR,-28(rMEMP) - stw rCHR,-24(rMEMP) - stw rCHR,-20(rMEMP) - stw rCHR,-16(rMEMP) - stw rCHR,-12(rMEMP) - stw rCHR,-8(rMEMP) - stw rCHR,-4(rMEMP) - b L(getCacheAligned) + ori r1,r1,0 + ori r1,r1,0 + stw rCHR,32(rMEMP3) + stw rCHR,36(rMEMP3) + addi rMEMP,rMEMP,32 + cmplwi cr1,rLEN,128 + ori r1,r1,0 + stw rCHR,40(rMEMP3) + stw rCHR,44(rMEMP3) + cmplwi cr6,rLEN,256 + li rMEMP2,128 + ori r1,r1,0 + stw rCHR,48(rMEMP3) + stw rCHR,52(rMEMP3) + ori r1,r1,0 + ori r1,r1,0 + stw rCHR,56(rMEMP3) + stw rCHR,60(rMEMP3) + blt cr1,L(cacheAligned1) + blt cr6,L(cacheAligned128) + b L(cacheAlignedx) /* Now we are aligned to the cache line and can use dcbz. */ .align 4 L(cacheAligned): - cmplw cr1,rLEN,rCLS - blt cr1,L(handletail32) + cmplwi cr1,rLEN,128 + cmplwi cr6,rLEN,256 + blt cr1,L(cacheAligned1) + li rMEMP2,128 +L(cacheAlignedx): + cmpldi cr5,rLEN,640 + blt cr6,L(cacheAligned128) + bgt cr5,L(cacheAligned512) + cmplwi cr6,rLEN,512 dcbz 0,rMEMP - subf rLEN,rCLS,rLEN - add rMEMP,rMEMP,rCLS - b L(cacheAligned) + cmplwi cr1,rLEN,384 + dcbz rMEMP2,rMEMP + addi rMEMP,rMEMP,256 + addi rLEN,rLEN,-256 + blt cr1,L(cacheAligned1) + blt cr6,L(cacheAligned128) + b L(cacheAligned256) + .align 5 +/* A simple loop for the longer (>640 bytes) lengths. This form limits + the branch miss-predicted to exactly 1 at loop exit.*/ +L(cacheAligned512): + cmpli cr1,rLEN,128 + blt cr1,L(cacheAligned1) + dcbz 0,rMEMP + addi rLEN,rLEN,-128 + addi rMEMP,rMEMP,128 + b L(cacheAligned512) + .align 5 +L(cacheAligned256): + cmplwi cr6,rLEN,512 + dcbz 0,rMEMP + cmplwi cr1,rLEN,384 + dcbz rMEMP2,rMEMP + addi rMEMP,rMEMP,256 + addi rLEN,rLEN,-256 + bge cr6,L(cacheAligned256) + blt cr1,L(cacheAligned1) + .align 4 +L(cacheAligned128): + dcbz 0,rMEMP + addi rMEMP,rMEMP,128 + addi rLEN,rLEN,-128 + .align 4 +L(cacheAligned1): + cmplwi cr1,rLEN,32 + blt cr1,L(handletail32) + addi rMEMP3,rMEMP,32 + addi rLEN,rLEN,-32 + stw rCHR,0(rMEMP) + stw rCHR,4(rMEMP) + stw rCHR,8(rMEMP) + stw rCHR,12(rMEMP) + stw rCHR,16(rMEMP) + stw rCHR,20(rMEMP) + addi rMEMP,rMEMP,32 + cmplwi cr1,rLEN,32 + stw rCHR,-8(rMEMP3) + stw rCHR,-4(rMEMP3) +L(cacheAligned2): + blt cr1,L(handletail32) + addi rLEN,rLEN,-32 + stw rCHR,0(rMEMP3) + stw rCHR,4(rMEMP3) + stw rCHR,8(rMEMP3) + stw rCHR,12(rMEMP3) + addi rMEMP,rMEMP,32 + cmplwi cr1,rLEN,32 + stw rCHR,16(rMEMP3) + stw rCHR,20(rMEMP3) + stw rCHR,24(rMEMP3) + stw rCHR,28(rMEMP3) + nop +L(cacheAligned3): + blt cr1,L(handletail32) +/* At this point we can overrun the store queue (pipe reject) so it is + time to slow things down. The store queue can merge two adjacent + stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU. + So we add "group ending nops" to guarantee that we dispatch only two + stores every other cycle. */ + ori r1,r1,0 + ori r1,r1,0 + addi rMEMP,rMEMP,32 + addi rLEN,rLEN,-32 + ori r1,r1,0 + ori r1,r1,0 + stw rCHR,32(rMEMP3) + stw rCHR,36(rMEMP3) + ori r1,r1,0 + ori r1,r1,0 + stw rCHR,40(rMEMP3) + stw rCHR,44(rMEMP3) + ori r1,r1,0 + ori r1,r1,0 + stw rCHR,48(rMEMP3) + stw rCHR,52(rMEMP3) + ori r1,r1,0 + ori r1,r1,0 + stw rCHR,56(rMEMP3) + stw rCHR,60(rMEMP3) -/* We are here because the cache line size was set and the remainder - (rLEN) is less than the actual cache line size. - So set up the preconditions for L(nondcbz) and go there. */ +/* We are here because the length or remainder (rLEN) is less than the + cache line/sector size and does not justify aggressive loop unrolling. + So set up the preconditions for L(medium) and go there. */ .align 3 L(handletail32): - clrrwi. rALIGN, rLEN, 5 - b L(nondcbz) + cmplwi cr1,rLEN,0 + beqlr cr1 + b L(medium) - .align 5 + .align 4 L(small): /* Memset of 4 bytes or less. */ cmplwi cr5, rLEN, 1 @@ -199,10 +510,10 @@ L(medium_tail): bt- 31, L(medium_31t) bt- 30, L(medium_30t) L(medium_30f): - bt- 29, L(medium_29t) + bt 29, L(medium_29t) L(medium_29f): - bge- cr1, L(medium_27t) - bflr- 28 + bge cr1, L(medium_27t) + bflr 28 stw rCHR, -4(rMEMP) stw rCHR, -8(rMEMP) blr @@ -215,14 +526,14 @@ L(medium_30t): bf- 29, L(medium_29f) L(medium_29t): stwu rCHR, -4(rMEMP) - blt- cr1, L(medium_27f) + blt cr1, L(medium_27f) L(medium_27t): stw rCHR, -4(rMEMP) stw rCHR, -8(rMEMP) stw rCHR, -12(rMEMP) stwu rCHR, -16(rMEMP) L(medium_27f): - bflr- 28 + bflr 28 L(medium_28t): stw rCHR, -4(rMEMP) stw rCHR, -8(rMEMP) diff --git a/sysdeps/powerpc/powerpc64/power4/memcmp.S b/sysdeps/powerpc/powerpc64/power4/memcmp.S index 8f74ca7044..a5e0c758df 100644 --- a/sysdeps/powerpc/powerpc64/power4/memcmp.S +++ b/sysdeps/powerpc/powerpc64/power4/memcmp.S @@ -23,6 +23,7 @@ /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */ + .machine power4 EALIGN (BP_SYM(memcmp), 4, 0) CALL_MCOUNT 3 diff --git a/sysdeps/powerpc/powerpc64/power4/memcpy.S b/sysdeps/powerpc/powerpc64/power4/memcpy.S index 9910ebda82..56f313b4b8 100644 --- a/sysdeps/powerpc/powerpc64/power4/memcpy.S +++ b/sysdeps/powerpc/powerpc64/power4/memcpy.S @@ -36,6 +36,7 @@ posible when both source and destination are doubleword aligned. Each case has a optimized unrolled loop. */ + .machine power4 EALIGN (BP_SYM (memcpy), 5, 0) CALL_MCOUNT 3 diff --git a/sysdeps/powerpc/powerpc64/power4/memset.S b/sysdeps/powerpc/powerpc64/power4/memset.S index 17b2d76950..e7a259acdd 100644 --- a/sysdeps/powerpc/powerpc64/power4/memset.S +++ b/sysdeps/powerpc/powerpc64/power4/memset.S @@ -22,12 +22,6 @@ #include #include - .section ".toc","aw" -.LC0: - .tc __cache_line_size[TC],__cache_line_size - .section ".text" - .align 2 - /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); Returns 's'. @@ -35,6 +29,7 @@ cache line (256 bits). There is a special case for setting cache lines to 0, to take advantage of the dcbz instruction. */ + .machine power4 EALIGN (BP_SYM (memset), 5, 0) CALL_MCOUNT 3 diff --git a/sysdeps/powerpc/powerpc64/power6/memset.S b/sysdeps/powerpc/powerpc64/power6/memset.S index 6daaa2c366..ea74c117dd 100644 --- a/sysdeps/powerpc/powerpc64/power6/memset.S +++ b/sysdeps/powerpc/powerpc64/power6/memset.S @@ -1,4 +1,4 @@ -/* Optimized memset implementation for PowerPC64. +/* Optimized 64-bit memset implementation for POWER6. Copyright (C) 1997, 1999, 2000, 2002, 2003, 2007 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -22,12 +22,6 @@ #include #include - .section ".toc","aw" -.LC0: - .tc __cache_line_size[TC],__cache_line_size - .section ".text" - .align 2 - /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); Returns 's'. @@ -35,7 +29,8 @@ cache line (256 bits). There is a special case for setting cache lines to 0, to take advantage of the dcbz instruction. */ -EALIGN (BP_SYM (memset), 5, 0) + .machine power6 +EALIGN (BP_SYM (memset), 7, 0) CALL_MCOUNT 3 #define rTMP r0 @@ -53,10 +48,7 @@ EALIGN (BP_SYM (memset), 5, 0) #endif #define rALIGN r7 /* Number of bytes we are setting now (when aligning). */ #define rMEMP2 r8 - -#define rNEG64 r8 /* Constant -64 for clearing with dcbz. */ -#define rCLS r8 /* Cache line size obtained from static. */ -#define rCLM r9 /* Cache line size mask to check for cache alignment. */ +#define rMEMP3 r9 /* Alt mem pointer. */ L(_memset): #if __BOUNDED_POINTERS__ cmpldi cr1, rRTN, 0 @@ -70,7 +62,7 @@ L(b0): cmpldi cr1, rLEN, 8 andi. rALIGN, rMEMP0, 7 mr rMEMP, rMEMP0 - ble- cr1, L(small) + ble cr1, L(small) /* Align to doubleword boundary. */ cmpldi cr5, rLEN, 31 @@ -131,75 +123,217 @@ L(caligned): clrrdi. rALIGN, rLEN, 5 mtcrf 0x01, rLEN beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ -L(nondcbz): - srdi rTMP, rALIGN, 5 - mtctr rTMP beq L(medium) /* We may not actually get to do a full line. */ - clrldi. rLEN, rLEN, 59 - add rMEMP, rMEMP, rALIGN - li rNEG64, -0x40 - bdz L(cloopdone) + .align 4 +/* Storing a non-zero "c" value. We are aligned at a sector (32-byte) + boundary may not be at cache line (128-byte) boundary. */ +L(nzloopstart): +/* memset in 32-byte chunks until we get to a cache line boundary. + If rLEN is less then the distance to the next cache-line boundary use + cacheAligned1 code to finish the tail. */ + cmpldi cr1,rLEN,128 - .align 4 -L(c3): dcbtst rNEG64, rMEMP - std rCHR, -8(rMEMP) - std rCHR, -16(rMEMP) - std rCHR, -24(rMEMP) - stdu rCHR, -32(rMEMP) - bdnz L(c3) - .align 4 -L(cloopdone): - std rCHR, -8(rMEMP) - std rCHR, -16(rMEMP) - cmpldi cr1, rLEN, 16 - std rCHR, -24(rMEMP) - stdu rCHR, -32(rMEMP) - beqlr - add rMEMP, rMEMP, rALIGN - b L(medium_tail2) + andi. rTMP,rMEMP,127 + blt cr1,L(cacheAligned1) + addi rMEMP3,rMEMP,32 + beq L(nzCacheAligned) + addi rLEN,rLEN,-32 + std rCHR,0(rMEMP) + std rCHR,8(rMEMP) + std rCHR,16(rMEMP) + addi rMEMP,rMEMP,32 + andi. rTMP,rMEMP3,127 + std rCHR,-8(rMEMP3) + beq L(nzCacheAligned) + addi rLEN,rLEN,-32 + std rCHR,0(rMEMP3) + addi rMEMP,rMEMP,32 + std rCHR,8(rMEMP3) + andi. rTMP,rMEMP,127 + std rCHR,16(rMEMP3) + std rCHR,24(rMEMP3) + + beq L(nzCacheAligned) + addi rLEN,rLEN,-32 + std rCHR,32(rMEMP3) + addi rMEMP,rMEMP,32 + cmpldi cr1,rLEN,128 + std rCHR,40(rMEMP3) + cmpldi cr6,rLEN,256 + li rMEMP2,128 + std rCHR,48(rMEMP3) + std rCHR,56(rMEMP3) + blt cr1,L(cacheAligned1) + b L(nzCacheAligned128) + +/* Now we are aligned to the cache line and can use dcbtst. */ + .align 4 +L(nzCacheAligned): + cmpldi cr1,rLEN,128 + blt cr1,L(cacheAligned1) + b L(nzCacheAligned128) + .align 5 +L(nzCacheAligned128): + cmpldi cr1,rLEN,256 + addi rMEMP3,rMEMP,64 + std rCHR,0(rMEMP) + std rCHR,8(rMEMP) + std rCHR,16(rMEMP) + std rCHR,24(rMEMP) + std rCHR,32(rMEMP) + std rCHR,40(rMEMP) + std rCHR,48(rMEMP) + std rCHR,56(rMEMP) + addi rMEMP,rMEMP3,64 + addi rLEN,rLEN,-128 + std rCHR,0(rMEMP3) + std rCHR,8(rMEMP3) + std rCHR,16(rMEMP3) + std rCHR,24(rMEMP3) + std rCHR,32(rMEMP3) + std rCHR,40(rMEMP3) + std rCHR,48(rMEMP3) + std rCHR,56(rMEMP3) + bge cr1,L(nzCacheAligned128) + dcbtst 0,rMEMP + b L(cacheAligned1) .align 5 -/* Clear lines of memory in 128-byte chunks. */ +/* Storing a zero "c" value. We are aligned at a sector (32-byte) + boundary but may not be at cache line (128-byte) boundary. If the + remaining length spans a full cache line we can use the Data cache + block zero instruction. */ L(zloopstart): -/* If the remaining length is less the 32 bytes, don't bother getting - the cache line size. */ +/* memset in 32-byte chunks until we get to a cache line boundary. + If rLEN is less then the distance to the next cache-line boundary use + cacheAligned1 code to finish the tail. */ + cmpldi cr1,rLEN,128 beq L(medium) - li rCLS,128 /* cache line size is 128 */ - -/* Now we know the cache line size, and it is not 32-bytes, but - we may not yet be aligned to the cache line. May have a partial - line to fill, so touch it 1st. */ - dcbt 0,rMEMP L(getCacheAligned): - cmpldi cr1,rLEN,32 andi. rTMP,rMEMP,127 - blt cr1,L(handletail32) - beq L(cacheAligned) + nop + blt cr1,L(cacheAligned1) + addi rMEMP3,rMEMP,32 + beq L(cacheAligned) + addi rLEN,rLEN,-32 + std rCHR,0(rMEMP) + std rCHR,8(rMEMP) + std rCHR,16(rMEMP) addi rMEMP,rMEMP,32 + andi. rTMP,rMEMP3,127 + std rCHR,-8(rMEMP3) +L(getCacheAligned2): + beq L(cacheAligned) addi rLEN,rLEN,-32 - std rCHR,-32(rMEMP) - std rCHR,-24(rMEMP) - std rCHR,-16(rMEMP) - std rCHR,-8(rMEMP) - b L(getCacheAligned) + std rCHR,0(rMEMP3) + std rCHR,8(rMEMP3) + addi rMEMP,rMEMP,32 + andi. rTMP,rMEMP,127 + std rCHR,16(rMEMP3) + std rCHR,24(rMEMP3) +L(getCacheAligned3): + beq L(cacheAligned) + addi rLEN,rLEN,-32 + std rCHR,32(rMEMP3) + addi rMEMP,rMEMP,32 + cmpldi cr1,rLEN,128 + std rCHR,40(rMEMP3) + cmpldi cr6,rLEN,256 + li rMEMP2,128 + std rCHR,48(rMEMP3) + std rCHR,56(rMEMP3) + blt cr1,L(cacheAligned1) + blt cr6,L(cacheAligned128) + b L(cacheAlignedx) /* Now we are aligned to the cache line and can use dcbz. */ - .align 4 + .align 5 L(cacheAligned): - cmpld cr1,rLEN,rCLS - blt cr1,L(handletail32) + cmpldi cr1,rLEN,128 + cmpldi cr6,rLEN,256 + blt cr1,L(cacheAligned1) + li rMEMP2,128 +L(cacheAlignedx): + cmpldi cr5,rLEN,640 + blt cr6,L(cacheAligned128) + bgt cr5,L(cacheAligned512) + cmpldi cr6,rLEN,512 + dcbz 0,rMEMP + cmpldi cr1,rLEN,384 + dcbz rMEMP2,rMEMP + addi rMEMP,rMEMP,256 + addi rLEN,rLEN,-256 + blt cr1,L(cacheAligned1) + blt cr6,L(cacheAligned128) + b L(cacheAligned256) + .align 5 +/* A simple loop for the longer (>640 bytes) lengths. This form limits + the branch miss-predicted to exactly 1 at loop exit.*/ +L(cacheAligned512): + cmpli cr1,rLEN,128 + blt cr1,L(cacheAligned1) + dcbz 0,rMEMP + addi rLEN,rLEN,-128 + addi rMEMP,rMEMP,128 + b L(cacheAligned512) + .align 5 +L(cacheAligned256): + + cmpldi cr6,rLEN,512 + dcbz 0,rMEMP - subf rLEN,rCLS,rLEN - add rMEMP,rMEMP,rCLS - b L(cacheAligned) + cmpldi cr1,rLEN,384 + dcbz rMEMP2,rMEMP + addi rMEMP,rMEMP,256 + addi rLEN,rLEN,-256 + + bge cr6,L(cacheAligned256) + + blt cr1,L(cacheAligned1) + .align 4 +L(cacheAligned128): + dcbz 0,rMEMP + addi rMEMP,rMEMP,128 + addi rLEN,rLEN,-128 + nop +L(cacheAligned1): + cmpldi cr1,rLEN,32 + blt cr1,L(handletail32) + addi rMEMP3,rMEMP,32 + addi rLEN,rLEN,-32 + std rCHR,0(rMEMP) + std rCHR,8(rMEMP) + std rCHR,16(rMEMP) + addi rMEMP,rMEMP,32 + cmpldi cr1,rLEN,32 + std rCHR,-8(rMEMP3) +L(cacheAligned2): + blt cr1,L(handletail32) + addi rLEN,rLEN,-32 + std rCHR,0(rMEMP3) + std rCHR,8(rMEMP3) + addi rMEMP,rMEMP,32 + cmpldi cr1,rLEN,32 + std rCHR,16(rMEMP3) + std rCHR,24(rMEMP3) + nop +L(cacheAligned3): + blt cr1,L(handletail32) + addi rMEMP,rMEMP,32 + addi rLEN,rLEN,-32 + std rCHR,32(rMEMP3) + std rCHR,40(rMEMP3) + std rCHR,48(rMEMP3) + std rCHR,56(rMEMP3) -/* We are here because the cache line size was set and was not 32-bytes - and the remainder (rLEN) is less than the actual cache line size. - So set up the preconditions for L(nondcbz) and go there. */ +/* We are here because the length or remainder (rLEN) is less than the + cache line/sector size and does not justify aggressive loop unrolling. + So set up the preconditions for L(medium) and go there. */ .align 3 L(handletail32): - clrrwi. rALIGN, rLEN, 5 - b L(nondcbz) + cmpldi cr1,rLEN,0 + beqlr cr1 + b L(medium) .align 5 L(small): @@ -237,10 +371,10 @@ L(medium_tail): bt- 31, L(medium_31t) bt- 30, L(medium_30t) L(medium_30f): - bt- 29, L(medium_29t) + bt 29, L(medium_29t) L(medium_29f): - bge- cr1, L(medium_27t) - bflr- 28 + bge cr1, L(medium_27t) + bflr 28 std rCHR, -8(rMEMP) blr @@ -252,12 +386,12 @@ L(medium_30t): bf- 29, L(medium_29f) L(medium_29t): stwu rCHR, -4(rMEMP) - blt- cr1, L(medium_27f) + blt cr1, L(medium_27f) L(medium_27t): std rCHR, -8(rMEMP) stdu rCHR, -16(rMEMP) L(medium_27f): - bflr- 28 + bflr 28 L(medium_28t): std rCHR, -8(rMEMP) blr -- cgit 1.4.1