From 0eacdbad318c940ee4d45ed87155e73e990fe2bb Mon Sep 17 00:00:00 2001 From: Luis Machado Date: Tue, 13 Jul 2010 16:39:26 -0300 Subject: powerpc: POWER7 optimizations Add optimizations for classification functions (32-bit and 64-bit) and string functions (32-bit and 64-bit). powerpc: Re-work the Implies structure This patch tries to organize the implies files for ppc, since there are a number of processors and most of them are compatible with each other (backwards compatible). Having in mind that we start the search for processor-specific files in the sysdeps/unix/sysv/linux tree (sysdeps/unix/sysv/linux/powerpc/powerpc[32|64]/[processor]/fpu to be exact), we would like to grab any linux-specific code from that tree prior to going through the other tree (sysdeps/powerpc/...). For that, i removed the Implies files that were originally inside the fpu directories and placed then in the non-fpu directories (still inside the unix/sysv/linux tree). If no processor-specific/linux-specific files could be found, we "imply" the other tree's (sysdeps/powerpc/...) fpu directory for that specific processor AND also the non-fpu directory for that same tree. If, again, no processor-specific code is found, we read another Implies file that will point to the most compatible processor that we should grab code from, and so on, until we reach the power4 processor. So, in summary, the Implies files will live inside these directories now: * sysdeps/unix/sysv/linux/powerpc/powerpc[32|64]/[processor] * sysdeps/powerpc/powerpc[32|64]/[processor] Practical example of the order we will use to pick power6-specific code with the new structure. sysdeps/unix/sysv/linux/powerpc/powerpc[32|64]/power6/fpu -> sysdeps/unix/sysv/linux/powerpc/powerpc[32|64]/power6 -> sysdeps/powerpc/powerpc[32|64]/power6/fpu -> sysdeps/powerpc/powerpc[32|64]/power6 -> sysdeps/powerpc/powerpc[32|64]/power5+/fpu -> sysdeps/powerpc/powerpc[32|64]/power5+ -> sysdeps/powerpc/powerpc[32|64]/power5/fpu -> sysdeps/powerpc/powerpc[32|64]/power5 -> sysdeps/powerpc/powerpc[32|64]/power4/fpu -> sysdeps/powerpc/powerpc[32|64]/power4 (from here, it'll go to the generic path as usual) --- sysdeps/powerpc/powerpc64/power7/Implies | 3 +- sysdeps/powerpc/powerpc64/power7/fpu/Implies | 2 +- sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S | 68 ++ sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S | 1 + sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S | 71 ++ sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S | 1 + sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S | 69 ++ sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S | 1 + sysdeps/powerpc/powerpc64/power7/memcmp.S | 984 +++++++++++++++++++++++ sysdeps/powerpc/powerpc64/power7/memcpy.S | 449 +++++++++++ sysdeps/powerpc/powerpc64/power7/memset.S | 398 +++++++++ sysdeps/powerpc/powerpc64/power7/strncmp.S | 181 +++++ 12 files changed, 2226 insertions(+), 2 deletions(-) create mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S create mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S create mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S create mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S create mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S create mode 100644 sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S create mode 100644 sysdeps/powerpc/powerpc64/power7/memcmp.S create mode 100644 sysdeps/powerpc/powerpc64/power7/memcpy.S create mode 100644 sysdeps/powerpc/powerpc64/power7/memset.S create mode 100644 sysdeps/powerpc/powerpc64/power7/strncmp.S (limited to 'sysdeps/powerpc/powerpc64/power7') diff --git a/sysdeps/powerpc/powerpc64/power7/Implies b/sysdeps/powerpc/powerpc64/power7/Implies index 13b03309fb..9d68f39d22 100644 --- a/sysdeps/powerpc/powerpc64/power7/Implies +++ b/sysdeps/powerpc/powerpc64/power7/Implies @@ -1 +1,2 @@ -powerpc/powerpc64/power5 +powerpc/powerpc64/power6/fpu +powerpc/powerpc64/power6 diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/powerpc/powerpc64/power7/fpu/Implies index 13b03309fb..f00c50fb49 100644 --- a/sysdeps/powerpc/powerpc64/power7/fpu/Implies +++ b/sysdeps/powerpc/powerpc64/power7/fpu/Implies @@ -1 +1 @@ -powerpc/powerpc64/power5 +powerpc/powerpc64/power5/fpu diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S new file mode 100644 index 0000000000..6763d1adc8 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S @@ -0,0 +1,68 @@ +/* finite(). PowerPC64/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* int __finite(x) */ + .section ".toc","aw" +.LC0: /* 1.0 */ + .tc FD_ONE[TC],0x3ff0000000000000 + .section ".text" + .type __finite, @function + .machine power7 +EALIGN (__finite, 4, 0) + CALL_MCOUNT 0 + lfd fp0,.LC0@toc(r2) + ftdiv cr7,fp1,fp0 + li r3,1 + bflr 30 + + /* If we are here, we either have +/-INF, + NaN or denormal. */ + + stfd fp1,-16(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + + lhz r4,-16(r1) /* Fetch the upper portion of the high word of + the FP value (where the exponent and sign bits + are). */ + clrlwi r4,r4,17 /* r4 = abs(r4). */ + cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */ + bltlr cr7 /* LT means finite, other non-finite. */ + li r3,0 + blr + END (__finite) + +hidden_def (__finite) +weak_alias (__finite, finite) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__finite, __finitef) +hidden_def (__finitef) +weak_alias (__finitef, finitef) + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __finite, __finitel, GLIBC_2_0); +compat_symbol (libc, finite, finitel, GLIBC_2_0); +# endif +#endif diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S new file mode 100644 index 0000000000..54bd94176d --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S @@ -0,0 +1 @@ +/* This function uses the same code as s_finite.S. */ diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S new file mode 100644 index 0000000000..f896d38026 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S @@ -0,0 +1,71 @@ +/* isinf(). PowerPC64/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* int __isinf(x) */ + .section ".toc","aw" +.LC0: /* 1.0 */ + .tc FD_ONE[TC],0x3ff0000000000000 + .section ".text" + .type __isinf, @function + .machine power7 +EALIGN (__isinf, 4, 0) + CALL_MCOUNT 0 + lfd fp0,.LC0@toc(r2) + ftdiv cr7,fp1,fp0 + li r3,0 + bflr 29 /* If not INF, return. */ + + /* Either we have -INF/+INF or a denormal. */ + + stfd fp1,-16(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + lhz r4,-16(r1) /* Fetch the upper portion of the high word of + the FP value (where the exponent and sign bits + are). */ + cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */ + li r3,1 + beqlr cr7 /* EQ means INF, otherwise -INF. */ + li r3,-1 + blr + END (__isinf) + +hidden_def (__isinf) +weak_alias (__isinf, isinf) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__isinf, __isinff) +hidden_def (__isinff) +weak_alias (__isinff, isinff) + +#ifdef NO_LONG_DOUBLE +strong_alias (__isinf, __isinfl) +weak_alias (__isinf, isinfl) +#endif + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0); +compat_symbol (libc, isinf, isinfl, GLIBC_2_0); +# endif +#endif diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S new file mode 100644 index 0000000000..be759e091e --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S @@ -0,0 +1 @@ +/* This function uses the same code as s_isinf.S. */ diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S new file mode 100644 index 0000000000..8877012598 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S @@ -0,0 +1,69 @@ +/* isnan(). PowerPC64/POWER7 version. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* int __isnan(x) */ + .section ".toc","aw" +.LC0: /* 1.0 */ + .tc FD_ONE[TC],0x3ff0000000000000 + .section ".text" + .type __isnan, @function + .machine power7 +EALIGN (__isnan, 4, 0) + CALL_MCOUNT 0 + lfd fp0,.LC0@toc(r2) + ftdiv cr7,fp1,fp0 + li r3,0 + bflr 30 /* If not NaN, finish. */ + + stfd fp1,-16(r1) /* Transfer FP to GPR's. */ + ori 2,2,0 /* Force a new dispatch group. */ + ld r4,-16(r1) /* Load FP into GPR. */ + lis r0,0x7ff0 + sldi r0,r0,32 /* const long r0 0x7ff00000 00000000. */ + clrldi r4,r4,1 /* x = fabs(x) */ + cmpd cr7,r4,r0 /* if (fabs(x) <= inf) */ + blelr cr7 /* LE means not NaN. */ + li r3,1 /* else return 1 */ + blr + END (__isnan) + +hidden_def (__isnan) +weak_alias (__isnan, isnan) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__isnan, __isnanf) +hidden_def (__isnanf) +weak_alias (__isnanf, isnanf) + +#ifdef NO_LONG_DOUBLE +strong_alias (__isnan, __isnanl) +weak_alias (__isnan, isnanl) +#endif + +#ifndef IS_IN_libm +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0); +compat_symbol (libc, isnan, isnanl, GLIBC_2_0); +# endif +#endif diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S new file mode 100644 index 0000000000..b48c85e0d3 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S @@ -0,0 +1 @@ +/* This function uses the same code as s_isnan.S. */ diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S new file mode 100644 index 0000000000..f1afffb4e7 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S @@ -0,0 +1,984 @@ +/* Optimized memcmp implementation for POWER7/PowerPC64. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA + 02110-1301 USA. */ + +#include +#include +#include + +/* int [r3] memcmp (const char *s1 [r3], + const char *s2 [r4], + size_t size [r5]) */ + + .machine power7 +EALIGN (BP_SYM(memcmp),4,0) + CALL_MCOUNT 3 + +#define rTMP r0 +#define rRTN r3 +#define rSTR1 r3 /* first string arg */ +#define rSTR2 r4 /* second string arg */ +#define rN r5 /* max string length */ +/* Note: The Bounded pointer support in this code is broken. This code + was inherited from PPC32 and and that support was never completed. + Current PPC gcc does not support -fbounds-check or -fbounded-pointers. */ +#define rWORD1 r6 /* current word in s1 */ +#define rWORD2 r7 /* current word in s2 */ +#define rWORD3 r8 /* next word in s1 */ +#define rWORD4 r9 /* next word in s2 */ +#define rWORD5 r10 /* next word in s1 */ +#define rWORD6 r11 /* next word in s2 */ +#define rBITDIF r12 /* bits that differ in s1 & s2 words */ +#define rWORD7 r30 /* next word in s1 */ +#define rWORD8 r31 /* next word in s2 */ + + xor rTMP,rSTR2,rSTR1 + cmpldi cr6,rN,0 + cmpldi cr1,rN,12 + clrldi. rTMP,rTMP,61 + clrldi rBITDIF,rSTR1,61 + cmpldi cr5,rBITDIF,0 + beq- cr6,L(zeroLength) + dcbt 0,rSTR1 + dcbt 0,rSTR2 +/* If less than 8 bytes or not aligned, use the unalligned + byte loop. */ + blt cr1,L(bytealigned) + std rWORD8,-8(r1) + cfi_offset(rWORD8,-8) + std rWORD7,-16(r1) + cfi_offset(rWORD7,-16) + bne L(unaligned) +/* At this point we know both strings have the same alignment and the + compare length is at least 8 bytes. rBITDIF containes the low order + 3 bits of rSTR1 and cr5 contains the result of the logical compare + of rBITDIF to 0. If rBITDIF == 0 then we are already double word + aligned and can perform the DWaligned loop. + + Otherwise we know the two strings have the same alignment (but not + yet DW). So we can force the string addresses to the next lower DW + boundary and special case this first DW word using shift left to + ellimiate bits preceeding the first byte. Since we want to join the + normal (DWaligned) compare loop, starting at the second double word, + we need to adjust the length (rN) and special case the loop + versioning for the first DW. This insures that the loop count is + correct and the first DW (shifted) is in the expected resister pair. */ + .align 4 +L(samealignment): + clrrdi rSTR1,rSTR1,3 + clrrdi rSTR2,rSTR2,3 + beq cr5,L(DWaligned) + add rN,rN,rBITDIF + sldi r11,rBITDIF,3 + srdi rTMP,rN,5 /* Divide by 32 */ + andi. rBITDIF,rN,24 /* Get the DW remainder */ + ld rWORD1,0(rSTR1) + ld rWORD2,0(rSTR2) + cmpldi cr1,rBITDIF,16 + cmpldi cr7,rN,32 + clrldi rN,rN,61 + beq L(dPs4) + mtctr rTMP + bgt cr1,L(dPs3) + beq cr1,L(dPs2) + +/* Remainder is 8 */ + .align 3 +L(dsP1): + sld rWORD5,rWORD1,r11 + sld rWORD6,rWORD2,r11 + cmpld cr5,rWORD5,rWORD6 + blt cr7,L(dP1x) +/* Do something useful in this cycle since we have to branch anyway. */ + ld rWORD1,8(rSTR1) + ld rWORD2,8(rSTR2) + cmpld cr0,rWORD1,rWORD2 + b L(dP1e) +/* Remainder is 16 */ + .align 4 +L(dPs2): + sld rWORD5,rWORD1,r11 + sld rWORD6,rWORD2,r11 + cmpld cr6,rWORD5,rWORD6 + blt cr7,L(dP2x) +/* Do something useful in this cycle since we have to branch anyway. */ + ld rWORD7,8(rSTR1) + ld rWORD8,8(rSTR2) + cmpld cr5,rWORD7,rWORD8 + b L(dP2e) +/* Remainder is 24 */ + .align 4 +L(dPs3): + sld rWORD3,rWORD1,r11 + sld rWORD4,rWORD2,r11 + cmpld cr1,rWORD3,rWORD4 + b L(dP3e) +/* Count is a multiple of 32, remainder is 0 */ + .align 4 +L(dPs4): + mtctr rTMP + sld rWORD1,rWORD1,r11 + sld rWORD2,rWORD2,r11 + cmpld cr0,rWORD1,rWORD2 + b L(dP4e) + +/* At this point we know both strings are double word aligned and the + compare length is at least 8 bytes. */ + .align 4 +L(DWaligned): + andi. rBITDIF,rN,24 /* Get the DW remainder */ + srdi rTMP,rN,5 /* Divide by 32 */ + cmpldi cr1,rBITDIF,16 + cmpldi cr7,rN,32 + clrldi rN,rN,61 + beq L(dP4) + bgt cr1,L(dP3) + beq cr1,L(dP2) + +/* Remainder is 8 */ + .align 4 +L(dP1): + mtctr rTMP +/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early + (8-15 byte compare), we want to use only volitile registers. This + means we can avoid restoring non-volitile registers since we did not + change any on the early exit path. The key here is the non-early + exit path only cares about the condition code (cr5), not about which + register pair was used. */ + ld rWORD5,0(rSTR1) + ld rWORD6,0(rSTR2) + cmpld cr5,rWORD5,rWORD6 + blt cr7,L(dP1x) + ld rWORD1,8(rSTR1) + ld rWORD2,8(rSTR2) + cmpld cr0,rWORD1,rWORD2 +L(dP1e): + ld rWORD3,16(rSTR1) + ld rWORD4,16(rSTR2) + cmpld cr1,rWORD3,rWORD4 + ld rWORD5,24(rSTR1) + ld rWORD6,24(rSTR2) + cmpld cr6,rWORD5,rWORD6 + bne cr5,L(dLcr5) + bne cr0,L(dLcr0) + + ldu rWORD7,32(rSTR1) + ldu rWORD8,32(rSTR2) + bne cr1,L(dLcr1) + cmpld cr5,rWORD7,rWORD8 + bdnz L(dLoop) + bne cr6,L(dLcr6) + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + .align 3 +L(dP1x): + sldi. r12,rN,3 + bne cr5,L(dLcr5) + subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */ + bne L(d00) + li rRTN,0 + blr + +/* Remainder is 16 */ + .align 4 +L(dP2): + mtctr rTMP + ld rWORD5,0(rSTR1) + ld rWORD6,0(rSTR2) + cmpld cr6,rWORD5,rWORD6 + blt cr7,L(dP2x) + ld rWORD7,8(rSTR1) + ld rWORD8,8(rSTR2) + cmpld cr5,rWORD7,rWORD8 +L(dP2e): + ld rWORD1,16(rSTR1) + ld rWORD2,16(rSTR2) + cmpld cr0,rWORD1,rWORD2 + ld rWORD3,24(rSTR1) + ld rWORD4,24(rSTR2) + cmpld cr1,rWORD3,rWORD4 + addi rSTR1,rSTR1,8 + addi rSTR2,rSTR2,8 + bne cr6,L(dLcr6) + bne cr5,L(dLcr5) + b L(dLoop2) +/* Again we are on a early exit path (16-23 byte compare), we want to + only use volitile registers and avoid restoring non-volitile + registers. */ + .align 4 +L(dP2x): + ld rWORD3,8(rSTR1) + ld rWORD4,8(rSTR2) + cmpld cr5,rWORD3,rWORD4 + sldi. r12,rN,3 + bne cr6,L(dLcr6) + addi rSTR1,rSTR1,8 + addi rSTR2,rSTR2,8 + bne cr5,L(dLcr5) + subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */ + bne L(d00) + li rRTN,0 + blr + +/* Remainder is 24 */ + .align 4 +L(dP3): + mtctr rTMP + ld rWORD3,0(rSTR1) + ld rWORD4,0(rSTR2) + cmpld cr1,rWORD3,rWORD4 +L(dP3e): + ld rWORD5,8(rSTR1) + ld rWORD6,8(rSTR2) + cmpld cr6,rWORD5,rWORD6 + blt cr7,L(dP3x) + ld rWORD7,16(rSTR1) + ld rWORD8,16(rSTR2) + cmpld cr5,rWORD7,rWORD8 + ld rWORD1,24(rSTR1) + ld rWORD2,24(rSTR2) + cmpld cr0,rWORD1,rWORD2 + addi rSTR1,rSTR1,16 + addi rSTR2,rSTR2,16 + bne cr1,L(dLcr1) + bne cr6,L(dLcr6) + b L(dLoop1) +/* Again we are on a early exit path (24-31 byte compare), we want to + only use volitile registers and avoid restoring non-volitile + registers. */ + .align 4 +L(dP3x): + ld rWORD1,16(rSTR1) + ld rWORD2,16(rSTR2) + cmpld cr5,rWORD1,rWORD2 + sldi. r12,rN,3 + bne cr1,L(dLcr1) + addi rSTR1,rSTR1,16 + addi rSTR2,rSTR2,16 + bne cr6,L(dLcr6) + subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */ + bne cr5,L(dLcr5) + bne L(d00) + li rRTN,0 + blr + +/* Count is a multiple of 32, remainder is 0 */ + .align 4 +L(dP4): + mtctr rTMP + ld rWORD1,0(rSTR1) + ld rWORD2,0(rSTR2) + cmpld cr0,rWORD1,rWORD2 +L(dP4e): + ld rWORD3,8(rSTR1) + ld rWORD4,8(rSTR2) + cmpld cr1,rWORD3,rWORD4 + ld rWORD5,16(rSTR1) + ld rWORD6,16(rSTR2) + cmpld cr6,rWORD5,rWORD6 + ldu rWORD7,24(rSTR1) + ldu rWORD8,24(rSTR2) + cmpld cr5,rWORD7,rWORD8 + bne cr0,L(dLcr0) + bne cr1,L(dLcr1) + bdz- L(d24) /* Adjust CTR as we start with +4 */ +/* This is the primary loop */ + .align 4 +L(dLoop): + ld rWORD1,8(rSTR1) + ld rWORD2,8(rSTR2) + cmpld cr1,rWORD3,rWORD4 + bne cr6,L(dLcr6) +L(dLoop1): + ld rWORD3,16(rSTR1) + ld rWORD4,16(rSTR2) + cmpld cr6,rWORD5,rWORD6 + bne cr5,L(dLcr5) +L(dLoop2): + ld rWORD5,24(rSTR1) + ld rWORD6,24(rSTR2) + cmpld cr5,rWORD7,rWORD8 + bne cr0,L(dLcr0) +L(dLoop3): + ldu rWORD7,32(rSTR1) + ldu rWORD8,32(rSTR2) + bne cr1,L(dLcr1) + cmpld cr0,rWORD1,rWORD2 + bdnz L(dLoop) + +L(dL4): + cmpld cr1,rWORD3,rWORD4 + bne cr6,L(dLcr6) + cmpld cr6,rWORD5,rWORD6 + bne cr5,L(dLcr5) + cmpld cr5,rWORD7,rWORD8 +L(d44): + bne cr0,L(dLcr0) +L(d34): + bne cr1,L(dLcr1) +L(d24): + bne cr6,L(dLcr6) +L(d14): + sldi. r12,rN,3 + bne cr5,L(dLcr5) +L(d04): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */ + beq L(zeroLength) +/* At this point we have a remainder of 1 to 7 bytes to compare. Since + we are aligned it is safe to load the whole double word, and use + shift right double to elliminate bits beyond the compare length. */ +L(d00): + ld rWORD1,8(rSTR1) + ld rWORD2,8(rSTR2) + srd rWORD1,rWORD1,rN + srd rWORD2,rWORD2,rN + cmpld cr5,rWORD1,rWORD2 + bne cr5,L(dLcr5x) + li rRTN,0 + blr + .align 4 +L(dLcr0): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgtlr cr0 + li rRTN,-1 + blr + .align 4 +L(dLcr1): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgtlr cr1 + li rRTN,-1 + blr + .align 4 +L(dLcr6): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgtlr cr6 + li rRTN,-1 + blr + .align 4 +L(dLcr5): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) +L(dLcr5x): + li rRTN,1 + bgtlr cr5 + li rRTN,-1 + blr + + .align 4 +L(bytealigned): + mtctr rN + beq cr6,L(zeroLength) + +/* We need to prime this loop. This loop is swing modulo scheduled + to avoid pipe delays. The dependent instruction latencies (load to + compare to conditional branch) is 2 to 3 cycles. In this loop each + dispatch group ends in a branch and takes 1 cycle. Effectively + the first iteration of the loop only serves to load operands and + branches based on compares are delayed until the next loop. + + So we must precondition some registers and condition codes so that + we don't exit the loop early on the first iteration. */ + + lbz rWORD1,0(rSTR1) + lbz rWORD2,0(rSTR2) + bdz L(b11) + cmpld cr0,rWORD1,rWORD2 + lbz rWORD3,1(rSTR1) + lbz rWORD4,1(rSTR2) + bdz L(b12) + cmpld cr1,rWORD3,rWORD4 + lbzu rWORD5,2(rSTR1) + lbzu rWORD6,2(rSTR2) + bdz L(b13) + .align 4 +L(bLoop): + lbzu rWORD1,1(rSTR1) + lbzu rWORD2,1(rSTR2) + bne cr0,L(bLcr0) + + cmpld cr6,rWORD5,rWORD6 + bdz L(b3i) + + lbzu rWORD3,1(rSTR1) + lbzu rWORD4,1(rSTR2) + bne cr1,L(bLcr1) + + cmpld cr0,rWORD1,rWORD2 + bdz L(b2i) + + lbzu rWORD5,1(rSTR1) + lbzu rWORD6,1(rSTR2) + bne cr6,L(bLcr6) + + cmpld cr1,rWORD3,rWORD4 + bdnz L(bLoop) + +/* We speculatively loading bytes before we have tested the previous + bytes. But we must avoid overrunning the length (in the ctr) to + prevent these speculative loads from causing a segfault. In this + case the loop will exit early (before the all pending bytes are + tested. In this case we must complete the pending operations + before returning. */ +L(b1i): + bne cr0,L(bLcr0) + bne cr1,L(bLcr1) + b L(bx56) + .align 4 +L(b2i): + bne cr6,L(bLcr6) + bne cr0,L(bLcr0) + b L(bx34) + .align 4 +L(b3i): + bne cr1,L(bLcr1) + bne cr6,L(bLcr6) + b L(bx12) + .align 4 +L(bLcr0): + li rRTN,1 + bgtlr cr0 + li rRTN,-1 + blr +L(bLcr1): + li rRTN,1 + bgtlr cr1 + li rRTN,-1 + blr +L(bLcr6): + li rRTN,1 + bgtlr cr6 + li rRTN,-1 + blr + +L(b13): + bne cr0,L(bx12) + bne cr1,L(bx34) +L(bx56): + sub rRTN,rWORD5,rWORD6 + blr + nop +L(b12): + bne cr0,L(bx12) +L(bx34): + sub rRTN,rWORD3,rWORD4 + blr +L(b11): +L(bx12): + sub rRTN,rWORD1,rWORD2 + blr + .align 4 +L(zeroLengthReturn): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) +L(zeroLength): + li rRTN,0 + blr + + .align 4 +/* At this point we know the strings have different alignment and the + compare length is at least 8 bytes. rBITDIF containes the low order + 3 bits of rSTR1 and cr5 contains the result of the logical compare + of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word + aligned and can perform the DWunaligned loop. + + Otherwise we know that rSTR1 is not aready DW aligned yet. + So we can force the string addresses to the next lower DW + boundary and special case this first DW word using shift left to + ellimiate bits preceeding the first byte. Since we want to join the + normal (DWaligned) compare loop, starting at the second double word, + we need to adjust the length (rN) and special case the loop + versioning for the first DW. This insures that the loop count is + correct and the first DW (shifted) is in the expected resister pair. */ +#define rSHL r29 /* Unaligned shift left count. */ +#define rSHR r28 /* Unaligned shift right count. */ +#define rB r27 /* Left rotation temp for rWORD2. */ +#define rD r26 /* Left rotation temp for rWORD4. */ +#define rF r25 /* Left rotation temp for rWORD6. */ +#define rH r24 /* Left rotation temp for rWORD8. */ +#define rA r0 /* Right rotation temp for rWORD2. */ +#define rC r12 /* Right rotation temp for rWORD4. */ +#define rE r0 /* Right rotation temp for rWORD6. */ +#define rG r12 /* Right rotation temp for rWORD8. */ +L(unaligned): + std r29,-24(r1) + cfi_offset(r29,-24) + clrldi rSHL,rSTR2,61 + beq cr6,L(duzeroLength) + std r28,-32(r1) + cfi_offset(r28,-32) + beq cr5,L(DWunaligned) + std r27,-40(r1) + cfi_offset(r27,-40) +/* Adjust the logical start of rSTR2 ro compensate for the extra bits + in the 1st rSTR1 DW. */ + sub r27,rSTR2,rBITDIF +/* But do not attempt to address the DW before that DW that contains + the actual start of rSTR2. */ + clrrdi rSTR2,rSTR2,3 + std r26,-48(r1) + cfi_offset(r26,-48) +/* Compute the leaft/right shift counts for the unalign rSTR2, + compensating for the logical (DW aligned) start of rSTR1. */ + clrldi rSHL,r27,61 + clrrdi rSTR1,rSTR1,3 + std r25,-56(r1) + cfi_offset(r25,-56) + sldi rSHL,rSHL,3 + cmpld cr5,r27,rSTR2 + add rN,rN,rBITDIF + sldi r11,rBITDIF,3 + std r24,-64(r1) + cfi_offset(r24,-64) + subfic rSHR,rSHL,64 + srdi rTMP,rN,5 /* Divide by 32 */ + andi. rBITDIF,rN,24 /* Get the DW remainder */ +/* We normally need to load 2 DWs to start the unaligned rSTR2, but in + this special case those bits may be discarded anyway. Also we + must avoid loading a DW where none of the bits are part of rSTR2 as + this may cross a page boundary and cause a page fault. */ + li rWORD8,0 + blt cr5,L(dus0) + ld rWORD8,0(rSTR2) + la rSTR2,8(rSTR2) + sld rWORD8,rWORD8,rSHL + +L(dus0): + ld rWORD1,0(rSTR1) + ld rWORD2,0(rSTR2) + cmpldi cr1,rBITDIF,16 + cmpldi cr7,rN,32 + srd rG,rWORD2,rSHR + clrldi rN,rN,61 + beq L(duPs4) + mtctr rTMP + or rWORD8,rG,rWORD8 + bgt cr1,L(duPs3) + beq cr1,L(duPs2) + +/* Remainder is 8 */ + .align 4 +L(dusP1): + sld rB,rWORD2,rSHL + sld rWORD7,rWORD1,r11 + sld rWORD8,rWORD8,r11 + bge cr7,L(duP1e) +/* At this point we exit early with the first double word compare + complete and remainder of 0 to 7 bytes. See L(du14) for details on + how we handle the remaining bytes. */ + cmpld cr5,rWORD7,rWORD8 + sldi. rN,rN,3 + bne cr5,L(duLcr5) + cmpld cr7,rN,rSHR + beq L(duZeroReturn) + li rA,0 + ble cr7,L(dutrim) + ld rWORD2,8(rSTR2) + srd rA,rWORD2,rSHR + b L(dutrim) +/* Remainder is 16 */ + .align 4 +L(duPs2): + sld rH,rWORD2,rSHL + sld rWORD5,rWORD1,r11 + sld rWORD6,rWORD8,r11 + b L(duP2e) +/* Remainder is 24 */ + .align 4 +L(duPs3): + sld rF,rWORD2,rSHL + sld rWORD3,rWORD1,r11 + sld rWORD4,rWORD8,r11 + b L(duP3e) +/* Count is a multiple of 32, remainder is 0 */ + .align 4 +L(duPs4): + mtctr rTMP + or rWORD8,rG,rWORD8 + sld rD,rWORD2,rSHL + sld rWORD1,rWORD1,r11 + sld rWORD2,rWORD8,r11 + b L(duP4e) + +/* At this point we know rSTR1 is double word aligned and the + compare length is at least 8 bytes. */ + .align 4 +L(DWunaligned): + std r27,-40(r1) + cfi_offset(r27,-40) + clrrdi rSTR2,rSTR2,3 + std r26,-48(r1) + cfi_offset(r26,-48) + srdi rTMP,rN,5 /* Divide by 32 */ + std r25,-56(r1) + cfi_offset(r25,-56) + andi. rBITDIF,rN,24 /* Get the DW remainder */ + std r24,-64(r1) + cfi_offset(r24,-64) + sldi rSHL,rSHL,3 + ld rWORD6,0(rSTR2) + ldu rWORD8,8(rSTR2) + cmpldi cr1,rBITDIF,16 + cmpldi cr7,rN,32 + clrldi rN,rN,61 + subfic rSHR,rSHL,64 + sld rH,rWORD6,rSHL + beq L(duP4) + mtctr rTMP + bgt cr1,L(duP3) + beq cr1,L(duP2) + +/* Remainder is 8 */ + .align 4 +L(duP1): + srd rG,rWORD8,rSHR + ld rWORD7,0(rSTR1) + sld rB,rWORD8,rSHL + or rWORD8,rG,rH + blt cr7,L(duP1x) +L(duP1e): + ld rWORD1,8(rSTR1) + ld rWORD2,8(rSTR2) + cmpld cr5,rWORD7,rWORD8 + srd rA,rWORD2,rSHR + sld rD,rWORD2,rSHL + or rWORD2,rA,rB + ld rWORD3,16(rSTR1) + ld rWORD4,16(rSTR2) + cmpld cr0,rWORD1,rWORD2 + srd rC,rWORD4,rSHR + sld rF,rWORD4,rSHL + bne cr5,L(duLcr5) + or rWORD4,rC,rD + ld rWORD5,24(rSTR1) + ld rWORD6,24(rSTR2) + cmpld cr1,rWORD3,rWORD4 + srd rE,rWORD6,rSHR + sld rH,rWORD6,rSHL + bne cr0,L(duLcr0) + or rWORD6,rE,rF + cmpld cr6,rWORD5,rWORD6 + b L(duLoop3) + .align 4 +/* At this point we exit early with the first double word compare + complete and remainder of 0 to 7 bytes. See L(du14) for details on + how we handle the remaining bytes. */ +L(duP1x): + cmpld cr5,rWORD7,rWORD8 + sldi. rN,rN,3 + bne cr5,L(duLcr5) + cmpld cr7,rN,rSHR + beq L(duZeroReturn) + li rA,0 + ble cr7,L(dutrim) + ld rWORD2,8(rSTR2) + srd rA,rWORD2,rSHR + b L(dutrim) +/* Remainder is 16 */ + .align 4 +L(duP2): + srd rE,rWORD8,rSHR + ld rWORD5,0(rSTR1) + or rWORD6,rE,rH + sld rH,rWORD8,rSHL +L(duP2e): + ld rWORD7,8(rSTR1) + ld rWORD8,8(rSTR2) + cmpld cr6,rWORD5,rWORD6 + srd rG,rWORD8,rSHR + sld rB,rWORD8,rSHL + or rWORD8,rG,rH + blt cr7,L(duP2x) + ld rWORD1,16(rSTR1) + ld rWORD2,16(rSTR2) + cmpld cr5,rWORD7,rWORD8 + bne cr6,L(duLcr6) + srd rA,rWORD2,rSHR + sld rD,rWORD2,rSHL + or rWORD2,rA,rB + ld rWORD3,24(rSTR1) + ld rWORD4,24(rSTR2) + cmpld cr0,rWORD1,rWORD2 + bne cr5,L(duLcr5) + srd rC,rWORD4,rSHR + sld rF,rWORD4,rSHL + or rWORD4,rC,rD + addi rSTR1,rSTR1,8 + addi rSTR2,rSTR2,8 + cmpld cr1,rWORD3,rWORD4 + b L(duLoop2) + .align 4 +L(duP2x): + cmpld cr5,rWORD7,rWORD8 + addi rSTR1,rSTR1,8 + addi rSTR2,rSTR2,8 + bne cr6,L(duLcr6) + sldi. rN,rN,3 + bne cr5,L(duLcr5) + cmpld cr7,rN,rSHR + beq L(duZeroReturn) + li rA,0 + ble cr7,L(dutrim) + ld rWORD2,8(rSTR2) + srd rA,rWORD2,rSHR + b L(dutrim) + +/* Remainder is 24 */ + .align 4 +L(duP3): + srd rC,rWORD8,rSHR + ld rWORD3,0(rSTR1) + sld rF,rWORD8,rSHL + or rWORD4,rC,rH +L(duP3e): + ld rWORD5,8(rSTR1) + ld rWORD6,8(rSTR2) + cmpld cr1,rWORD3,rWORD4 + srd rE,rWORD6,rSHR + sld rH,rWORD6,rSHL + or rWORD6,rE,rF + ld rWORD7,16(rSTR1) + ld rWORD8,16(rSTR2) + cmpld cr6,rWORD5,rWORD6 + bne cr1,L(duLcr1) + srd rG,rWORD8,rSHR + sld rB,rWORD8,rSHL + or rWORD8,rG,rH + blt cr7,L(duP3x) + ld rWORD1,24(rSTR1) + ld rWORD2,24(rSTR2) + cmpld cr5,rWORD7,rWORD8 + bne cr6,L(duLcr6) + srd rA,rWORD2,rSHR + sld rD,rWORD2,rSHL + or rWORD2,rA,rB + addi rSTR1,rSTR1,16 + addi rSTR2,rSTR2,16 + cmpld cr0,rWORD1,rWORD2 + b L(duLoop1) + .align 4 +L(duP3x): + addi rSTR1,rSTR1,16 + addi rSTR2,rSTR2,16 + bne cr1,L(duLcr1) + cmpld cr5,rWORD7,rWORD8 + bne cr6,L(duLcr6) + sldi. rN,rN,3 + bne cr5,L(duLcr5) + cmpld cr7,rN,rSHR + beq L(duZeroReturn) + li rA,0 + ble cr7,L(dutrim) + ld rWORD2,8(rSTR2) + srd rA,rWORD2,rSHR + b L(dutrim) + +/* Count is a multiple of 32, remainder is 0 */ + .align 4 +L(duP4): + mtctr rTMP + srd rA,rWORD8,rSHR + ld rWORD1,0(rSTR1) + sld rD,rWORD8,rSHL + or rWORD2,rA,rH +L(duP4e): + ld rWORD3,8(rSTR1) + ld rWORD4,8(rSTR2) + cmpld cr0,rWORD1,rWORD2 + srd rC,rWORD4,rSHR + sld rF,rWORD4,rSHL + or rWORD4,rC,rD + ld rWORD5,16(rSTR1) + ld rWORD6,16(rSTR2) + cmpld cr1,rWORD3,rWORD4 + bne cr0,L(duLcr0) + srd rE,rWORD6,rSHR + sld rH,rWORD6,rSHL + or rWORD6,rE,rF + ldu rWORD7,24(rSTR1) + ldu rWORD8,24(rSTR2) + cmpld cr6,rWORD5,rWORD6 + bne cr1,L(duLcr1) + srd rG,rWORD8,rSHR + sld rB,rWORD8,rSHL + or rWORD8,rG,rH + cmpld cr5,rWORD7,rWORD8 + bdz L(du24) /* Adjust CTR as we start with +4 */ +/* This is the primary loop */ + .align 4 +L(duLoop): + ld rWORD1,8(rSTR1) + ld rWORD2,8(rSTR2) + cmpld cr1,rWORD3,rWORD4 + bne cr6,L(duLcr6) + srd rA,rWORD2,rSHR + sld rD,rWORD2,rSHL + or rWORD2,rA,rB +L(duLoop1): + ld rWORD3,16(rSTR1) + ld rWORD4,16(rSTR2) + cmpld cr6,rWORD5,rWORD6 + bne cr5,L(duLcr5) + srd rC,rWORD4,rSHR + sld rF,rWORD4,rSHL + or rWORD4,rC,rD +L(duLoop2): + ld rWORD5,24(rSTR1) + ld rWORD6,24(rSTR2) + cmpld cr5,rWORD7,rWORD8 + bne cr0,L(duLcr0) + srd rE,rWORD6,rSHR + sld rH,rWORD6,rSHL + or rWORD6,rE,rF +L(duLoop3): + ldu rWORD7,32(rSTR1) + ldu rWORD8,32(rSTR2) + cmpld cr0,rWORD1,rWORD2 + bne- cr1,L(duLcr1) + srd rG,rWORD8,rSHR + sld rB,rWORD8,rSHL + or rWORD8,rG,rH + bdnz L(duLoop) + +L(duL4): + bne cr1,L(duLcr1) + cmpld cr1,rWORD3,rWORD4 + bne cr6,L(duLcr6) + cmpld cr6,rWORD5,rWORD6 + bne cr5,L(duLcr5) + cmpld cr5,rWORD7,rWORD8 +L(du44): + bne cr0,L(duLcr0) +L(du34): + bne cr1,L(duLcr1) +L(du24): + bne cr6,L(duLcr6) +L(du14): + sldi. rN,rN,3 + bne cr5,L(duLcr5) +/* At this point we have a remainder of 1 to 7 bytes to compare. We use + shift right double to elliminate bits beyond the compare length. + This allows the use of double word subtract to compute the final + result. + + However it may not be safe to load rWORD2 which may be beyond the + string length. So we compare the bit length of the remainder to + the right shift count (rSHR). If the bit count is less than or equal + we do not need to load rWORD2 (all significant bits are already in + rB). */ + cmpld cr7,rN,rSHR + beq L(duZeroReturn) + li rA,0 + ble cr7,L(dutrim) + ld rWORD2,8(rSTR2) + srd rA,rWORD2,rSHR + .align 4 +L(dutrim): + ld rWORD1,8(rSTR1) + ld rWORD8,-8(r1) + subfic rN,rN,64 /* Shift count is 64 - (rN * 8). */ + or rWORD2,rA,rB + ld rWORD7,-16(r1) + ld r29,-24(r1) + srd rWORD1,rWORD1,rN + srd rWORD2,rWORD2,rN + ld r28,-32(r1) + ld r27,-40(r1) + li rRTN,0 + cmpld cr0,rWORD1,rWORD2 + ld r26,-48(r1) + ld r25,-56(r1) + beq cr0,L(dureturn24) + li rRTN,1 + ld r24,-64(r1) + bgtlr cr0 + li rRTN,-1 + blr + .align 4 +L(duLcr0): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgt cr0,L(dureturn29) + ld r29,-24(r1) + ld r28,-32(r1) + li rRTN,-1 + b L(dureturn27) + .align 4 +L(duLcr1): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgt cr1,L(dureturn29) + ld r29,-24(r1) + ld r28,-32(r1) + li rRTN,-1 + b L(dureturn27) + .align 4 +L(duLcr6): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgt cr6,L(dureturn29) + ld r29,-24(r1) + ld r28,-32(r1) + li rRTN,-1 + b L(dureturn27) + .align 4 +L(duLcr5): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) + li rRTN,1 + bgt cr5,L(dureturn29) + ld r29,-24(r1) + ld r28,-32(r1) + li rRTN,-1 + b L(dureturn27) + .align 3 +L(duZeroReturn): + li rRTN,0 + .align 4 +L(dureturn): + ld rWORD8,-8(r1) + ld rWORD7,-16(r1) +L(dureturn29): + ld r29,-24(r1) + ld r28,-32(r1) +L(dureturn27): + ld r27,-40(r1) +L(dureturn26): + ld r26,-48(r1) +L(dureturn25): + ld r25,-56(r1) +L(dureturn24): + ld r24,-64(r1) + blr +L(duzeroLength): + li rRTN,0 + blr + +END (BP_SYM (memcmp)) +libc_hidden_builtin_def (memcmp) +weak_alias (memcmp,bcmp) diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S new file mode 100644 index 0000000000..2e5beed15e --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S @@ -0,0 +1,449 @@ +/* Optimized memcpy implementation for PowerPC64/POWER7. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA + 02110-1301 USA. */ + +#include +#include +#include + + +/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); + Returns 'dst'. */ + + .machine power7 +EALIGN (BP_SYM (memcpy), 5, 0) + CALL_MCOUNT 3 + + cmpldi cr1,5,31 + neg 0,3 + std 3,-16(1) + std 31,-8(1) + cfi_offset(31,-8) + ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move + code. */ + + andi. 11,3,7 /* Check alignment of DST. */ + + + clrldi 10,4,61 /* Check alignment of SRC. */ + cmpld cr6,10,11 /* SRC and DST alignments match? */ + mr 12,4 + mr 31,5 + bne cr6,L(copy_GE_32_unaligned) + + srdi 9,5,3 /* Number of full quadwords remaining. */ + + beq L(copy_GE_32_aligned_cont) + + clrldi 0,0,61 + mtcrf 0x01,0 + subf 31,0,5 + + /* Get the SRC aligned to 8 bytes. */ + +1: bf 31,2f + lbz 6,0(12) + addi 12,12,1 + stb 6,0(3) + addi 3,3,1 +2: bf 30,4f + lhz 6,0(12) + addi 12,12,2 + sth 6,0(3) + addi 3,3,2 +4: bf 29,0f + lwz 6,0(12) + addi 12,12,4 + stw 6,0(3) + addi 3,3,4 +0: + clrldi 10,12,61 /* Check alignment of SRC again. */ + srdi 9,31,3 /* Number of full doublewords remaining. */ + +L(copy_GE_32_aligned_cont): + + clrldi 11,31,61 + mtcrf 0x01,9 + + srdi 8,31,5 + cmpldi cr1,9,4 + cmpldi cr6,11,0 + mr 11,12 + + /* Copy 1~3 doublewords so the main loop starts + at a multiple of 32 bytes. */ + + bf 30,1f + ld 6,0(12) + ld 7,8(12) + addi 11,12,16 + mtctr 8 + std 6,0(3) + std 7,8(3) + addi 10,3,16 + bf 31,4f + ld 0,16(12) + std 0,16(3) + blt cr1,3f + addi 11,12,24 + addi 10,3,24 + b 4f + + .align 4 +1: /* Copy 1 doubleword and set the counter. */ + mr 10,3 + mtctr 8 + bf 31,4f + ld 6,0(12) + addi 11,12,8 + std 6,0(3) + addi 10,3,8 + + /* Main aligned copy loop. Copies 32-bytes at a time. */ + .align 4 +4: + ld 6,0(11) + ld 7,8(11) + ld 8,16(11) + ld 0,24(11) + addi 11,11,32 + + std 6,0(10) + std 7,8(10) + std 8,16(10) + std 0,24(10) + addi 10,10,32 + bdnz 4b +3: + + /* Check for tail bytes. */ + rldicr 0,31,0,60 + mtcrf 0x01,31 + beq cr6,0f + +.L9: + add 3,3,0 + add 12,12,0 + + /* At this point we have a tail of 0-7 bytes and we know that the + destination is doubleword-aligned. */ +4: /* Copy 4 bytes. */ + bf 29,2f + + lwz 6,0(12) + addi 12,12,4 + stw 6,0(3) + addi 3,3,4 +2: /* Copy 2 bytes. */ + bf 30,1f + + lhz 6,0(12) + addi 12,12,2 + sth 6,0(3) + addi 3,3,2 +1: /* Copy 1 byte. */ + bf 31,0f + + lbz 6,0(12) + stb 6,0(3) +0: /* Return original DST pointer. */ + ld 31,-8(1) + ld 3,-16(1) + blr + + /* Handle copies of 0~31 bytes. */ + .align 4 +L(copy_LT_32): + cmpldi cr6,5,8 + mr 12,4 + mtcrf 0x01,5 + ble cr6,L(copy_LE_8) + + /* At least 9 bytes to go. */ + neg 8,4 + clrrdi 11,4,2 + andi. 0,8,3 + cmpldi cr1,5,16 + mr 10,5 + beq L(copy_LT_32_aligned) + + /* Force 4-bytes alignment for SRC. */ + mtocrf 0x01,0 + subf 10,0,5 +2: bf 30,1f + + lhz 6,0(12) + addi 12,12,2 + sth 6,0(3) + addi 3,3,2 +1: bf 31,L(end_4bytes_alignment) + + lbz 6,0(12) + addi 12,12,1 + stb 6,0(3) + addi 3,3,1 + + .align 4 +L(end_4bytes_alignment): + cmpldi cr1,10,16 + mtcrf 0x01,10 + +L(copy_LT_32_aligned): + /* At least 6 bytes to go, and SRC is word-aligned. */ + blt cr1,8f + + /* Copy 16 bytes. */ + lwz 6,0(12) + lwz 7,4(12) + stw 6,0(3) + lwz 8,8(12) + stw 7,4(3) + lwz 6,12(12) + addi 12,12,16 + stw 8,8(3) + stw 6,12(3) + addi 3,3,16 +8: /* Copy 8 bytes. */ + bf 28,4f + + lwz 6,0(12) + lwz 7,4(12) + addi 12,12,8 + stw 6,0(3) + stw 7,4(3) + addi 3,3,8 +4: /* Copy 4 bytes. */ + bf 29,2f + + lwz 6,0(12) + addi 12,12,4 + stw 6,0(3) + addi 3,3,4 +2: /* Copy 2-3 bytes. */ + bf 30,1f + + lhz 6,0(12) + sth 6,0(3) + bf 31,0f + lbz 7,2(12) + stb 7,2(3) + ld 3,-16(1) + blr + + .align 4 +1: /* Copy 1 byte. */ + bf 31,0f + + lbz 6,0(12) + stb 6,0(3) +0: /* Return original DST pointer. */ + ld 3,-16(1) + blr + + /* Handles copies of 0~8 bytes. */ + .align 4 +L(copy_LE_8): + bne cr6,4f + + /* Though we could've used ld/std here, they are still + slow for unaligned cases. */ + + lwz 6,0(4) + lwz 7,4(4) + stw 6,0(3) + stw 7,4(3) + ld 3,-16(1) /* Return original DST pointers. */ + blr + + .align 4 +4: /* Copies 4~7 bytes. */ + bf 29,2b + + lwz 6,0(4) + stw 6,0(3) + bf 30,5f + lhz 7,4(4) + sth 7,4(3) + bf 31,0f + lbz 8,6(4) + stb 8,6(3) + ld 3,-16(1) + blr + + .align 4 +5: /* Copy 1 byte. */ + bf 31,0f + + lbz 6,4(4) + stb 6,4(3) + +0: /* Return original DST pointer. */ + ld 3,-16(1) + blr + + /* Handle copies of 32+ bytes where DST is aligned (to quadword) but + SRC is not. Use aligned quadword loads from SRC, shifted to realign + the data, allowing for aligned DST stores. */ + .align 4 +L(copy_GE_32_unaligned): + clrldi 0,0,60 /* Number of bytes until the 1st + quadword. */ + andi. 11,3,15 /* Check alignment of DST (against + quadwords). */ + srdi 9,5,4 /* Number of full quadwords remaining. */ + + beq L(copy_GE_32_unaligned_cont) + + /* SRC is not quadword aligned, get it aligned. */ + + mtcrf 0x01,0 + subf 31,0,5 + + /* Vector instructions work best when proper alignment (16-bytes) + is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ +1: /* Copy 1 byte. */ + bf 31,2f + + lbz 6,0(12) + addi 12,12,1 + stb 6,0(3) + addi 3,3,1 +2: /* Copy 2 bytes. */ + bf 30,4f + + lhz 6,0(12) + addi 12,12,2 + sth 6,0(3) + addi 3,3,2 +4: /* Copy 4 bytes. */ + bf 29,8f + + lwz 6,0(12) + addi 12,12,4 + stw 6,0(3) + addi 3,3,4 +8: /* Copy 8 bytes. */ + bf 28,0f + + ld 6,0(12) + addi 12,12,8 + std 6,0(3) + addi 3,3,8 +0: + clrldi 10,12,60 /* Check alignment of SRC. */ + srdi 9,31,4 /* Number of full quadwords remaining. */ + + /* The proper alignment is present, it is OK to copy the bytes now. */ +L(copy_GE_32_unaligned_cont): + + /* Setup two indexes to speed up the indexed vector operations. */ + clrldi 11,31,60 + li 6,16 /* Index for 16-bytes offsets. */ + li 7,32 /* Index for 32-bytes offsets. */ + cmpldi cr1,11,0 + srdi 8,31,5 /* Setup the loop counter. */ + mr 10,3 + mr 11,12 + mtcrf 0x01,9 + cmpldi cr6,9,1 + lvsl 5,0,12 + lvx 3,0,12 + bf 31,L(setup_unaligned_loop) + + /* Copy another 16 bytes to align to 32-bytes due to the loop . */ + lvx 4,12,6 + vperm 6,3,4,5 + addi 11,12,16 + addi 10,3,16 + stvx 6,0,3 + vor 3,4,4 + +L(setup_unaligned_loop): + mtctr 8 + ble cr6,L(end_unaligned_loop) + + /* Copy 32 bytes at a time using vector instructions. */ + .align 4 +L(unaligned_loop): + + /* Note: vr6/vr10 may contain data that was already copied, + but in order to get proper alignment, we may have to copy + some portions again. This is faster than having unaligned + vector instructions though. */ + + lvx 4,11,6 /* vr4 = r11+16. */ + vperm 6,3,4,5 /* Merge the correctly-aligned portions + of vr3/vr4 into vr6. */ + lvx 3,11,7 /* vr3 = r11+32. */ + vperm 10,4,3,5 /* Merge the correctly-aligned portions + of vr3/vr4 into vr10. */ + addi 11,11,32 + stvx 6,0,10 + stvx 10,10,6 + addi 10,10,32 + + bdnz L(unaligned_loop) + + .align 4 +L(end_unaligned_loop): + + /* Check for tail bytes. */ + rldicr 0,31,0,59 + mtcrf 0x01,31 + beq cr1,0f + + add 3,3,0 + add 12,12,0 + + /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ +8: /* Copy 8 bytes. */ + bf 28,4f + + lwz 6,0(12) + lwz 7,4(12) + addi 12,12,8 + stw 6,0(3) + stw 7,4(3) + addi 3,3,8 +4: /* Copy 4 bytes. */ + bf 29,2f + + lwz 6,0(12) + addi 12,12,4 + stw 6,0(3) + addi 3,3,4 +2: /* Copy 2~3 bytes. */ + bf 30,1f + + lhz 6,0(12) + addi 12,12,2 + sth 6,0(3) + addi 3,3,2 +1: /* Copy 1 byte. */ + bf 31,0f + + lbz 6,0(12) + stb 6,0(3) +0: /* Return original DST pointer. */ + ld 31,-8(1) + ld 3,-16(1) + blr + +END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS) +libc_hidden_builtin_def (memcpy) diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S new file mode 100644 index 0000000000..02a9eedd6b --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/memset.S @@ -0,0 +1,398 @@ +/* Optimized memset implementation for PowerPC64/POWER7. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Luis Machado . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include + +/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); + Returns 's'. */ + + .machine power7 +EALIGN (BP_SYM (memset), 5, 0) + CALL_MCOUNT 3 + +L(_memset): + cmpldi cr7,5,31 + cmpldi cr6,5,8 + mr 10,3 + + /* Replicate byte to word. */ + rlwimi 4,4,8,16,23 + rlwimi 4,4,16,0,15 + ble cr6,L(small) /* If length <= 8, use short copy code. */ + + neg 0,3 + ble cr7,L(medium) /* If length < 32, use medium copy code. */ + + andi. 11,10,7 /* Check alignment of SRC. */ + insrdi 4,4,32,0 /* Replicate word to double word. */ + + mr 12,5 + beq L(big_aligned) + + clrldi 0,0,61 + mtocrf 0x01,0 + subf 5,0,5 + + /* Get DST aligned to 8 bytes. */ +1: bf 31,2f + + stb 4,0(10) + addi 10,10,1 +2: bf 30,4f + + sth 4,0(10) + addi 10,10,2 +4: bf 29,L(big_aligned) + + stw 4,0(10) + addi 10,10,4 + + .align 4 +L(big_aligned): + + cmpldi cr5,5,255 + li 0,32 + dcbtst 0,10 + cmpldi cr6,4,0 + srdi 9,5,3 /* Number of full doublewords remaining. */ + crand 27,26,21 + mtocrf 0x01,9 + bt 27,L(huge) + + /* From this point on, we'll copy 32+ bytes and the value + isn't 0 (so we can't use dcbz). */ + + srdi 8,5,5 + clrldi 11,5,61 + cmpldi cr6,11,0 + cmpldi cr1,9,4 + mtctr 8 + + /* Copy 1~3 doublewords so the main loop starts + at a multiple of 32 bytes. */ + + bf 30,1f + + std 4,0(10) + std 4,8(10) + addi 10,10,16 + bf 31,L(big_loop) + + std 4,0(10) + addi 10,10,8 + mr 12,10 + blt cr1,L(tail_bytes) + b L(big_loop) + + .align 4 +1: /* Copy 1 doubleword. */ + bf 31,L(big_loop) + + std 4,0(10) + addi 10,10,8 + + /* Main aligned copy loop. Copies 32-bytes at a time and + ping-pong through r10 and r12 to avoid AGEN delays. */ + .align 4 +L(big_loop): + addi 12,10,32 + std 4,0(10) + std 4,8(10) + std 4,16(10) + std 4,24(10) + bdz L(tail_bytes) + + addi 10,10,64 + std 4,0(12) + std 4,8(12) + std 4,16(12) + std 4,24(12) + bdnz L(big_loop) + + mr 12,10 + b L(tail_bytes) + + .align 4 +L(tail_bytes): + + /* Check for tail bytes. */ + beqlr cr6 + + clrldi 0,5,61 + mtocrf 0x01,0 + + /* At this point we have a tail of 0-7 bytes and we know that the + destination is doubleword-aligned. */ +4: /* Copy 4 bytes. */ + bf 29,2f + + stw 4,0(12) + addi 12,12,4 +2: /* Copy 2 bytes. */ + bf 30,1f + + sth 4,0(12) + addi 12,12,2 +1: /* Copy 1 byte. */ + bflr 31 + + stb 4,0(12) + blr + + /* Special case when value is 0 and we have a long length to deal + with. Use dcbz to zero out 128-bytes at a time. Before using + dcbz though, we need to get the destination 128-bytes aligned. */ + .align 4 +L(huge): + andi. 11,10,127 + neg 0,10 + beq L(huge_aligned) + + clrldi 0,0,57 + subf 5,0,5 + srdi 0,0,3 + mtocrf 0x01,0 + + /* Get DST aligned to 128 bytes. */ +8: bf 28,4f + + std 4,0(10) + std 4,8(10) + std 4,16(10) + std 4,24(10) + std 4,32(10) + std 4,40(10) + std 4,48(10) + std 4,56(10) + addi 10,10,64 + .align 4 +4: bf 29,2f + + std 4,0(10) + std 4,8(10) + std 4,16(10) + std 4,24(10) + addi 10,10,32 + .align 4 +2: bf 30,1f + + std 4,0(10) + std 4,8(10) + addi 10,10,16 + .align 4 +1: bf 31,L(huge_aligned) + + std 4,0(10) + addi 10,10,8 + + +L(huge_aligned): + srdi 8,5,7 + clrldi 11,5,57 + cmpldi cr6,11,0 + mtctr 8 + + .align 4 +L(huge_loop): + dcbz 0,10 + addi 10,10,128 + bdnz L(huge_loop) + + /* Check how many bytes are still left. */ + beqlr cr6 + + subf 9,3,10 + subf 5,9,12 + srdi 8,5,3 + cmpldi cr6,8,0 + mtocrf 0x01,8 + + /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for + speed. We'll handle the resulting tail bytes later. */ + beq cr6,L(tail) + +8: bf 28,4f + + std 4,0(10) + std 4,8(10) + std 4,16(10) + std 4,24(10) + std 4,32(10) + std 4,40(10) + std 4,48(10) + std 4,56(10) + addi 10,10,64 + .align 4 +4: bf 29,2f + + std 4,0(10) + std 4,8(10) + std 4,16(10) + std 4,24(10) + addi 10,10,32 + .align 4 +2: bf 30,1f + + std 4,0(10) + std 4,8(10) + addi 10,10,16 + .align 4 +1: bf 31,L(tail) + + std 4,0(10) + addi 10,10,8 + + /* Handle the rest of the tail bytes here. */ +L(tail): + mtocrf 0x01,5 + + .align 4 +4: bf 29,2f + + stw 4,0(10) + addi 10,10,4 + .align 4 +2: bf 30,1f + + sth 4,0(10) + addi 10,10,2 + .align 4 +1: bflr 31 + + stb 4,0(10) + blr + + /* Expanded tree to copy tail bytes without increments. */ + .align 4 +L(copy_tail): + bf 29,L(FXX) + + stw 4,0(10) + bf 30,L(TFX) + + sth 4,4(10) + bflr 31 + + stb 4,6(10) + blr + + .align 4 +L(FXX): bf 30,L(FFX) + + sth 4,0(10) + bflr 31 + + stb 4,2(10) + blr + + .align 4 +L(TFX): bflr 31 + + stb 4,4(10) + blr + + .align 4 +L(FFX): bflr 31 + + stb 4,0(10) + blr + + /* Handle copies of 9~31 bytes. */ + .align 4 +L(medium): + /* At least 9 bytes to go. */ + andi. 11,10,3 + clrldi 0,0,62 + beq L(medium_aligned) + + /* Force 4-bytes alignment for SRC. */ + mtocrf 0x01,0 + subf 5,0,5 +1: /* Copy 1 byte. */ + bf 31,2f + + stb 4,0(10) + addi 10,10,1 +2: /* Copy 2 bytes. */ + bf 30,L(medium_aligned) + + sth 4,0(10) + addi 10,10,2 + + .align 4 +L(medium_aligned): + /* At least 6 bytes to go, and DST is word-aligned. */ + cmpldi cr1,5,16 + mtocrf 0x01,5 + blt cr1,8f + + /* Copy 16 bytes. */ + stw 4,0(10) + stw 4,4(10) + stw 4,8(10) + stw 4,12(10) + addi 10,10,16 +8: /* Copy 8 bytes. */ + bf 28,4f + + stw 4,0(10) + stw 4,4(10) + addi 10,10,8 +4: /* Copy 4 bytes. */ + bf 29,2f + + stw 4,0(10) + addi 10,10,4 +2: /* Copy 2-3 bytes. */ + bf 30,1f + + sth 4,0(10) + addi 10,10,2 +1: /* Copy 1 byte. */ + bflr 31 + + stb 4,0(10) + blr + + /* Handles copies of 0~8 bytes. */ + .align 4 +L(small): + mtocrf 0x01,5 + bne cr6,L(copy_tail) + + stw 4,0(10) + stw 4,4(10) + blr + +END_GEN_TB (BP_SYM (memset),TB_TOCLESS) +libc_hidden_builtin_def (memset) + +/* Copied from bzero.S to prevent the linker from inserting a stub + between bzero and memset. */ +ENTRY (BP_SYM (__bzero)) + CALL_MCOUNT 3 + mr r5,r4 + li r4,0 + b L(_memset) +END_GEN_TB (BP_SYM (__bzero),TB_TOCLESS) + +weak_alias (BP_SYM (__bzero), BP_SYM (bzero)) diff --git a/sysdeps/powerpc/powerpc64/power7/strncmp.S b/sysdeps/powerpc/powerpc64/power7/strncmp.S new file mode 100644 index 0000000000..34f1e52df9 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/strncmp.S @@ -0,0 +1,181 @@ +/* Optimized strcmp implementation for POWER7/PowerPC64. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA + 02110-1301 USA. */ + +#include +#include +#include + +/* See strlen.s for comments on how the end-of-string testing works. */ + +/* int [r3] strncmp (const char *s1 [r3], + const char *s2 [r4], + size_t size [r5]) */ + +EALIGN (BP_SYM(strncmp),4,0) + CALL_MCOUNT 3 + +#define rTMP r0 +#define rRTN r3 +#define rSTR1 r3 /* first string arg */ +#define rSTR2 r4 /* second string arg */ +#define rN r5 /* max string length */ +/* Note: The Bounded pointer support in this code is broken. This code + was inherited from PPC32 and and that support was never completed. + Current PPC gcc does not support -fbounds-check or -fbounded-pointers. */ +#define rWORD1 r6 /* current word in s1 */ +#define rWORD2 r7 /* current word in s2 */ +#define rWORD3 r10 +#define rWORD4 r11 +#define rFEFE r8 /* constant 0xfefefefefefefeff (-0x0101010101010101) */ +#define r7F7F r9 /* constant 0x7f7f7f7f7f7f7f7f */ +#define rNEG r10 /* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */ +#define rBITDIF r11 /* bits that differ in s1 & s2 words */ + + dcbt 0,rSTR1 + or rTMP,rSTR2,rSTR1 + lis r7F7F,0x7f7f + dcbt 0,rSTR2 + clrldi. rTMP,rTMP,61 + cmpldi cr1,rN,0 + lis rFEFE,-0x101 + bne L(unaligned) +/* We are doubleword alligned so set up for two loops. first a double word + loop, then fall into the byte loop if any residual. */ + srdi. rTMP,rN,3 + clrldi rN,rN,61 + addi rFEFE,rFEFE,-0x101 + addi r7F7F,r7F7F,0x7f7f + cmpldi cr1,rN,0 + beq L(unaligned) + + mtctr rTMP + ld rWORD1,0(rSTR1) + ld rWORD2,0(rSTR2) + sldi rTMP,rFEFE,32 + insrdi r7F7F,r7F7F,32,0 + add rFEFE,rFEFE,rTMP + b L(g1) + +L(g0): + ldu rWORD1,8(rSTR1) + bne cr1,L(different) + ldu rWORD2,8(rSTR2) +L(g1): add rTMP,rFEFE,rWORD1 + nor rNEG,r7F7F,rWORD1 + bdz L(tail) + and. rTMP,rTMP,rNEG + cmpd cr1,rWORD1,rWORD2 + beq L(g0) + +/* OK. We've hit the end of the string. We need to be careful that + we don't compare two strings as different because of gunk beyond + the end of the strings... */ + +L(endstring): + and rTMP,r7F7F,rWORD1 + beq cr1,L(equal) + add rTMP,rTMP,r7F7F + xor. rBITDIF,rWORD1,rWORD2 + + andc rNEG,rNEG,rTMP + blt L(highbit) + cntlzd rBITDIF,rBITDIF + cntlzd rNEG,rNEG + addi rNEG,rNEG,7 + cmpd cr1,rNEG,rBITDIF + sub rRTN,rWORD1,rWORD2 + blt cr1,L(equal) + sradi rRTN,rRTN,63 + ori rRTN,rRTN,1 + blr +L(equal): + li rRTN,0 + blr + +L(different): + ldu rWORD1,-8(rSTR1) + xor. rBITDIF,rWORD1,rWORD2 + sub rRTN,rWORD1,rWORD2 + blt L(highbit) + sradi rRTN,rRTN,63 + ori rRTN,rRTN,1 + blr +L(highbit): + srdi rWORD2,rWORD2,56 + srdi rWORD1,rWORD1,56 + sub rRTN,rWORD1,rWORD2 + blr + + +/* Oh well. In this case, we just do a byte-by-byte comparison. */ + .align 4 +L(tail): + and. rTMP,rTMP,rNEG + cmpd cr1,rWORD1,rWORD2 + bne L(endstring) + addi rSTR1,rSTR1,8 + bne cr1,L(different) + addi rSTR2,rSTR2,8 + cmpldi cr1,rN,0 +L(unaligned): + mtctr rN + ble cr1,L(ux) +L(uz): + lbz rWORD1,0(rSTR1) + lbz rWORD2,0(rSTR2) + .align 4 +L(u1): + cmpdi cr1,rWORD1,0 + bdz L(u4) + cmpd rWORD1,rWORD2 + beq cr1,L(u4) + lbzu rWORD3,1(rSTR1) + lbzu rWORD4,1(rSTR2) + bne L(u4) + cmpdi cr1,rWORD3,0 + bdz L(u3) + cmpd rWORD3,rWORD4 + beq cr1,L(u3) + lbzu rWORD1,1(rSTR1) + lbzu rWORD2,1(rSTR2) + bne L(u3) + cmpdi cr1,rWORD1,0 + bdz L(u4) + cmpd rWORD1,rWORD2 + beq cr1,L(u4) + lbzu rWORD3,1(rSTR1) + lbzu rWORD4,1(rSTR2) + bne L(u4) + cmpdi cr1,rWORD3,0 + bdz L(u3) + cmpd rWORD3,rWORD4 + beq cr1,L(u3) + lbzu rWORD1,1(rSTR1) + lbzu rWORD2,1(rSTR2) + beq L(u1) + +L(u3): sub rRTN,rWORD3,rWORD4 + blr +L(u4): sub rRTN,rWORD1,rWORD2 + blr +L(ux): + li rRTN,0 + blr +END (BP_SYM (strncmp)) +libc_hidden_builtin_def (strncmp) -- cgit 1.4.1