diff options
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power7')
38 files changed, 0 insertions, 6793 deletions
diff --git a/sysdeps/powerpc/powerpc64/power7/Implies b/sysdeps/powerpc/powerpc64/power7/Implies deleted file mode 100644 index 9d68f39d22..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/Implies +++ /dev/null @@ -1,2 +0,0 @@ -powerpc/powerpc64/power6/fpu -powerpc/powerpc64/power6 diff --git a/sysdeps/powerpc/powerpc64/power7/Makefile b/sysdeps/powerpc/powerpc64/power7/Makefile deleted file mode 100644 index 89a2296085..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -ifeq ($(subdir),elf) -# Prevent the use of VSX registers and insns in _dl_start, which under -O3 -# optimization may require a TOC reference before relocations are resolved. -CFLAGS-rtld.c += -mno-vsx -endif - -ifeq ($(subdir),string) -sysdep_routines += strstr-ppc64 -CFLAGS-strncase.c += -funroll-loops -CFLAGS-strncase_l.c += -funroll-loops -endif diff --git a/sysdeps/powerpc/powerpc64/power7/add_n.S b/sysdeps/powerpc/powerpc64/power7/add_n.S deleted file mode 100644 index 6425afbc9f..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/add_n.S +++ /dev/null @@ -1,98 +0,0 @@ -/* PowerPC64 mpn_lshift -- mpn_add_n/mpn_sub_n -- mpn addition and - subtraction. - Copyright (C) 2003-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* cycles/limb - * POWER7 2.18 - */ - -#ifdef USE_AS_SUB -# define FUNC __mpn_sub_n -# define ADDSUBC subfe -#else -# define FUNC __mpn_add_n -# define ADDSUBC adde -#endif - -#define RP r3 -#define UP r4 -#define VP r5 -#define N r6 - -EALIGN(FUNC, 5, 0) -#ifdef USE_AS_SUB - addic r0, r0, 0 -#else - addic r0, r1, -1 -#endif - andi. r7, N, 1 - beq L(bx0) - - ld r7, 0(UP) - ld r9, r0(VP) - ADDSUBC r11, r9, r7 - std r11, r0(RP) - cmpldi N, N, 1 - beq N, L(end) - addi UP, UP, 8 - addi VP, VP, 8 - addi RP, RP, 8 - -L(bx0): addi r0, N, 2 - srdi r0, r0, 2 - mtctr r0 - - andi. r7, N, 2 - bne L(mid) - - addi UP, UP, 16 - addi VP, VP, 16 - addi RP, RP, 16 - - .align 5 -L(top): ld r6, -16(UP) - ld r7, -8(UP) - ld r8, -16(VP) - ld r9, -8(VP) - ADDSUBC r10, r8, N - ADDSUBC r11, r9, r7 - std r10, -16(RP) - std r11, -8(RP) -L(mid): ld r6, 0(UP) - ld r7, 8(UP) - ld r8, 0(VP) - ld r9, 8(VP) - ADDSUBC r10, r8, N - ADDSUBC r11, r9, r7 - std r10, 0(RP) - std r11, 8(RP) - addi UP, UP, 32 - addi VP, VP, 32 - addi RP, RP, 32 - bdnz L(top) - -L(end): subfe r3, r0, r0 -#ifdef USE_AS_SUB - neg r3, r3 -#else - addi r3, r3, 1 -#endif - blr -END(FUNC) diff --git a/sysdeps/powerpc/powerpc64/power7/bcopy.c b/sysdeps/powerpc/powerpc64/power7/bcopy.c deleted file mode 100644 index 4a6a400e7a..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/bcopy.c +++ /dev/null @@ -1 +0,0 @@ -/* Implemented at memmove.S */ diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/powerpc/powerpc64/power7/fpu/Implies deleted file mode 100644 index 30fa17646e..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc64/power6/fpu diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies deleted file mode 100644 index 410d289a6d..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc64/power6/fpu/multiarch diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S deleted file mode 100644 index 9ccc758c9e..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S +++ /dev/null @@ -1,70 +0,0 @@ -/* finite(). PowerPC64/POWER7 version. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <math_ldbl_opt.h> - -/* int __finite(x) */ - .section ".toc","aw" -.LC0: /* 1.0 */ - .tc FD_ONE[TC],0x3ff0000000000000 - .section ".text" - .type __finite, @function - .machine power7 -EALIGN (__finite, 4, 0) - CALL_MCOUNT 0 - lfd fp0,.LC0@toc(r2) - ftdiv cr7,fp1,fp0 - li r3,1 - bflr 30 - - /* If we are here, we either have +/-INF, - NaN or denormal. */ - - stfd fp1,-16(r1) /* Transfer FP to GPR's. */ - ori 2,2,0 /* Force a new dispatch group. */ - lhz r4,-16+HISHORT(r1) /* Fetch the upper 16 bits of the FP value - (biased exponent and sign bit). */ - clrlwi r4,r4,17 /* r4 = abs(r4). */ - cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */ - bltlr cr7 /* LT means finite, other non-finite. */ - li r3,0 - blr - END (__finite) - -hidden_def (__finite) -weak_alias (__finite, finite) - -/* It turns out that the 'double' version will also always work for - single-precision. */ -strong_alias (__finite, __finitef) -hidden_def (__finitef) -weak_alias (__finitef, finitef) - -#if IS_IN (libm) -# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0) -compat_symbol (libm, __finite, __finitel, GLIBC_2_0) -compat_symbol (libm, finite, finitel, GLIBC_2_0) -# endif -#else -# if LONG_DOUBLE_COMPAT (libc, GLIBC_2_0) -compat_symbol (libc, __finite, __finitel, GLIBC_2_0); -compat_symbol (libc, finite, finitel, GLIBC_2_0); -# endif -#endif diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S deleted file mode 100644 index 54bd94176d..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S +++ /dev/null @@ -1 +0,0 @@ -/* This function uses the same code as s_finite.S. */ diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S deleted file mode 100644 index 4482cddcfa..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S +++ /dev/null @@ -1,69 +0,0 @@ -/* isinf(). PowerPC64/POWER7 version. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <math_ldbl_opt.h> - -/* int __isinf(x) */ - .section ".toc","aw" -.LC0: /* 1.0 */ - .tc FD_ONE[TC],0x3ff0000000000000 - .section ".text" - .type __isinf, @function - .machine power7 -EALIGN (__isinf, 4, 0) - CALL_MCOUNT 0 - lfd fp0,.LC0@toc(r2) - ftdiv cr7,fp1,fp0 - li r3,0 - bflr 29 /* If not INF, return. */ - - /* Either we have -INF/+INF or a denormal. */ - - stfd fp1,-16(r1) /* Transfer FP to GPR's. */ - ori 2,2,0 /* Force a new dispatch group. */ - lhz r4,-16+HISHORT(r1) /* Fetch the upper 16 bits of the FP value - (biased exponent and sign bit). */ - cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */ - li r3,1 - beqlr cr7 /* EQ means INF, otherwise -INF. */ - li r3,-1 - blr - END (__isinf) - -hidden_def (__isinf) -weak_alias (__isinf, isinf) - -/* It turns out that the 'double' version will also always work for - single-precision. */ -strong_alias (__isinf, __isinff) -hidden_def (__isinff) -weak_alias (__isinff, isinff) - -#ifdef NO_LONG_DOUBLE -strong_alias (__isinf, __isinfl) -weak_alias (__isinf, isinfl) -#endif - -#if !IS_IN (libm) -# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) -compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0); -compat_symbol (libc, isinf, isinfl, GLIBC_2_0); -# endif -#endif diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S deleted file mode 100644 index be759e091e..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S +++ /dev/null @@ -1 +0,0 @@ -/* This function uses the same code as s_isinf.S. */ diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S deleted file mode 100644 index 46b08a0d37..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S +++ /dev/null @@ -1,68 +0,0 @@ -/* isnan(). PowerPC64/POWER7 version. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <math_ldbl_opt.h> - -/* int __isnan(x) */ - .section ".toc","aw" -.LC0: /* 1.0 */ - .tc FD_ONE[TC],0x3ff0000000000000 - .section ".text" - .type __isnan, @function - .machine power7 -EALIGN (__isnan, 4, 0) - CALL_MCOUNT 0 - lfd fp0,.LC0@toc(r2) - ftdiv cr7,fp1,fp0 - li r3,0 - bflr 30 /* If not NaN, finish. */ - - stfd fp1,-16(r1) /* Transfer FP to GPR's. */ - ori 2,2,0 /* Force a new dispatch group. */ - ld r4,-16(r1) /* Load FP into GPR. */ - lis r0,0x7ff0 - sldi r0,r0,32 /* const long r0 0x7ff00000 00000000. */ - clrldi r4,r4,1 /* x = fabs(x) */ - cmpd cr7,r4,r0 /* if (fabs(x) <= inf) */ - blelr cr7 /* LE means not NaN. */ - li r3,1 /* else return 1 */ - blr - END (__isnan) - -hidden_def (__isnan) -weak_alias (__isnan, isnan) - -/* It turns out that the 'double' version will also always work for - single-precision. */ -strong_alias (__isnan, __isnanf) -hidden_def (__isnanf) -weak_alias (__isnanf, isnanf) - -#ifdef NO_LONG_DOUBLE -strong_alias (__isnan, __isnanl) -weak_alias (__isnan, isnanl) -#endif - -#if !IS_IN (libm) -# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) -compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0); -compat_symbol (libc, isnan, isnanl, GLIBC_2_0); -# endif -#endif diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S deleted file mode 100644 index b48c85e0d3..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S +++ /dev/null @@ -1 +0,0 @@ -/* This function uses the same code as s_isnan.S. */ diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c b/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c deleted file mode 100644 index 2599c771d9..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c +++ /dev/null @@ -1 +0,0 @@ -#include <sysdeps/powerpc/power7/fpu/s_logb.c> diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c b/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c deleted file mode 100644 index 7a5a8032e0..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c +++ /dev/null @@ -1 +0,0 @@ -#include <sysdeps/powerpc/power7/fpu/s_logbf.c> diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c b/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c deleted file mode 100644 index 524ae2c78d..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c +++ /dev/null @@ -1 +0,0 @@ -#include <sysdeps/powerpc/power7/fpu/s_logbl.c> diff --git a/sysdeps/powerpc/powerpc64/power7/memchr.S b/sysdeps/powerpc/powerpc64/power7/memchr.S deleted file mode 100644 index 5e9707aa02..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/memchr.S +++ /dev/null @@ -1,199 +0,0 @@ -/* Optimized memchr implementation for PowerPC64/POWER7 using cmpb insn. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* int [r3] memchr (char *s [r3], int byte [r4], int size [r5]) */ - -#ifndef MEMCHR -# define MEMCHR __memchr -#endif - .machine power7 -ENTRY (MEMCHR) - CALL_MCOUNT 3 - dcbt 0,r3 - clrrdi r8,r3,3 - insrdi r4,r4,8,48 - - /* Calculate the last acceptable address and check for possible - addition overflow by using satured math: - r7 = r3 + r5 - r7 |= -(r7 < x) */ - add r7,r3,r5 - subfc r6,r3,r7 - subfe r9,r9,r9 - extsw r6,r9 - or r7,r7,r6 - - insrdi r4,r4,16,32 - cmpldi r5,32 - li r9, -1 - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - insrdi r4,r4,32,0 - addi r7,r7,-1 -#ifdef __LITTLE_ENDIAN__ - sld r9,r9,r6 -#else - srd r9,r9,r6 -#endif - ble L(small_range) - - ld r12,0(r8) /* Load doubleword from memory. */ - cmpb r3,r12,r4 /* Check for BYTEs in DWORD1. */ - and r3,r3,r9 - clrldi r5,r7,61 /* Byte count - 1 in last dword. */ - clrrdi r7,r7,3 /* Address of last doubleword. */ - cmpldi cr7,r3,0 /* Does r3 indicate we got a hit? */ - bne cr7,L(done) - - mtcrf 0x01,r8 - /* Are we now aligned to a quadword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - bt 28,L(loop_setup) - - /* Handle DWORD2 of pair. */ - ldu r12,8(r8) - cmpb r3,r12,r4 - cmpldi cr7,r3,0 - bne cr7,L(done) - -L(loop_setup): - /* The last dword we want to read in the loop below is the one - containing the last byte of the string, ie. the dword at - (s + size - 1) & ~7, or r7. The first dword read is at - r8 + 8, we read 2 * cnt dwords, so the last dword read will - be at r8 + 8 + 16 * cnt - 8. Solving for cnt gives - cnt = (r7 - r8) / 16 */ - sub r6,r7,r8 - srdi r6,r6,4 /* Number of loop iterations. */ - mtctr r6 /* Setup the counter. */ - - /* Main loop to look for BYTE in the string. Since - it's a small loop (8 instructions), align it to 32-bytes. */ - .align 5 -L(loop): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the byte-checking process for bigger strings. */ - ld r12,8(r8) - ldu r11,16(r8) - cmpb r3,r12,r4 - cmpb r9,r11,r4 - or r6,r9,r3 /* Merge everything in one doubleword. */ - cmpldi cr7,r6,0 - bne cr7,L(found) - bdnz L(loop) - - /* We may have one more dword to read. */ - cmpld r8,r7 - beqlr - - ldu r12,8(r8) - cmpb r3,r12,r4 - cmpldi cr6,r3,0 - bne cr6,L(done) - blr - - .align 4 -L(found): - /* OK, one (or both) of the doublewords contains BYTE. Check - the first doubleword and decrement the address in case the first - doubleword really contains BYTE. */ - cmpldi cr6,r3,0 - addi r8,r8,-8 - bne cr6,L(done) - - /* BYTE must be in the second doubleword. Adjust the address - again and move the result of cmpb to r3 so we can calculate the - pointer. */ - - mr r3,r9 - addi r8,r8,8 - - /* r3 has the output of the cmpb instruction, that is, it contains - 0xff in the same position as BYTE in the original - doubleword from the string. Use that to calculate the pointer. - We need to make sure BYTE is *before* the end of the range. */ -L(done): -#ifdef __LITTLE_ENDIAN__ - addi r0,r3,-1 - andc r0,r0,r3 - popcntd r0,r0 /* Count trailing zeros. */ -#else - cntlzd r0,r3 /* Count leading zeros before the match. */ -#endif - cmpld r8,r7 /* Are we on the last dword? */ - srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ - add r3,r8,r0 - cmpld cr7,r0,r5 /* If on the last dword, check byte offset. */ - bnelr - blelr cr7 - li r3,0 - blr - - .align 4 -L(null): - li r3,0 - blr - -/* Deals with size <= 32. */ - .align 4 -L(small_range): - cmpldi r5,0 - beq L(null) - ld r12,0(r8) /* Load word from memory. */ - cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */ - and r3,r3,r9 - cmpldi cr7,r3,0 - clrldi r5,r7,61 /* Byte count - 1 in last dword. */ - clrrdi r7,r7,3 /* Address of last doubleword. */ - cmpld r8,r7 /* Are we done already? */ - bne cr7,L(done) - beqlr - - ldu r12,8(r8) - cmpb r3,r12,r4 - cmpldi cr6,r3,0 - cmpld r8,r7 - bne cr6,L(done) /* Found something. */ - beqlr /* Hit end of string (length). */ - - ldu r12,8(r8) - cmpb r3,r12,r4 - cmpldi cr6,r3,0 - cmpld r8,r7 - bne cr6,L(done) - beqlr - - ldu r12,8(r8) - cmpb r3,r12,r4 - cmpldi cr6,r3,0 - cmpld r8,r7 - bne cr6,L(done) - beqlr - - ldu r12,8(r8) - cmpb r3,r12,r4 - cmpldi cr6,r3,0 - bne cr6,L(done) - blr - -END (MEMCHR) -weak_alias (__memchr, memchr) -libc_hidden_builtin_def (memchr) diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S deleted file mode 100644 index 96ce8cee25..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/memcmp.S +++ /dev/null @@ -1,1061 +0,0 @@ -/* Optimized memcmp implementation for POWER7/PowerPC64. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* int [r3] memcmp (const char *s1 [r3], - const char *s2 [r4], - size_t size [r5]) */ -#ifndef MEMCMP -# define MEMCMP memcmp -#endif - .machine power7 -EALIGN (MEMCMP, 4, 0) - CALL_MCOUNT 3 - -#define rRTN r3 -#define rSTR1 r3 /* first string arg */ -#define rSTR2 r4 /* second string arg */ -#define rN r5 /* max string length */ -#define rWORD1 r6 /* current word in s1 */ -#define rWORD2 r7 /* current word in s2 */ -#define rWORD3 r8 /* next word in s1 */ -#define rWORD4 r9 /* next word in s2 */ -#define rWORD5 r10 /* next word in s1 */ -#define rWORD6 r11 /* next word in s2 */ - -#define rOFF8 r20 /* 8 bytes offset. */ -#define rOFF16 r21 /* 16 bytes offset. */ -#define rOFF24 r22 /* 24 bytes offset. */ -#define rOFF32 r23 /* 24 bytes offset. */ -#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ -#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ -#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ -#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ -#define rSHR r28 /* Unaligned shift right count. */ -#define rSHL r29 /* Unaligned shift left count. */ -#define rWORD7 r30 /* next word in s1 */ -#define rWORD8 r31 /* next word in s2 */ - -#define rWORD8SAVE (-8) -#define rWORD7SAVE (-16) -#define rOFF8SAVE (-24) -#define rOFF16SAVE (-32) -#define rOFF24SAVE (-40) -#define rOFF32SAVE (-48) -#define rSHRSAVE (-56) -#define rSHLSAVE (-64) -#define rWORD8SHIFTSAVE (-72) -#define rWORD2SHIFTSAVE (-80) -#define rWORD4SHIFTSAVE (-88) -#define rWORD6SHIFTSAVE (-96) - -#ifdef __LITTLE_ENDIAN__ -# define LD ldbrx -#else -# define LD ldx -#endif - - xor r0, rSTR2, rSTR1 - cmpldi cr6, rN, 0 - cmpldi cr1, rN, 12 - clrldi. r0, r0, 61 - clrldi r12, rSTR1, 61 - cmpldi cr5, r12, 0 - beq- cr6, L(zeroLength) - dcbt 0, rSTR1 - dcbt 0, rSTR2 -/* If less than 8 bytes or not aligned, use the unaligned - byte loop. */ - blt cr1, L(bytealigned) - std rWORD8, rWORD8SAVE(r1) - std rWORD7, rWORD7SAVE(r1) - std rOFF8, rOFF8SAVE(r1) - std rOFF16, rOFF16SAVE(r1) - std rOFF24, rOFF24SAVE(r1) - std rOFF32, rOFF32SAVE(r1) - cfi_offset(rWORD8, rWORD8SAVE) - cfi_offset(rWORD7, rWORD7SAVE) - cfi_offset(rOFF8, rOFF8SAVE) - cfi_offset(rOFF16, rOFF16SAVE) - cfi_offset(rOFF24, rOFF24SAVE) - cfi_offset(rOFF32, rOFF32SAVE) - - li rOFF8,8 - li rOFF16,16 - li rOFF24,24 - li rOFF32,32 - - bne L(unaligned) -/* At this point we know both strings have the same alignment and the - compare length is at least 8 bytes. r12 contains the low order - 3 bits of rSTR1 and cr5 contains the result of the logical compare - of r12 to 0. If r12 == 0 then we are already double word - aligned and can perform the DW aligned loop. - - Otherwise we know the two strings have the same alignment (but not - yet DW). So we force the string addresses to the next lower DW - boundary and special case this first DW using shift left to - eliminate bits preceding the first byte. Since we want to join the - normal (DW aligned) compare loop, starting at the second double word, - we need to adjust the length (rN) and special case the loop - versioning for the first DW. This ensures that the loop count is - correct and the first DW (shifted) is in the expected register pair. */ - .align 4 -L(samealignment): - clrrdi rSTR1, rSTR1, 3 - clrrdi rSTR2, rSTR2, 3 - beq cr5, L(DWaligned) - add rN, rN, r12 - sldi rWORD6, r12, 3 - srdi r0, rN, 5 /* Divide by 32 */ - andi. r12, rN, 24 /* Get the DW remainder */ - LD rWORD1, 0, rSTR1 - LD rWORD2, 0, rSTR2 - cmpldi cr1, r12, 16 - cmpldi cr7, rN, 32 - clrldi rN, rN, 61 - beq L(dPs4) - mtctr r0 - bgt cr1, L(dPs3) - beq cr1, L(dPs2) - -/* Remainder is 8 */ - .align 3 -L(dsP1): - sld rWORD5, rWORD1, rWORD6 - sld rWORD6, rWORD2, rWORD6 - cmpld cr5, rWORD5, rWORD6 - blt cr7, L(dP1x) -/* Do something useful in this cycle since we have to branch anyway. */ - LD rWORD1, rOFF8, rSTR1 - LD rWORD2, rOFF8, rSTR2 - cmpld cr7, rWORD1, rWORD2 - b L(dP1e) -/* Remainder is 16 */ - .align 4 -L(dPs2): - sld rWORD5, rWORD1, rWORD6 - sld rWORD6, rWORD2, rWORD6 - cmpld cr6, rWORD5, rWORD6 - blt cr7, L(dP2x) -/* Do something useful in this cycle since we have to branch anyway. */ - LD rWORD7, rOFF8, rSTR1 - LD rWORD8, rOFF8, rSTR2 - cmpld cr5, rWORD7, rWORD8 - b L(dP2e) -/* Remainder is 24 */ - .align 4 -L(dPs3): - sld rWORD3, rWORD1, rWORD6 - sld rWORD4, rWORD2, rWORD6 - cmpld cr1, rWORD3, rWORD4 - b L(dP3e) -/* Count is a multiple of 32, remainder is 0 */ - .align 4 -L(dPs4): - mtctr r0 - sld rWORD1, rWORD1, rWORD6 - sld rWORD2, rWORD2, rWORD6 - cmpld cr7, rWORD1, rWORD2 - b L(dP4e) - -/* At this point we know both strings are double word aligned and the - compare length is at least 8 bytes. */ - .align 4 -L(DWaligned): - andi. r12, rN, 24 /* Get the DW remainder */ - srdi r0, rN, 5 /* Divide by 32 */ - cmpldi cr1, r12, 16 - cmpldi cr7, rN, 32 - clrldi rN, rN, 61 - beq L(dP4) - bgt cr1, L(dP3) - beq cr1, L(dP2) - -/* Remainder is 8 */ - .align 4 -L(dP1): - mtctr r0 -/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early - (8-15 byte compare), we want to use only volatile registers. This - means we can avoid restoring non-volatile registers since we did not - change any on the early exit path. The key here is the non-early - exit path only cares about the condition code (cr5), not about which - register pair was used. */ - LD rWORD5, 0, rSTR1 - LD rWORD6, 0, rSTR2 - cmpld cr5, rWORD5, rWORD6 - blt cr7, L(dP1x) - LD rWORD1, rOFF8, rSTR1 - LD rWORD2, rOFF8, rSTR2 - cmpld cr7, rWORD1, rWORD2 -L(dP1e): - LD rWORD3, rOFF16, rSTR1 - LD rWORD4, rOFF16, rSTR2 - cmpld cr1, rWORD3, rWORD4 - LD rWORD5, rOFF24, rSTR1 - LD rWORD6, rOFF24, rSTR2 - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(dLcr5x) - bne cr7, L(dLcr7x) - - LD rWORD7, rOFF32, rSTR1 - LD rWORD8, rOFF32, rSTR2 - addi rSTR1, rSTR1, 32 - addi rSTR2, rSTR2, 32 - bne cr1, L(dLcr1) - cmpld cr5, rWORD7, rWORD8 - bdnz L(dLoop) - bne cr6, L(dLcr6) - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) - .align 3 -L(dP1x): - sldi. r12, rN, 3 - bne cr5, L(dLcr5x) - subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - bne L(d00) - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 0 - blr - -/* Remainder is 16 */ - .align 4 -L(dP2): - mtctr r0 - LD rWORD5, 0, rSTR1 - LD rWORD6, 0, rSTR2 - cmpld cr6, rWORD5, rWORD6 - blt cr7, L(dP2x) - LD rWORD7, rOFF8, rSTR1 - LD rWORD8, rOFF8, rSTR2 - cmpld cr5, rWORD7, rWORD8 -L(dP2e): - LD rWORD1, rOFF16, rSTR1 - LD rWORD2, rOFF16, rSTR2 - cmpld cr7, rWORD1, rWORD2 - LD rWORD3, rOFF24, rSTR1 - LD rWORD4, rOFF24, rSTR2 - cmpld cr1, rWORD3, rWORD4 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - bne cr6, L(dLcr6) - bne cr5, L(dLcr5) - b L(dLoop2) - .align 4 -L(dP2x): - LD rWORD3, rOFF8, rSTR1 - LD rWORD4, rOFF8, rSTR2 - cmpld cr1, rWORD3, rWORD4 - sldi. r12, rN, 3 - bne cr6, L(dLcr6x) - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - bne cr1, L(dLcr1x) - subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - bne L(d00) - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 0 - blr - -/* Remainder is 24 */ - .align 4 -L(dP3): - mtctr r0 - LD rWORD3, 0, rSTR1 - LD rWORD4, 0, rSTR2 - cmpld cr1, rWORD3, rWORD4 -L(dP3e): - LD rWORD5, rOFF8, rSTR1 - LD rWORD6, rOFF8, rSTR2 - cmpld cr6, rWORD5, rWORD6 - blt cr7, L(dP3x) - LD rWORD7, rOFF16, rSTR1 - LD rWORD8, rOFF16, rSTR2 - cmpld cr5, rWORD7, rWORD8 - LD rWORD1, rOFF24, rSTR1 - LD rWORD2, rOFF24, rSTR2 - cmpld cr7, rWORD1, rWORD2 - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 - bne cr1, L(dLcr1) - bne cr6, L(dLcr6) - b L(dLoop1) -/* Again we are on a early exit path (24-31 byte compare), we want to - only use volatile registers and avoid restoring non-volatile - registers. */ - .align 4 -L(dP3x): - LD rWORD1, rOFF16, rSTR1 - LD rWORD2, rOFF16, rSTR2 - cmpld cr7, rWORD1, rWORD2 - sldi. r12, rN, 3 - bne cr1, L(dLcr1x) - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 - bne cr6, L(dLcr6x) - subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - bne cr7, L(dLcr7x) - bne L(d00) - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 0 - blr - -/* Count is a multiple of 32, remainder is 0 */ - .align 4 -L(dP4): - mtctr r0 - LD rWORD1, 0, rSTR1 - LD rWORD2, 0, rSTR2 - cmpld cr7, rWORD1, rWORD2 -L(dP4e): - LD rWORD3, rOFF8, rSTR1 - LD rWORD4, rOFF8, rSTR2 - cmpld cr1, rWORD3, rWORD4 - LD rWORD5, rOFF16, rSTR1 - LD rWORD6, rOFF16, rSTR2 - cmpld cr6, rWORD5, rWORD6 - LD rWORD7, rOFF24, rSTR1 - LD rWORD8, rOFF24, rSTR2 - addi rSTR1, rSTR1, 24 - addi rSTR2, rSTR2, 24 - cmpld cr5, rWORD7, rWORD8 - bne cr7, L(dLcr7) - bne cr1, L(dLcr1) - bdz- L(d24) /* Adjust CTR as we start with +4 */ -/* This is the primary loop */ - .align 4 -L(dLoop): - LD rWORD1, rOFF8, rSTR1 - LD rWORD2, rOFF8, rSTR2 - cmpld cr1, rWORD3, rWORD4 - bne cr6, L(dLcr6) -L(dLoop1): - LD rWORD3, rOFF16, rSTR1 - LD rWORD4, rOFF16, rSTR2 - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(dLcr5) -L(dLoop2): - LD rWORD5, rOFF24, rSTR1 - LD rWORD6, rOFF24, rSTR2 - cmpld cr5, rWORD7, rWORD8 - bne cr7, L(dLcr7) -L(dLoop3): - LD rWORD7, rOFF32, rSTR1 - LD rWORD8, rOFF32, rSTR2 - addi rSTR1, rSTR1, 32 - addi rSTR2, rSTR2, 32 - bne cr1, L(dLcr1) - cmpld cr7, rWORD1, rWORD2 - bdnz L(dLoop) - -L(dL4): - cmpld cr1, rWORD3, rWORD4 - bne cr6, L(dLcr6) - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(dLcr5) - cmpld cr5, rWORD7, rWORD8 -L(d44): - bne cr7, L(dLcr7) -L(d34): - bne cr1, L(dLcr1) -L(d24): - bne cr6, L(dLcr6) -L(d14): - sldi. r12, rN, 3 - bne cr5, L(dLcr5) -L(d04): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) - subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - beq L(duzeroLength) -/* At this point we have a remainder of 1 to 7 bytes to compare. Since - we are aligned it is safe to load the whole double word, and use - shift right double to eliminate bits beyond the compare length. */ -L(d00): - LD rWORD1, rOFF8, rSTR1 - LD rWORD2, rOFF8, rSTR2 - srd rWORD1, rWORD1, rN - srd rWORD2, rWORD2, rN - cmpld cr7, rWORD1, rWORD2 - bne cr7, L(dLcr7x) - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 0 - blr - - .align 4 -L(dLcr7): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) -L(dLcr7x): - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 1 - bgtlr cr7 - li rRTN, -1 - blr - .align 4 -L(dLcr1): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) -L(dLcr1x): - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 1 - bgtlr cr1 - li rRTN, -1 - blr - .align 4 -L(dLcr6): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) -L(dLcr6x): - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 1 - bgtlr cr6 - li rRTN, -1 - blr - .align 4 -L(dLcr5): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) -L(dLcr5x): - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 1 - bgtlr cr5 - li rRTN, -1 - blr - - .align 4 -L(bytealigned): - mtctr rN - -/* We need to prime this loop. This loop is swing modulo scheduled - to avoid pipe delays. The dependent instruction latencies (load to - compare to conditional branch) is 2 to 3 cycles. In this loop each - dispatch group ends in a branch and takes 1 cycle. Effectively - the first iteration of the loop only serves to load operands and - branches based on compares are delayed until the next loop. - - So we must precondition some registers and condition codes so that - we don't exit the loop early on the first iteration. */ - - lbz rWORD1, 0(rSTR1) - lbz rWORD2, 0(rSTR2) - bdz L(b11) - cmpld cr7, rWORD1, rWORD2 - lbz rWORD3, 1(rSTR1) - lbz rWORD4, 1(rSTR2) - bdz L(b12) - cmpld cr1, rWORD3, rWORD4 - lbzu rWORD5, 2(rSTR1) - lbzu rWORD6, 2(rSTR2) - bdz L(b13) - .align 4 -L(bLoop): - lbzu rWORD1, 1(rSTR1) - lbzu rWORD2, 1(rSTR2) - bne cr7, L(bLcr7) - - cmpld cr6, rWORD5, rWORD6 - bdz L(b3i) - - lbzu rWORD3, 1(rSTR1) - lbzu rWORD4, 1(rSTR2) - bne cr1, L(bLcr1) - - cmpld cr7, rWORD1, rWORD2 - bdz L(b2i) - - lbzu rWORD5, 1(rSTR1) - lbzu rWORD6, 1(rSTR2) - bne cr6, L(bLcr6) - - cmpld cr1, rWORD3, rWORD4 - bdnz L(bLoop) - -/* We speculatively loading bytes before we have tested the previous - bytes. But we must avoid overrunning the length (in the ctr) to - prevent these speculative loads from causing a segfault. In this - case the loop will exit early (before the all pending bytes are - tested. In this case we must complete the pending operations - before returning. */ -L(b1i): - bne cr7, L(bLcr7) - bne cr1, L(bLcr1) - b L(bx56) - .align 4 -L(b2i): - bne cr6, L(bLcr6) - bne cr7, L(bLcr7) - b L(bx34) - .align 4 -L(b3i): - bne cr1, L(bLcr1) - bne cr6, L(bLcr6) - b L(bx12) - .align 4 -L(bLcr7): - li rRTN, 1 - bgtlr cr7 - li rRTN, -1 - blr -L(bLcr1): - li rRTN, 1 - bgtlr cr1 - li rRTN, -1 - blr -L(bLcr6): - li rRTN, 1 - bgtlr cr6 - li rRTN, -1 - blr - -L(b13): - bne cr7, L(bx12) - bne cr1, L(bx34) -L(bx56): - sub rRTN, rWORD5, rWORD6 - blr - nop -L(b12): - bne cr7, L(bx12) -L(bx34): - sub rRTN, rWORD3, rWORD4 - blr -L(b11): -L(bx12): - sub rRTN, rWORD1, rWORD2 - blr - - .align 4 -L(zeroLength): - li rRTN, 0 - blr - - .align 4 -/* At this point we know the strings have different alignment and the - compare length is at least 8 bytes. r12 contains the low order - 3 bits of rSTR1 and cr5 contains the result of the logical compare - of r12 to 0. If r12 == 0 then rStr1 is double word - aligned and can perform the DWunaligned loop. - - Otherwise we know that rSTR1 is not already DW aligned yet. - So we can force the string addresses to the next lower DW - boundary and special case this first DW using shift left to - eliminate bits preceding the first byte. Since we want to join the - normal (DWaligned) compare loop, starting at the second double word, - we need to adjust the length (rN) and special case the loop - versioning for the first DW. This ensures that the loop count is - correct and the first DW (shifted) is in the expected resister pair. */ -L(unaligned): - std rSHL, rSHLSAVE(r1) - cfi_offset(rSHL, rSHLSAVE) - clrldi rSHL, rSTR2, 61 - beq cr6, L(duzeroLength) - std rSHR, rSHRSAVE(r1) - cfi_offset(rSHR, rSHRSAVE) - beq cr5, L(DWunaligned) - std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) - cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) -/* Adjust the logical start of rSTR2 to compensate for the extra bits - in the 1st rSTR1 DW. */ - sub rWORD8_SHIFT, rSTR2, r12 -/* But do not attempt to address the DW before that DW that contains - the actual start of rSTR2. */ - clrrdi rSTR2, rSTR2, 3 - std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) -/* Compute the left/right shift counts for the unaligned rSTR2, - compensating for the logical (DW aligned) start of rSTR1. */ - clrldi rSHL, rWORD8_SHIFT, 61 - clrrdi rSTR1, rSTR1, 3 - std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) - sldi rSHL, rSHL, 3 - cmpld cr5, rWORD8_SHIFT, rSTR2 - add rN, rN, r12 - sldi rWORD6, r12, 3 - std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) - cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) - cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) - cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) - subfic rSHR, rSHL, 64 - srdi r0, rN, 5 /* Divide by 32 */ - andi. r12, rN, 24 /* Get the DW remainder */ -/* We normally need to load 2 DWs to start the unaligned rSTR2, but in - this special case those bits may be discarded anyway. Also we - must avoid loading a DW where none of the bits are part of rSTR2 as - this may cross a page boundary and cause a page fault. */ - li rWORD8, 0 - blt cr5, L(dus0) - LD rWORD8, 0, rSTR2 - addi rSTR2, rSTR2, 8 - sld rWORD8, rWORD8, rSHL - -L(dus0): - LD rWORD1, 0, rSTR1 - LD rWORD2, 0, rSTR2 - cmpldi cr1, r12, 16 - cmpldi cr7, rN, 32 - srd r12, rWORD2, rSHR - clrldi rN, rN, 61 - beq L(duPs4) - mtctr r0 - or rWORD8, r12, rWORD8 - bgt cr1, L(duPs3) - beq cr1, L(duPs2) - -/* Remainder is 8 */ - .align 4 -L(dusP1): - sld rWORD8_SHIFT, rWORD2, rSHL - sld rWORD7, rWORD1, rWORD6 - sld rWORD8, rWORD8, rWORD6 - bge cr7, L(duP1e) -/* At this point we exit early with the first double word compare - complete and remainder of 0 to 7 bytes. See L(du14) for details on - how we handle the remaining bytes. */ - cmpld cr5, rWORD7, rWORD8 - sldi. rN, rN, 3 - bne cr5, L(duLcr5) - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li r0, 0 - ble cr7, L(dutrim) - LD rWORD2, rOFF8, rSTR2 - srd r0, rWORD2, rSHR - b L(dutrim) -/* Remainder is 16 */ - .align 4 -L(duPs2): - sld rWORD6_SHIFT, rWORD2, rSHL - sld rWORD5, rWORD1, rWORD6 - sld rWORD6, rWORD8, rWORD6 - b L(duP2e) -/* Remainder is 24 */ - .align 4 -L(duPs3): - sld rWORD4_SHIFT, rWORD2, rSHL - sld rWORD3, rWORD1, rWORD6 - sld rWORD4, rWORD8, rWORD6 - b L(duP3e) -/* Count is a multiple of 32, remainder is 0 */ - .align 4 -L(duPs4): - mtctr r0 - or rWORD8, r12, rWORD8 - sld rWORD2_SHIFT, rWORD2, rSHL - sld rWORD1, rWORD1, rWORD6 - sld rWORD2, rWORD8, rWORD6 - b L(duP4e) - -/* At this point we know rSTR1 is double word aligned and the - compare length is at least 8 bytes. */ - .align 4 -L(DWunaligned): - std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) - clrrdi rSTR2, rSTR2, 3 - std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) - srdi r0, rN, 5 /* Divide by 32 */ - std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) - andi. r12, rN, 24 /* Get the DW remainder */ - std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) - cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) - cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) - cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) - cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) - sldi rSHL, rSHL, 3 - LD rWORD6, 0, rSTR2 - LD rWORD8, rOFF8, rSTR2 - addi rSTR2, rSTR2, 8 - cmpldi cr1, r12, 16 - cmpldi cr7, rN, 32 - clrldi rN, rN, 61 - subfic rSHR, rSHL, 64 - sld rWORD6_SHIFT, rWORD6, rSHL - beq L(duP4) - mtctr r0 - bgt cr1, L(duP3) - beq cr1, L(duP2) - -/* Remainder is 8 */ - .align 4 -L(duP1): - srd r12, rWORD8, rSHR - LD rWORD7, 0, rSTR1 - sld rWORD8_SHIFT, rWORD8, rSHL - or rWORD8, r12, rWORD6_SHIFT - blt cr7, L(duP1x) -L(duP1e): - LD rWORD1, rOFF8, rSTR1 - LD rWORD2, rOFF8, rSTR2 - cmpld cr5, rWORD7, rWORD8 - srd r0, rWORD2, rSHR - sld rWORD2_SHIFT, rWORD2, rSHL - or rWORD2, r0, rWORD8_SHIFT - LD rWORD3, rOFF16, rSTR1 - LD rWORD4, rOFF16, rSTR2 - cmpld cr7, rWORD1, rWORD2 - srd r12, rWORD4, rSHR - sld rWORD4_SHIFT, rWORD4, rSHL - bne cr5, L(duLcr5) - or rWORD4, r12, rWORD2_SHIFT - LD rWORD5, rOFF24, rSTR1 - LD rWORD6, rOFF24, rSTR2 - cmpld cr1, rWORD3, rWORD4 - srd r0, rWORD6, rSHR - sld rWORD6_SHIFT, rWORD6, rSHL - bne cr7, L(duLcr7) - or rWORD6, r0, rWORD4_SHIFT - cmpld cr6, rWORD5, rWORD6 - b L(duLoop3) - .align 4 -/* At this point we exit early with the first double word compare - complete and remainder of 0 to 7 bytes. See L(du14) for details on - how we handle the remaining bytes. */ -L(duP1x): - cmpld cr5, rWORD7, rWORD8 - sldi. rN, rN, 3 - bne cr5, L(duLcr5) - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li r0, 0 - ble cr7, L(dutrim) - LD rWORD2, rOFF8, rSTR2 - srd r0, rWORD2, rSHR - b L(dutrim) -/* Remainder is 16 */ - .align 4 -L(duP2): - srd r0, rWORD8, rSHR - LD rWORD5, 0, rSTR1 - or rWORD6, r0, rWORD6_SHIFT - sld rWORD6_SHIFT, rWORD8, rSHL -L(duP2e): - LD rWORD7, rOFF8, rSTR1 - LD rWORD8, rOFF8, rSTR2 - cmpld cr6, rWORD5, rWORD6 - srd r12, rWORD8, rSHR - sld rWORD8_SHIFT, rWORD8, rSHL - or rWORD8, r12, rWORD6_SHIFT - blt cr7, L(duP2x) - LD rWORD1, rOFF16, rSTR1 - LD rWORD2, rOFF16, rSTR2 - cmpld cr5, rWORD7, rWORD8 - bne cr6, L(duLcr6) - srd r0, rWORD2, rSHR - sld rWORD2_SHIFT, rWORD2, rSHL - or rWORD2, r0, rWORD8_SHIFT - LD rWORD3, rOFF24, rSTR1 - LD rWORD4, rOFF24, rSTR2 - cmpld cr7, rWORD1, rWORD2 - bne cr5, L(duLcr5) - srd r12, rWORD4, rSHR - sld rWORD4_SHIFT, rWORD4, rSHL - or rWORD4, r12, rWORD2_SHIFT - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - cmpld cr1, rWORD3, rWORD4 - b L(duLoop2) - .align 4 -L(duP2x): - cmpld cr5, rWORD7, rWORD8 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - bne cr6, L(duLcr6) - sldi. rN, rN, 3 - bne cr5, L(duLcr5) - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li r0, 0 - ble cr7, L(dutrim) - LD rWORD2, rOFF8, rSTR2 - srd r0, rWORD2, rSHR - b L(dutrim) - -/* Remainder is 24 */ - .align 4 -L(duP3): - srd r12, rWORD8, rSHR - LD rWORD3, 0, rSTR1 - sld rWORD4_SHIFT, rWORD8, rSHL - or rWORD4, r12, rWORD6_SHIFT -L(duP3e): - LD rWORD5, rOFF8, rSTR1 - LD rWORD6, rOFF8, rSTR2 - cmpld cr1, rWORD3, rWORD4 - srd r0, rWORD6, rSHR - sld rWORD6_SHIFT, rWORD6, rSHL - or rWORD6, r0, rWORD4_SHIFT - LD rWORD7, rOFF16, rSTR1 - LD rWORD8, rOFF16, rSTR2 - cmpld cr6, rWORD5, rWORD6 - bne cr1, L(duLcr1) - srd r12, rWORD8, rSHR - sld rWORD8_SHIFT, rWORD8, rSHL - or rWORD8, r12, rWORD6_SHIFT - blt cr7, L(duP3x) - LD rWORD1, rOFF24, rSTR1 - LD rWORD2, rOFF24, rSTR2 - cmpld cr5, rWORD7, rWORD8 - bne cr6, L(duLcr6) - srd r0, rWORD2, rSHR - sld rWORD2_SHIFT, rWORD2, rSHL - or rWORD2, r0, rWORD8_SHIFT - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 - cmpld cr7, rWORD1, rWORD2 - b L(duLoop1) - .align 4 -L(duP3x): - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 - cmpld cr5, rWORD7, rWORD8 - bne cr6, L(duLcr6) - sldi. rN, rN, 3 - bne cr5, L(duLcr5) - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li r0, 0 - ble cr7, L(dutrim) - LD rWORD2, rOFF8, rSTR2 - srd r0, rWORD2, rSHR - b L(dutrim) - -/* Count is a multiple of 32, remainder is 0 */ - .align 4 -L(duP4): - mtctr r0 - srd r0, rWORD8, rSHR - LD rWORD1, 0, rSTR1 - sld rWORD2_SHIFT, rWORD8, rSHL - or rWORD2, r0, rWORD6_SHIFT -L(duP4e): - LD rWORD3, rOFF8, rSTR1 - LD rWORD4, rOFF8, rSTR2 - cmpld cr7, rWORD1, rWORD2 - srd r12, rWORD4, rSHR - sld rWORD4_SHIFT, rWORD4, rSHL - or rWORD4, r12, rWORD2_SHIFT - LD rWORD5, rOFF16, rSTR1 - LD rWORD6, rOFF16, rSTR2 - cmpld cr1, rWORD3, rWORD4 - bne cr7, L(duLcr7) - srd r0, rWORD6, rSHR - sld rWORD6_SHIFT, rWORD6, rSHL - or rWORD6, r0, rWORD4_SHIFT - LD rWORD7, rOFF24, rSTR1 - LD rWORD8, rOFF24, rSTR2 - addi rSTR1, rSTR1, 24 - addi rSTR2, rSTR2, 24 - cmpld cr6, rWORD5, rWORD6 - bne cr1, L(duLcr1) - srd r12, rWORD8, rSHR - sld rWORD8_SHIFT, rWORD8, rSHL - or rWORD8, r12, rWORD6_SHIFT - cmpld cr5, rWORD7, rWORD8 - bdz L(du24) /* Adjust CTR as we start with +4 */ -/* This is the primary loop */ - .align 4 -L(duLoop): - LD rWORD1, rOFF8, rSTR1 - LD rWORD2, rOFF8, rSTR2 - cmpld cr1, rWORD3, rWORD4 - bne cr6, L(duLcr6) - srd r0, rWORD2, rSHR - sld rWORD2_SHIFT, rWORD2, rSHL - or rWORD2, r0, rWORD8_SHIFT -L(duLoop1): - LD rWORD3, rOFF16, rSTR1 - LD rWORD4, rOFF16, rSTR2 - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(duLcr5) - srd r12, rWORD4, rSHR - sld rWORD4_SHIFT, rWORD4, rSHL - or rWORD4, r12, rWORD2_SHIFT -L(duLoop2): - LD rWORD5, rOFF24, rSTR1 - LD rWORD6, rOFF24, rSTR2 - cmpld cr5, rWORD7, rWORD8 - bne cr7, L(duLcr7) - srd r0, rWORD6, rSHR - sld rWORD6_SHIFT, rWORD6, rSHL - or rWORD6, r0, rWORD4_SHIFT -L(duLoop3): - LD rWORD7, rOFF32, rSTR1 - LD rWORD8, rOFF32, rSTR2 - addi rSTR1, rSTR1, 32 - addi rSTR2, rSTR2, 32 - cmpld cr7, rWORD1, rWORD2 - bne cr1, L(duLcr1) - srd r12, rWORD8, rSHR - sld rWORD8_SHIFT, rWORD8, rSHL - or rWORD8, r12, rWORD6_SHIFT - bdnz L(duLoop) - -L(duL4): - cmpld cr1, rWORD3, rWORD4 - bne cr6, L(duLcr6) - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(duLcr5) - cmpld cr5, rWORD7, rWORD8 -L(du44): - bne cr7, L(duLcr7) -L(du34): - bne cr1, L(duLcr1) -L(du24): - bne cr6, L(duLcr6) -L(du14): - sldi. rN, rN, 3 - bne cr5, L(duLcr5) -/* At this point we have a remainder of 1 to 7 bytes to compare. We use - shift right double to eliminate bits beyond the compare length. - - However it may not be safe to load rWORD2 which may be beyond the - string length. So we compare the bit length of the remainder to - the right shift count (rSHR). If the bit count is less than or equal - we do not need to load rWORD2 (all significant bits are already in - rWORD8_SHIFT). */ - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li r0, 0 - ble cr7, L(dutrim) - LD rWORD2, rOFF8, rSTR2 - srd r0, rWORD2, rSHR - .align 4 -L(dutrim): - LD rWORD1, rOFF8, rSTR1 - ld rWORD8, -8(r1) - subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ - or rWORD2, r0, rWORD8_SHIFT - ld rWORD7, rWORD7SAVE(r1) - ld rSHL, rSHLSAVE(r1) - srd rWORD1, rWORD1, rN - srd rWORD2, rWORD2, rN - ld rSHR, rSHRSAVE(r1) - ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) - li rRTN, 0 - cmpld cr7, rWORD1, rWORD2 - ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) - ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) - beq cr7, L(dureturn24) - li rRTN, 1 - ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - bgtlr cr7 - li rRTN, -1 - blr - .align 4 -L(duLcr7): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) - li rRTN, 1 - bgt cr7, L(dureturn29) - ld rSHL, rSHLSAVE(r1) - ld rSHR, rSHRSAVE(r1) - li rRTN, -1 - b L(dureturn27) - .align 4 -L(duLcr1): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) - li rRTN, 1 - bgt cr1, L(dureturn29) - ld rSHL, rSHLSAVE(r1) - ld rSHR, rSHRSAVE(r1) - li rRTN, -1 - b L(dureturn27) - .align 4 -L(duLcr6): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) - li rRTN, 1 - bgt cr6, L(dureturn29) - ld rSHL, rSHLSAVE(r1) - ld rSHR, rSHRSAVE(r1) - li rRTN, -1 - b L(dureturn27) - .align 4 -L(duLcr5): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) - li rRTN, 1 - bgt cr5, L(dureturn29) - ld rSHL, rSHLSAVE(r1) - ld rSHR, rSHRSAVE(r1) - li rRTN, -1 - b L(dureturn27) - - .align 3 -L(duZeroReturn): - li rRTN, 0 - .align 4 -L(dureturn): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) -L(dureturn29): - ld rSHL, rSHLSAVE(r1) - ld rSHR, rSHRSAVE(r1) -L(dureturn27): - ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) - ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) - ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) -L(dureturn24): - ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - blr - -L(duzeroLength): - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 0 - blr - -END (MEMCMP) -libc_hidden_builtin_def (memcmp) -weak_alias (memcmp, bcmp) diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S deleted file mode 100644 index e08993cbc3..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/memcpy.S +++ /dev/null @@ -1,430 +0,0 @@ -/* Optimized memcpy implementation for PowerPC64/POWER7. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - - -/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); - Returns 'dst'. */ - -#ifndef MEMCPY -# define MEMCPY memcpy -#endif - -#define dst 11 /* Use r11 so r3 kept unchanged. */ -#define src 4 -#define cnt 5 - - .machine power7 -EALIGN (MEMCPY, 5, 0) - CALL_MCOUNT 3 - - cmpldi cr1,cnt,31 - neg 0,3 - ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move - code. */ - -/* Align copies using VSX instructions to quadword. It is to avoid alignment - traps when memcpy is used on non-cacheable memory (for instance, memory - mapped I/O). */ - andi. 10,3,15 - clrldi 11,4,60 - cmpld cr6,10,11 /* SRC and DST alignments match? */ - - mr dst,3 - bne cr6,L(copy_GE_32_unaligned) - beq L(aligned_copy) - - mtocrf 0x01,0 - clrldi 0,0,60 - -/* Get the DST and SRC aligned to 16 bytes. */ -1: - bf 31,2f - lbz 6,0(src) - addi src,src,1 - stb 6,0(dst) - addi dst,dst,1 -2: - bf 30,4f - lhz 6,0(src) - addi src,src,2 - sth 6,0(dst) - addi dst,dst,2 -4: - bf 29,8f - lwz 6,0(src) - addi src,src,4 - stw 6,0(dst) - addi dst,dst,4 -8: - bf 28,16f - ld 6,0(src) - addi src,src,8 - std 6,0(dst) - addi dst,dst,8 -16: - subf cnt,0,cnt - -/* Main aligned copy loop. Copies 128 bytes at a time. */ -L(aligned_copy): - li 6,16 - li 7,32 - li 8,48 - mtocrf 0x02,cnt - srdi 12,cnt,7 - cmpdi 12,0 - beq L(aligned_tail) - lxvd2x 6,0,src - lxvd2x 7,src,6 - mtctr 12 - b L(aligned_128loop) - - .align 4 -L(aligned_128head): - /* for the 2nd + iteration of this loop. */ - lxvd2x 6,0,src - lxvd2x 7,src,6 -L(aligned_128loop): - lxvd2x 8,src,7 - lxvd2x 9,src,8 - stxvd2x 6,0,dst - addi src,src,64 - stxvd2x 7,dst,6 - stxvd2x 8,dst,7 - stxvd2x 9,dst,8 - lxvd2x 6,0,src - lxvd2x 7,src,6 - addi dst,dst,64 - lxvd2x 8,src,7 - lxvd2x 9,src,8 - addi src,src,64 - stxvd2x 6,0,dst - stxvd2x 7,dst,6 - stxvd2x 8,dst,7 - stxvd2x 9,dst,8 - addi dst,dst,64 - bdnz L(aligned_128head) - -L(aligned_tail): - mtocrf 0x01,cnt - bf 25,32f - lxvd2x 6,0,src - lxvd2x 7,src,6 - lxvd2x 8,src,7 - lxvd2x 9,src,8 - addi src,src,64 - stxvd2x 6,0,dst - stxvd2x 7,dst,6 - stxvd2x 8,dst,7 - stxvd2x 9,dst,8 - addi dst,dst,64 -32: - bf 26,16f - lxvd2x 6,0,src - lxvd2x 7,src,6 - addi src,src,32 - stxvd2x 6,0,dst - stxvd2x 7,dst,6 - addi dst,dst,32 -16: - bf 27,8f - lxvd2x 6,0,src - addi src,src,16 - stxvd2x 6,0,dst - addi dst,dst,16 -8: - bf 28,4f - ld 6,0(src) - addi src,src,8 - std 6,0(dst) - addi dst,dst,8 -4: /* Copies 4~7 bytes. */ - bf 29,L(tail2) - lwz 6,0(src) - stw 6,0(dst) - bf 30,L(tail5) - lhz 7,4(src) - sth 7,4(dst) - bflr 31 - lbz 8,6(src) - stb 8,6(dst) - /* Return original DST pointer. */ - blr - - -/* Handle copies of 0~31 bytes. */ - .align 4 -L(copy_LT_32): - mr dst,3 - cmpldi cr6,cnt,8 - mtocrf 0x01,cnt - ble cr6,L(copy_LE_8) - - /* At least 9 bytes to go. */ - neg 8,4 - andi. 0,8,3 - cmpldi cr1,cnt,16 - beq L(copy_LT_32_aligned) - - /* Force 4-byte alignment for SRC. */ - mtocrf 0x01,0 - subf cnt,0,cnt -2: - bf 30,1f - lhz 6,0(src) - addi src,src,2 - sth 6,0(dst) - addi dst,dst,2 -1: - bf 31,L(end_4bytes_alignment) - lbz 6,0(src) - addi src,src,1 - stb 6,0(dst) - addi dst,dst,1 - - .align 4 -L(end_4bytes_alignment): - cmpldi cr1,cnt,16 - mtocrf 0x01,cnt - -L(copy_LT_32_aligned): - /* At least 6 bytes to go, and SRC is word-aligned. */ - blt cr1,8f - - /* Copy 16 bytes. */ - lwz 6,0(src) - lwz 7,4(src) - stw 6,0(dst) - lwz 8,8(src) - stw 7,4(dst) - lwz 6,12(src) - addi src,src,16 - stw 8,8(dst) - stw 6,12(dst) - addi dst,dst,16 -8: /* Copy 8 bytes. */ - bf 28,L(tail4) - lwz 6,0(src) - lwz 7,4(src) - addi src,src,8 - stw 6,0(dst) - stw 7,4(dst) - addi dst,dst,8 - - .align 4 -/* Copies 4~7 bytes. */ -L(tail4): - bf 29,L(tail2) - lwz 6,0(src) - stw 6,0(dst) - bf 30,L(tail5) - lhz 7,4(src) - sth 7,4(dst) - bflr 31 - lbz 8,6(src) - stb 8,6(dst) - /* Return original DST pointer. */ - blr - - .align 4 -/* Copies 2~3 bytes. */ -L(tail2): - bf 30,1f - lhz 6,0(src) - sth 6,0(dst) - bflr 31 - lbz 7,2(src) - stb 7,2(dst) - blr - - .align 4 -L(tail5): - bflr 31 - lbz 6,4(src) - stb 6,4(dst) - blr - - .align 4 -1: - bflr 31 - lbz 6,0(src) - stb 6,0(dst) - /* Return original DST pointer. */ - blr - - -/* Handles copies of 0~8 bytes. */ - .align 4 -L(copy_LE_8): - bne cr6,L(tail4) - - /* Though we could've used ld/std here, they are still - slow for unaligned cases. */ - - lwz 6,0(src) - lwz 7,4(src) - stw 6,0(dst) - stw 7,4(dst) - blr - - -/* Handle copies of 32+ bytes where DST is aligned (to quadword) but - SRC is not. Use aligned quadword loads from SRC, shifted to realign - the data, allowing for aligned DST stores. */ - .align 4 -L(copy_GE_32_unaligned): - clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */ - srdi 9,cnt,4 /* Number of full quadwords remaining. */ - - beq L(copy_GE_32_unaligned_cont) - - /* DST is not quadword aligned, get it aligned. */ - - mtocrf 0x01,0 - subf cnt,0,cnt - - /* Vector instructions work best when proper alignment (16-bytes) - is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ -1: - bf 31,2f - lbz 6,0(src) - addi src,src,1 - stb 6,0(dst) - addi dst,dst,1 -2: - bf 30,4f - lhz 6,0(src) - addi src,src,2 - sth 6,0(dst) - addi dst,dst,2 -4: - bf 29,8f - lwz 6,0(src) - addi src,src,4 - stw 6,0(dst) - addi dst,dst,4 -8: - bf 28,0f - ld 6,0(src) - addi src,src,8 - std 6,0(dst) - addi dst,dst,8 -0: - srdi 9,cnt,4 /* Number of full quadwords remaining. */ - - /* The proper alignment is present, it is OK to copy the bytes now. */ -L(copy_GE_32_unaligned_cont): - - /* Setup two indexes to speed up the indexed vector operations. */ - clrldi 10,cnt,60 - li 6,16 /* Index for 16-bytes offsets. */ - li 7,32 /* Index for 32-bytes offsets. */ - cmpldi cr1,10,0 - srdi 8,cnt,5 /* Setup the loop counter. */ - mtocrf 0x01,9 - cmpldi cr6,9,1 -#ifdef __LITTLE_ENDIAN__ - lvsr 5,0,src -#else - lvsl 5,0,src -#endif - lvx 3,0,src - li 0,0 - bf 31,L(setup_unaligned_loop) - - /* Copy another 16 bytes to align to 32-bytes due to the loop. */ - lvx 4,src,6 -#ifdef __LITTLE_ENDIAN__ - vperm 6,4,3,5 -#else - vperm 6,3,4,5 -#endif - addi src,src,16 - stvx 6,0,dst - addi dst,dst,16 - vor 3,4,4 - clrrdi 0,src,60 - -L(setup_unaligned_loop): - mtctr 8 - ble cr6,L(end_unaligned_loop) - - /* Copy 32 bytes at a time using vector instructions. */ - .align 4 -L(unaligned_loop): - - /* Note: vr6/vr10 may contain data that was already copied, - but in order to get proper alignment, we may have to copy - some portions again. This is faster than having unaligned - vector instructions though. */ - - lvx 4,src,6 -#ifdef __LITTLE_ENDIAN__ - vperm 6,4,3,5 -#else - vperm 6,3,4,5 -#endif - lvx 3,src,7 -#ifdef __LITTLE_ENDIAN__ - vperm 10,3,4,5 -#else - vperm 10,4,3,5 -#endif - addi src,src,32 - stvx 6,0,dst - stvx 10,dst,6 - addi dst,dst,32 - bdnz L(unaligned_loop) - - clrrdi 0,src,60 - - .align 4 -L(end_unaligned_loop): - - /* Check for tail bytes. */ - mtocrf 0x01,cnt - beqlr cr1 - - add src,src,0 - - /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ - /* Copy 8 bytes. */ - bf 28,4f - lwz 6,0(src) - lwz 7,4(src) - addi src,src,8 - stw 6,0(dst) - stw 7,4(dst) - addi dst,dst,8 -4: /* Copy 4~7 bytes. */ - bf 29,L(tail2) - lwz 6,0(src) - stw 6,0(dst) - bf 30,L(tail5) - lhz 7,4(src) - sth 7,4(dst) - bflr 31 - lbz 8,6(src) - stb 8,6(dst) - /* Return original DST pointer. */ - blr - -END_GEN_TB (MEMCPY,TB_TOCLESS) -libc_hidden_builtin_def (memcpy) diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S deleted file mode 100644 index 4c0f7c3571..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/memmove.S +++ /dev/null @@ -1,835 +0,0 @@ -/* Optimized memmove implementation for PowerPC64/POWER7. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - - -/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5]) - - This optimization check if memory 'dest' overlaps with 'src'. If it does - not then it calls an optimized memcpy call (similar to memcpy for POWER7, - embedded here to gain some cycles). - If source and destiny overlaps, a optimized backwards memcpy is used - instead. */ - -#ifndef MEMMOVE -# define MEMMOVE memmove -#endif - .machine power7 -EALIGN (MEMMOVE, 5, 0) - CALL_MCOUNT 3 - -L(_memmove): - subf r9,r4,r3 - cmpld cr7,r9,r5 - blt cr7,L(memmove_bwd) - - cmpldi cr1,r5,31 - neg 0,3 - ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move - code. */ - - andi. 10,3,15 - clrldi 11,4,60 - cmpld cr6,10,11 /* SRC and DST alignments match? */ - - mr r11,3 - bne cr6,L(copy_GE_32_unaligned) - beq L(aligned_copy) - - mtocrf 0x01,0 - clrldi 0,0,60 - -/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */ -1: - bf 31,2f - lbz 6,0(r4) - addi r4,r4,1 - stb 6,0(r11) - addi r11,r11,1 -2: - bf 30,4f - lhz 6,0(r4) - addi r4,r4,2 - sth 6,0(r11) - addi r11,r11,2 -4: - bf 29,8f - lwz 6,0(r4) - addi r4,r4,4 - stw 6,0(r11) - addi r11,r11,4 -8: - bf 28,16f - ld 6,0(r4) - addi r4,r4,8 - std 6,0(r11) - addi r11,r11,8 -16: - subf r5,0,r5 - -/* Main aligned copy loop. Copies 128 bytes at a time. */ -L(aligned_copy): - li 6,16 - li 7,32 - li 8,48 - mtocrf 0x02,r5 - srdi 12,r5,7 - cmpdi 12,0 - beq L(aligned_tail) - lxvd2x 6,0,r4 - lxvd2x 7,r4,6 - mtctr 12 - b L(aligned_128loop) - - .align 4 -L(aligned_128head): - /* for the 2nd + iteration of this loop. */ - lxvd2x 6,0,r4 - lxvd2x 7,r4,6 -L(aligned_128loop): - lxvd2x 8,r4,7 - lxvd2x 9,r4,8 - stxvd2x 6,0,r11 - addi r4,r4,64 - stxvd2x 7,r11,6 - stxvd2x 8,r11,7 - stxvd2x 9,r11,8 - lxvd2x 6,0,r4 - lxvd2x 7,r4,6 - addi r11,r11,64 - lxvd2x 8,r4,7 - lxvd2x 9,r4,8 - addi r4,r4,64 - stxvd2x 6,0,r11 - stxvd2x 7,r11,6 - stxvd2x 8,r11,7 - stxvd2x 9,r11,8 - addi r11,r11,64 - bdnz L(aligned_128head) - -L(aligned_tail): - mtocrf 0x01,r5 - bf 25,32f - lxvd2x 6,0,r4 - lxvd2x 7,r4,6 - lxvd2x 8,r4,7 - lxvd2x 9,r4,8 - addi r4,r4,64 - stxvd2x 6,0,r11 - stxvd2x 7,r11,6 - stxvd2x 8,r11,7 - stxvd2x 9,r11,8 - addi r11,r11,64 -32: - bf 26,16f - lxvd2x 6,0,r4 - lxvd2x 7,r4,6 - addi r4,r4,32 - stxvd2x 6,0,r11 - stxvd2x 7,r11,6 - addi r11,r11,32 -16: - bf 27,8f - lxvd2x 6,0,r4 - addi r4,r4,16 - stxvd2x 6,0,r11 - addi r11,r11,16 -8: - bf 28,4f - ld 6,0(r4) - addi r4,r4,8 - std 6,0(r11) - addi r11,r11,8 -4: /* Copies 4~7 bytes. */ - bf 29,L(tail2) - lwz 6,0(r4) - stw 6,0(r11) - bf 30,L(tail5) - lhz 7,4(r4) - sth 7,4(r11) - bflr 31 - lbz 8,6(r4) - stb 8,6(r11) - /* Return original DST pointer. */ - blr - -/* Handle copies of 0~31 bytes. */ - .align 4 -L(copy_LT_32): - mr r11,3 - cmpldi cr6,r5,8 - mtocrf 0x01,r5 - ble cr6,L(copy_LE_8) - - /* At least 9 bytes to go. */ - neg 8,4 - andi. 0,8,3 - cmpldi cr1,r5,16 - beq L(copy_LT_32_aligned) - - /* Force 4-byte alignment for SRC. */ - mtocrf 0x01,0 - subf r5,0,r5 -2: - bf 30,1f - lhz 6,0(r4) - addi r4,r4,2 - sth 6,0(r11) - addi r11,r11,2 -1: - bf 31,L(end_4bytes_alignment) - lbz 6,0(r4) - addi r4,r4,1 - stb 6,0(r11) - addi r11,r11,1 - - .align 4 -L(end_4bytes_alignment): - cmpldi cr1,r5,16 - mtocrf 0x01,r5 - -L(copy_LT_32_aligned): - /* At least 6 bytes to go, and SRC is word-aligned. */ - blt cr1,8f - - /* Copy 16 bytes. */ - lwz 6,0(r4) - lwz 7,4(r4) - stw 6,0(r11) - lwz 8,8(r4) - stw 7,4(r11) - lwz 6,12(r4) - addi r4,r4,16 - stw 8,8(r11) - stw 6,12(r11) - addi r11,r11,16 -8: /* Copy 8 bytes. */ - bf 28,L(tail4) - lwz 6,0(r4) - lwz 7,4(r4) - addi r4,r4,8 - stw 6,0(r11) - stw 7,4(r11) - addi r11,r11,8 - - .align 4 -/* Copies 4~7 bytes. */ -L(tail4): - bf 29,L(tail2) - lwz 6,0(r4) - stw 6,0(r11) - bf 30,L(tail5) - lhz 7,4(r4) - sth 7,4(r11) - bflr 31 - lbz 8,6(r4) - stb 8,6(r11) - /* Return original DST pointer. */ - blr - - .align 4 -/* Copies 2~3 bytes. */ -L(tail2): - bf 30,1f - lhz 6,0(r4) - sth 6,0(r11) - bflr 31 - lbz 7,2(r4) - stb 7,2(r11) - blr - - .align 4 -L(tail5): - bflr 31 - lbz 6,4(r4) - stb 6,4(r11) - blr - - .align 4 -1: - bflr 31 - lbz 6,0(r4) - stb 6,0(r11) - /* Return original DST pointer. */ - blr - -/* Handles copies of 0~8 bytes. */ - .align 4 -L(copy_LE_8): - bne cr6,L(tail4) - - /* Though we could've used ld/std here, they are still - slow for unaligned cases. */ - - lwz 6,0(r4) - lwz 7,4(r4) - stw 6,0(r11) - stw 7,4(r11) - blr - - -/* Handle copies of 32+ bytes where DST is aligned (to quadword) but - SRC is not. Use aligned quadword loads from SRC, shifted to realign - the data, allowing for aligned DST stores. */ - .align 4 -L(copy_GE_32_unaligned): - clrldi 0,0,60 /* Number of bytes until the 1st r11 quadword. */ - srdi 9,r5,4 /* Number of full quadwords remaining. */ - - beq L(copy_GE_32_unaligned_cont) - - /* DST is not quadword aligned, get it aligned. */ - - mtocrf 0x01,0 - subf r5,0,r5 - - /* Vector instructions work best when proper alignment (16-bytes) - is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ -1: - bf 31,2f - lbz 6,0(r4) - addi r4,r4,1 - stb 6,0(r11) - addi r11,r11,1 -2: - bf 30,4f - lhz 6,0(r4) - addi r4,r4,2 - sth 6,0(r11) - addi r11,r11,2 -4: - bf 29,8f - lwz 6,0(r4) - addi r4,r4,4 - stw 6,0(r11) - addi r11,r11,4 -8: - bf 28,0f - ld 6,0(r4) - addi r4,r4,8 - std 6,0(r11) - addi r11,r11,8 -0: - srdi 9,r5,4 /* Number of full quadwords remaining. */ - - /* The proper alignment is present, it is OK to copy the bytes now. */ -L(copy_GE_32_unaligned_cont): - - /* Setup two indexes to speed up the indexed vector operations. */ - clrldi 10,r5,60 - li 6,16 /* Index for 16-bytes offsets. */ - li 7,32 /* Index for 32-bytes offsets. */ - cmpldi cr1,10,0 - srdi 8,r5,5 /* Setup the loop counter. */ - mtocrf 0x01,9 - cmpldi cr6,9,1 -#ifdef __LITTLE_ENDIAN__ - lvsr 5,0,r4 -#else - lvsl 5,0,r4 -#endif - lvx 3,0,r4 - li 0,0 - bf 31,L(setup_unaligned_loop) - - /* Copy another 16 bytes to align to 32-bytes due to the loop. */ - lvx 4,r4,6 -#ifdef __LITTLE_ENDIAN__ - vperm 6,4,3,5 -#else - vperm 6,3,4,5 -#endif - addi r4,r4,16 - stvx 6,0,r11 - addi r11,r11,16 - vor 3,4,4 - clrrdi 0,r4,60 - -L(setup_unaligned_loop): - mtctr 8 - ble cr6,L(end_unaligned_loop) - - /* Copy 32 bytes at a time using vector instructions. */ - .align 4 -L(unaligned_loop): - - /* Note: vr6/vr10 may contain data that was already copied, - but in order to get proper alignment, we may have to copy - some portions again. This is faster than having unaligned - vector instructions though. */ - - lvx 4,r4,6 -#ifdef __LITTLE_ENDIAN__ - vperm 6,4,3,5 -#else - vperm 6,3,4,5 -#endif - lvx 3,r4,7 -#ifdef __LITTLE_ENDIAN__ - vperm 10,3,4,5 -#else - vperm 10,4,3,5 -#endif - addi r4,r4,32 - stvx 6,0,r11 - stvx 10,r11,6 - addi r11,r11,32 - bdnz L(unaligned_loop) - - clrrdi 0,r4,60 - - .align 4 -L(end_unaligned_loop): - - /* Check for tail bytes. */ - mtocrf 0x01,r5 - beqlr cr1 - - add r4,r4,0 - - /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ - /* Copy 8 bytes. */ - bf 28,4f - lwz 6,0(r4) - lwz 7,4(r4) - addi r4,r4,8 - stw 6,0(r11) - stw 7,4(r11) - addi r11,r11,8 -4: /* Copy 4~7 bytes. */ - bf 29,L(tail2) - lwz 6,0(r4) - stw 6,0(r11) - bf 30,L(tail5) - lhz 7,4(r4) - sth 7,4(r11) - bflr 31 - lbz 8,6(r4) - stb 8,6(r11) - /* Return original DST pointer. */ - blr - - /* Start to memcpy backward implementation: the algorith first check if - src and dest have the same alignment and if it does align both to 16 - bytes and copy using VSX instructions. - If does not, align dest to 16 bytes and use VMX (altivec) instruction - to read two 16 bytes at time, shift/permute the bytes read and write - aligned to dest. */ -L(memmove_bwd): - cmpldi cr1,r5,31 - /* Copy is done backwards: update the pointers and check alignment. */ - add r11,r3,r5 - add r4,r4,r5 - mr r0,r11 - ble cr1, L(copy_LT_32_bwd) /* If move < 32 bytes use short move - code. */ - - andi. r10,r11,15 /* Check if r11 is aligned to 16 bytes */ - clrldi r9,r4,60 /* Check if r4 is aligned to 16 bytes */ - cmpld cr6,r10,r9 /* SRC and DST alignments match? */ - - bne cr6,L(copy_GE_32_unaligned_bwd) - beq L(aligned_copy_bwd) - - mtocrf 0x01,r0 - clrldi r0,r0,60 - -/* Get the DST and SRC aligned to 16 bytes. */ -1: - bf 31,2f - lbz r6,-1(r4) - subi r4,r4,1 - stb r6,-1(r11) - subi r11,r11,1 -2: - bf 30,4f - lhz r6,-2(r4) - subi r4,r4,2 - sth r6,-2(r11) - subi r11,r11,2 -4: - bf 29,8f - lwz r6,-4(r4) - subi r4,r4,4 - stw r6,-4(r11) - subi r11,r11,4 -8: - bf 28,16f - ld r6,-8(r4) - subi r4,r4,8 - std r6,-8(r11) - subi r11,r11,8 -16: - subf r5,0,r5 - -/* Main aligned copy loop. Copies 128 bytes at a time. */ -L(aligned_copy_bwd): - li r6,-16 - li r7,-32 - li r8,-48 - li r9,-64 - mtocrf 0x02,r5 - srdi r12,r5,7 - cmpdi r12,0 - beq L(aligned_tail_bwd) - lxvd2x v6,r4,r6 - lxvd2x v7,r4,r7 - mtctr 12 - b L(aligned_128loop_bwd) - - .align 4 -L(aligned_128head_bwd): - /* for the 2nd + iteration of this loop. */ - lxvd2x v6,r4,r6 - lxvd2x v7,r4,r7 -L(aligned_128loop_bwd): - lxvd2x v8,r4,r8 - lxvd2x v9,r4,r9 - stxvd2x v6,r11,r6 - subi r4,r4,64 - stxvd2x v7,r11,r7 - stxvd2x v8,r11,r8 - stxvd2x v9,r11,r9 - lxvd2x v6,r4,r6 - lxvd2x v7,r4,7 - subi r11,r11,64 - lxvd2x v8,r4,r8 - lxvd2x v9,r4,r9 - subi r4,r4,64 - stxvd2x v6,r11,r6 - stxvd2x v7,r11,r7 - stxvd2x v8,r11,r8 - stxvd2x v9,r11,r9 - subi r11,r11,64 - bdnz L(aligned_128head_bwd) - -L(aligned_tail_bwd): - mtocrf 0x01,r5 - bf 25,32f - lxvd2x v6,r4,r6 - lxvd2x v7,r4,r7 - lxvd2x v8,r4,r8 - lxvd2x v9,r4,r9 - subi r4,r4,64 - stxvd2x v6,r11,r6 - stxvd2x v7,r11,r7 - stxvd2x v8,r11,r8 - stxvd2x v9,r11,r9 - subi r11,r11,64 -32: - bf 26,16f - lxvd2x v6,r4,r6 - lxvd2x v7,r4,r7 - subi r4,r4,32 - stxvd2x v6,r11,r6 - stxvd2x v7,r11,r7 - subi r11,r11,32 -16: - bf 27,8f - lxvd2x v6,r4,r6 - subi r4,r4,16 - stxvd2x v6,r11,r6 - subi r11,r11,16 -8: - bf 28,4f - ld r6,-8(r4) - subi r4,r4,8 - std r6,-8(r11) - subi r11,r11,8 -4: /* Copies 4~7 bytes. */ - bf 29,L(tail2_bwd) - lwz r6,-4(r4) - stw r6,-4(r11) - bf 30,L(tail5_bwd) - lhz r7,-6(r4) - sth r7,-6(r11) - bflr 31 - lbz r8,-7(r4) - stb r8,-7(r11) - /* Return original DST pointer. */ - blr - -/* Handle copies of 0~31 bytes. */ - .align 4 -L(copy_LT_32_bwd): - cmpldi cr6,r5,8 - mtocrf 0x01,r5 - ble cr6,L(copy_LE_8_bwd) - - /* At least 9 bytes to go. */ - neg r8,r4 - andi. r0,r8,3 - cmpldi cr1,r5,16 - beq L(copy_LT_32_aligned_bwd) - - /* Force 4-byte alignment for SRC. */ - mtocrf 0x01,0 - subf r5,0,r5 -2: - bf 30,1f - lhz r6,-2(r4) - subi r4,r4,2 - sth r6,-2(r11) - subi r11,r11,2 -1: - bf 31,L(end_4bytes_alignment_bwd) - lbz 6,-1(r4) - subi r4,r4,1 - stb 6,-1(r11) - subi r11,r11,1 - - .align 4 -L(end_4bytes_alignment_bwd): - cmpldi cr1,r5,16 - mtocrf 0x01,r5 - -L(copy_LT_32_aligned_bwd): - /* At least 6 bytes to go, and SRC is word-aligned. */ - blt cr1,8f - - /* Copy 16 bytes. */ - lwz r6,-4(r4) - lwz r7,-8(r4) - stw r6,-4(r11) - lwz r8,-12(r4) - stw r7,-8(r11) - lwz r6,-16(r4) - subi r4,r4,16 - stw r8,-12(r11) - stw r6,-16(r11) - subi r11,r11,16 -8: /* Copy 8 bytes. */ - bf 28,L(tail4_bwd) - lwz r6,-4(r4) - lwz r7,-8(r4) - subi r4,r4,8 - stw r6,-4(r11) - stw r7,-8(r11) - subi r11,r11,8 - - .align 4 -/* Copies 4~7 bytes. */ -L(tail4_bwd): - bf 29,L(tail2_bwd) - lwz 6,-4(r4) - stw 6,-4(r11) - bf 30,L(tail5_bwd) - lhz 7,-6(r4) - sth 7,-6(r11) - bflr 31 - lbz 8,-7(r4) - stb 8,-7(r11) - /* Return original DST pointer. */ - blr - - .align 4 -/* Copies 2~3 bytes. */ -L(tail2_bwd): - bf 30,1f - lhz 6,-2(r4) - sth 6,-2(r11) - bflr 31 - lbz 7,-3(r4) - stb 7,-3(r11) - blr - - .align 4 -L(tail5_bwd): - bflr 31 - lbz 6,-5(r4) - stb 6,-5(r11) - blr - - .align 4 -1: - bflr 31 - lbz 6,-1(r4) - stb 6,-1(r11) - /* Return original DST pointer. */ - blr - - -/* Handles copies of 0~8 bytes. */ - .align 4 -L(copy_LE_8_bwd): - bne cr6,L(tail4_bwd) - - /* Though we could've used ld/std here, they are still - slow for unaligned cases. */ - lwz 6,-8(r4) - lwz 7,-4(r4) - stw 6,-8(r11) - stw 7,-4(r11) - blr - - -/* Handle copies of 32+ bytes where DST is aligned (to quadword) but - SRC is not. Use aligned quadword loads from SRC, shifted to realign - the data, allowing for aligned DST stores. */ - .align 4 -L(copy_GE_32_unaligned_bwd): - andi. r10,r11,15 /* Check alignment of DST against 16 bytes.. */ - srdi r9,r5,4 /* Number of full quadwords remaining. */ - - beq L(copy_GE_32_unaligned_cont_bwd) - - /* DST is not quadword aligned and r10 holds the address masked to - compare alignments. */ - mtocrf 0x01,r10 - subf r5,r10,r5 - - /* Vector instructions work best when proper alignment (16-bytes) - is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ -1: - bf 31,2f - lbz r6,-1(r4) - subi r4,r4,1 - stb r6,-1(r11) - subi r11,r11,1 -2: - bf 30,4f - lhz r6,-2(r4) - subi r4,r4,2 - sth r6,-2(r11) - subi r11,r11,2 -4: - bf 29,8f - lwz r6,-4(r4) - subi r4,r4,4 - stw r6,-4(r11) - subi r11,r11,4 -8: - bf 28,0f - ld r6,-8(r4) - subi r4,r4,8 - std r6,-8(r11) - subi r11,r11,8 -0: - srdi r9,r5,4 /* Number of full quadwords remaining. */ - - /* The proper alignment is present, it is OK to copy the bytes now. */ -L(copy_GE_32_unaligned_cont_bwd): - - /* Setup two indexes to speed up the indexed vector operations. */ - clrldi r10,r5,60 - li r6,-16 /* Index for 16-bytes offsets. */ - li r7,-32 /* Index for 32-bytes offsets. */ - cmpldi cr1,10,0 - srdi r8,r5,5 /* Setup the loop counter. */ - mtocrf 0x01,9 - cmpldi cr6,r9,1 -#ifdef __LITTLE_ENDIAN__ - lvsr v5,r0,r4 -#else - lvsl v5,r0,r4 -#endif - lvx v3,0,r4 - li r0,0 - bf 31,L(setup_unaligned_loop_bwd) - - /* Copy another 16 bytes to align to 32-bytes due to the loop. */ - lvx v4,r4,r6 -#ifdef __LITTLE_ENDIAN__ - vperm v6,v3,v4,v5 -#else - vperm v6,v4,v3,v5 -#endif - subi r4,r4,16 - stvx v6,r11,r6 - subi r11,r11,16 - vor v3,v4,v4 - clrrdi r0,r4,60 - -L(setup_unaligned_loop_bwd): - mtctr r8 - ble cr6,L(end_unaligned_loop_bwd) - - /* Copy 32 bytes at a time using vector instructions. */ - .align 4 -L(unaligned_loop_bwd): - - /* Note: vr6/vr10 may contain data that was already copied, - but in order to get proper alignment, we may have to copy - some portions again. This is faster than having unaligned - vector instructions though. */ - - lvx v4,r4,r6 -#ifdef __LITTLE_ENDIAN__ - vperm v6,v3,v4,v5 -#else - vperm v6,v4,v3,v5 -#endif - lvx v3,r4,r7 -#ifdef __LITTLE_ENDIAN__ - vperm v10,v4,v3,v5 -#else - vperm v10,v3,v4,v5 -#endif - subi r4,r4,32 - stvx v6,r11,r6 - stvx v10,r11,r7 - subi r11,r11,32 - bdnz L(unaligned_loop_bwd) - - clrrdi r0,r4,60 - - .align 4 -L(end_unaligned_loop_bwd): - - /* Check for tail bytes. */ - mtocrf 0x01,r5 - beqlr cr1 - - add r4,r4,0 - - /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ - /* Copy 8 bytes. */ - bf 28,4f - lwz r6,-4(r4) - lwz r7,-8(r4) - subi r4,r4,8 - stw r6,-4(r11) - stw r7,-8(r11) - subi r11,r11,8 -4: /* Copy 4~7 bytes. */ - bf 29,L(tail2_bwd) - lwz r6,-4(r4) - stw r6,-4(r11) - bf 30,L(tail5_bwd) - lhz r7,-6(r4) - sth r7,-6(r11) - bflr 31 - lbz r8,-7(r4) - stb r8,-7(r11) - /* Return original DST pointer. */ - blr -END_GEN_TB (MEMMOVE, TB_TOCLESS) -libc_hidden_builtin_def (memmove) - - -/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5]) - Implemented in this file to avoid linker create a stub function call - in the branch to '_memmove'. */ -ENTRY (__bcopy) - mr r6,r3 - mr r3,r4 - mr r4,r6 - b L(_memmove) -END (__bcopy) -weak_alias (__bcopy, bcopy) diff --git a/sysdeps/powerpc/powerpc64/power7/mempcpy.S b/sysdeps/powerpc/powerpc64/power7/mempcpy.S deleted file mode 100644 index 4e15d1e40c..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/mempcpy.S +++ /dev/null @@ -1,472 +0,0 @@ -/* Optimized mempcpy implementation for POWER7. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - - -/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); - Returns 'dst' + 'len'. */ - -#ifndef MEMPCPY -# define MEMPCPY __mempcpy -#endif - .machine power7 -EALIGN (MEMPCPY, 5, 0) - CALL_MCOUNT 3 - - cmpldi cr1,5,31 - neg 0,3 - std 3,-16(1) - std 31,-8(1) - cfi_offset(31,-8) - ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move - code. */ - - andi. 11,3,7 /* Check alignment of DST. */ - - - clrldi 10,4,61 /* Check alignment of SRC. */ - cmpld cr6,10,11 /* SRC and DST alignments match? */ - mr 12,4 - mr 31,5 - bne cr6,L(copy_GE_32_unaligned) - - srdi 9,5,3 /* Number of full quadwords remaining. */ - - beq L(copy_GE_32_aligned_cont) - - clrldi 0,0,61 - mtcrf 0x01,0 - subf 31,0,5 - - /* Get the SRC aligned to 8 bytes. */ - -1: bf 31,2f - lbz 6,0(12) - addi 12,12,1 - stb 6,0(3) - addi 3,3,1 -2: bf 30,4f - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -4: bf 29,0f - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -0: - clrldi 10,12,61 /* Check alignment of SRC again. */ - srdi 9,31,3 /* Number of full doublewords remaining. */ - -L(copy_GE_32_aligned_cont): - - clrldi 11,31,61 - mtcrf 0x01,9 - - srdi 8,31,5 - cmpldi cr1,9,4 - cmpldi cr6,11,0 - mr 11,12 - - /* Copy 1~3 doublewords so the main loop starts - at a multiple of 32 bytes. */ - - bf 30,1f - ld 6,0(12) - ld 7,8(12) - addi 11,12,16 - mtctr 8 - std 6,0(3) - std 7,8(3) - addi 10,3,16 - bf 31,4f - ld 0,16(12) - std 0,16(3) - blt cr1,3f - addi 11,12,24 - addi 10,3,24 - b 4f - - .align 4 -1: /* Copy 1 doubleword and set the counter. */ - mr 10,3 - mtctr 8 - bf 31,4f - ld 6,0(12) - addi 11,12,8 - std 6,0(3) - addi 10,3,8 - - /* Main aligned copy loop. Copies 32-bytes at a time. */ - .align 4 -4: - ld 6,0(11) - ld 7,8(11) - ld 8,16(11) - ld 0,24(11) - addi 11,11,32 - - std 6,0(10) - std 7,8(10) - std 8,16(10) - std 0,24(10) - addi 10,10,32 - bdnz 4b -3: - - /* Check for tail bytes. */ - rldicr 0,31,0,60 - mtcrf 0x01,31 - beq cr6,0f - -.L9: - add 3,3,0 - add 12,12,0 - - /* At this point we have a tail of 0-7 bytes and we know that the - destination is doubleword-aligned. */ -4: /* Copy 4 bytes. */ - bf 29,2f - - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -2: /* Copy 2 bytes. */ - bf 30,1f - - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -1: /* Copy 1 byte. */ - bf 31,0f - - lbz 6,0(12) - stb 6,0(3) -0: /* Return DST + LEN pointer. */ - ld 31,-8(1) - ld 3,-16(1) - add 3,3,5 - blr - - /* Handle copies of 0~31 bytes. */ - .align 4 -L(copy_LT_32): - cmpldi cr6,5,8 - mr 12,4 - mtcrf 0x01,5 - ble cr6,L(copy_LE_8) - - /* At least 9 bytes to go. */ - neg 8,4 - clrrdi 11,4,2 - andi. 0,8,3 - cmpldi cr1,5,16 - mr 10,5 - beq L(copy_LT_32_aligned) - - /* Force 4-bytes alignment for SRC. */ - mtocrf 0x01,0 - subf 10,0,5 -2: bf 30,1f - - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -1: bf 31,L(end_4bytes_alignment) - - lbz 6,0(12) - addi 12,12,1 - stb 6,0(3) - addi 3,3,1 - - .align 4 -L(end_4bytes_alignment): - cmpldi cr1,10,16 - mtcrf 0x01,10 - -L(copy_LT_32_aligned): - /* At least 6 bytes to go, and SRC is word-aligned. */ - blt cr1,8f - - /* Copy 16 bytes. */ - lwz 6,0(12) - lwz 7,4(12) - stw 6,0(3) - lwz 8,8(12) - stw 7,4(3) - lwz 6,12(12) - addi 12,12,16 - stw 8,8(3) - stw 6,12(3) - addi 3,3,16 -8: /* Copy 8 bytes. */ - bf 28,4f - - lwz 6,0(12) - lwz 7,4(12) - addi 12,12,8 - stw 6,0(3) - stw 7,4(3) - addi 3,3,8 -4: /* Copy 4 bytes. */ - bf 29,2f - - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -2: /* Copy 2-3 bytes. */ - bf 30,1f - - lhz 6,0(12) - sth 6,0(3) - bf 31,0f - lbz 7,2(12) - stb 7,2(3) - ld 3,-16(1) - add 3,3,5 - blr - - .align 4 -1: /* Copy 1 byte. */ - bf 31,0f - - lbz 6,0(12) - stb 6,0(3) -0: /* Return DST + LEN pointer. */ - ld 3,-16(1) - add 3,3,5 - blr - - /* Handles copies of 0~8 bytes. */ - .align 4 -L(copy_LE_8): - bne cr6,4f - - /* Though we could've used ld/std here, they are still - slow for unaligned cases. */ - - lwz 6,0(4) - lwz 7,4(4) - stw 6,0(3) - stw 7,4(3) - ld 3,-16(1) /* Return DST + LEN pointer. */ - add 3,3,5 - blr - - .align 4 -4: /* Copies 4~7 bytes. */ - bf 29,2b - - lwz 6,0(4) - stw 6,0(3) - bf 30,5f - lhz 7,4(4) - sth 7,4(3) - bf 31,0f - lbz 8,6(4) - stb 8,6(3) - ld 3,-16(1) - add 3,3,5 - blr - - .align 4 -5: /* Copy 1 byte. */ - bf 31,0f - - lbz 6,4(4) - stb 6,4(3) - -0: /* Return DST + LEN pointer. */ - ld 3,-16(1) - add 3,3,5 - blr - - /* Handle copies of 32+ bytes where DST is aligned (to quadword) but - SRC is not. Use aligned quadword loads from SRC, shifted to realign - the data, allowing for aligned DST stores. */ - .align 4 -L(copy_GE_32_unaligned): - clrldi 0,0,60 /* Number of bytes until the 1st - quadword. */ - andi. 11,3,15 /* Check alignment of DST (against - quadwords). */ - srdi 9,5,4 /* Number of full quadwords remaining. */ - - beq L(copy_GE_32_unaligned_cont) - - /* SRC is not quadword aligned, get it aligned. */ - - mtcrf 0x01,0 - subf 31,0,5 - - /* Vector instructions work best when proper alignment (16-bytes) - is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ -1: /* Copy 1 byte. */ - bf 31,2f - - lbz 6,0(12) - addi 12,12,1 - stb 6,0(3) - addi 3,3,1 -2: /* Copy 2 bytes. */ - bf 30,4f - - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -4: /* Copy 4 bytes. */ - bf 29,8f - - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -8: /* Copy 8 bytes. */ - bf 28,0f - - ld 6,0(12) - addi 12,12,8 - std 6,0(3) - addi 3,3,8 -0: - clrldi 10,12,60 /* Check alignment of SRC. */ - srdi 9,31,4 /* Number of full quadwords remaining. */ - - /* The proper alignment is present, it is OK to copy the bytes now. */ -L(copy_GE_32_unaligned_cont): - - /* Setup two indexes to speed up the indexed vector operations. */ - clrldi 11,31,60 - li 6,16 /* Index for 16-bytes offsets. */ - li 7,32 /* Index for 32-bytes offsets. */ - cmpldi cr1,11,0 - srdi 8,31,5 /* Setup the loop counter. */ - mr 10,3 - mr 11,12 - mtcrf 0x01,9 - cmpldi cr6,9,1 -#ifdef __LITTLE_ENDIAN__ - lvsr 5,0,12 -#else - lvsl 5,0,12 -#endif - lvx 3,0,12 - bf 31,L(setup_unaligned_loop) - - /* Copy another 16 bytes to align to 32-bytes due to the loop . */ - lvx 4,12,6 -#ifdef __LITTLE_ENDIAN__ - vperm 6,4,3,5 -#else - vperm 6,3,4,5 -#endif - addi 11,12,16 - addi 10,3,16 - stvx 6,0,3 - vor 3,4,4 - -L(setup_unaligned_loop): - mtctr 8 - ble cr6,L(end_unaligned_loop) - - /* Copy 32 bytes at a time using vector instructions. */ - .align 4 -L(unaligned_loop): - - /* Note: vr6/vr10 may contain data that was already copied, - but in order to get proper alignment, we may have to copy - some portions again. This is faster than having unaligned - vector instructions though. */ - - lvx 4,11,6 /* vr4 = r11+16. */ -#ifdef __LITTLE_ENDIAN__ - vperm 6,4,3,5 -#else - vperm 6,3,4,5 -#endif - lvx 3,11,7 /* vr3 = r11+32. */ -#ifdef __LITTLE_ENDIAN__ - vperm 10,3,4,5 -#else - vperm 10,4,3,5 -#endif - addi 11,11,32 - stvx 6,0,10 - stvx 10,10,6 - addi 10,10,32 - - bdnz L(unaligned_loop) - - .align 4 -L(end_unaligned_loop): - - /* Check for tail bytes. */ - rldicr 0,31,0,59 - mtcrf 0x01,31 - beq cr1,0f - - add 3,3,0 - add 12,12,0 - - /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ -8: /* Copy 8 bytes. */ - bf 28,4f - - lwz 6,0(12) - lwz 7,4(12) - addi 12,12,8 - stw 6,0(3) - stw 7,4(3) - addi 3,3,8 -4: /* Copy 4 bytes. */ - bf 29,2f - - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -2: /* Copy 2~3 bytes. */ - bf 30,1f - - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -1: /* Copy 1 byte. */ - bf 31,0f - - lbz 6,0(12) - stb 6,0(3) -0: /* Return DST + LEN pointer. */ - ld 31,-8(1) - ld 3,-16(1) - add 3,3,5 - blr - -END_GEN_TB (MEMPCPY,TB_TOCLESS) -libc_hidden_def (__mempcpy) -weak_alias (__mempcpy, mempcpy) -libc_hidden_builtin_def (mempcpy) diff --git a/sysdeps/powerpc/powerpc64/power7/memrchr.S b/sysdeps/powerpc/powerpc64/power7/memrchr.S deleted file mode 100644 index 4276768915..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/memrchr.S +++ /dev/null @@ -1,201 +0,0 @@ -/* Optimized memrchr implementation for PowerPC64/POWER7 using cmpb insn. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5]) */ - -#ifndef MEMRCHR -# define MEMRCHR __memrchr -#endif - .machine power7 -ENTRY (MEMRCHR) - CALL_MCOUNT 3 - add r7,r3,r5 /* Calculate the last acceptable address. */ - neg r0,r7 - addi r7,r7,-1 - mr r10,r3 - clrrdi r6,r7,7 - li r9,3<<5 - dcbt r9,r6,8 /* Stream hint, decreasing addresses. */ - - /* Replicate BYTE to doubleword. */ - insrdi r4,r4,8,48 - insrdi r4,r4,16,32 - insrdi r4,r4,32,0 - li r6,-8 - li r9,-1 - rlwinm r0,r0,3,26,28 /* Calculate padding. */ - clrrdi r8,r7,3 - srd r9,r9,r0 - cmpldi r5,32 - clrrdi r0,r10,3 - ble L(small_range) - -#ifdef __LITTLE_ENDIAN__ - ldx r12,0,r8 -#else - ldbrx r12,0,r8 /* Load reversed doubleword from memory. */ -#endif - cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */ - and r3,r3,r9 - cmpldi cr7,r3,0 /* If r3 == 0, no BYTEs have been found. */ - bne cr7,L(done) - - mtcrf 0x01,r8 - /* Are we now aligned to a quadword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - bf 28,L(loop_setup) - - /* Handle DWORD2 of pair. */ -#ifdef __LITTLE_ENDIAN__ - ldx r12,r8,r6 -#else - ldbrx r12,r8,r6 -#endif - addi r8,r8,-8 - cmpb r3,r12,r4 - cmpldi cr7,r3,0 - bne cr7,L(done) - -L(loop_setup): - /* The last dword we want to read in the loop below is the one - containing the first byte of the string, ie. the dword at - s & ~7, or r0. The first dword read is at r8 - 8, we - read 2 * cnt dwords, so the last dword read will be at - r8 - 8 - 16 * cnt + 8. Solving for cnt gives - cnt = (r8 - r0) / 16 */ - sub r5,r8,r0 - addi r8,r8,-8 - srdi r9,r5,4 /* Number of loop iterations. */ - mtctr r9 /* Setup the counter. */ - - /* Main loop to look for BYTE backwards in the string. - FIXME: Investigate whether 32 byte align helps with this - 9 instruction loop. */ - .align 5 -L(loop): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the byte-checking process for bigger strings. */ - -#ifdef __LITTLE_ENDIAN__ - ldx r12,0,r8 - ldx r11,r8,r6 -#else - ldbrx r12,0,r8 - ldbrx r11,r8,r6 -#endif - cmpb r3,r12,r4 - cmpb r9,r11,r4 - or r5,r9,r3 /* Merge everything in one doubleword. */ - cmpldi cr7,r5,0 - bne cr7,L(found) - addi r8,r8,-16 - bdnz L(loop) - - /* We may have one more word to read. */ - cmpld r8,r0 - bnelr - -#ifdef __LITTLE_ENDIAN__ - ldx r12,0,r8 -#else - ldbrx r12,0,r8 -#endif - cmpb r3,r12,r4 - cmpldi cr7,r3,0 - bne cr7,L(done) - blr - - .align 4 -L(found): - /* OK, one (or both) of the dwords contains BYTE. Check - the first dword. */ - cmpldi cr6,r3,0 - bne cr6,L(done) - - /* BYTE must be in the second word. Adjust the address - again and move the result of cmpb to r3 so we can calculate the - pointer. */ - - mr r3,r9 - addi r8,r8,-8 - - /* r3 has the output of the cmpb instruction, that is, it contains - 0xff in the same position as BYTE in the original - word from the string. Use that to calculate the pointer. - We need to make sure BYTE is *before* the end of the - range. */ -L(done): - cntlzd r9,r3 /* Count leading zeros before the match. */ - cmpld r8,r0 /* Are we on the last word? */ - srdi r6,r9,3 /* Convert leading zeros to bytes. */ - addi r0,r6,-7 - sub r3,r8,r0 - cmpld cr7,r3,r10 - bnelr - bgelr cr7 - li r3,0 - blr - - .align 4 -L(null): - li r3,0 - blr - -/* Deals with size <= 32. */ - .align 4 -L(small_range): - cmpldi r5,0 - beq L(null) - -#ifdef __LITTLE_ENDIAN__ - ldx r12,0,r8 -#else - ldbrx r12,0,r8 /* Load reversed doubleword from memory. */ -#endif - cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */ - and r3,r3,r9 - cmpldi cr7,r3,0 - bne cr7,L(done) - - /* Are we done already? */ - cmpld r8,r0 - addi r8,r8,-8 - beqlr - - .align 5 -L(loop_small): -#ifdef __LITTLE_ENDIAN__ - ldx r12,0,r8 -#else - ldbrx r12,0,r8 -#endif - cmpb r3,r12,r4 - cmpld r8,r0 - cmpldi cr7,r3,0 - bne cr7,L(done) - addi r8,r8,-8 - bne L(loop_small) - blr - -END (MEMRCHR) -weak_alias (__memrchr, memrchr) -libc_hidden_builtin_def (memrchr) diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S deleted file mode 100644 index 21933c0672..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/memset.S +++ /dev/null @@ -1,399 +0,0 @@ -/* Optimized memset implementation for PowerPC64/POWER7. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); - Returns 's'. */ - -#ifndef MEMSET -# define MEMSET memset -#endif - .machine power7 -EALIGN (MEMSET, 5, 0) - CALL_MCOUNT 3 - -L(_memset): - cmpldi cr7,5,31 - cmpldi cr6,5,8 - mr 10,3 - - /* Replicate byte to word. */ - insrdi 4,4,8,48 - insrdi 4,4,16,32 - ble cr6,L(small) /* If length <= 8, use short copy code. */ - - neg 0,3 - ble cr7,L(medium) /* If length < 32, use medium copy code. */ - - andi. 11,10,7 /* Check alignment of SRC. */ - insrdi 4,4,32,0 /* Replicate word to double word. */ - - mr 12,5 - beq L(big_aligned) - - clrldi 0,0,61 - mtocrf 0x01,0 - subf 5,0,5 - - /* Get DST aligned to 8 bytes. */ -1: bf 31,2f - - stb 4,0(10) - addi 10,10,1 -2: bf 30,4f - - sth 4,0(10) - addi 10,10,2 -4: bf 29,L(big_aligned) - - stw 4,0(10) - addi 10,10,4 - - .align 4 -L(big_aligned): - - cmpldi cr5,5,255 - li 0,32 - dcbtst 0,10 - cmpldi cr6,4,0 - srdi 9,5,3 /* Number of full doublewords remaining. */ - crand 27,26,21 - mtocrf 0x01,9 - bt 27,L(huge) - - /* From this point on, we'll copy 32+ bytes and the value - isn't 0 (so we can't use dcbz). */ - - srdi 8,5,5 - clrldi 11,5,61 - cmpldi cr6,11,0 - cmpldi cr1,9,4 - mtctr 8 - - /* Copy 1~3 doublewords so the main loop starts - at a multiple of 32 bytes. */ - - bf 30,1f - - std 4,0(10) - std 4,8(10) - addi 10,10,16 - bf 31,L(big_loop) - - std 4,0(10) - addi 10,10,8 - mr 12,10 - blt cr1,L(tail_bytes) - b L(big_loop) - - .align 4 -1: /* Copy 1 doubleword. */ - bf 31,L(big_loop) - - std 4,0(10) - addi 10,10,8 - - /* Main aligned copy loop. Copies 32-bytes at a time and - ping-pong through r10 and r12 to avoid AGEN delays. */ - .align 4 -L(big_loop): - addi 12,10,32 - std 4,0(10) - std 4,8(10) - std 4,16(10) - std 4,24(10) - bdz L(tail_bytes) - - addi 10,10,64 - std 4,0(12) - std 4,8(12) - std 4,16(12) - std 4,24(12) - bdnz L(big_loop) - - mr 12,10 - b L(tail_bytes) - - .align 4 -L(tail_bytes): - - /* Check for tail bytes. */ - beqlr cr6 - - clrldi 0,5,61 - mtocrf 0x01,0 - - /* At this point we have a tail of 0-7 bytes and we know that the - destination is doubleword-aligned. */ -4: /* Copy 4 bytes. */ - bf 29,2f - - stw 4,0(12) - addi 12,12,4 -2: /* Copy 2 bytes. */ - bf 30,1f - - sth 4,0(12) - addi 12,12,2 -1: /* Copy 1 byte. */ - bflr 31 - - stb 4,0(12) - blr - - /* Special case when value is 0 and we have a long length to deal - with. Use dcbz to zero out 128-bytes at a time. Before using - dcbz though, we need to get the destination 128-bytes aligned. */ - .align 4 -L(huge): - andi. 11,10,127 - neg 0,10 - beq L(huge_aligned) - - clrldi 0,0,57 - subf 5,0,5 - srdi 0,0,3 - mtocrf 0x01,0 - - /* Get DST aligned to 128 bytes. */ -8: bf 28,4f - - std 4,0(10) - std 4,8(10) - std 4,16(10) - std 4,24(10) - std 4,32(10) - std 4,40(10) - std 4,48(10) - std 4,56(10) - addi 10,10,64 - .align 4 -4: bf 29,2f - - std 4,0(10) - std 4,8(10) - std 4,16(10) - std 4,24(10) - addi 10,10,32 - .align 4 -2: bf 30,1f - - std 4,0(10) - std 4,8(10) - addi 10,10,16 - .align 4 -1: bf 31,L(huge_aligned) - - std 4,0(10) - addi 10,10,8 - - -L(huge_aligned): - srdi 8,5,7 - clrldi 11,5,57 - cmpldi cr6,11,0 - mtctr 8 - - .align 4 -L(huge_loop): - dcbz 0,10 - addi 10,10,128 - bdnz L(huge_loop) - - /* Check how many bytes are still left. */ - beqlr cr6 - - subf 9,3,10 - subf 5,9,12 - srdi 8,5,3 - cmpldi cr6,8,0 - mtocrf 0x01,8 - - /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for - speed. We'll handle the resulting tail bytes later. */ - beq cr6,L(tail) - -8: bf 28,4f - - std 4,0(10) - std 4,8(10) - std 4,16(10) - std 4,24(10) - std 4,32(10) - std 4,40(10) - std 4,48(10) - std 4,56(10) - addi 10,10,64 - .align 4 -4: bf 29,2f - - std 4,0(10) - std 4,8(10) - std 4,16(10) - std 4,24(10) - addi 10,10,32 - .align 4 -2: bf 30,1f - - std 4,0(10) - std 4,8(10) - addi 10,10,16 - .align 4 -1: bf 31,L(tail) - - std 4,0(10) - addi 10,10,8 - - /* Handle the rest of the tail bytes here. */ -L(tail): - mtocrf 0x01,5 - - .align 4 -4: bf 29,2f - - stw 4,0(10) - addi 10,10,4 - .align 4 -2: bf 30,1f - - sth 4,0(10) - addi 10,10,2 - .align 4 -1: bflr 31 - - stb 4,0(10) - blr - - /* Expanded tree to copy tail bytes without increments. */ - .align 4 -L(copy_tail): - bf 29,L(FXX) - - stw 4,0(10) - bf 30,L(TFX) - - sth 4,4(10) - bflr 31 - - stb 4,6(10) - blr - - .align 4 -L(FXX): bf 30,L(FFX) - - sth 4,0(10) - bflr 31 - - stb 4,2(10) - blr - - .align 4 -L(TFX): bflr 31 - - stb 4,4(10) - blr - - .align 4 -L(FFX): bflr 31 - - stb 4,0(10) - blr - - /* Handle copies of 9~31 bytes. */ - .align 4 -L(medium): - /* At least 9 bytes to go. */ - andi. 11,10,3 - clrldi 0,0,62 - beq L(medium_aligned) - - /* Force 4-bytes alignment for DST. */ - mtocrf 0x01,0 - subf 5,0,5 -1: /* Copy 1 byte. */ - bf 31,2f - - stb 4,0(10) - addi 10,10,1 -2: /* Copy 2 bytes. */ - bf 30,L(medium_aligned) - - sth 4,0(10) - addi 10,10,2 - - .align 4 -L(medium_aligned): - /* At least 6 bytes to go, and DST is word-aligned. */ - cmpldi cr1,5,16 - mtocrf 0x01,5 - blt cr1,8f - - /* Copy 16 bytes. */ - stw 4,0(10) - stw 4,4(10) - stw 4,8(10) - stw 4,12(10) - addi 10,10,16 -8: /* Copy 8 bytes. */ - bf 28,4f - - stw 4,0(10) - stw 4,4(10) - addi 10,10,8 -4: /* Copy 4 bytes. */ - bf 29,2f - - stw 4,0(10) - addi 10,10,4 -2: /* Copy 2-3 bytes. */ - bf 30,1f - - sth 4,0(10) - addi 10,10,2 -1: /* Copy 1 byte. */ - bflr 31 - - stb 4,0(10) - blr - - /* Handles copies of 0~8 bytes. */ - .align 4 -L(small): - mtocrf 0x01,5 - bne cr6,L(copy_tail) - - stw 4,0(10) - stw 4,4(10) - blr - -END_GEN_TB (MEMSET,TB_TOCLESS) -libc_hidden_builtin_def (memset) - -/* Copied from bzero.S to prevent the linker from inserting a stub - between bzero and memset. */ -ENTRY (__bzero) - CALL_MCOUNT 3 - mr r5,r4 - li r4,0 - b L(_memset) -END (__bzero) -#ifndef __bzero -weak_alias (__bzero, bzero) -#endif diff --git a/sysdeps/powerpc/powerpc64/power7/multiarch/Implies b/sysdeps/powerpc/powerpc64/power7/multiarch/Implies deleted file mode 100644 index bf5d6171a5..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/multiarch/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc64/power6/multiarch diff --git a/sysdeps/powerpc/powerpc64/power7/rawmemchr.S b/sysdeps/powerpc/powerpc64/power7/rawmemchr.S deleted file mode 100644 index 48afb75943..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/rawmemchr.S +++ /dev/null @@ -1,115 +0,0 @@ -/* Optimized rawmemchr implementation for PowerPC64/POWER7 using cmpb insn. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* int [r3] rawmemchr (void *s [r3], int c [r4]) */ - -#ifndef RAWMEMCHR -# define RAWMEMCHR __rawmemchr -#endif - .machine power7 -ENTRY (RAWMEMCHR) - CALL_MCOUNT 2 - dcbt 0,r3 - clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ - - /* Replicate byte to doubleword. */ - insrdi r4,r4,8,48 - insrdi r4,r4,16,32 - insrdi r4,r4,32,0 - - /* Now r4 has a doubleword of c bytes. */ - - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - ld r12,0(r8) /* Load doubleword from memory. */ - cmpb r5,r12,r4 /* Compare each byte against c byte. */ -#ifdef __LITTLE_ENDIAN__ - srd r5,r5,r6 - sld r5,r5,r6 -#else - sld r5,r5,r6 /* Move left to discard ignored bits. */ - srd r5,r5,r6 /* Bring the bits back as zeros. */ -#endif - cmpdi cr7,r5,0 /* If r5 == 0, no c bytes have been found. */ - bne cr7,L(done) - - mtcrf 0x01,r8 - - /* Are we now aligned to a quadword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - - bt 28,L(loop) - - /* Handle DWORD2 of pair. */ - ldu r12,8(r8) - cmpb r5,r12,r4 - cmpdi cr7,r5,0 - bne cr7,L(done) - b L(loop) /* We branch here (rather than falling through) - to skip the nops due to heavy alignment - of the loop below. */ - - /* Main loop to look for the end of the string. Since it's a - small loop (< 8 instructions), align it to 32-bytes. */ - .p2align 5 -L(loop): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the byte-checking process for bigger strings. */ - ld r12,8(r8) - ldu r11,16(r8) - cmpb r5,r12,r4 - cmpb r6,r11,r4 - or r7,r5,r6 - cmpdi cr7,r7,0 - beq cr7,L(loop) - - /* OK, one (or both) of the doublewords contains a 'c' byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a c byte. */ - - cmpdi cr6,r5,0 - addi r8,r8,-8 - bne cr6,L(done) - - /* The 'c' byte must be in the second doubleword. Adjust the address - again and move the result of cmpb to r10 so we can calculate the - pointer. */ - mr r5,r6 - addi r8,r8,8 - - /* r5 has the output of the cmpb instruction, that is, it contains - 0xff in the same position as the 'c' byte in the original - doubleword from the string. Use that fact to find out what is - the position of the byte inside the string. */ -L(done): -#ifdef __LITTLE_ENDIAN__ - addi r0,r5,-1 - andc r0,r0,r5 - popcntd r0,r0 /* Count trailing zeros. */ -#else - cntlzd r0,r5 /* Count leading zeros before the match. */ -#endif - srdi r0,r0,3 /* Convert leading zeros to bytes. */ - add r3,r8,r0 /* Return address of the matching char. */ - blr -END (RAWMEMCHR) -weak_alias (__rawmemchr,rawmemchr) -libc_hidden_builtin_def (__rawmemchr) diff --git a/sysdeps/powerpc/powerpc64/power7/stpncpy.S b/sysdeps/powerpc/powerpc64/power7/stpncpy.S deleted file mode 100644 index a346dd7e28..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/stpncpy.S +++ /dev/null @@ -1,24 +0,0 @@ -/* Optimized stpncpy implementation for PowerPC64/POWER7. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#define USE_AS_STPNCPY -#include <sysdeps/powerpc/powerpc64/power7/strncpy.S> - -weak_alias (__stpncpy, stpncpy) -libc_hidden_def (__stpncpy) -libc_hidden_builtin_def (stpncpy) diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S deleted file mode 100644 index e856b8a593..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S +++ /dev/null @@ -1,126 +0,0 @@ -/* Optimized strcasecmp implementation for PowerPC64. - Copyright (C) 2011-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <locale-defines.h> - -/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) - - or if defined USE_IN_EXTENDED_LOCALE_MODEL: - - int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4], - __locale_t loc [r5]) */ - -#ifndef STRCMP -# define __STRCMP __strcasecmp -# define STRCMP strcasecmp -#endif - -ENTRY (__STRCMP) -#ifndef USE_IN_EXTENDED_LOCALE_MODEL - CALL_MCOUNT 2 -#else - CALL_MCOUNT 3 -#endif - -#define rRTN r3 /* Return value */ -#define rSTR1 r5 /* 1st string */ -#define rSTR2 r4 /* 2nd string */ -#define rLOCARG r5 /* 3rd argument: locale_t */ -#define rCHAR1 r6 /* Byte read from 1st string */ -#define rCHAR2 r7 /* Byte read from 2nd string */ -#define rADDR1 r8 /* Address of tolower(rCHAR1) */ -#define rADDR2 r12 /* Address of tolower(rCHAR2) */ -#define rLWR1 r8 /* Word tolower(rCHAR1) */ -#define rLWR2 r12 /* Word tolower(rCHAR2) */ -#define rTMP r9 -#define rLOC r11 /* Default locale address */ - - cmpd cr7, r3, r4 -#ifndef USE_IN_EXTENDED_LOCALE_MODEL - ld rTMP, __libc_tsd_LOCALE@got@tprel(r2) - add rLOC, rTMP, __libc_tsd_LOCALE@tls - ld rLOC, 0(rLOC) -#else - mr rLOC, rLOCARG -#endif - ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC) - mr rSTR1, rRTN - li rRTN, 0 - beqlr cr7 - - - /* Unrolling loop for POWER: loads are done with 'lbz' plus - offset and string descriptors are only updated in the end - of loop unrolling. */ - - lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ - lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ -L(loop): - cmpdi rCHAR1, 0 /* *s1 == '\0' ? */ - sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */ - sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */ - lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */ - lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */ - cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */ - crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */ - beq cr1, L(done) - lbz rCHAR1, 1(rSTR1) - lbz rCHAR2, 1(rSTR2) - cmpdi rCHAR1, 0 - sldi rADDR1, rCHAR1, 2 - sldi rADDR2, rCHAR2, 2 - lwzx rLWR1, rLOC, rADDR1 - lwzx rLWR2, rLOC, rADDR2 - cmpw cr1, rLWR1, rLWR2 - crorc 4*cr1+eq,eq,4*cr1+eq - beq cr1, L(done) - lbz rCHAR1, 2(rSTR1) - lbz rCHAR2, 2(rSTR2) - cmpdi rCHAR1, 0 - sldi rADDR1, rCHAR1, 2 - sldi rADDR2, rCHAR2, 2 - lwzx rLWR1, rLOC, rADDR1 - lwzx rLWR2, rLOC, rADDR2 - cmpw cr1, rLWR1, rLWR2 - crorc 4*cr1+eq,eq,4*cr1+eq - beq cr1, L(done) - lbz rCHAR1, 3(rSTR1) - lbz rCHAR2, 3(rSTR2) - cmpdi rCHAR1, 0 - /* Increment both string descriptors */ - addi rSTR1, rSTR1, 4 - addi rSTR2, rSTR2, 4 - sldi rADDR1, rCHAR1, 2 - sldi rADDR2, rCHAR2, 2 - lwzx rLWR1, rLOC, rADDR1 - lwzx rLWR2, rLOC, rADDR2 - cmpw cr1, rLWR1, rLWR2 - crorc 4*cr1+eq,eq,4*cr1+eq - beq cr1,L(done) - lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ - lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ - b L(loop) -L(done): - subf r0, rLWR2, rLWR1 - extsw rRTN, r0 - blr -END (__STRCMP) - -weak_alias (__STRCMP, STRCMP) -libc_hidden_builtin_def (__STRCMP) diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S deleted file mode 100644 index c13c4ebcb8..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S +++ /dev/null @@ -1,5 +0,0 @@ -#define USE_IN_EXTENDED_LOCALE_MODEL -#define STRCMP strcasecmp_l -#define __STRCMP __strcasecmp_l - -#include "strcasecmp.S" diff --git a/sysdeps/powerpc/powerpc64/power7/strchr.S b/sysdeps/powerpc/powerpc64/power7/strchr.S deleted file mode 100644 index a18e2e101c..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strchr.S +++ /dev/null @@ -1,230 +0,0 @@ -/* Optimized strchr implementation for PowerPC64/POWER7 using cmpb insn. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifndef STRCHR -# define STRCHR strchr -#endif - -/* int [r3] strchr (char *s [r3], int c [r4]) */ - .machine power7 -ENTRY (STRCHR) - CALL_MCOUNT 2 - dcbt 0,r3 - clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ - cmpdi cr7,r4,0 - ld r12,0(r8) /* Load doubleword from memory. */ - li r0,0 /* Doubleword with null chars to use - with cmpb. */ - - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - - beq cr7,L(null_match) - - /* Replicate byte to doubleword. */ - insrdi r4,r4,8,48 - insrdi r4,r4,16,32 - insrdi r4,r4,32,0 - - /* Now r4 has a doubleword of c bytes and r0 has - a doubleword of null bytes. */ - - cmpb r10,r12,r4 /* Compare each byte against c byte. */ - cmpb r11,r12,r0 /* Compare each byte against null byte. */ - - /* Move the doublewords left and right to discard the bits that are - not part of the string and bring them back as zeros. */ -#ifdef __LITTLE_ENDIAN__ - srd r10,r10,r6 - srd r11,r11,r6 - sld r10,r10,r6 - sld r11,r11,r6 -#else - sld r10,r10,r6 - sld r11,r11,r6 - srd r10,r10,r6 - srd r11,r11,r6 -#endif - or r5,r10,r11 /* OR the results to speed things up. */ - cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes - have been found. */ - bne cr7,L(done) - - mtcrf 0x01,r8 - - /* Are we now aligned to a doubleword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - - bt 28,L(loop) - - /* Handle WORD2 of pair. */ - ldu r12,8(r8) - cmpb r10,r12,r4 - cmpb r11,r12,r0 - or r5,r10,r11 - cmpdi cr7,r5,0 - bne cr7,L(done) - b L(loop) /* We branch here (rather than falling through) - to skip the nops due to heavy alignment - of the loop below. */ - - .p2align 5 -L(loop): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the null-checking process for bigger strings. */ - ld r12,8(r8) - ldu r9,16(r8) - cmpb r10,r12,r4 - cmpb r11,r12,r0 - cmpb r6,r9,r4 - cmpb r7,r9,r0 - or r12,r10,r11 - or r9,r6,r7 - or r5,r12,r9 - cmpdi cr7,r5,0 - beq cr7,L(loop) - - /* OK, one (or both) of the doublewords contains a c/null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a c/null byte. */ - - cmpdi cr6,r12,0 - addi r8,r8,-8 - bne cr6,L(done) - - /* The c/null byte must be in the second doubleword. Adjust the - address again and move the result of cmpb to r10 so we can calculate - the pointer. */ - - mr r10,r6 - mr r11,r7 - addi r8,r8,8 - - /* r10/r11 have the output of the cmpb instructions, that is, - 0xff in the same position as the c/null byte in the original - doubleword from the string. Use that to calculate the pointer. */ -L(done): -#ifdef __LITTLE_ENDIAN__ - addi r3,r10,-1 - andc r3,r3,r10 - popcntd r0,r3 - addi r4,r11,-1 - andc r4,r4,r11 - cmpld cr7,r3,r4 - bgt cr7,L(no_match) -#else - cntlzd r0,r10 /* Count leading zeros before c matches. */ - cmpld cr7,r11,r10 - bgt cr7,L(no_match) -#endif - srdi r0,r0,3 /* Convert leading zeros to bytes. */ - add r3,r8,r0 /* Return address of the matching c byte - or null in case c was not found. */ - blr - - .align 4 -L(no_match): - li r3,0 - blr - -/* We are here because strchr was called with a null byte. */ - .align 4 -L(null_match): - /* r0 has a doubleword of null bytes. */ - - cmpb r5,r12,r0 /* Compare each byte against null bytes. */ - - /* Move the doublewords left and right to discard the bits that are - not part of the string and bring them back as zeros. */ -#ifdef __LITTLE_ENDIAN__ - srd r5,r5,r6 - sld r5,r5,r6 -#else - sld r5,r5,r6 - srd r5,r5,r6 -#endif - cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes - have been found. */ - bne cr7,L(done_null) - - mtcrf 0x01,r8 - - /* Are we now aligned to a quadword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - - bt 28,L(loop_null) - - /* Handle WORD2 of pair. */ - ldu r12,8(r8) - cmpb r5,r12,r0 - cmpdi cr7,r5,0 - bne cr7,L(done_null) - b L(loop_null) /* We branch here (rather than falling through) - to skip the nops due to heavy alignment - of the loop below. */ - - /* Main loop to look for the end of the string. Since it's a - small loop (< 8 instructions), align it to 32-bytes. */ - .p2align 5 -L(loop_null): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the null-checking process for bigger strings. */ - ld r12,8(r8) - ldu r11,16(r8) - cmpb r5,r12,r0 - cmpb r10,r11,r0 - or r6,r5,r10 - cmpdi cr7,r6,0 - beq cr7,L(loop_null) - - /* OK, one (or both) of the doublewords contains a null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a null byte. */ - - cmpdi cr6,r5,0 - addi r8,r8,-8 - bne cr6,L(done_null) - - /* The null byte must be in the second doubleword. Adjust the address - again and move the result of cmpb to r10 so we can calculate the - pointer. */ - - mr r5,r10 - addi r8,r8,8 - - /* r5 has the output of the cmpb instruction, that is, it contains - 0xff in the same position as the null byte in the original - doubleword from the string. Use that to calculate the pointer. */ -L(done_null): -#ifdef __LITTLE_ENDIAN__ - addi r0,r5,-1 - andc r0,r0,r5 - popcntd r0,r0 -#else - cntlzd r0,r5 /* Count leading zeros before the match. */ -#endif - srdi r0,r0,3 /* Convert leading zeros to bytes. */ - add r3,r8,r0 /* Return address of the matching null byte. */ - blr -END (STRCHR) -weak_alias (strchr, index) -libc_hidden_builtin_def (strchr) diff --git a/sysdeps/powerpc/powerpc64/power7/strchrnul.S b/sysdeps/powerpc/powerpc64/power7/strchrnul.S deleted file mode 100644 index 27bc1f0682..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strchrnul.S +++ /dev/null @@ -1,131 +0,0 @@ -/* Optimized strchrnul implementation for PowerPC64/POWER7 using cmpb insn. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifndef STRCHRNUL -# define STRCHRNUL __strchrnul -#endif -/* int [r3] strchrnul (char *s [r3], int c [r4]) */ - .machine power7 -ENTRY (STRCHRNUL) - CALL_MCOUNT 2 - dcbt 0,r3 - clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ - - /* Replicate byte to doubleword. */ - insrdi r4,r4,8,48 - insrdi r4,r4,16,32 - insrdi r4,r4,32,0 - - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - ld r12,0(r8) /* Load doubleword from memory. */ - li r0,0 /* Doubleword with null chars to use - with cmpb. */ - - /* Now r4 has a doubleword of c bytes and r0 has - a doubleword of null bytes. */ - - cmpb r10,r12,r0 /* Compare each byte against c byte. */ - cmpb r9,r12,r4 /* Compare each byte against null byte. */ - - /* Move the doublewords left and right to discard the bits that are - not part of the string and to bring them back as zeros. */ -#ifdef __LITTLE_ENDIAN__ - srd r10,r10,r6 - srd r9,r9,r6 - sld r10,r10,r6 - sld r9,r9,r6 -#else - sld r10,r10,r6 - sld r9,r9,r6 - srd r10,r10,r6 - srd r9,r9,r6 -#endif - or r5,r9,r10 /* OR the results to speed things up. */ - cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes - have been found. */ - bne cr7,L(done) - - mtcrf 0x01,r8 - - /* Are we now aligned to a quadword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - - bt 28,L(loop) - - /* Handle DWORD2 of pair. */ - ldu r12,8(r8) - cmpb r10,r12,r0 - cmpb r9,r12,r4 - or r5,r9,r10 - cmpdi cr7,r5,0 - bne cr7,L(done) - b L(loop) /* We branch here (rather than falling through) - to skip the nops due to heavy alignment - of the loop below. */ - - .p2align 5 -L(loop): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the null-checking process for bigger strings. */ - ld r12,8(r8) - ldu r11,16(r8) - cmpb r10,r12,r0 - cmpb r9,r12,r4 - cmpb r6,r11,r0 - cmpb r7,r11,r4 - or r5,r9,r10 - or r10,r6,r7 - or r11,r5,r10 - cmpdi cr7,r11,0 - beq cr7,L(loop) - - /* OK, one (or both) of the doublewords contains a c/null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a c/null byte. */ - - cmpdi cr6,r5,0 - addi r8,r8,-8 - bne cr6,L(done) - - /* The c/null byte must be in the second doubleword. Adjust the - address again and move the result of cmpb to r5 so we can calculate - the pointer. */ - mr r5,r10 - addi r8,r8,8 - - /* r5 has the output of the cmpb instruction, that is, it contains - 0xff in the same position as the c/null byte in the original - doubleword from the string. Use that to calculate the pointer. */ -L(done): -#ifdef __LITTLE_ENDIAN__ - addi r0,r5,-1 - andc r0,r0,r5 - popcntd r0,r0 -#else - cntlzd r0,r5 /* Count leading zeros before the match. */ -#endif - srdi r0,r0,3 /* Convert leading zeros to bytes. */ - add r3,r8,r0 /* Return address of matching c/null byte. */ - blr -END (STRCHRNUL) -weak_alias (STRCHRNUL, strchrnul) -libc_hidden_builtin_def (STRCHRNUL) diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S deleted file mode 100644 index 14e14f457e..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strcmp.S +++ /dev/null @@ -1,168 +0,0 @@ -/* Optimized strcmp implementation for Power7 using 'cmpb' instruction - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -/* The optimization is achieved here through cmpb instruction. - 8byte aligned strings are processed with double word comparision - and unaligned strings are handled effectively with loop unrolling - technique */ - -#include <sysdep.h> - -#ifndef STRCMP -# define STRCMP strcmp -#endif - -/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */ - - .machine power7 -EALIGN (STRCMP, 4, 0) - CALL_MCOUNT 2 - - or r9, r3, r4 - rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */ - bne cr0, L(process_unaligned_bytes) - li r5, 0 - - .align 4 -/* process input parameters on double word aligned boundary */ -L(unrollDword): - ld r8,0(r3) - ld r10,0(r4) - cmpb r7,r8,r5 - cmpdi cr7,r7,0 - mr r9,r7 - bne cr7,L(null_found) - cmpld cr7,r8,r10 - bne cr7,L(different) - - ld r8,8(r3) - ld r10,8(r4) - cmpb r7,r8,r5 - cmpdi cr7,r7,0 - mr r9,r7 - bne cr7,L(null_found) - cmpld cr7,r8,r10 - bne cr7,L(different) - - ld r8,16(r3) - ld r10,16(r4) - cmpb r7,r8,r5 - cmpdi cr7,r7,0 - mr r9,r7 - bne cr7,L(null_found) - cmpld cr7,r8,r10 - bne cr7,L(different) - - ld r8,24(r3) - ld r10,24(r4) - cmpb r7,r8,r5 - cmpdi cr7,r7,0 - mr r9,r7 - bne cr7,L(null_found) - cmpld cr7,r8,r10 - bne cr7,L(different) - - addi r3, r3, 32 - addi r4, r4, 32 - beq cr7, L(unrollDword) - - .align 4 -L(null_found): -#ifdef __LITTLE_ENDIAN__ - neg r7,r9 - and r9,r9,r7 - li r7,-1 - cntlzd r9,r9 - subfic r9,r9,71 - sld r9,r7,r9 -#else - cntlzd r9,r9 - li r7,-1 - addi r9,r9,8 - srd r9,r7,r9 -#endif - or r8,r8,r9 - or r10,r10,r9 - -L(different): - cmpb r9,r8,r10 -#ifdef __LITTLE_ENDIAN__ - addi r7,r9,1 - andc r9,r7,r9 - cntlzd r9,r9 - subfic r9,r9,63 -#else - not r9,r9 - cntlzd r9,r9 - subfic r9,r9,56 -#endif - srd r3,r8,r9 - srd r10,r10,r9 - rldicl r10,r10,0,56 - rldicl r3,r3,0,56 - subf r3,r10,r3 - blr - - .align 4 -L(process_unaligned_bytes): - lbz r9, 0(r3) /* load byte from s1 */ - lbz r10, 0(r4) /* load byte from s2 */ - cmpdi cr7, r9, 0 /* compare *s1 with NULL */ - beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ - cmplw cr7, r9, r10 /* compare *s1 and *s2 */ - bne cr7, L(ComputeDiff) /* branch to compute difference and return */ - - lbz r9, 1(r3) /* load next byte from s1 */ - lbz r10, 1(r4) /* load next byte from s2 */ - cmpdi cr7, r9, 0 /* compare *s1 with NULL */ - beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ - cmplw cr7, r9, r10 /* compare *s1 and *s2 */ - bne cr7, L(ComputeDiff) /* branch to compute difference and return */ - - lbz r9, 2(r3) /* unroll 3rd byte here */ - lbz r10, 2(r4) - cmpdi cr7, r9, 0 - beq cr7, L(diffOfNULL) - cmplw cr7, r9, r10 - bne 7, L(ComputeDiff) - - lbz r9, 3(r3) /* unroll 4th byte now */ - lbz r10, 3(r4) - addi r3, r3, 4 /* increment s1 by unroll factor */ - cmpdi cr7, r9, 0 - cmplw cr6, 9, r10 - beq cr7, L(diffOfNULL) - addi r4, r4, 4 /* increment s2 by unroll factor */ - beq cr6, L(process_unaligned_bytes) /* unroll byte processing */ - - .align 4 -L(ComputeDiff): - extsw r9, r9 - subf r10, r10, r9 /* compute s1 - s2 */ - extsw r3, r10 - blr /* return */ - - .align 4 -L(diffOfNULL): - li r9, 0 - subf r10, r10, r9 /* compute s1 - s2 */ - extsw r3, r10 /* sign extend result */ - blr /* return */ - -END (STRCMP) -libc_hidden_builtin_def (strcmp) diff --git a/sysdeps/powerpc/powerpc64/power7/strlen.S b/sysdeps/powerpc/powerpc64/power7/strlen.S deleted file mode 100644 index 63848c460c..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strlen.S +++ /dev/null @@ -1,107 +0,0 @@ -/* Optimized strlen implementation for PowerPC64/POWER7 using cmpb insn. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* int [r3] strlen (char *s [r3]) */ - -#ifndef STRLEN -# define STRLEN strlen -#endif - .machine power7 -ENTRY (STRLEN) - CALL_MCOUNT 1 - dcbt 0,r3 - clrrdi r4,r3,3 /* Align the address to doubleword boundary. */ - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - li r0,0 /* Doubleword with null chars to use - with cmpb. */ - li r5,-1 /* MASK = 0xffffffffffffffff. */ - ld r12,0(r4) /* Load doubleword from memory. */ -#ifdef __LITTLE_ENDIAN__ - sld r5,r5,r6 -#else - srd r5,r5,r6 /* MASK = MASK >> padding. */ -#endif - orc r9,r12,r5 /* Mask bits that are not part of the string. */ - cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ - cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ - bne cr7,L(done) - - mtcrf 0x01,r4 - - /* Are we now aligned to a quadword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - - bt 28,L(loop) - - /* Handle DWORD2 of pair. */ - ldu r12,8(r4) - cmpb r10,r12,r0 - cmpdi cr7,r10,0 - bne cr7,L(done) - - /* Main loop to look for the end of the string. Since it's a - small loop (< 8 instructions), align it to 32-bytes. */ - .p2align 5 -L(loop): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the null-checking process for bigger strings. */ - - ld r12, 8(r4) - ldu r11, 16(r4) - cmpb r10,r12,r0 - cmpb r9,r11,r0 - or r8,r9,r10 /* Merge everything in one doubleword. */ - cmpdi cr7,r8,0 - beq cr7,L(loop) - - /* OK, one (or both) of the doublewords contains a null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a null byte. */ - - cmpdi cr6,r10,0 - addi r4,r4,-8 - bne cr6,L(done) - - /* The null byte must be in the second doubleword. Adjust the address - again and move the result of cmpb to r10 so we can calculate the - length. */ - - mr r10,r9 - addi r4,r4,8 - - /* r10 has the output of the cmpb instruction, that is, it contains - 0xff in the same position as the null byte in the original - doubleword from the string. Use that to calculate the length. */ -L(done): -#ifdef __LITTLE_ENDIAN__ - addi r9, r10, -1 /* Form a mask from trailing zeros. */ - andc r9, r9, r10 - popcntd r0, r9 /* Count the bits in the mask. */ -#else - cntlzd r0,r10 /* Count leading zeros before the match. */ -#endif - subf r5,r3,r4 - srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ - add r3,r5,r0 /* Compute final length. */ - blr -END (STRLEN) -libc_hidden_builtin_def (strlen) diff --git a/sysdeps/powerpc/powerpc64/power7/strncmp.S b/sysdeps/powerpc/powerpc64/power7/strncmp.S deleted file mode 100644 index d53b31be8e..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strncmp.S +++ /dev/null @@ -1,227 +0,0 @@ -/* Optimized strcmp implementation for POWER7/PowerPC64. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifndef STRNCMP -# define STRNCMP strncmp -#endif - -/* See strlen.s for comments on how the end-of-string testing works. */ - -/* int [r3] strncmp (const char *s1 [r3], - const char *s2 [r4], - size_t size [r5]) */ - -EALIGN (STRNCMP,5,0) - CALL_MCOUNT 3 - -#define rTMP2 r0 -#define rRTN r3 -#define rSTR1 r3 /* first string arg */ -#define rSTR2 r4 /* second string arg */ -#define rN r5 /* max string length */ -#define rWORD1 r6 /* current word in s1 */ -#define rWORD2 r7 /* current word in s2 */ -#define rWORD3 r10 -#define rWORD4 r11 -#define rFEFE r8 /* constant 0xfefefefefefefeff (-0x0101010101010101) */ -#define r7F7F r9 /* constant 0x7f7f7f7f7f7f7f7f */ -#define rNEG r10 /* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */ -#define rBITDIF r11 /* bits that differ in s1 & s2 words */ -#define rTMP r12 - - dcbt 0,rSTR1 - nop - or rTMP,rSTR2,rSTR1 - lis r7F7F,0x7f7f - dcbt 0,rSTR2 - nop - clrldi. rTMP,rTMP,61 - cmpldi cr1,rN,0 - lis rFEFE,-0x101 - bne L(unaligned) -/* We are doubleword aligned so set up for two loops. first a double word - loop, then fall into the byte loop if any residual. */ - srdi. rTMP,rN,3 - clrldi rN,rN,61 - addi rFEFE,rFEFE,-0x101 - addi r7F7F,r7F7F,0x7f7f - cmpldi cr1,rN,0 - beq L(unaligned) - - mtctr rTMP - ld rWORD1,0(rSTR1) - ld rWORD2,0(rSTR2) - sldi rTMP,rFEFE,32 - insrdi r7F7F,r7F7F,32,0 - add rFEFE,rFEFE,rTMP - b L(g1) - -L(g0): - ldu rWORD1,8(rSTR1) - bne cr1,L(different) - ldu rWORD2,8(rSTR2) -L(g1): add rTMP,rFEFE,rWORD1 - nor rNEG,r7F7F,rWORD1 - bdz L(tail) - and. rTMP,rTMP,rNEG - cmpd cr1,rWORD1,rWORD2 - beq L(g0) - -/* OK. We've hit the end of the string. We need to be careful that - we don't compare two strings as different because of gunk beyond - the end of the strings... */ - -#ifdef __LITTLE_ENDIAN__ -L(endstring): - addi rTMP2, rTMP, -1 - beq cr1, L(equal) - andc rTMP2, rTMP2, rTMP - rldimi rTMP2, rTMP2, 1, 0 - and rWORD2, rWORD2, rTMP2 /* Mask off gunk. */ - and rWORD1, rWORD1, rTMP2 - cmpd cr1, rWORD1, rWORD2 - beq cr1, L(equal) - cmpb rBITDIF, rWORD1, rWORD2 /* 0xff on equal bytes. */ - addi rNEG, rBITDIF, 1 - orc rNEG, rNEG, rBITDIF /* 0's below LS differing byte. */ - sldi rNEG, rNEG, 8 /* 1's above LS differing byte. */ - andc rWORD1, rWORD1, rNEG /* mask off MS bytes. */ - andc rWORD2, rWORD2, rNEG - xor. rBITDIF, rWORD1, rWORD2 - sub rRTN, rWORD1, rWORD2 - blt L(highbit) - sradi rRTN, rRTN, 63 /* must return an int. */ - ori rRTN, rRTN, 1 - blr -L(equal): - li rRTN, 0 - blr - -L(different): - ld rWORD1, -8(rSTR1) - cmpb rBITDIF, rWORD1, rWORD2 /* 0xff on equal bytes. */ - addi rNEG, rBITDIF, 1 - orc rNEG, rNEG, rBITDIF /* 0's below LS differing byte. */ - sldi rNEG, rNEG, 8 /* 1's above LS differing byte. */ - andc rWORD1, rWORD1, rNEG /* mask off MS bytes. */ - andc rWORD2, rWORD2, rNEG - xor. rBITDIF, rWORD1, rWORD2 - sub rRTN, rWORD1, rWORD2 - blt L(highbit) - sradi rRTN, rRTN, 63 - ori rRTN, rRTN, 1 - blr -L(highbit): - sradi rRTN, rWORD2, 63 - ori rRTN, rRTN, 1 - blr - -#else -L(endstring): - and rTMP,r7F7F,rWORD1 - beq cr1,L(equal) - add rTMP,rTMP,r7F7F - xor. rBITDIF,rWORD1,rWORD2 - andc rNEG,rNEG,rTMP - blt L(highbit) - cntlzd rBITDIF,rBITDIF - cntlzd rNEG,rNEG - addi rNEG,rNEG,7 - cmpd cr1,rNEG,rBITDIF - sub rRTN,rWORD1,rWORD2 - blt cr1,L(equal) - sradi rRTN,rRTN,63 /* must return an int. */ - ori rRTN,rRTN,1 - blr -L(equal): - li rRTN,0 - blr - -L(different): - ld rWORD1,-8(rSTR1) - xor. rBITDIF,rWORD1,rWORD2 - sub rRTN,rWORD1,rWORD2 - blt L(highbit) - sradi rRTN,rRTN,63 - ori rRTN,rRTN,1 - blr -L(highbit): - sradi rRTN,rWORD2,63 - ori rRTN,rRTN,1 - blr -#endif - -/* Oh well. In this case, we just do a byte-by-byte comparison. */ - .align 4 -L(tail): - and. rTMP,rTMP,rNEG - cmpd cr1,rWORD1,rWORD2 - bne L(endstring) - addi rSTR1,rSTR1,8 - bne cr1,L(different) - addi rSTR2,rSTR2,8 - cmpldi cr1,rN,0 -L(unaligned): - mtctr rN - ble cr1,L(ux) -L(uz): - lbz rWORD1,0(rSTR1) - lbz rWORD2,0(rSTR2) - .align 4 -L(u1): - cmpdi cr1,rWORD1,0 - bdz L(u4) - cmpd rWORD1,rWORD2 - beq cr1,L(u4) - bne L(u4) - lbzu rWORD3,1(rSTR1) - lbzu rWORD4,1(rSTR2) - cmpdi cr1,rWORD3,0 - bdz L(u3) - cmpd rWORD3,rWORD4 - beq cr1,L(u3) - bne L(u3) - lbzu rWORD1,1(rSTR1) - lbzu rWORD2,1(rSTR2) - cmpdi cr1,rWORD1,0 - bdz L(u4) - cmpd rWORD1,rWORD2 - beq cr1,L(u4) - bne L(u4) - lbzu rWORD3,1(rSTR1) - lbzu rWORD4,1(rSTR2) - cmpdi cr1,rWORD3,0 - bdz L(u3) - cmpd rWORD3,rWORD4 - beq cr1,L(u3) - bne L(u3) - lbzu rWORD1,1(rSTR1) - lbzu rWORD2,1(rSTR2) - b L(u1) - -L(u3): sub rRTN,rWORD3,rWORD4 - blr -L(u4): sub rRTN,rWORD1,rWORD2 - blr -L(ux): - li rRTN,0 - blr -END (STRNCMP) -libc_hidden_builtin_def (strncmp) diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S deleted file mode 100644 index 0224f74898..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strncpy.S +++ /dev/null @@ -1,722 +0,0 @@ -/* Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* Implements the functions - - char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5]) - - AND - - char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5]) - - The algorithm is as follows: - > if src and dest are 8 byte aligned, perform double word copy - else - > copy byte by byte on unaligned addresses. - - The aligned comparison are made using cmpb instructions. */ - -/* The focus on optimization for performance improvements are as follows: - 1. data alignment [gain from aligned memory access on read/write] - 2. POWER7 gains performance with loop unrolling/unwinding - [gain by reduction of branch penalty]. - 3. The final pad with null bytes is done by calling an optimized - memset. */ - -#ifdef USE_AS_STPNCPY -# ifndef STPNCPY -# define FUNC_NAME __stpncpy -# else -# define FUNC_NAME STPNCPY -# endif -#else -# ifndef STRNCPY -# define FUNC_NAME strncpy -# else -# define FUNC_NAME STRNCPY -# endif -#endif /* !USE_AS_STPNCPY */ - -#define FRAMESIZE (FRAME_MIN_SIZE+32) - -#ifndef MEMSET -/* For builds with no IFUNC support, local calls should be made to internal - GLIBC symbol (created by libc_hidden_builtin_def). */ -# ifdef SHARED -# define MEMSET __GI_memset -# else -# define MEMSET memset -# endif -#endif - - .machine power7 -EALIGN(FUNC_NAME, 4, 0) - CALL_MCOUNT 3 - - mflr r0 /* load link register LR to r0 */ - or r10, r3, r4 /* to verify source and destination */ - rldicl. r8, r10, 0, 61 /* is double word aligned .. ? */ - - std r19, -8(r1) /* save callers register , r19 */ - std r18, -16(r1) /* save callers register , r18 */ - std r0, 16(r1) /* store the link register */ - stdu r1, -FRAMESIZE(r1) /* create the stack frame */ - - mr r9, r3 /* save r3 into r9 for use */ - mr r18, r3 /* save r3 for retCode of strncpy */ - bne 0, L(unaligned) - -L(aligned): - srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */ - cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */ - ble 7, L(update1) - - ld r10, 0(r4) /* load doubleWord from src */ - cmpb r8, r10, r8 /* compare src with NULL ,we read just now */ - cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */ - bne cr7, L(update3) - - std r10, 0(r3) /* copy doubleword at offset=0 */ - ld r10, 8(r4) /* load next doubleword from offset=8 */ - cmpb r8, r10, r8 /* compare src with NULL , we read just now */ - cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */ - bne 7,L(HopBy8) - - addi r8, r11, -4 - mr r7, r3 - srdi r8, r8, 2 - mr r6, r4 - addi r8, r8, 1 - li r12, 0 - mtctr r8 - b L(dwordCopy) - - .p2align 4 -L(dWordUnroll): - std r8, 16(r9) - ld r8, 24(r4) /* load dword,perform loop unrolling again */ - cmpb r10, r8, r10 - cmpdi cr7, r10, 0 - bne cr7, L(HopBy24) - - std r8, 24(r7) /* copy dword at offset=24 */ - addi r9, r9, 32 - addi r4, r4, 32 - bdz L(leftDwords) /* continue with loop on counter */ - - ld r3, 32(r6) - cmpb r8, r3, r10 - cmpdi cr7, r8, 0 - bne cr7, L(update2) - - std r3, 32(r7) - ld r10, 40(r6) - cmpb r8, r10, r8 - cmpdi cr7, r8, 0 - bne cr7, L(HopBy40) - - mr r6, r4 /* update values */ - mr r7, r9 - mr r11, r0 - mr r5, r19 - -L(dwordCopy): - std r10, 8(r9) /* copy dword at offset=8 */ - addi r19, r5, -32 - addi r0, r11, -4 - ld r8, 16(r4) - cmpb r10, r8, r12 - cmpdi cr7, r10, 0 - beq cr7, L(dWordUnroll) - - addi r9, r9, 16 /* increment dst by 16 */ - addi r4, r4, 16 /* increment src by 16 */ - addi r5, r5, -16 /* decrement length 'n' by 16 */ - addi r0, r11, -2 /* decrement loop counter */ - -L(dWordUnrollOFF): - ld r10, 0(r4) /* load first dword */ - li r8, 0 /* load mask */ - cmpb r8, r10, r8 - cmpdi cr7, r8, 0 - bne cr7, L(byte_by_byte) - mtctr r0 - li r7, 0 - b L(CopyDword) - - .p2align 4 -L(loadDWordandCompare): - ld r10, 0(r4) - cmpb r8, r10, r7 - cmpdi cr7, r8, 0 - bne cr7, L(byte_by_byte) - -L(CopyDword): - addi r9, r9, 8 - std r10, -8(r9) - addi r4, r4, 8 - addi r5, r5, -8 - bdnz L(loadDWordandCompare) - -L(byte_by_byte): - cmpldi cr7, r5, 3 - ble cr7, L(verifyByte) - srdi r10, r5, 2 - mr r19, r9 - mtctr r10 - b L(firstByteUnroll) - - .p2align 4 -L(bytes_unroll): - lbz r10, 1(r4) /* load byte from src */ - cmpdi cr7, r10, 0 /* compare for NULL */ - stb r10, 1(r19) /* store byte to dst */ - beq cr7, L(updtDestComputeN2ndByte) - - addi r4, r4, 4 /* advance src */ - - lbz r10, -2(r4) /* perform loop unrolling for byte r/w */ - cmpdi cr7, r10, 0 - stb r10, 2(r19) - beq cr7, L(updtDestComputeN3rdByte) - - lbz r10, -1(r4) /* perform loop unrolling for byte r/w */ - addi r19, r19, 4 - cmpdi cr7, r10, 0 - stb r10, -1(r19) - beq cr7, L(ComputeNByte) - - bdz L(update0) - -L(firstByteUnroll): - lbz r10, 0(r4) /* perform loop unrolling for byte r/w */ - cmpdi cr7, 10, 0 - stb r10, 0(r19) - bne cr7, L(bytes_unroll) - addi r19, r19, 1 - -L(ComputeNByte): - subf r9, r19, r9 /* compute 'n'n bytes to fill */ - add r8, r9, r5 - -L(zeroFill): - cmpdi cr7, r8, 0 /* compare if length is zero */ - beq cr7, L(update3return) - - mr r3, r19 /* fill buffer with */ - li r4, 0 /* zero fill buffer */ - mr r5, r8 /* how many bytes to fill buffer with */ - bl MEMSET /* call optimized memset */ - nop - -L(update3return): -#ifdef USE_AS_STPNCPY - addi r3, r19, -1 /* update return value */ -#endif - -L(hop2return): -#ifndef USE_AS_STPNCPY - mr r3, r18 /* set return value */ -#endif - addi r1, r1, FRAMESIZE /* restore stack pointer */ - ld r0, 16(r1) /* read the saved link register */ - ld r18, -16(r1) /* restore callers save register, r18 */ - ld r19, -8(r1) /* restore callers save register, r19 */ - mtlr r0 /* branch to link register */ - blr /* return */ - - .p2align 4 -L(update0): - mr r9, r19 - - .p2align 4 -L(verifyByte): - rldicl. r8, r5, 0, 62 -#ifdef USE_AS_STPNCPY - mr r3, r9 -#endif - beq cr0, L(hop2return) - mtctr r8 - addi r4, r4, -1 - mr r19, r9 - b L(oneBYone) - - .p2align 4 -L(proceed): - bdz L(done) - -L(oneBYone): - lbzu r10, 1(r4) /* copy byte */ - addi r19, r19, 1 - addi r8, r8, -1 - cmpdi cr7, r10, 0 - stb r10, -1(r19) - bne cr7, L(proceed) - b L(zeroFill) - - .p2align 4 -L(done): - addi r1, r1, FRAMESIZE /* restore stack pointer */ -#ifdef USE_AS_STPNCPY - mr r3, r19 /* set the return value */ -#else - mr r3, r18 /* set the return value */ -#endif - ld r0, 16(r1) /* read the saved link register */ - ld r18, -16(r1) /* restore callers save register, r18 */ - ld r19, -8(r1) /* restore callers save register, r19 */ - mtlr r0 /* branch to link register */ - blr /* return */ - -L(update1): - mr r0, r11 - mr r19, r5 - - .p2align 4 -L(leftDwords): - cmpdi cr7, r0, 0 - mr r5, r19 - bne cr7, L(dWordUnrollOFF) - b L(byte_by_byte) - - .p2align 4 -L(updtDestComputeN2ndByte): - addi r19, r19, 2 /* update dst by 2 */ - subf r9, r19, r9 /* compute distance covered */ - add r8, r9, r5 - b L(zeroFill) - - .p2align 4 -L(updtDestComputeN3rdByte): - addi r19, r19, 3 /* update dst by 3 */ - subf r9, r19, r9 /* compute distance covered */ - add r8, r9, r5 - b L(zeroFill) - - .p2align 4 -L(HopBy24): - addi r9, r9, 24 /* increment dst by 24 */ - addi r4, r4, 24 /* increment src by 24 */ - addi r5, r5, -24 /* decrement length 'n' by 24 */ - addi r0, r11, -3 /* decrement loop counter */ - b L(dWordUnrollOFF) - - .p2align 4 -L(update2): - mr r5, r19 - b L(dWordUnrollOFF) - - .p2align 4 -L(HopBy40): - addi r9, r7, 40 /* increment dst by 40 */ - addi r4, r6, 40 /* increment src by 40 */ - addi r5, r5, -40 /* decrement length 'n' by 40 */ - addi r0, r11, -5 /* decrement loop counter */ - b L(dWordUnrollOFF) - -L(update3): - mr r0, r11 - b L(dWordUnrollOFF) - -L(HopBy8): - addi r9, r3, 8 /* increment dst by 8 */ - addi r4, r4, 8 /* increment src by 8 */ - addi r5, r5, -8 /* decrement length 'n' by 8 */ - addi r0, r11, -1 /* decrement loop counter */ - b L(dWordUnrollOFF) - -L(unaligned): - cmpdi r5, 16 /* Proceed byte by byte for less than 16 */ - ble L(byte_by_byte) - rldicl r7, r3, 0, 61 - rldicl r6, r4, 0, 61 - cmpdi r6, 0 /* Check src alignment */ - beq L(srcaligndstunalign) - /* src is unaligned */ - rlwinm r10, r4, 3,26,28 /* Calculate padding. */ - clrrdi r4, r4, 3 /* Align the addr to dw boundary */ - ld r8, 0(r4) /* Load doubleword from memory. */ - li r0, 0 - /* Discard bits not part of the string */ -#ifdef __LITTLE_ENDIAN__ - srd r7, r8, r10 -#else - sld r7, r8, r10 -#endif - cmpb r0, r7, r0 /* Compare each byte against null */ - /* Discard bits not part of the string */ -#ifdef __LITTLE_ENDIAN__ - sld r0, r0, r10 -#else - srd r0, r0, r10 -#endif - cmpdi r0, 0 - bne L(bytebybyte) /* if it has null, copy byte by byte */ - subfic r6, r6, 8 - rlwinm r12, r3, 3,26,28 /* Calculate padding in bits. */ - rldicl r9, r3, 0, 61 /* Calculate padding in bytes. */ - addi r3, r3, -1 - - cmpdi r12, 0 /* check dest alignment */ - beq L(srcunaligndstalign) - - /* both src and dst unaligned */ -#ifdef __LITTLE_ENDIAN__ - sld r8, r7, r10 - mr r11, r10 - addi r11, r11, -8 /* Adjust byte pointer on loaded dw */ -#else - srd r8, r7, r10 - subfic r11, r10, 64 -#endif - /* dst alignment is greater then src alignment? */ - cmpd cr7, r12, r10 - ble cr7, L(dst_align_small) - /* src alignment is less than dst */ - - /* Calculate the dst alignment difference */ - subfic r7, r9, 8 - mtctr r7 - - /* Write until dst is aligned */ - cmpdi r0, r7, 4 - blt L(storebyte1) /* less than 4, store byte by byte */ - beq L(equal1) /* if its 4, store word */ - addi r0, r7, -4 /* greater than 4, so stb and stw */ - mtctr r0 -L(storebyte1): -#ifdef __LITTLE_ENDIAN__ - addi r11, r11, 8 /* Adjust byte pointer on loaded dw */ -#else - addi r11, r11, -8 -#endif - srd r7, r8, r11 - stbu r7, 1(r3) - addi r5, r5, -1 - bdnz L(storebyte1) - - subfic r7, r9, 8 /* Check the remaining bytes */ - cmpdi r0, r7, 4 - blt L(proceed1) - - .align 4 -L(equal1): -#ifdef __LITTLE_ENDIAN__ - addi r11, r11, 8 /* Adjust byte pointer on loaded dw */ - srd r7, r8, r11 -#else - subfic r11, r11, 64 - sld r7, r8, r11 - srdi r7, r7, 32 -#endif - stw r7, 1(r3) - addi r3, r3, 4 - addi r5, r5, -4 - -L(proceed1): - mr r7, r8 - /* calculate the Left over bytes to be written */ - subfic r11, r10, 64 - subfic r12, r12, 64 - subf r12, r12, r11 /* remaining bytes on second dw */ - subfic r10, r12, 64 /* remaining bytes on first dw */ - subfic r9, r9, 8 - subf r6, r9, r6 /* recalculate padding */ -L(srcunaligndstalign): - addi r3, r3, 1 - subfic r12, r10, 64 /* remaining bytes on second dw */ - addi r4, r4, 8 - li r0,0 - b L(storedouble) - - .align 4 -L(dst_align_small): - mtctr r6 - /* Write until src is aligned */ -L(storebyte2): -#ifdef __LITTLE_ENDIAN__ - addi r11, r11, 8 /* Adjust byte pointer on dw */ -#else - addi r11, r11, -8 -#endif - srd r7, r8, r11 - stbu r7, 1(r3) - addi r5, r5, -1 - bdnz L(storebyte2) - - addi r4, r4, 8 /* Increment src pointer */ - addi r3, r3, 1 /* Increment dst pointer */ - mr r9, r3 - li r8, 0 - cmpd cr7, r12, r10 - beq cr7, L(aligned) - rldicl r6, r3, 0, 61 /* Recalculate padding */ - mr r7, r6 - - /* src is algined */ -L(srcaligndstunalign): - mr r9, r3 - mr r6, r7 - ld r8, 0(r4) - subfic r10, r7, 8 - mr r7, r8 - li r0, 0 /* Check null */ - cmpb r0, r8, r0 - cmpdi r0, 0 - bne L(byte_by_byte) /* Do byte by byte if there is NULL */ - rlwinm r12, r3, 3,26,28 /* Calculate padding */ - addi r3, r3, -1 - /* write byte by byte until aligned */ -#ifdef __LITTLE_ENDIAN__ - li r11, -8 -#else - li r11, 64 -#endif - mtctr r10 - cmpdi r0, r10, 4 - blt L(storebyte) - beq L(equal) - addi r0, r10, -4 - mtctr r0 -L(storebyte): -#ifdef __LITTLE_ENDIAN__ - addi r11, r11, 8 /* Adjust byte pointer on dw */ -#else - addi r11, r11, -8 -#endif - srd r7, r8, r11 - stbu r7, 1(r3) - addi r5, r5, -1 - bdnz L(storebyte) - - cmpdi r0, r10, 4 - blt L(align) - - .align 4 -L(equal): -#ifdef __LITTLE_ENDIAN__ - addi r11, r11, 8 - srd r7, r8, r11 -#else - subfic r11, r11, 64 - sld r7, r8, r11 - srdi r7, r7, 32 -#endif - stw r7, 1(r3) - addi r5, r5, -4 - addi r3, r3, 4 -L(align): - addi r3, r3, 1 - addi r4, r4, 8 /* Increment src pointer */ - subfic r10, r12, 64 - li r0, 0 - /* dst addr aligned to 8 */ -L(storedouble): - cmpdi r5, 8 - ble L(null1) - ld r7, 0(r4) /* load next dw */ - cmpb r0, r7, r0 - cmpdi r0, 0 /* check for null on each new dw */ - bne L(null) -#ifdef __LITTLE_ENDIAN__ - srd r9, r8, r10 /* bytes from first dw */ - sld r11, r7, r12 /* bytes from second dw */ -#else - sld r9, r8, r10 - srd r11, r7, r12 -#endif - or r11, r9, r11 /* make as a single dw */ - std r11, 0(r3) /* store as std on aligned addr */ - mr r8, r7 /* still few bytes left to be written */ - addi r3, r3, 8 /* increment dst addr */ - addi r4, r4, 8 /* increment src addr */ - addi r5, r5, -8 - b L(storedouble) /* Loop until NULL */ - - .align 4 - -/* We've hit the end of the string. Do the rest byte-by-byte. */ -L(null): - addi r3, r3, -1 - mr r10, r12 - mtctr r6 -#ifdef __LITTLE_ENDIAN__ - subfic r10, r10, 64 - addi r10, r10, -8 -#endif - cmpdi r0, r5, 4 - blt L(loop) - cmpdi r0, r6, 4 - blt L(loop) - - /* we can still use stw if leftover >= 4 */ -#ifdef __LITTLE_ENDIAN__ - addi r10, r10, 8 - srd r11, r8, r10 -#else - subfic r10, r10, 64 - sld r11, r8, r10 - srdi r11, r11, 32 -#endif - stw r11, 1(r3) - addi r5, r5, -4 - addi r3, r3, 4 - cmpdi r0, r5, 0 - beq L(g1) - cmpdi r0, r6, 4 - beq L(bytebybyte1) - addi r10, r10, 32 -#ifdef __LITTLE_ENDIAN__ - addi r10, r10, -8 -#else - subfic r10, r10, 64 -#endif - addi r0, r6, -4 - mtctr r0 - /* remaining byte by byte part of first dw */ -L(loop): -#ifdef __LITTLE_ENDIAN__ - addi r10, r10, 8 -#else - addi r10, r10, -8 -#endif - srd r0, r8, r10 - stbu r0, 1(r3) - addi r5, r5, -1 - cmpdi r0, r5, 0 - beq L(g1) - bdnz L(loop) -L(bytebybyte1): - addi r3, r3, 1 - /* remaining byte by byte part of second dw */ -L(bytebybyte): - addi r3, r3, -8 - addi r4, r4, -1 - -#ifdef __LITTLE_ENDIAN__ - extrdi. r0, r7, 8, 56 - stbu r7, 8(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 48 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 40 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 32 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 24 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 16 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 8 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi r0, r7, 8, 0 - stbu r0, 1(r3) - addi r5, r5, -1 - b L(g2) -#else - extrdi. r0, r7, 8, 0 - stbu r0, 8(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 8 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 16 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 24 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 32 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 40 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - extrdi. r0, r7, 8, 48 - stbu r0, 1(r3) - addi r5, r5, -1 - beq L(g2) - cmpdi r5, 0 - beq L(g1) - stbu r7, 1(r3) - addi r5, r5, -1 - b L(g2) -#endif -L(g1): -#ifdef USE_AS_STPNCPY - addi r3, r3, 1 -#endif -L(g2): - addi r3, r3, 1 - mr r19, r3 - mr r8, r5 - b L(zeroFill) -L(null1): - mr r9, r3 - subf r4, r6, r4 - b L(byte_by_byte) -END(FUNC_NAME) -#ifndef USE_AS_STPNCPY -libc_hidden_builtin_def (strncpy) -#endif diff --git a/sysdeps/powerpc/powerpc64/power7/strnlen.S b/sysdeps/powerpc/powerpc64/power7/strnlen.S deleted file mode 100644 index a970b6ce30..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strnlen.S +++ /dev/null @@ -1,182 +0,0 @@ -/* Optimized strnlen implementation for PowerPC64/POWER7 using cmpb insn. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - Contributed by Luis Machado <luisgpm@br.ibm.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifndef STRNLEN -# define STRNLEN __strnlen -#endif - -/* int [r3] strnlen (char *s [r3], int size [r4]) */ - .machine power7 -ENTRY (STRNLEN) - CALL_MCOUNT 2 - dcbt 0,r3 - clrrdi r8,r3,3 - add r7,r3,r4 /* Calculate the last acceptable address. */ - cmpldi r4,32 - li r0,0 /* Doubleword with null chars. */ - addi r7,r7,-1 - - /* If we have less than 33 bytes to search, skip to a faster code. */ - ble L(small_range) - - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - ld r12,0(r8) /* Load doubleword from memory. */ - cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */ -#ifdef __LITTLE_ENDIAN__ - srd r10,r10,r6 - sld r10,r10,r6 -#else - sld r10,r10,r6 - srd r10,r10,r6 -#endif - cmpldi cr7,r10,0 /* If r10 == 0, no null's have been found. */ - bne cr7,L(done) - - clrrdi r7,r7,3 /* Address of last doubleword. */ - mtcrf 0x01,r8 - /* Are we now aligned to a quadword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - - bt 28,L(loop_setup) - - /* Handle DWORD2 of pair. */ - ldu r12,8(r8) - cmpb r10,r12,r0 - cmpldi cr7,r10,0 - bne cr7,L(done) - -L(loop_setup): - /* The last dword we want to read in the loop below is the one - containing the last byte of the string, ie. the dword at - (s + size - 1) & ~7, or r7. The first dword read is at - r8 + 8, we read 2 * cnt dwords, so the last dword read will - be at r8 + 8 + 16 * cnt - 8. Solving for cnt gives - cnt = (r7 - r8) / 16 */ - sub r5,r7,r8 - srdi r6,r5,4 /* Number of loop iterations. */ - mtctr r6 /* Setup the counter. */ - - /* Main loop to look for the null byte in the string. Since - it's a small loop (< 8 instructions), align it to 32-bytes. */ - .p2align 5 -L(loop): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the null-checking process for bigger strings. */ - - ld r12,8(r8) - ldu r11,16(r8) - cmpb r10,r12,r0 - cmpb r9,r11,r0 - or r5,r9,r10 /* Merge everything in one doubleword. */ - cmpldi cr7,r5,0 - bne cr7,L(found) - bdnz L(loop) - - /* We may have one more dword to read. */ - cmpld cr6,r8,r7 - beq cr6,L(end_max) - - ldu r12,8(r8) - cmpb r10,r12,r0 - cmpldi cr6,r10,0 - bne cr6,L(done) - -L(end_max): - mr r3,r4 - blr - - /* OK, one (or both) of the doublewords contains a null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a null byte. */ - .align 4 -L(found): - cmpldi cr6,r10,0 - addi r8,r8,-8 - bne cr6,L(done) - - /* The null byte must be in the second doubleword. Adjust the address - again and move the result of cmpb to r10 so we can calculate the - length. */ - - mr r10,r9 - addi r8,r8,8 - - /* r10 has the output of the cmpb instruction, that is, it contains - 0xff in the same position as the null byte in the original - doubleword from the string. Use that to calculate the length. - We need to make sure the null char is *before* the end of the - range. */ -L(done): -#ifdef __LITTLE_ENDIAN__ - addi r0,r10,-1 - andc r0,r0,r10 - popcntd r0,r0 -#else - cntlzd r0,r10 /* Count leading zeros before the match. */ -#endif - sub r3,r8,r3 - srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ - add r3,r3,r0 /* Length until the match. */ - cmpld r3,r4 - blelr - mr r3,r4 - blr - -/* Deals with size <= 32. */ - .align 4 -L(small_range): - cmpldi r4,0 - beq L(end_max) - - clrrdi r7,r7,3 /* Address of last doubleword. */ - - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - ld r12,0(r8) /* Load doubleword from memory. */ - cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */ -#ifdef __LITTLE_ENDIAN__ - srd r10,r10,r6 - sld r10,r10,r6 -#else - sld r10,r10,r6 - srd r10,r10,r6 -#endif - cmpldi cr7,r10,0 - bne cr7,L(done) - - cmpld r8,r7 - beq L(end_max) - - .p2align 5 -L(loop_small): - ldu r12,8(r8) - cmpb r10,r12,r0 - cmpldi cr6,r10,0 - bne cr6,L(done) - cmpld r8,r7 - bne L(loop_small) - mr r3,r4 - blr - -END (STRNLEN) -libc_hidden_def (__strnlen) -weak_alias (__strnlen, strnlen) -libc_hidden_def (strnlen) diff --git a/sysdeps/powerpc/powerpc64/power7/strrchr.S b/sysdeps/powerpc/powerpc64/power7/strrchr.S deleted file mode 100644 index c22393deb5..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strrchr.S +++ /dev/null @@ -1,260 +0,0 @@ -/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* int [r3] strrchr (char *s [r3], int c [r4]) */ - -#ifndef STRRCHR -# define STRRCHR strrchr -#endif - - .machine power7 -ENTRY (STRRCHR) - CALL_MCOUNT 2 - dcbt 0,r3 - clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ - cmpdi cr7,r4,0 - ld r12,0(r8) /* Load doubleword from memory. */ - li r9,0 /* used to store last occurence */ - li r0,0 /* Doubleword with null chars to use - with cmpb. */ - - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - - beq cr7,L(null_match) - - /* Replicate byte to doubleword. */ - insrdi r4,r4,8,48 - insrdi r4,r4,16,32 - insrdi r4,r4,32,0 - - /* r4 is changed now ,if its passed as more chars - check for null again */ - cmpdi cr7,r4,0 - beq cr7,L(null_match) - /* Now r4 has a doubleword of c bytes and r0 has - a doubleword of null bytes. */ - - cmpb r10,r12,r4 /* Compare each byte against c byte. */ - cmpb r11,r12,r0 /* Compare each byte against null byte. */ - - /* Move the doublewords left and right to discard the bits that are - not part of the string and bring them back as zeros. */ -#ifdef __LITTLE_ENDIAN__ - srd r10,r10,r6 - srd r11,r11,r6 - sld r10,r10,r6 - sld r11,r11,r6 -#else - sld r10,r10,r6 - sld r11,r11,r6 - srd r10,r10,r6 - srd r11,r11,r6 -#endif - or r5,r10,r11 /* OR the results to speed things up. */ - cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes - have been found. */ - bne cr7,L(done) - -L(align): - mtcrf 0x01,r8 - - /* Are we now aligned to a doubleword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - - bt 28,L(loop) - - /* Handle WORD2 of pair. */ - ldu r12,8(r8) - cmpb r10,r12,r4 - cmpb r11,r12,r0 - or r5,r10,r11 - cmpdi cr7,r5,0 - bne cr7,L(done) - b L(loop) /* We branch here (rather than falling through) - to skip the nops due to heavy alignment - of the loop below. */ - .p2align 5 -L(loop): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the null-checking process for bigger strings. */ - ld r12,8(r8) - ldu r7,16(r8) - cmpb r10,r12,r4 - cmpb r11,r12,r0 - cmpb r6,r7,r4 - cmpb r7,r7,r0 - or r12,r10,r11 - or r5,r6,r7 - or r5,r12,r5 - cmpdi cr7,r5,0 - beq cr7,L(loop) - - /* OK, one (or both) of the doublewords contains a c/null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a c/null byte. */ - cmpdi cr6,r12,0 - addi r8,r8,-8 - bne cr6,L(done) - - /* The c/null byte must be in the second doubleword. Adjust the - address again and move the result of cmpb to r10 so we can calculate - the pointer. */ - - mr r10,r6 - mr r11,r7 - addi r8,r8,8 - - /* r10/r11 have the output of the cmpb instructions, that is, - 0xff in the same position as the c/null byte in the original - doubleword from the string. Use that to calculate the pointer. */ - -L(done): - /* if there are more than one 0xff in r11, find the first pos of ff - in r11 and fill r10 with 0 from that position */ - cmpdi cr7,r11,0 - beq cr7,L(no_null) -#ifdef __LITTLE_ENDIAN__ - addi r3,r11,-1 - andc r3,r3,r11 - popcntd r0,r3 -#else - cntlzd r0,r11 -#endif - subfic r0,r0,63 - li r6,-1 -#ifdef __LITTLE_ENDIAN__ - srd r0,r6,r0 -#else - sld r0,r6,r0 -#endif - and r10,r0,r10 -L(no_null): -#ifdef __LITTLE_ENDIAN__ - cntlzd r0,r10 /* Count leading zeros before c matches. */ - addi r3,r10,-1 - andc r3,r3,r10 - addi r10,r11,-1 - andc r10,r10,r11 - cmpld cr7,r3,r10 - bgt cr7,L(no_match) -#else - addi r3,r10,-1 /* Count trailing zeros before c matches. */ - andc r3,r3,r10 - popcntd r0,r3 - cmpld cr7,r11,r10 - bgt cr7,L(no_match) -#endif - srdi r0,r0,3 /* Convert trailing zeros to bytes. */ - subfic r0,r0,7 - add r9,r8,r0 /* Return address of the matching c byte - or null in case c was not found. */ - li r0,0 - cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */ - beq cr7,L(align) - - .align 4 -L(no_match): - mr r3,r9 - blr - -/* We are here because strrchr was called with a null byte. */ - .align 4 -L(null_match): - /* r0 has a doubleword of null bytes. */ - - cmpb r5,r12,r0 /* Compare each byte against null bytes. */ - - /* Move the doublewords left and right to discard the bits that are - not part of the string and bring them back as zeros. */ -#ifdef __LITTLE_ENDIAN__ - srd r5,r5,r6 - sld r5,r5,r6 -#else - sld r5,r5,r6 - srd r5,r5,r6 -#endif - cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes - have been found. */ - bne cr7,L(done_null) - - mtcrf 0x01,r8 - - /* Are we now aligned to a quadword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - - bt 28,L(loop_null) - - /* Handle WORD2 of pair. */ - ldu r12,8(r8) - cmpb r5,r12,r0 - cmpdi cr7,r5,0 - bne cr7,L(done_null) - b L(loop_null) /* We branch here (rather than falling through) - to skip the nops due to heavy alignment - of the loop below. */ - - /* Main loop to look for the end of the string. Since it's a - small loop (< 8 instructions), align it to 32-bytes. */ - .p2align 5 -L(loop_null): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the null-checking process for bigger strings. */ - ld r12,8(r8) - ldu r11,16(r8) - cmpb r5,r12,r0 - cmpb r10,r11,r0 - or r6,r5,r10 - cmpdi cr7,r6,0 - beq cr7,L(loop_null) - - /* OK, one (or both) of the doublewords contains a null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a null byte. */ - - cmpdi cr6,r5,0 - addi r8,r8,-8 - bne cr6,L(done_null) - - /* The null byte must be in the second doubleword. Adjust the address - again and move the result of cmpb to r10 so we can calculate the - pointer. */ - - mr r5,r10 - addi r8,r8,8 - - /* r5 has the output of the cmpb instruction, that is, it contains - 0xff in the same position as the null byte in the original - doubleword from the string. Use that to calculate the pointer. */ -L(done_null): -#ifdef __LITTLE_ENDIAN__ - addi r0,r5,-1 - andc r0,r0,r5 - popcntd r0,r0 -#else - cntlzd r0,r5 /* Count leading zeros before the match. */ -#endif - srdi r0,r0,3 /* Convert trailing zeros to bytes. */ - add r3,r8,r0 /* Return address of the matching null byte. */ - blr -END (STRRCHR) -weak_alias (strrchr, rindex) -libc_hidden_builtin_def (strrchr) diff --git a/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c b/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c deleted file mode 100644 index a917b2157e..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c +++ /dev/null @@ -1,27 +0,0 @@ -/* Optimized strstr implementation for PowerPC64/POWER7. - Copyright (C) 2015-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <string.h> - -#define STRSTR __strstr_ppc -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(__name) - -extern __typeof (strstr) __strstr_ppc attribute_hidden; - -#include <string/strstr.c> diff --git a/sysdeps/powerpc/powerpc64/power7/strstr.S b/sysdeps/powerpc/powerpc64/power7/strstr.S deleted file mode 100644 index 260db2ed6d..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strstr.S +++ /dev/null @@ -1,521 +0,0 @@ -/* Optimized strstr implementation for PowerPC64/POWER7. - Copyright (C) 2015-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* Char * [r3] strstr (char *s [r3], char * pat[r4]) */ - -/* The performance gain is obtained using aligned memory access, load - * doubleword and usage of cmpb instruction for quicker comparison. */ - -#define ITERATIONS 64 - -#ifndef STRSTR -# define STRSTR strstr -#endif - -#ifndef STRLEN -/* For builds with no IFUNC support, local calls should be made to internal - GLIBC symbol (created by libc_hidden_builtin_def). */ -# ifdef SHARED -# define STRLEN __GI_strlen -# else -# define STRLEN strlen -# endif -#endif - -#ifndef STRNLEN -/* For builds with no IFUNC support, local calls should be made to internal - GLIBC symbol (created by libc_hidden_builtin_def). */ -# ifdef SHARED -# define STRNLEN __GI_strnlen -# else -# define STRNLEN __strnlen -# endif -#endif - -#ifndef STRCHR -# ifdef SHARED -# define STRCHR __GI_strchr -# else -# define STRCHR strchr -# endif -#endif - -#define FRAMESIZE (FRAME_MIN_SIZE+32) - .machine power7 -EALIGN (STRSTR, 4, 0) - CALL_MCOUNT 2 - mflr r0 /* Load link register LR to r0. */ - std r31, -8(r1) /* Save callers register r31. */ - std r30, -16(r1) /* Save callers register r30. */ - std r29, -24(r1) /* Save callers register r29. */ - std r28, -32(r1) /* Save callers register r28. */ - std r0, 16(r1) /* Store the link register. */ - cfi_offset(r31, -8) - cfi_offset(r30, -16) - cfi_offset(r28, -32) - cfi_offset(r29, -24) - cfi_offset(lr, 16) - stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */ - cfi_adjust_cfa_offset(FRAMESIZE) - - dcbt 0, r3 - dcbt 0, r4 - cmpdi cr7, r3, 0 - beq cr7, L(retnull) - cmpdi cr7, r4, 0 - beq cr7, L(retnull) - - mr r29, r3 - mr r30, r4 - mr r3, r4 - bl STRLEN - nop - - cmpdi cr7, r3, 0 /* If search str is null. */ - beq cr7, L(ret_r3) - - mr r31, r3 - mr r4, r3 - mr r3, r29 - bl STRNLEN - nop - - cmpd cr7, r3, r31 /* If len(r3) < len(r4). */ - blt cr7, L(retnull) - mr r3, r29 - lbz r4, 0(r30) - bl STRCHR - nop - - mr r11, r3 - /* If first char of search str is not present. */ - cmpdi cr7, r3, 0 - ble cr7, L(end) - /* Reg r28 is used to count the number of iterations. */ - li r28, 0 - rldicl r8, r3, 0, 52 /* Page cross check. */ - cmpldi cr7, r8, 4096-16 - bgt cr7, L(bytebybyte) - - rldicl r8, r30, 0, 52 - cmpldi cr7, r8, 4096-16 - bgt cr7, L(bytebybyte) - - /* If len(r4) < 8 handle in a different way. */ - /* Shift position based on null and use cmpb. */ - cmpdi cr7, r31, 8 - blt cr7, L(lessthan8) - - /* Len(r4) >= 8 reaches here. */ - mr r8, r3 /* Save r3 for future use. */ - mr r4, r30 /* Restore r4. */ - li r0, 0 - rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */ - clrrdi r4, r4, 3 /* Make r4 aligned to 8. */ - ld r6, 0(r4) - addi r4, r4, 8 - cmpdi cr7, r10, 0 /* Check if its already aligned? */ - beq cr7, L(begin1) -#ifdef __LITTLE_ENDIAN__ - srd r6, r6, r10 /* Discard unwanted bits. */ -#else - sld r6, r6, r10 -#endif - ld r9, 0(r4) - subfic r10, r10, 64 -#ifdef __LITTLE_ENDIAN__ - sld r9, r9, r10 /* Discard unwanted bits. */ -#else - srd r9, r9, r10 -#endif - or r6, r6, r9 /* Form complete search str. */ -L(begin1): - mr r29, r6 - rlwinm r10, r3, 3, 26, 28 - clrrdi r3, r3, 3 - ld r5, 0(r3) - cmpb r9, r0, r6 /* Check if input has null. */ - cmpdi cr7, r9, 0 - bne cr7, L(return3) - cmpb r9, r0, r5 /* Check if input has null. */ -#ifdef __LITTLE_ENDIAN__ - srd r9, r9, r10 -#else - sld r9, r9, r10 -#endif - cmpdi cr7, r9, 0 - bne cr7, L(retnull) - - li r12, -8 /* Shift values. */ - li r11, 72 /* Shift values. */ - cmpdi cr7, r10, 0 - beq cr7, L(nextbyte1) - mr r12, r10 - addi r12, r12, -8 - subfic r11, r12, 64 - -L(nextbyte1): - ldu r7, 8(r3) /* Load next dw. */ - addi r12, r12, 8 /* Shift one byte and compare. */ - addi r11, r11, -8 -#ifdef __LITTLE_ENDIAN__ - srd r9, r5, r12 /* Rotate based on mask. */ - sld r10, r7, r11 -#else - sld r9, r5, r12 - srd r10, r7, r11 -#endif - /* Form single dw from few bytes on first load and second load. */ - or r10, r9, r10 - /* Check for null in the formed dw. */ - cmpb r9, r0, r10 - cmpdi cr7, r9, 0 - bne cr7, L(retnull) - /* Cmpb search str and input str. */ - cmpb r9, r10, r6 - cmpdi cr7, r9, -1 - beq cr7, L(match) - addi r8, r8, 1 - b L(begin) - - .align 4 -L(match): - /* There is a match of 8 bytes, check next bytes. */ - cmpdi cr7, r31, 8 - beq cr7, L(return) - /* Update next starting point r8. */ - srdi r9, r11, 3 - subf r9, r9, r3 - mr r8, r9 - -L(secondmatch): - mr r5, r7 - rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */ - ld r6, 0(r4) - addi r4, r4, 8 - cmpdi cr7, r10, 0 /* Check if its already aligned? */ - beq cr7, L(proceed3) -#ifdef __LITTLE_ENDIAN__ - srd r6, r6, r10 /* Discard unwanted bits. */ - cmpb r9, r0, r6 - sld r9, r9, r10 -#else - sld r6, r6, r10 - cmpb r9, r0, r6 - srd r9, r9, r10 -#endif - cmpdi cr7, r9, 0 - bne cr7, L(proceed3) - ld r9, 0(r4) - subfic r10, r10, 64 -#ifdef __LITTLE_ENDIAN__ - sld r9, r9, r10 /* Discard unwanted bits. */ -#else - srd r9, r9, r10 -#endif - or r6, r6, r9 /* Form complete search str. */ - -L(proceed3): - li r7, 0 - addi r3, r3, 8 - cmpb r9, r0, r5 - cmpdi cr7, r9, 0 - bne cr7, L(proceed4) - ld r7, 0(r3) -L(proceed4): -#ifdef __LITTLE_ENDIAN__ - srd r9, r5, r12 - sld r10, r7, r11 -#else - sld r9, r5, r12 - srd r10, r7, r11 -#endif - /* Form single dw with few bytes from first and second load. */ - or r10, r9, r10 - cmpb r9, r0, r6 - cmpdi cr7, r9, 0 - bne cr7, L(return4) - /* Check for null in the formed dw. */ - cmpb r9, r0, r10 - cmpdi cr7, r9, 0 - bne cr7, L(retnull) - /* If the next 8 bytes dont match, start search again. */ - cmpb r9, r10, r6 - cmpdi cr7, r9, -1 - bne cr7, L(reset) - /* If the next 8 bytes match, load and compare next 8. */ - b L(secondmatch) - - .align 4 -L(reset): - /* Start the search again. */ - addi r8, r8, 1 - b L(begin) - - .align 4 -L(return3): - /* Count leading zeros and compare partial dw. */ -#ifdef __LITTLE_ENDIAN__ - addi r7, r9, -1 - andc r7, r7, r9 - popcntd r7, r7 - subfic r7, r7, 64 - sld r10, r5, r7 - sld r6, r6, r7 -#else - cntlzd r7, r9 - subfic r7, r7, 64 - srd r10, r5, r7 - srd r6, r6, r7 -#endif - cmpb r9, r10, r6 - cmpdi cr7, r9, -1 - addi r8, r8, 1 - /* Start search again if there is no match. */ - bne cr7, L(begin) - /* If the words match, update return values. */ - subfic r7, r7, 64 - srdi r7, r7, 3 - add r3, r3, r7 - subf r3, r31, r3 - b L(end) - - .align 4 -L(return4): - /* Count leading zeros and compare partial dw. */ -#ifdef __LITTLE_ENDIAN__ - addi r7, r9, -1 - andc r7, r7, r9 - popcntd r7, r7 - subfic r7, r7, 64 - sld r10, r10, r7 - sld r6, r6, r7 -#else - cntlzd r7, r9 - subfic r7, r7, 64 - srd r10, r10, r7 - srd r6, r6, r7 -#endif - cmpb r9, r10, r6 - cmpdi cr7, r9, -1 - addi r8, r8, 1 - bne cr7, L(begin) - subfic r7, r7, 64 - srdi r11, r11, 3 - subf r3, r11, r3 - srdi r7, r7, 3 - add r3, r3, r7 - subf r3, r31, r3 - b L(end) - - .align 4 -L(begin): - mr r3, r8 - /* When our iterations exceed ITERATIONS,fall back to default. */ - addi r28, r28, 1 - cmpdi cr7, r28, ITERATIONS - beq cr7, L(default) - lbz r4, 0(r30) - bl STRCHR - nop - /* If first char of search str is not present. */ - cmpdi cr7, r3, 0 - ble cr7, L(end) - mr r8, r3 - mr r4, r30 /* Restore r4. */ - li r0, 0 - mr r6, r29 - clrrdi r4, r4, 3 - addi r4, r4, 8 - b L(begin1) - - /* Handle less than 8 search string. */ - .align 4 -L(lessthan8): - mr r4, r3 - mr r9, r30 - li r0, 0 - - rlwinm r10, r9, 3, 26, 28 /* Calculate padding in bits. */ - srdi r8, r10, 3 /* Padding in bytes. */ - clrrdi r9, r9, 3 /* Make r4 aligned to 8. */ - ld r6, 0(r9) - cmpdi cr7, r10, 0 /* Check if its already aligned? */ - beq cr7, L(proceed2) -#ifdef __LITTLE_ENDIAN__ - srd r6, r6, r10 /* Discard unwanted bits. */ -#else - sld r6, r6, r10 -#endif - subfic r8, r8, 8 - cmpd cr7, r8, r31 /* Next load needed? */ - bge cr7, L(proceed2) - ld r7, 8(r9) - subfic r10, r10, 64 -#ifdef __LITTLE_ENDIAN__ - sld r7, r7, r10 /* Discard unwanted bits. */ -#else - srd r7, r7, r10 -#endif - or r6, r6, r7 /* Form complete search str. */ -L(proceed2): - mr r29, r6 - rlwinm r10, r3, 3, 26, 28 - clrrdi r7, r3, 3 /* Make r3 aligned. */ - ld r5, 0(r7) - sldi r8, r31, 3 - subfic r8, r8, 64 -#ifdef __LITTLE_ENDIAN__ - sld r6, r6, r8 - cmpb r9, r0, r5 - srd r9, r9, r10 -#else - srd r6, r6, r8 - cmpb r9, r0, r5 - sld r9, r9, r10 -#endif - cmpdi cr7, r9, 0 - bne cr7, L(noload) - cmpdi cr7, r10, 0 - beq cr7, L(continue) - ld r7, 8(r7) -L(continue1): - mr r12, r10 - addi r12, r12, -8 - subfic r11, r12, 64 - b L(nextbyte) - - .align 4 -L(continue): - ld r7, 8(r7) - li r12, -8 /* Shift values. */ - li r11, 72 /* Shift values. */ -L(nextbyte): - addi r12, r12, 8 /* Mask for rotation. */ - addi r11, r11, -8 -#ifdef __LITTLE_ENDIAN__ - srd r9, r5, r12 - sld r10, r7, r11 - or r10, r9, r10 - sld r10, r10, r8 - cmpb r9, r0, r10 - srd r9, r9, r8 -#else - sld r9, r5, r12 - srd r10, r7, r11 - or r10, r9, r10 - srd r10, r10, r8 - cmpb r9, r0, r10 - sld r9, r9, r8 -#endif - cmpdi cr7, r9, 0 - bne cr7, L(retnull) - cmpb r9, r10, r6 - cmpdi cr7, r9, -1 - beq cr7, L(end) - addi r3, r4, 1 - /* When our iterations exceed ITERATIONS,fall back to default. */ - addi r28, r28, 1 - cmpdi cr7, r28, ITERATIONS - beq cr7, L(default) - lbz r4, 0(r30) - bl STRCHR - nop - /* If first char of search str is not present. */ - cmpdi cr7, r3, 0 - ble cr7, L(end) - mr r4, r3 - mr r6, r29 - li r0, 0 - b L(proceed2) - - .align 4 -L(noload): - /* Reached null in r3, so skip next load. */ - li r7, 0 - b L(continue1) - - .align 4 -L(return): - /* Update return values. */ - srdi r9, r11, 3 - subf r3, r9, r3 - b L(end) - - /* Handling byte by byte. */ - .align 4 -L(bytebybyte): - mr r8, r3 - addi r8, r8, -1 -L(loop1): - addi r8, r8, 1 - mr r3, r8 - mr r4, r30 - lbz r6, 0(r4) - cmpdi cr7, r6, 0 - beq cr7, L(updater3) -L(loop): - lbz r5, 0(r3) - cmpdi cr7, r5, 0 - beq cr7, L(retnull) - cmpld cr7, r6, r5 - bne cr7, L(loop1) - addi r3, r3, 1 - addi r4, r4, 1 - lbz r6, 0(r4) - cmpdi cr7, r6, 0 - beq cr7, L(updater3) - b L(loop) - - /* Handling return values. */ - .align 4 -L(updater3): - subf r3, r31, r3 /* Reduce len of r4 from r3. */ - b L(end) - - .align 4 -L(ret_r3): - mr r3, r29 /* Return r3. */ - b L(end) - - .align 4 -L(retnull): - li r3, 0 /* Return NULL. */ - b L(end) - - .align 4 -L(default): - mr r4, r30 - bl __strstr_ppc - nop - - .align 4 -L(end): - addi r1, r1, FRAMESIZE /* Restore stack pointer. */ - cfi_adjust_cfa_offset(-FRAMESIZE) - ld r0, 16(r1) /* Restore the saved link register. */ - ld r28, -32(r1) /* Restore callers save register r28. */ - ld r29, -24(r1) /* Restore callers save register r29. */ - ld r30, -16(r1) /* Restore callers save register r30. */ - ld r31, -8(r1) /* Restore callers save register r31. */ - mtlr r0 /* Branch to link register. */ - blr -END (STRSTR) -libc_hidden_builtin_def (strstr) diff --git a/sysdeps/powerpc/powerpc64/power7/sub_n.S b/sysdeps/powerpc/powerpc64/power7/sub_n.S deleted file mode 100644 index 848dad5718..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/sub_n.S +++ /dev/null @@ -1,23 +0,0 @@ -/* PowerPC64 mpn_lshift -- mpn_add_n/mpn_sub_n -- mpn addition and - subtraction. - Copyright (C) 2013-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#define USE_AS_SUB -#include "add_n.S" |