diff options
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8')
35 files changed, 0 insertions, 7733 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/Implies b/sysdeps/powerpc/powerpc64/power8/Implies deleted file mode 100644 index 9a5e3c7277..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/Implies +++ /dev/null @@ -1,2 +0,0 @@ -powerpc/powerpc64/power7/fpu -powerpc/powerpc64/power7 diff --git a/sysdeps/powerpc/powerpc64/power8/Makefile b/sysdeps/powerpc/powerpc64/power8/Makefile deleted file mode 100644 index 71a59529f3..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -ifeq ($(subdir),string) -sysdep_routines += strcasestr-ppc64 -endif diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/Implies b/sysdeps/powerpc/powerpc64/power8/fpu/Implies deleted file mode 100644 index 1187cdfb0a..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc64/power7/fpu/ diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S b/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S deleted file mode 100644 index 4c42926a74..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S +++ /dev/null @@ -1,303 +0,0 @@ -/* Optimized expf(). PowerPC64/POWER8 version. - Copyright (C) 2016-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* Short algorithm description: - * - * Let K = 64 (table size). - * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y)) - * where: - * x = m*log(2)/K + y, y in [0.0..log(2)/K] - * m = n*K + j, m,n,j - signed integer, j in [0..K-1] - * values of 2^(j/K) are tabulated as T[j]. - * - * P(y) is a minimax polynomial approximation of expf(y)-1 - * on small interval [0.0..log(2)/K]. - * - * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as - * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y - * - * Special cases: - * expf(NaN) = NaN - * expf(+INF) = +INF - * expf(-INF) = 0 - * expf(x) = 1 for subnormals - * for finite argument, only expf(0)=1 is exact - * expf(x) overflows if x>88.7228317260742190 - * expf(x) underflows if x<-103.972076416015620 - */ - -#define C1 0x42ad496b /* Single precision 125*log(2). */ -#define C2 0x31800000 /* Single precision 2^(-28). */ -#define SP_INF 0x7f800000 /* Single precision Inf. */ -#define SP_EXP_BIAS 0x1fc0 /* Single precision exponent bias. */ - -#define DATA_OFFSET r9 - -/* Implements the function - - float [fp1] expf (float [fp1] x) */ - - .machine power8 -EALIGN(__ieee754_expf, 4, 0) - addis DATA_OFFSET,r2,.Lanchor@toc@ha - addi DATA_OFFSET,DATA_OFFSET,.Lanchor@toc@l - - xscvdpspn v0,v1 - mfvsrd r8,v0 /* r8 = x */ - lfd fp2,(.KLN2-.Lanchor)(DATA_OFFSET) - lfd fp3,(.P2-.Lanchor)(DATA_OFFSET) - rldicl r3,r8,32,33 /* r3 = |x| */ - lis r4,C1@ha /* r4 = 125*log(2) */ - ori r4,r4,C1@l - cmpw r3,r4 - lfd fp5,(.P3-.Lanchor)(DATA_OFFSET) - lfd fp4,(.RS-.Lanchor)(DATA_OFFSET) - fmadd fp2,fp1,fp2,fp4 /* fp2 = x * K/log(2) + (2^23 + 2^22) */ - bge L(special_paths) /* |x| >= 125*log(2) ? */ - - lis r4,C2@ha - ori r4,r4,C2@l - cmpw r3,r4 - blt L(small_args) /* |x| < 2^(-28) ? */ - - /* Main path: here if 2^(-28) <= |x| < 125*log(2) */ - frsp fp6,fp2 - xscvdpsp v2,v2 - mfvsrd r8,v2 - mr r3,r8 /* r3 = m */ - rldicl r8,r8,32,58 /* r8 = j */ - lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET) - fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */ - srdi r3,r3,32 - clrrwi r3,r3,6 /* r3 = n */ - lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET) - fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */ - fmul fp2,fp0,fp0 /* fp2 = z = y^2 */ - lfd fp4,(.P1-.Lanchor)(DATA_OFFSET) - lfd fp6,(.P0-.Lanchor)(DATA_OFFSET) - lis r4,SP_EXP_BIAS@ha - ori r4,r4,SP_EXP_BIAS@l - add r3,r3,r4 - rldic r3,r3,49,1 /* r3 = 2^n */ - fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */ - fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */ - mtvsrd v1,r3 - xscvspdp v1,v1 - fmul fp4,fp4,fp2 /* fp4 = (P3 * z + P1)*z */ - fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */ - sldi r8,r8,3 /* Access doublewords from T[j]. */ - addi r6,DATA_OFFSET,(.Ttable-.Lanchor) - lfdx fp3,r6,r8 - fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + P(y)) */ - fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + P(y)) */ - frsp fp1,fp1 - blr - - .align 4 -/* x is either underflow, overflow, infinite or NaN. */ -L(special_paths): - srdi r8,r8,32 - rlwinm r8,r8,3,29,29 /* r8 = 0, if x positive. - r8 = 4, otherwise. */ - addi r6,DATA_OFFSET,(.SPRANGE-.Lanchor) - lwzx r4,r6,r8 /* r4 = .SPRANGE[signbit(x)] */ - cmpw r3,r4 - /* |x| <= .SPRANGE[signbit(x)] */ - ble L(near_under_or_overflow) - - lis r4,SP_INF@ha - ori r4,r4,SP_INF@l - cmpw r3,r4 - bge L(arg_inf_or_nan) /* |x| > Infinite ? */ - - addi r6,DATA_OFFSET,(.SPLARGE_SMALL-.Lanchor) - lfsx fp1,r6,r8 - fmuls fp1,fp1,fp1 - blr - - - .align 4 -L(small_args): - /* expf(x) = 1.0, where |x| < |2^(-28)| */ - lfs fp2,(.SPone-.Lanchor)(DATA_OFFSET) - fadds fp1,fp1,fp2 - blr - - - .align 4 -L(arg_inf_or_nan:) - bne L(arg_nan) - - /* expf(+INF) = +INF - expf(-INF) = 0 */ - addi r6,DATA_OFFSET,(.INF_ZERO-.Lanchor) - lfsx fp1,r6,r8 - blr - - - .align 4 -L(arg_nan): - /* expf(NaN) = NaN */ - fadd fp1,fp1,fp1 - frsp fp1,fp1 - blr - - .align 4 -L(near_under_or_overflow): - frsp fp6,fp2 - xscvdpsp v2,v2 - mfvsrd r8,v2 - mr r3,r8 /* r3 = m */ - rldicl r8,r8,32,58 /* r8 = j */ - lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET) - fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */ - srdi r3,r3,32 - clrrwi r3,r3,6 /* r3 = n */ - lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET) - fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */ - fmul fp2,fp0,fp0 /* fp2 = z = y^2 */ - lfd fp4,(.P1-.Lanchor)(DATA_OFFSET) - lfd fp6,(.P0-.Lanchor)(DATA_OFFSET) - ld r4,(.DP_EXP_BIAS-.Lanchor)(DATA_OFFSET) - add r3,r3,r4 - rldic r3,r3,46,1 /* r3 = 2 */ - fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */ - fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */ - mtvsrd v1,r3 - fmul fp4,fp4,fp2 /* fp4 = (P3*z + P1)*z */ - fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */ - sldi r8,r8,3 /* Access doublewords from T[j]. */ - addi r6,DATA_OFFSET,(.Ttable-.Lanchor) - lfdx fp3,r6,r8 - fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + T[j]) */ - fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + T[j]) */ - frsp fp1,fp1 - blr -END(__ieee754_expf) - - .section .rodata, "a",@progbits -.Lanchor: - .balign 8 -/* Table T[j] = 2^(j/K). Double precision. */ -.Ttable: - .8byte 0x3ff0000000000000 - .8byte 0x3ff02c9a3e778061 - .8byte 0x3ff059b0d3158574 - .8byte 0x3ff0874518759bc8 - .8byte 0x3ff0b5586cf9890f - .8byte 0x3ff0e3ec32d3d1a2 - .8byte 0x3ff11301d0125b51 - .8byte 0x3ff1429aaea92de0 - .8byte 0x3ff172b83c7d517b - .8byte 0x3ff1a35beb6fcb75 - .8byte 0x3ff1d4873168b9aa - .8byte 0x3ff2063b88628cd6 - .8byte 0x3ff2387a6e756238 - .8byte 0x3ff26b4565e27cdd - .8byte 0x3ff29e9df51fdee1 - .8byte 0x3ff2d285a6e4030b - .8byte 0x3ff306fe0a31b715 - .8byte 0x3ff33c08b26416ff - .8byte 0x3ff371a7373aa9cb - .8byte 0x3ff3a7db34e59ff7 - .8byte 0x3ff3dea64c123422 - .8byte 0x3ff4160a21f72e2a - .8byte 0x3ff44e086061892d - .8byte 0x3ff486a2b5c13cd0 - .8byte 0x3ff4bfdad5362a27 - .8byte 0x3ff4f9b2769d2ca7 - .8byte 0x3ff5342b569d4f82 - .8byte 0x3ff56f4736b527da - .8byte 0x3ff5ab07dd485429 - .8byte 0x3ff5e76f15ad2148 - .8byte 0x3ff6247eb03a5585 - .8byte 0x3ff6623882552225 - .8byte 0x3ff6a09e667f3bcd - .8byte 0x3ff6dfb23c651a2f - .8byte 0x3ff71f75e8ec5f74 - .8byte 0x3ff75feb564267c9 - .8byte 0x3ff7a11473eb0187 - .8byte 0x3ff7e2f336cf4e62 - .8byte 0x3ff82589994cce13 - .8byte 0x3ff868d99b4492ed - .8byte 0x3ff8ace5422aa0db - .8byte 0x3ff8f1ae99157736 - .8byte 0x3ff93737b0cdc5e5 - .8byte 0x3ff97d829fde4e50 - .8byte 0x3ff9c49182a3f090 - .8byte 0x3ffa0c667b5de565 - .8byte 0x3ffa5503b23e255d - .8byte 0x3ffa9e6b5579fdbf - .8byte 0x3ffae89f995ad3ad - .8byte 0x3ffb33a2b84f15fb - .8byte 0x3ffb7f76f2fb5e47 - .8byte 0x3ffbcc1e904bc1d2 - .8byte 0x3ffc199bdd85529c - .8byte 0x3ffc67f12e57d14b - .8byte 0x3ffcb720dcef9069 - .8byte 0x3ffd072d4a07897c - .8byte 0x3ffd5818dcfba487 - .8byte 0x3ffda9e603db3285 - .8byte 0x3ffdfc97337b9b5f - .8byte 0x3ffe502ee78b3ff6 - .8byte 0x3ffea4afa2a490da - .8byte 0x3ffefa1bee615a27 - .8byte 0x3fff50765b6e4540 - .8byte 0x3fffa7c1819e90d8 - -.KLN2: - .8byte 0x40571547652b82fe /* Double precision K/log(2). */ - -/* Double precision polynomial coefficients. */ -.P0: - .8byte 0x3fefffffffffe7c6 -.P1: - .8byte 0x3fe00000008d6118 -.P2: - .8byte 0x3fc55550da752d4f -.P3: - .8byte 0x3fa56420eb78fa85 - -.RS: - .8byte 0x4168000000000000 /* Double precision 2^23 + 2^22. */ -.NLN2K: - .8byte 0xbf862e42fefa39ef /* Double precision -log(2)/K. */ -.DP_EXP_BIAS: - .8byte 0x000000000000ffc0 /* Double precision exponent bias. */ - - .balign 4 -.SPone: - .4byte 0x3f800000 /* Single precision 1.0. */ -.SP_RS: - .4byte 0x4b400000 /* Single precision 2^23 + 2^22. */ - -.SPRANGE: /* Single precision overflow/underflow bounds. */ - .4byte 0x42b17217 /* if x>this bound, then result overflows. */ - .4byte 0x42cff1b4 /* if x<this bound, then result underflows. */ - -.SPLARGE_SMALL: - .4byte 0x71800000 /* 2^100. */ - .4byte 0x0d800000 /* 2^-100. */ - -.INF_ZERO: - .4byte 0x7f800000 /* Single precision Inf. */ - .4byte 0 /* Single precision zero. */ - -strong_alias (__ieee754_expf, __expf_finite) diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies deleted file mode 100644 index 7fd86fdf87..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc64/power7/fpu/multiarch diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S deleted file mode 100644 index 8dfa0076e0..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S +++ /dev/null @@ -1,508 +0,0 @@ -/* Optimized cosf(). PowerPC64/POWER8 version. - Copyright (C) 2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#define _ERRNO_H 1 -#include <bits/errno.h> - -#define FRAMESIZE (FRAME_MIN_SIZE+16) - -#define FLOAT_EXPONENT_SHIFT 23 -#define FLOAT_EXPONENT_BIAS 127 -#define INTEGER_BITS 3 - -#define PI_4 0x3f490fdb /* PI/4 */ -#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */ -#define TWO_PN5 0x3d000000 /* 2^-5 */ -#define TWO_PN27 0x32000000 /* 2^-27 */ -#define INFINITY 0x7f800000 -#define TWO_P23 0x4b000000 /* 2^23 */ -#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */ - - /* Implements the function - - float [fp1] cosf (float [fp1] x) */ - - .machine power8 -EALIGN(__cosf, 4, 0) - addis r9,r2,L(anchor)@toc@ha - addi r9,r9,L(anchor)@toc@l - - lis r4,PI_4@h - ori r4,r4,PI_4@l - - xscvdpspn v0,v1 - mfvsrd r8,v0 - rldicl r3,r8,32,33 /* Remove sign bit. */ - - cmpw r3,r4 - bge L(greater_or_equal_pio4) - - lis r4,TWO_PN5@h - ori r4,r4,TWO_PN5@l - - cmpw r3,r4 - blt L(less_2pn5) - - /* Chebyshev polynomial of the form: - * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ - - lfd fp9,(L(C0)-L(anchor))(r9) - lfd fp10,(L(C1)-L(anchor))(r9) - lfd fp11,(L(C2)-L(anchor))(r9) - lfd fp12,(L(C3)-L(anchor))(r9) - lfd fp13,(L(C4)-L(anchor))(r9) - - fmul fp2,fp1,fp1 /* x^2 */ - lfd fp3,(L(DPone)-L(anchor))(r9) - - fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */ - fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */ - fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */ - fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */ - fmadd fp1,fp2,fp4,fp3 /* 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */ - frsp fp1,fp1 /* Round to single precision. */ - - blr - - .balign 16 -L(greater_or_equal_pio4): - lis r4,NINEPI_4@h - ori r4,r4,NINEPI_4@l - cmpw r3,r4 - bge L(greater_or_equal_9pio4) - - /* Calculate quotient of |x|/(PI/4). */ - lfd fp2,(L(invpio4)-L(anchor))(r9) - fabs fp1,fp1 /* |x| */ - fmul fp2,fp1,fp2 /* |x|/(PI/4) */ - fctiduz fp2,fp2 - mfvsrd r3,v2 /* n = |x| mod PI/4 */ - - /* Now use that quotient to find |x| mod (PI/2). */ - addi r7,r3,1 - rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */ - addi r6,r9,(L(pio2_table)-L(anchor)) - lfdx fp4,r5,r6 - fsub fp1,fp1,fp4 - - .balign 16 -L(reduced): - /* Now we are in the range -PI/4 to PI/4. */ - - /* Work out if we are in a positive or negative primary interval. */ - addi r7,r7,2 - rldicl r4,r7,62,63 /* ((n+3) >> 2) & 1 */ - - /* Load a 1.0 or -1.0. */ - addi r5,r9,(L(ones)-L(anchor)) - sldi r4,r4,3 - lfdx fp0,r4,r5 - - /* Are we in the primary interval of sin or cos? */ - andi. r4,r7,0x2 - bne L(cos) - - /* Chebyshev polynomial of the form: - x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ - - lfd fp9,(L(S0)-L(anchor))(r9) - lfd fp10,(L(S1)-L(anchor))(r9) - lfd fp11,(L(S2)-L(anchor))(r9) - lfd fp12,(L(S3)-L(anchor))(r9) - lfd fp13,(L(S4)-L(anchor))(r9) - - fmul fp2,fp1,fp1 /* x^2 */ - fmul fp3,fp2,fp1 /* x^3 */ - - fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */ - fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */ - fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */ - fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */ - fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */ - fmul fp4,fp4,fp0 /* Add in the sign. */ - frsp fp1,fp4 /* Round to single precision. */ - - blr - - .balign 16 -L(cos): - /* Chebyshev polynomial of the form: - 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ - - lfd fp9,(L(C0)-L(anchor))(r9) - lfd fp10,(L(C1)-L(anchor))(r9) - lfd fp11,(L(C2)-L(anchor))(r9) - lfd fp12,(L(C3)-L(anchor))(r9) - lfd fp13,(L(C4)-L(anchor))(r9) - - fmul fp2,fp1,fp1 /* x^2 */ - lfd fp3,(L(DPone)-L(anchor))(r9) - - fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */ - fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */ - fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */ - fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */ - fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */ - fmul fp4,fp4,fp0 /* Add in the sign. */ - frsp fp1,fp4 /* Round to single precision. */ - - blr - - .balign 16 -L(greater_or_equal_9pio4): - lis r4,INFINITY@h - ori r4,r4,INFINITY@l - cmpw r3,r4 - bge L(inf_or_nan) - - lis r4,TWO_P23@h - ori r4,r4,TWO_P23@l - cmpw r3,r4 - bge L(greater_or_equal_2p23) - - fabs fp1,fp1 /* |x| */ - - /* Calculate quotient of |x|/(PI/4). */ - lfd fp2,(L(invpio4)-L(anchor))(r9) - - lfd fp3,(L(DPone)-L(anchor))(r9) - lfd fp4,(L(DPhalf)-L(anchor))(r9) - fmul fp2,fp1,fp2 /* |x|/(PI/4) */ - friz fp2,fp2 /* n = floor(|x|/(PI/4)) */ - - /* Calculate (n + 1) / 2. */ - fadd fp2,fp2,fp3 /* n + 1 */ - fmul fp3,fp2,fp4 /* (n + 1) / 2 */ - friz fp3,fp3 - - lfd fp4,(L(pio2hi)-L(anchor))(r9) - lfd fp5,(L(pio2lo)-L(anchor))(r9) - - fmul fp6,fp4,fp3 - fadd fp6,fp6,fp1 - fmadd fp1,fp5,fp3,fp6 - - fctiduz fp2,fp2 - mfvsrd r7,v2 /* n + 1 */ - - b L(reduced) - - .balign 16 -L(inf_or_nan): - bne L(skip_errno_setting) /* Is a NAN? */ - - /* We delayed the creation of the stack frame, as well as the saving of - the link register, because only at this point, we are sure that - doing so is actually needed. */ - - stfd fp1,-8(r1) - - /* Save the link register. */ - mflr r0 - std r0,16(r1) - cfi_offset(lr, 16) - - /* Create the stack frame. */ - stdu r1,-FRAMESIZE(r1) - cfi_adjust_cfa_offset(FRAMESIZE) - - bl JUMPTARGET(__errno_location) - nop - - /* Restore the stack frame. */ - addi r1,r1,FRAMESIZE - cfi_adjust_cfa_offset(-FRAMESIZE) - /* Restore the link register. */ - ld r0,16(r1) - mtlr r0 - - lfd fp1,-8(r1) - - /* errno = EDOM */ - li r4,EDOM - stw r4,0(r3) - -L(skip_errno_setting): - fsub fp1,fp1,fp1 /* x - x */ - blr - - .balign 16 -L(greater_or_equal_2p23): - fabs fp1,fp1 - - srwi r4,r3,FLOAT_EXPONENT_SHIFT - subi r4,r4,FLOAT_EXPONENT_BIAS - - /* We reduce the input modulo pi/4, so we need 3 bits of integer - to determine where in 2*pi we are. Index into our array - accordingly. */ - addi r4,r4,INTEGER_BITS - - /* To avoid an expensive divide, for the range we care about (0 - 127) - we can transform x/28 into: - - x/28 = (x * ((0x100000000 / 28) + 1)) >> 32 - - mulhwu returns the top 32 bits of the 64 bit result, doing the - shift for us in the same instruction. The top 32 bits are undefined, - so we have to mask them. */ - - lis r6,FX_FRACTION_1_28@h - ori r6,r6,FX_FRACTION_1_28@l - mulhwu r5,r4,r6 - clrldi r5,r5,32 - - /* Get our pointer into the invpio4_table array. */ - sldi r4,r5,3 - addi r6,r9,(L(invpio4_table)-L(anchor)) - add r4,r4,r6 - - lfd fp2,0(r4) - lfd fp3,8(r4) - lfd fp4,16(r4) - lfd fp5,24(r4) - - fmul fp6,fp2,fp1 - fmul fp7,fp3,fp1 - fmul fp8,fp4,fp1 - fmul fp9,fp5,fp1 - - /* Mask off larger integer bits in highest double word that we don't - care about to avoid losing precision when combining with smaller - values. */ - fctiduz fp10,fp6 - mfvsrd r7,v10 - rldicr r7,r7,0,(63-INTEGER_BITS) - mtvsrd v10,r7 - fcfidu fp10,fp10 /* Integer bits. */ - - fsub fp6,fp6,fp10 /* highest -= integer bits */ - - /* Work out the integer component, rounded down. Use the top two - limbs for this. */ - fadd fp10,fp6,fp7 /* highest + higher */ - - fctiduz fp10,fp10 - mfvsrd r7,v10 - andi. r0,r7,1 - fcfidu fp10,fp10 - - /* Subtract integer component from highest limb. */ - fsub fp12,fp6,fp10 - - beq L(even_integer) - - /* Our integer component is odd, so we are in the -PI/4 to 0 primary - region. We need to shift our result down by PI/4, and to do this - in the mod (4/PI) space we simply subtract 1. */ - lfd fp11,(L(DPone)-L(anchor))(r9) - fsub fp12,fp12,fp11 - - /* Now add up all the limbs in order. */ - fadd fp12,fp12,fp7 - fadd fp12,fp12,fp8 - fadd fp12,fp12,fp9 - - /* And finally multiply by pi/4. */ - lfd fp13,(L(pio4)-L(anchor))(r9) - fmul fp1,fp12,fp13 - - addi r7,r7,1 - b L(reduced) - -L(even_integer): - lfd fp11,(L(DPone)-L(anchor))(r9) - - /* Now add up all the limbs in order. */ - fadd fp12,fp12,fp7 - fadd fp12,r12,fp8 - fadd fp12,r12,fp9 - - /* We need to check if the addition of all the limbs resulted in us - overflowing 1.0. */ - fcmpu 0,fp12,fp11 - bgt L(greater_than_one) - - /* And finally multiply by pi/4. */ - lfd fp13,(L(pio4)-L(anchor))(r9) - fmul fp1,fp12,fp13 - - addi r7,r7,1 - b L(reduced) - -L(greater_than_one): - /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our - integer, and subtract 1.0 from our result. Since that makes the - integer component odd, we need to subtract another 1.0 as - explained above. */ - addi r7,r7,1 - - lfd fp11,(L(DPtwo)-L(anchor))(r9) - fsub fp12,fp12,fp11 - - /* And finally multiply by pi/4. */ - lfd fp13,(L(pio4)-L(anchor))(r9) - fmul fp1,fp12,fp13 - - addi r7,r7,1 - b L(reduced) - - .balign 16 -L(less_2pn5): - lis r4,TWO_PN27@h - ori r4,r4,TWO_PN27@l - - cmpw r3,r4 - blt L(less_2pn27) - - /* A simpler Chebyshev approximation is close enough for this range: - 1.0+x^2*(CC0+x^3*CC1). */ - - lfd fp10,(L(CC0)-L(anchor))(r9) - lfd fp11,(L(CC1)-L(anchor))(r9) - - fmul fp2,fp1,fp1 /* x^2 */ - fmul fp3,fp2,fp1 /* x^3 */ - lfd fp1,(L(DPone)-L(anchor))(r9) - - fmadd fp4,fp3,fp11,fp10 /* CC0+x^3*CC1 */ - fmadd fp1,fp2,fp4,fp1 /* 1.0+x^2*(CC0+x^3*CC1) */ - - frsp fp1,fp1 /* Round to single precision. */ - - blr - - .balign 16 -L(less_2pn27): - /* Handle some special cases: - - cosf(subnormal) raises inexact - cosf(min_normalized) raises inexact - cosf(normalized) raises inexact. */ - - lfd fp2,(L(DPone)-L(anchor))(r9) - - fabs fp1,fp1 /* |x| */ - fsub fp1,fp2,fp1 /* 1.0-|x| */ - - frsp fp1,fp1 - - blr - -END (__cosf) - - .section .rodata, "a" - - .balign 8 - -L(anchor): - - /* Chebyshev constants for sin, range -PI/4 - PI/4. */ -L(S0): .8byte 0xbfc5555555551cd9 -L(S1): .8byte 0x3f81111110c2688b -L(S2): .8byte 0xbf2a019f8b4bd1f9 -L(S3): .8byte 0x3ec71d7264e6b5b4 -L(S4): .8byte 0xbe5a947e1674b58a - - /* Chebyshev constants for cos, range 2^-27 - 2^-5. */ -L(CC0): .8byte 0xbfdfffffff5cc6fd -L(CC1): .8byte 0x3fa55514b178dac5 - - /* Chebyshev constants for cos, range -PI/4 - PI/4. */ -L(C0): .8byte 0xbfdffffffffe98ae -L(C1): .8byte 0x3fa55555545c50c7 -L(C2): .8byte 0xbf56c16b348b6874 -L(C3): .8byte 0x3efa00eb9ac43cc0 -L(C4): .8byte 0xbe923c97dd8844d7 - -L(invpio2): - .8byte 0x3fe45f306dc9c883 /* 2/PI */ - -L(invpio4): - .8byte 0x3ff45f306dc9c883 /* 4/PI */ - -L(invpio4_table): - .8byte 0x0000000000000000 - .8byte 0x3ff45f306c000000 - .8byte 0x3e3c9c882a000000 - .8byte 0x3c54fe13a8000000 - .8byte 0x3aaf47d4d0000000 - .8byte 0x38fbb81b6c000000 - .8byte 0x3714acc9e0000000 - .8byte 0x3560e4107c000000 - .8byte 0x33bca2c756000000 - .8byte 0x31fbd778ac000000 - .8byte 0x300b7246e0000000 - .8byte 0x2e5d2126e8000000 - .8byte 0x2c97003248000000 - .8byte 0x2ad77504e8000000 - .8byte 0x290921cfe0000000 - .8byte 0x274deb1cb0000000 - .8byte 0x25829a73e0000000 - .8byte 0x23fd1046be000000 - .8byte 0x2224baed10000000 - .8byte 0x20709d338e000000 - .8byte 0x1e535a2f80000000 - .8byte 0x1cef904e64000000 - .8byte 0x1b0d639830000000 - .8byte 0x1964ce7d24000000 - .8byte 0x17b908bf16000000 - -L(pio4): - .8byte 0x3fe921fb54442d18 /* PI/4 */ - -/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb - to avoid losing significant bits when multiplying with up to - (2^22)/(pi/2). */ -L(pio2hi): - .8byte 0xbff921fb54400000 - -L(pio2lo): - .8byte 0xbdd0b4611a626332 - -L(pio2_table): - .8byte 0 - .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */ - .8byte 0x400921fb54442d18 /* 2 * PI/2 */ - .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */ - .8byte 0x401921fb54442d18 /* 4 * PI/2 */ - .8byte 0x401f6a7a2955385e /* 5 * PI/2 */ - .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */ - .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */ - .8byte 0x402921fb54442d18 /* 8 * PI/2 */ - .8byte 0x402c463abeccb2bb /* 9 * PI/2 */ - .8byte 0x402f6a7a2955385e /* 10 * PI/2 */ - -L(small): - .8byte 0x3cd0000000000000 /* 2^-50 */ - -L(ones): - .8byte 0x3ff0000000000000 /* +1.0 */ - .8byte 0xbff0000000000000 /* -1.0 */ - -L(DPhalf): - .8byte 0x3fe0000000000000 /* 0.5 */ - -L(DPone): - .8byte 0x3ff0000000000000 /* 1.0 */ - -L(DPtwo): - .8byte 0x4000000000000000 /* 2.0 */ - -weak_alias(__cosf, cosf) diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S deleted file mode 100644 index fcdcb60293..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S +++ /dev/null @@ -1,56 +0,0 @@ -/* isfinite(). PowerPC64/POWER8 version. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <math_ldbl_opt.h> - -#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ - -/* int [r3] __finite ([fp1] x) */ - -EALIGN (__finite, 4, 0) - CALL_MCOUNT 0 - MFVSRD_R3_V1 - lis r9,0x8010 - clrldi r3,r3,1 /* r3 = r3 & 0x8000000000000000 */ - rldicr r9,r9,32,31 /* r9 = (r9 << 32) & 0xffffffff */ - add r3,r3,r9 - rldicl r3,r3,1,63 - blr -END (__finite) - -hidden_def (__finite) -weak_alias (__finite, finite) - -/* It turns out that the 'double' version will also always work for - single-precision. */ -strong_alias (__finite, __finitef) -hidden_def (__finitef) -weak_alias (__finitef, finitef) - -#if IS_IN (libm) -# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0) -compat_symbol (libm, __finite, __finitel, GLIBC_2_0) -compat_symbol (libm, finite, finitel, GLIBC_2_0) -# endif -#else -# if LONG_DOUBLE_COMPAT (libc, GLIBC_2_0) -compat_symbol (libc, __finite, __finitel, GLIBC_2_0); -compat_symbol (libc, finite, finitel, GLIBC_2_0); -# endif -#endif diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S deleted file mode 100644 index 54bd94176d..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S +++ /dev/null @@ -1 +0,0 @@ -/* This function uses the same code as s_finite.S. */ diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S deleted file mode 100644 index 32814e4525..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S +++ /dev/null @@ -1,61 +0,0 @@ -/* isinf(). PowerPC64/POWER8 version. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <math_ldbl_opt.h> - -#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ - -/* int [r3] __isinf([fp1] x) */ - -EALIGN (__isinf, 4, 0) - CALL_MCOUNT 0 - MFVSRD_R3_V1 - lis r9,0x7ff0 /* r9 = 0x7ff0 */ - rldicl r10,r3,0,1 /* r10 = r3 & (0x8000000000000000) */ - sldi r9,r9,32 /* r9 = r9 << 52 */ - cmpd cr7,r10,r9 /* fp1 & 0x7ff0000000000000 ? */ - beq cr7,L(inf) - li r3,0 /* Not inf */ - blr -L(inf): - sradi r3,r3,63 /* r3 = r3 >> 63 */ - ori r3,r3,1 /* r3 = r3 | 0x1 */ - blr -END (__isinf) - -hidden_def (__isinf) -weak_alias (__isinf, isinf) - -/* It turns out that the 'double' version will also always work for - single-precision. */ -strong_alias (__isinf, __isinff) -hidden_def (__isinff) -weak_alias (__isinff, isinff) - -#ifdef NO_LONG_DOUBLE -strong_alias (__isinf, __isinfl) -weak_alias (__isinf, isinfl) -#endif - -#if !IS_IN (libm) -# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) -compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0); -compat_symbol (libc, isinf, isinfl, GLIBC_2_0); -# endif -#endif diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S deleted file mode 100644 index be759e091e..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S +++ /dev/null @@ -1 +0,0 @@ -/* This function uses the same code as s_isinf.S. */ diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S deleted file mode 100644 index af52e502b7..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S +++ /dev/null @@ -1,56 +0,0 @@ -/* isnan(). PowerPC64/POWER8 version. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <math_ldbl_opt.h> - -#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ - -/* int [r3] __isnan([f1] x) */ - -EALIGN (__isnan, 4, 0) - CALL_MCOUNT 0 - MFVSRD_R3_V1 - lis r9,0x7ff0 - clrldi r3,r3,1 /* r3 = r3 & 0x8000000000000000 */ - rldicr r9,r9,32,31 /* r9 = (r9 << 32) & 0xffffffff */ - subf r3,r3,r9 - rldicl r3,r3,1,63 - blr -END (__isnan) - -hidden_def (__isnan) -weak_alias (__isnan, isnan) - -/* It turns out that the 'double' version will also always work for - single-precision. */ -strong_alias (__isnan, __isnanf) -hidden_def (__isnanf) -weak_alias (__isnanf, isnanf) - -#ifdef NO_LONG_DOUBLE -strong_alias (__isnan, __isnanl) -weak_alias (__isnan, isnanl) -#endif - -#if !IS_IN (libm) -# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) -compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0); -compat_symbol (libc, isnan, isnanl, GLIBC_2_0); -# endif -#endif diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S deleted file mode 100644 index b48c85e0d3..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S +++ /dev/null @@ -1 +0,0 @@ -/* This function uses the same code as s_isnan.S. */ diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S deleted file mode 100644 index aa180b6901..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S +++ /dev/null @@ -1,45 +0,0 @@ -/* Round double to long int. POWER8 PowerPC64 version. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <math_ldbl_opt.h> - -#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ - -/* long long int[r3] __llrint (double x[fp1]) */ -ENTRY (__llrint) - CALL_MCOUNT 0 - fctid fp1,fp1 - MFVSRD_R3_V1 - blr -END (__llrint) - -strong_alias (__llrint, __lrint) -weak_alias (__llrint, llrint) -weak_alias (__lrint, lrint) - -#ifdef NO_LONG_DOUBLE -strong_alias (__llrint, __llrintl) -weak_alias (__llrint, llrintl) -strong_alias (__lrint, __lrintl) -weak_alias (__lrint, lrintl) -#endif -#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1) -compat_symbol (libm, __llrint, llrintl, GLIBC_2_1) -compat_symbol (libm, __lrint, lrintl, GLIBC_2_1) -#endif diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S deleted file mode 100644 index 043fc6a089..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S +++ /dev/null @@ -1,48 +0,0 @@ -/* llround function. POWER8 PowerPC64 version. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <endian.h> -#include <math_ldbl_opt.h> - -#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ - -/* long long [r3] llround (float x [fp1]) */ - -ENTRY (__llround) - CALL_MCOUNT 0 - frin fp1,fp1 /* Round to nearest +-0.5. */ - fctidz fp1,fp1 /* Convert To Integer DW round toward 0. */ - MFVSRD_R3_V1 - blr -END (__llround) - -strong_alias (__llround, __lround) -weak_alias (__llround, llround) -weak_alias (__lround, lround) - -#ifdef NO_LONG_DOUBLE -weak_alias (__llround, llroundl) -strong_alias (__llround, __llroundl) -weak_alias (__lround, lroundl) -strong_alias (__lround, __lroundl) -#endif -#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1) -compat_symbol (libm, __llround, llroundl, GLIBC_2_1) -compat_symbol (libm, __lround, lroundl, GLIBC_2_1) -#endif diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S deleted file mode 100644 index fb0add3462..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S +++ /dev/null @@ -1,519 +0,0 @@ -/* Optimized sinf(). PowerPC64/POWER8 version. - Copyright (C) 2016-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#define _ERRNO_H 1 -#include <bits/errno.h> - -#define FRAMESIZE (FRAME_MIN_SIZE+16) - -#define FLOAT_EXPONENT_SHIFT 23 -#define FLOAT_EXPONENT_BIAS 127 -#define INTEGER_BITS 3 - -#define PI_4 0x3f490fdb /* PI/4 */ -#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */ -#define TWO_PN5 0x3d000000 /* 2^-5 */ -#define TWO_PN27 0x32000000 /* 2^-27 */ -#define INFINITY 0x7f800000 -#define TWO_P23 0x4b000000 /* 2^27 */ -#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */ - - /* Implements the function - - float [fp1] sinf (float [fp1] x) */ - - .machine power8 -EALIGN(__sinf, 4, 0) - addis r9,r2,L(anchor)@toc@ha - addi r9,r9,L(anchor)@toc@l - - lis r4,PI_4@h - ori r4,r4,PI_4@l - - xscvdpspn v0,v1 - mfvsrd r8,v0 - rldicl r3,r8,32,33 /* Remove sign bit. */ - - cmpw r3,r4 - bge L(greater_or_equal_pio4) - - lis r4,TWO_PN5@h - ori r4,r4,TWO_PN5@l - - cmpw r3,r4 - blt L(less_2pn5) - - /* Chebyshev polynomial of the form: - * x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ - - lfd fp9,(L(S0)-L(anchor))(r9) - lfd fp10,(L(S1)-L(anchor))(r9) - lfd fp11,(L(S2)-L(anchor))(r9) - lfd fp12,(L(S3)-L(anchor))(r9) - lfd fp13,(L(S4)-L(anchor))(r9) - - fmul fp2,fp1,fp1 /* x^2 */ - fmul fp3,fp2,fp1 /* x^3 */ - - fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */ - fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */ - fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */ - fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */ - fmadd fp1,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */ - frsp fp1,fp1 /* Round to single precision. */ - - blr - - .balign 16 -L(greater_or_equal_pio4): - lis r4,NINEPI_4@h - ori r4,r4,NINEPI_4@l - cmpw r3,r4 - bge L(greater_or_equal_9pio4) - - /* Calculate quotient of |x|/(PI/4). */ - lfd fp2,(L(invpio4)-L(anchor))(r9) - fabs fp1,fp1 /* |x| */ - fmul fp2,fp1,fp2 /* |x|/(PI/4) */ - fctiduz fp2,fp2 - mfvsrd r3,v2 /* n = |x| mod PI/4 */ - - /* Now use that quotient to find |x| mod (PI/2). */ - addi r7,r3,1 - rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */ - addi r6,r9,(L(pio2_table)-L(anchor)) - lfdx fp4,r5,r6 - fsub fp1,fp1,fp4 - - .balign 16 -L(reduced): - /* Now we are in the range -PI/4 to PI/4. */ - - /* Work out if we are in a positive or negative primary interval. */ - rldicl r4,r7,62,63 /* ((n+1) >> 2) & 1 */ - - /* We are operating on |x|, so we need to add back the original - sign. */ - rldicl r8,r8,33,63 /* (x >> 31) & 1, ie the sign bit. */ - xor r4,r4,r8 /* 0 if result should be positive, - 1 if negative. */ - - /* Load a 1.0 or -1.0. */ - addi r5,r9,(L(ones)-L(anchor)) - sldi r4,r4,3 - lfdx fp0,r4,r5 - - /* Are we in the primary interval of sin or cos? */ - andi. r4,r7,0x2 - bne L(cos) - - /* Chebyshev polynomial of the form: - x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ - - lfd fp9,(L(S0)-L(anchor))(r9) - lfd fp10,(L(S1)-L(anchor))(r9) - lfd fp11,(L(S2)-L(anchor))(r9) - lfd fp12,(L(S3)-L(anchor))(r9) - lfd fp13,(L(S4)-L(anchor))(r9) - - fmul fp2,fp1,fp1 /* x^2 */ - fmul fp3,fp2,fp1 /* x^3 */ - - fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */ - fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */ - fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */ - fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */ - fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */ - fmul fp4,fp4,fp0 /* Add in the sign. */ - frsp fp1,fp4 /* Round to single precision. */ - - blr - - .balign 16 -L(cos): - /* Chebyshev polynomial of the form: - 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ - - lfd fp9,(L(C0)-L(anchor))(r9) - lfd fp10,(L(C1)-L(anchor))(r9) - lfd fp11,(L(C2)-L(anchor))(r9) - lfd fp12,(L(C3)-L(anchor))(r9) - lfd fp13,(L(C4)-L(anchor))(r9) - - fmul fp2,fp1,fp1 /* x^2 */ - lfd fp3,(L(DPone)-L(anchor))(r9) - - fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */ - fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */ - fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */ - fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */ - fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */ - fmul fp4,fp4,fp0 /* Add in the sign. */ - frsp fp1,fp4 /* Round to single precision. */ - - blr - - .balign 16 -L(greater_or_equal_9pio4): - lis r4,INFINITY@h - ori r4,r4,INFINITY@l - cmpw r3,r4 - bge L(inf_or_nan) - - lis r4,TWO_P23@h - ori r4,r4,TWO_P23@l - cmpw r3,r4 - bge L(greater_or_equal_2p23) - - fabs fp1,fp1 /* |x| */ - - /* Calculate quotient of |x|/(PI/4). */ - lfd fp2,(L(invpio4)-L(anchor))(r9) - - lfd fp3,(L(DPone)-L(anchor))(r9) - lfd fp4,(L(DPhalf)-L(anchor))(r9) - fmul fp2,fp1,fp2 /* |x|/(PI/4) */ - friz fp2,fp2 /* n = floor(|x|/(PI/4)) */ - - /* Calculate (n + 1) / 2. */ - fadd fp2,fp2,fp3 /* n + 1 */ - fmul fp3,fp2,fp4 /* (n + 1) / 2 */ - friz fp3,fp3 - - lfd fp4,(L(pio2hi)-L(anchor))(r9) - lfd fp5,(L(pio2lo)-L(anchor))(r9) - - fmul fp6,fp4,fp3 - fadd fp6,fp6,fp1 - fmadd fp1,fp5,fp3,fp6 - - fctiduz fp2,fp2 - mfvsrd r7,v2 /* n + 1 */ - - b L(reduced) - - .balign 16 -L(inf_or_nan): - bne L(skip_errno_setting) /* Is a NAN? */ - - /* We delayed the creation of the stack frame, as well as the saving of - the link register, because only at this point, we are sure that - doing so is actually needed. */ - - stfd fp1,-8(r1) - - /* Save the link register. */ - mflr r0 - std r0,16(r1) - cfi_offset(lr, 16) - - /* Create the stack frame. */ - stdu r1,-FRAMESIZE(r1) - cfi_adjust_cfa_offset(FRAMESIZE) - - bl JUMPTARGET(__errno_location) - nop - - /* Restore the stack frame. */ - addi r1,r1,FRAMESIZE - cfi_adjust_cfa_offset(-FRAMESIZE) - /* Restore the link register. */ - ld r0,16(r1) - mtlr r0 - - lfd fp1,-8(r1) - - /* errno = EDOM */ - li r4,EDOM - stw r4,0(r3) - -L(skip_errno_setting): - fsub fp1,fp1,fp1 /* x - x */ - blr - - .balign 16 -L(greater_or_equal_2p23): - fabs fp1,fp1 - - srwi r4,r3,FLOAT_EXPONENT_SHIFT - subi r4,r4,FLOAT_EXPONENT_BIAS - - /* We reduce the input modulo pi/4, so we need 3 bits of integer - to determine where in 2*pi we are. Index into our array - accordingly. */ - addi r4,r4,INTEGER_BITS - - /* To avoid an expensive divide, for the range we care about (0 - 127) - we can transform x/28 into: - - x/28 = (x * ((0x100000000 / 28) + 1)) >> 32 - - mulhwu returns the top 32 bits of the 64 bit result, doing the - shift for us in the same instruction. The top 32 bits are undefined, - so we have to mask them. */ - - lis r6,FX_FRACTION_1_28@h - ori r6,r6,FX_FRACTION_1_28@l - mulhwu r5,r4,r6 - clrldi r5,r5,32 - - /* Get our pointer into the invpio4_table array. */ - sldi r4,r5,3 - addi r6,r9,(L(invpio4_table)-L(anchor)) - add r4,r4,r6 - - lfd fp2,0(r4) - lfd fp3,8(r4) - lfd fp4,16(r4) - lfd fp5,24(r4) - - fmul fp6,fp2,fp1 - fmul fp7,fp3,fp1 - fmul fp8,fp4,fp1 - fmul fp9,fp5,fp1 - - /* Mask off larger integer bits in highest double word that we don't - care about to avoid losing precision when combining with smaller - values. */ - fctiduz fp10,fp6 - mfvsrd r7,v10 - rldicr r7,r7,0,(63-INTEGER_BITS) - mtvsrd v10,r7 - fcfidu fp10,fp10 /* Integer bits. */ - - fsub fp6,fp6,fp10 /* highest -= integer bits */ - - /* Work out the integer component, rounded down. Use the top two - limbs for this. */ - fadd fp10,fp6,fp7 /* highest + higher */ - - fctiduz fp10,fp10 - mfvsrd r7,v10 - andi. r0,r7,1 - fcfidu fp10,fp10 - - /* Subtract integer component from highest limb. */ - fsub fp12,fp6,fp10 - - beq L(even_integer) - - /* Our integer component is odd, so we are in the -PI/4 to 0 primary - region. We need to shift our result down by PI/4, and to do this - in the mod (4/PI) space we simply subtract 1. */ - lfd fp11,(L(DPone)-L(anchor))(r9) - fsub fp12,fp12,fp11 - - /* Now add up all the limbs in order. */ - fadd fp12,fp12,fp7 - fadd fp12,fp12,fp8 - fadd fp12,fp12,fp9 - - /* And finally multiply by pi/4. */ - lfd fp13,(L(pio4)-L(anchor))(r9) - fmul fp1,fp12,fp13 - - addi r7,r7,1 - b L(reduced) - -L(even_integer): - lfd fp11,(L(DPone)-L(anchor))(r9) - - /* Now add up all the limbs in order. */ - fadd fp12,fp12,fp7 - fadd fp12,r12,fp8 - fadd fp12,r12,fp9 - - /* We need to check if the addition of all the limbs resulted in us - overflowing 1.0. */ - fcmpu 0,fp12,fp11 - bgt L(greater_than_one) - - /* And finally multiply by pi/4. */ - lfd fp13,(L(pio4)-L(anchor))(r9) - fmul fp1,fp12,fp13 - - addi r7,r7,1 - b L(reduced) - -L(greater_than_one): - /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our - integer, and subtract 1.0 from our result. Since that makes the - integer component odd, we need to subtract another 1.0 as - explained above. */ - addi r7,r7,1 - - lfd fp11,(L(DPtwo)-L(anchor))(r9) - fsub fp12,fp12,fp11 - - /* And finally multiply by pi/4. */ - lfd fp13,(L(pio4)-L(anchor))(r9) - fmul fp1,fp12,fp13 - - addi r7,r7,1 - b L(reduced) - - .balign 16 -L(less_2pn5): - lis r4,TWO_PN27@h - ori r4,r4,TWO_PN27@l - - cmpw r3,r4 - blt L(less_2pn27) - - /* A simpler Chebyshev approximation is close enough for this range: - x+x^3*(SS0+x^2*SS1). */ - - lfd fp10,(L(SS0)-L(anchor))(r9) - lfd fp11,(L(SS1)-L(anchor))(r9) - - fmul fp2,fp1,fp1 /* x^2 */ - fmul fp3,fp2,fp1 /* x^3 */ - - fmadd fp4,fp2,fp11,fp10 /* SS0+x^2*SS1 */ - fmadd fp1,fp3,fp4,fp1 /* x+x^3*(SS0+x^2*SS1) */ - - frsp fp1,fp1 /* Round to single precision. */ - - blr - - .balign 16 -L(less_2pn27): - cmpwi r3,0 - beq L(zero) - - /* Handle some special cases: - - sinf(subnormal) raises inexact/underflow - sinf(min_normalized) raises inexact/underflow - sinf(normalized) raises inexact. */ - - lfd fp2,(L(small)-L(anchor))(r9) - - fmul fp2,fp1,fp2 /* x * small */ - fsub fp1,fp1,fp2 /* x - x * small */ - - frsp fp1,fp1 - - blr - - .balign 16 -L(zero): - blr - -END (__sinf) - - .section .rodata, "a" - - .balign 8 - -L(anchor): - - /* Chebyshev constants for sin, range -PI/4 - PI/4. */ -L(S0): .8byte 0xbfc5555555551cd9 -L(S1): .8byte 0x3f81111110c2688b -L(S2): .8byte 0xbf2a019f8b4bd1f9 -L(S3): .8byte 0x3ec71d7264e6b5b4 -L(S4): .8byte 0xbe5a947e1674b58a - - /* Chebyshev constants for sin, range 2^-27 - 2^-5. */ -L(SS0): .8byte 0xbfc555555543d49d -L(SS1): .8byte 0x3f8110f475cec8c5 - - /* Chebyshev constants for cos, range -PI/4 - PI/4. */ -L(C0): .8byte 0xbfdffffffffe98ae -L(C1): .8byte 0x3fa55555545c50c7 -L(C2): .8byte 0xbf56c16b348b6874 -L(C3): .8byte 0x3efa00eb9ac43cc0 -L(C4): .8byte 0xbe923c97dd8844d7 - -L(invpio2): - .8byte 0x3fe45f306dc9c883 /* 2/PI */ - -L(invpio4): - .8byte 0x3ff45f306dc9c883 /* 4/PI */ - -L(invpio4_table): - .8byte 0x0000000000000000 - .8byte 0x3ff45f306c000000 - .8byte 0x3e3c9c882a000000 - .8byte 0x3c54fe13a8000000 - .8byte 0x3aaf47d4d0000000 - .8byte 0x38fbb81b6c000000 - .8byte 0x3714acc9e0000000 - .8byte 0x3560e4107c000000 - .8byte 0x33bca2c756000000 - .8byte 0x31fbd778ac000000 - .8byte 0x300b7246e0000000 - .8byte 0x2e5d2126e8000000 - .8byte 0x2c97003248000000 - .8byte 0x2ad77504e8000000 - .8byte 0x290921cfe0000000 - .8byte 0x274deb1cb0000000 - .8byte 0x25829a73e0000000 - .8byte 0x23fd1046be000000 - .8byte 0x2224baed10000000 - .8byte 0x20709d338e000000 - .8byte 0x1e535a2f80000000 - .8byte 0x1cef904e64000000 - .8byte 0x1b0d639830000000 - .8byte 0x1964ce7d24000000 - .8byte 0x17b908bf16000000 - -L(pio4): - .8byte 0x3fe921fb54442d18 /* PI/4 */ - -/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb - to avoid losing significant bits when multiplying with up to - (2^22)/(pi/2). */ -L(pio2hi): - .8byte 0xbff921fb54400000 - -L(pio2lo): - .8byte 0xbdd0b4611a626332 - -L(pio2_table): - .8byte 0 - .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */ - .8byte 0x400921fb54442d18 /* 2 * PI/2 */ - .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */ - .8byte 0x401921fb54442d18 /* 4 * PI/2 */ - .8byte 0x401f6a7a2955385e /* 5 * PI/2 */ - .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */ - .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */ - .8byte 0x402921fb54442d18 /* 8 * PI/2 */ - .8byte 0x402c463abeccb2bb /* 9 * PI/2 */ - .8byte 0x402f6a7a2955385e /* 10 * PI/2 */ - -L(small): - .8byte 0x3cd0000000000000 /* 2^-50 */ - -L(ones): - .8byte 0x3ff0000000000000 /* +1.0 */ - .8byte 0xbff0000000000000 /* -1.0 */ - -L(DPhalf): - .8byte 0x3fe0000000000000 /* 0.5 */ - -L(DPone): - .8byte 0x3ff0000000000000 /* 1.0 */ - -L(DPtwo): - .8byte 0x4000000000000000 /* 2.0 */ - -weak_alias(__sinf, sinf) diff --git a/sysdeps/powerpc/powerpc64/power8/memcmp.S b/sysdeps/powerpc/powerpc64/power8/memcmp.S deleted file mode 100644 index 46b9c0067a..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/memcmp.S +++ /dev/null @@ -1,1447 +0,0 @@ -/* Optimized memcmp implementation for POWER7/PowerPC64. - Copyright (C) 2010-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* int [r3] memcmp (const char *s1 [r3], - const char *s2 [r4], - size_t size [r5]) */ - -/* TODO: change these to the actual instructions when the minimum required - binutils allows it. */ -#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) -#ifndef MEMCMP -# define MEMCMP memcmp -#endif - .machine power7 -EALIGN (MEMCMP, 4, 0) - CALL_MCOUNT 3 - -#define rRTN r3 -#define rSTR1 r3 /* First string arg. */ -#define rSTR2 r4 /* Second string arg. */ -#define rN r5 /* Max string length. */ -#define rWORD1 r6 /* Current word in s1. */ -#define rWORD2 r7 /* Current word in s2. */ -#define rWORD3 r8 /* Next word in s1. */ -#define rWORD4 r9 /* Next word in s2. */ -#define rWORD5 r10 /* Next word in s1. */ -#define rWORD6 r11 /* Next word in s2. */ - -#define rOFF8 r20 /* 8 bytes offset. */ -#define rOFF16 r21 /* 16 bytes offset. */ -#define rOFF24 r22 /* 24 bytes offset. */ -#define rOFF32 r23 /* 24 bytes offset. */ -#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ -#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ -#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ -#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ -#define rSHR r28 /* Unaligned shift right count. */ -#define rSHL r29 /* Unaligned shift left count. */ -#define rWORD7 r30 /* Next word in s1. */ -#define rWORD8 r31 /* Next word in s2. */ - -#define rWORD8SAVE (-8) -#define rWORD7SAVE (-16) -#define rOFF8SAVE (-24) -#define rOFF16SAVE (-32) -#define rOFF24SAVE (-40) -#define rOFF32SAVE (-48) -#define rSHRSAVE (-56) -#define rSHLSAVE (-64) -#define rWORD8SHIFTSAVE (-72) -#define rWORD2SHIFTSAVE (-80) -#define rWORD4SHIFTSAVE (-88) -#define rWORD6SHIFTSAVE (-96) - -#ifdef __LITTLE_ENDIAN__ -# define LD ldbrx -#else -# define LD ldx -#endif - - xor r10, rSTR2, rSTR1 - cmpldi cr6, rN, 0 - cmpldi cr1, rN, 8 - clrldi. r0, r10, 61 - clrldi r12, rSTR1, 61 - cmpldi cr5, r12, 0 - beq- cr6, L(zeroLength) - dcbt 0, rSTR1 - dcbt 0, rSTR2 - /* If less than 8 bytes or not aligned, use the unaligned - byte loop. */ - blt cr1, L(bytealigned) - bne L(unalignedqw) -/* At this point we know both strings have the same alignment and the - compare length is at least 8 bytes. r12 contains the low order - 3 bits of rSTR1 and cr5 contains the result of the logical compare - of r12 to 0. If r12 == 0 then we are already double word - aligned and can perform the DW aligned loop. */ - - .align 4 -L(samealignment): - or r11, rSTR2, rSTR1 - clrldi. r11, r11, 60 - beq L(qw_align) - /* Try to align to QW else proceed to DW loop. */ - clrldi. r10, r10, 60 - bne L(DW) - /* For the difference to reach QW alignment, load as DW. */ - clrrdi rSTR1, rSTR1, 3 - clrrdi rSTR2, rSTR2, 3 - subfic r10, r12, 8 - LD rWORD1, 0, rSTR1 - LD rWORD2, 0, rSTR2 - sldi r9, r10, 3 - subfic r9, r9, 64 - sld rWORD1, rWORD1, r9 - sld rWORD2, rWORD2, r9 - cmpld cr6, rWORD1, rWORD2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - bne cr6, L(ret_diff) - subf rN, r10, rN - - cmpld cr6, r11, r12 - bgt cr6, L(qw_align) - LD rWORD1, 0, rSTR1 - LD rWORD2, 0, rSTR2 - cmpld cr6, rWORD1, rWORD2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - bne cr6, L(different) - cmpldi cr6, rN, 8 - ble cr6, L(zeroLength) - addi rN, rN, -8 - /* Now both rSTR1 and rSTR2 are aligned to QW. */ - .align 4 -L(qw_align): - vspltisb v0, 0 - srdi. r6, rN, 6 - li r8, 16 - li r10, 32 - li r11, 48 - ble cr0, L(lessthan64) - mtctr r6 - vspltisb v8, 0 - vspltisb v6, 0 - /* Aligned vector loop. */ - .align 4 -L(aligned_loop): - lvx v4, 0, rSTR1 - lvx v5, 0, rSTR2 - vcmpequb. v7, v6, v8 - bnl cr6, L(different3) - lvx v6, rSTR1, r8 - lvx v8, rSTR2, r8 - vcmpequb. v7, v5, v4 - bnl cr6, L(different2) - lvx v4, rSTR1, r10 - lvx v5, rSTR2, r10 - vcmpequb. v7, v6, v8 - bnl cr6, L(different3) - lvx v6, rSTR1, r11 - lvx v8, rSTR2, r11 - vcmpequb. v7, v5, v4 - bnl cr6, L(different2) - addi rSTR1, rSTR1, 64 - addi rSTR2, rSTR2, 64 - bdnz L(aligned_loop) - vcmpequb. v7, v6, v8 - bnl cr6, L(different3) - clrldi rN, rN, 58 - /* Handle remainder for aligned loop. */ - .align 4 -L(lessthan64): - mr r9, rSTR1 - cmpdi cr6, rN, 0 - li rSTR1, 0 - blelr cr6 - lvx v4, 0, r9 - lvx v5, 0, rSTR2 - vcmpequb. v7, v5, v4 - bnl cr6, L(different1) - addi rN, rN, -16 - - cmpdi cr6, rN, 0 - blelr cr6 - lvx v4, r9, r8 - lvx v5, rSTR2, r8 - vcmpequb. v7, v5, v4 - bnl cr6, L(different1) - addi rN, rN, -16 - - cmpdi cr6, rN, 0 - blelr cr6 - lvx v4, r9, r10 - lvx v5, rSTR2, r10 - vcmpequb. v7, v5, v4 - bnl cr6, L(different1) - addi rN, rN, -16 - - cmpdi cr6, rN, 0 - blelr cr6 - lvx v4, r9, r11 - lvx v5, rSTR2, r11 - vcmpequb. v7, v5, v4 - bnl cr6, L(different1) - blr - - /* Calculate and return the difference. */ - .align 4 -L(different1): - cmpdi cr6, rN, 16 - bge cr6, L(different2) - /* Discard unwanted bytes. */ -#ifdef __LITTLE_ENDIAN__ - lvsr v1, 0, rN - vperm v4, v4, v0, v1 - vperm v5, v5, v0, v1 -#else - lvsl v1, 0, rN - vperm v4, v0, v4, v1 - vperm v5, v0, v5, v1 -#endif - vcmpequb. v7, v4, v5 - li rRTN, 0 - bltlr cr6 - .align 4 -L(different2): -#ifdef __LITTLE_ENDIAN__ - /* Reverse bytes for direct comparison. */ - lvsl v10, r0, r0 - vspltisb v8, 15 - vsububm v9, v8, v10 - vperm v4, v4, v0, v9 - vperm v5, v5, v0, v9 -#endif - MFVRD(r7, v4) - MFVRD(r9, v5) - cmpld cr6, r7, r9 - bne cr6, L(ret_diff) - /* Difference in second DW. */ - vsldoi v4, v4, v4, 8 - vsldoi v5, v5, v5, 8 - MFVRD(r7, v4) - MFVRD(r9, v5) - cmpld cr6, r7, r9 -L(ret_diff): - li rRTN, 1 - bgtlr cr6 - li rRTN, -1 - blr - .align 4 -L(different3): -#ifdef __LITTLE_ENDIAN__ - /* Reverse bytes for direct comparison. */ - vspltisb v9, 15 - lvsl v10, r0, r0 - vsububm v9, v9, v10 - vperm v6, v6, v0, v9 - vperm v8, v8, v0, v9 -#endif - MFVRD(r7, v6) - MFVRD(r9, v8) - cmpld cr6, r7, r9 - bne cr6, L(ret_diff) - /* Difference in second DW. */ - vsldoi v6, v6, v6, 8 - vsldoi v8, v8, v8, 8 - MFVRD(r7, v6) - MFVRD(r9, v8) - cmpld cr6, r7, r9 - li rRTN, 1 - bgtlr cr6 - li rRTN, -1 - blr - - .align 4 -L(different): - cmpldi cr7, rN, 8 - bgt cr7, L(end) - /* Skip unwanted bytes. */ - sldi r8, rN, 3 - subfic r8, r8, 64 - srd rWORD1, rWORD1, r8 - srd rWORD2, rWORD2, r8 - cmpld cr6, rWORD1, rWORD2 - li rRTN, 0 - beqlr cr6 -L(end): - li rRTN, 1 - bgtlr cr6 - li rRTN, -1 - blr - - .align 4 -L(unalignedqw): - /* Proceed to DW unaligned loop,if there is a chance of pagecross. */ - rldicl r9, rSTR1, 0, 52 - add r9, r9, rN - cmpldi cr0, r9, 4096-16 - bgt cr0, L(unaligned) - rldicl r9, rSTR2, 0, 52 - add r9, r9, rN - cmpldi cr0, r9, 4096-16 - bgt cr0, L(unaligned) - li r0, 0 - li r8, 16 - vspltisb v0, 0 - /* Check if rSTR1 is aligned to QW. */ - andi. r11, rSTR1, 0xF - beq L(s1_align) - - /* Compare 16B and align S1 to QW. */ -#ifdef __LITTLE_ENDIAN__ - lvsr v10, 0, rSTR1 /* Compute mask. */ - lvsr v6, 0, rSTR2 /* Compute mask. */ -#else - lvsl v10, 0, rSTR1 /* Compute mask. */ - lvsl v6, 0, rSTR2 /* Compute mask. */ -#endif - lvx v5, 0, rSTR2 - lvx v9, rSTR2, r8 -#ifdef __LITTLE_ENDIAN__ - vperm v5, v9, v5, v6 -#else - vperm v5, v5, v9, v6 -#endif - lvx v4, 0, rSTR1 - lvx v9, rSTR1, r8 -#ifdef __LITTLE_ENDIAN__ - vperm v4, v9, v4, v10 -#else - vperm v4, v4, v9, v10 -#endif - vcmpequb. v7, v5, v4 - bnl cr6, L(different1) - cmpldi cr6, rN, 16 - ble cr6, L(zeroLength) - subfic r11, r11, 16 - subf rN, r11, rN - add rSTR1, rSTR1, r11 - add rSTR2, rSTR2, r11 - - /* As s1 is QW aligned prepare for unaligned loop. */ - .align 4 -L(s1_align): -#ifdef __LITTLE_ENDIAN__ - lvsr v6, 0, rSTR2 -#else - lvsl v6, 0, rSTR2 -#endif - lvx v5, 0, rSTR2 - srdi. r6, rN, 6 - li r10, 32 - li r11, 48 - ble cr0, L(lessthan64_unalign) - mtctr r6 - li r9, 64 - /* Unaligned vector loop. */ - .align 4 -L(unalign_qwloop): - lvx v4, 0, rSTR1 - lvx v10, rSTR2, r8 -#ifdef __LITTLE_ENDIAN__ - vperm v5, v10, v5, v6 -#else - vperm v5, v5, v10, v6 -#endif - vcmpequb. v7, v5, v4 - bnl cr6, L(different2) - vor v5, v10, v10 - lvx v4, rSTR1, r8 - lvx v10, rSTR2, r10 -#ifdef __LITTLE_ENDIAN__ - vperm v5, v10, v5, v6 -#else - vperm v5, v5, v10, v6 -#endif - vcmpequb. v7, v5, v4 - bnl cr6, L(different2) - vor v5, v10, v10 - lvx v4, rSTR1, r10 - lvx v10, rSTR2, r11 -#ifdef __LITTLE_ENDIAN__ - vperm v5, v10, v5, v6 -#else - vperm v5, v5, v10, v6 -#endif - vcmpequb. v7, v5, v4 - bnl cr6, L(different2) - vor v5, v10, v10 - lvx v4, rSTR1, r11 - lvx v10, rSTR2, r9 -#ifdef __LITTLE_ENDIAN__ - vperm v5, v10, v5, v6 -#else - vperm v5, v5, v10, v6 -#endif - vcmpequb. v7, v5, v4 - bnl cr6, L(different2) - vor v5, v10, v10 - addi rSTR1, rSTR1, 64 - addi rSTR2, rSTR2, 64 - bdnz L(unalign_qwloop) - clrldi rN, rN, 58 - /* Handle remainder for unaligned loop. */ - .align 4 -L(lessthan64_unalign): - mr r9, rSTR1 - cmpdi cr6, rN, 0 - li rSTR1, 0 - blelr cr6 - lvx v4, 0, r9 - lvx v10, rSTR2, r8 -#ifdef __LITTLE_ENDIAN__ - vperm v5, v10, v5, v6 -#else - vperm v5, v5, v10, v6 -#endif - vcmpequb. v7, v5, v4 - bnl cr6, L(different1) - vor v5, v10, v10 - addi rN, rN, -16 - - cmpdi cr6, rN, 0 - blelr cr6 - lvx v4, r9, r8 - lvx v10, rSTR2, r10 -#ifdef __LITTLE_ENDIAN__ - vperm v5, v10, v5, v6 -#else - vperm v5, v5, v10, v6 -#endif - vcmpequb. v7, v5, v4 - bnl cr6, L(different1) - vor v5, v10, v10 - addi rN, rN, -16 - - cmpdi cr6, rN, 0 - blelr cr6 - lvx v4, r9, r10 - lvx v10, rSTR2, r11 -#ifdef __LITTLE_ENDIAN__ - vperm v5, v10, v5, v6 -#else - vperm v5, v5, v10, v6 -#endif - vcmpequb. v7, v5, v4 - bnl cr6, L(different1) - vor v5, v10, v10 - addi rN, rN, -16 - - cmpdi cr6, rN, 0 - blelr cr6 - lvx v4, r9, r11 - addi r11, r11, 16 - lvx v10, rSTR2, r11 -#ifdef __LITTLE_ENDIAN__ - vperm v5, v10, v5, v6 -#else - vperm v5, v5, v10, v6 -#endif - vcmpequb. v7, v5, v4 - bnl cr6, L(different1) - blr - -/* Otherwise we know the two strings have the same alignment (but not - yet DW). So we force the string addresses to the next lower DW - boundary and special case this first DW using shift left to - eliminate bits preceding the first byte. Since we want to join the - normal (DW aligned) compare loop, starting at the second double word, - we need to adjust the length (rN) and special case the loop - versioning for the first DW. This ensures that the loop count is - correct and the first DW (shifted) is in the expected register pair. */ - .align 4 -L(DW): - std rWORD8, rWORD8SAVE(r1) - std rWORD7, rWORD7SAVE(r1) - std rOFF8, rOFF8SAVE(r1) - std rOFF16, rOFF16SAVE(r1) - std rOFF24, rOFF24SAVE(r1) - std rOFF32, rOFF32SAVE(r1) - cfi_offset(rWORD8, rWORD8SAVE) - cfi_offset(rWORD7, rWORD7SAVE) - cfi_offset(rOFF8, rOFF8SAVE) - cfi_offset(rOFF16, rOFF16SAVE) - cfi_offset(rOFF24, rOFF24SAVE) - cfi_offset(rOFF32, rOFF32SAVE) - - li rOFF8,8 - li rOFF16,16 - li rOFF24,24 - li rOFF32,32 - clrrdi rSTR1, rSTR1, 3 - clrrdi rSTR2, rSTR2, 3 - beq cr5, L(DWaligned) - add rN, rN, r12 - sldi rWORD6, r12, 3 - srdi r0, rN, 5 /* Divide by 32. */ - andi. r12, rN, 24 /* Get the DW remainder. */ - LD rWORD1, 0, rSTR1 - LD rWORD2, 0, rSTR2 - cmpldi cr1, r12, 16 - cmpldi cr7, rN, 32 - clrldi rN, rN, 61 - beq L(dPs4) - mtctr r0 - bgt cr1, L(dPs3) - beq cr1, L(dPs2) - -/* Remainder is 8. */ - .align 3 -L(dsP1): - sld rWORD5, rWORD1, rWORD6 - sld rWORD6, rWORD2, rWORD6 - cmpld cr5, rWORD5, rWORD6 - blt cr7, L(dP1x) -/* Do something useful in this cycle since we have to branch anyway. */ - LD rWORD1, rOFF8, rSTR1 - LD rWORD2, rOFF8, rSTR2 - cmpld cr7, rWORD1, rWORD2 - b L(dP1e) -/* Remainder is 16. */ - .align 4 -L(dPs2): - sld rWORD5, rWORD1, rWORD6 - sld rWORD6, rWORD2, rWORD6 - cmpld cr6, rWORD5, rWORD6 - blt cr7, L(dP2x) -/* Do something useful in this cycle since we have to branch anyway. */ - LD rWORD7, rOFF8, rSTR1 - LD rWORD8, rOFF8, rSTR2 - cmpld cr5, rWORD7, rWORD8 - b L(dP2e) -/* Remainder is 24. */ - .align 4 -L(dPs3): - sld rWORD3, rWORD1, rWORD6 - sld rWORD4, rWORD2, rWORD6 - cmpld cr1, rWORD3, rWORD4 - b L(dP3e) -/* Count is a multiple of 32, remainder is 0. */ - .align 4 -L(dPs4): - mtctr r0 - sld rWORD1, rWORD1, rWORD6 - sld rWORD2, rWORD2, rWORD6 - cmpld cr7, rWORD1, rWORD2 - b L(dP4e) - -/* At this point we know both strings are double word aligned and the - compare length is at least 8 bytes. */ - .align 4 -L(DWaligned): - andi. r12, rN, 24 /* Get the DW remainder. */ - srdi r0, rN, 5 /* Divide by 32. */ - cmpldi cr1, r12, 16 - cmpldi cr7, rN, 32 - clrldi rN, rN, 61 - beq L(dP4) - bgt cr1, L(dP3) - beq cr1, L(dP2) - -/* Remainder is 8. */ - .align 4 -L(dP1): - mtctr r0 -/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early - (8-15 byte compare), we want to use only volatile registers. This - means we can avoid restoring non-volatile registers since we did not - change any on the early exit path. The key here is the non-early - exit path only cares about the condition code (cr5), not about which - register pair was used. */ - LD rWORD5, 0, rSTR1 - LD rWORD6, 0, rSTR2 - cmpld cr5, rWORD5, rWORD6 - blt cr7, L(dP1x) - LD rWORD1, rOFF8, rSTR1 - LD rWORD2, rOFF8, rSTR2 - cmpld cr7, rWORD1, rWORD2 -L(dP1e): - LD rWORD3, rOFF16, rSTR1 - LD rWORD4, rOFF16, rSTR2 - cmpld cr1, rWORD3, rWORD4 - LD rWORD5, rOFF24, rSTR1 - LD rWORD6, rOFF24, rSTR2 - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(dLcr5x) - bne cr7, L(dLcr7x) - - LD rWORD7, rOFF32, rSTR1 - LD rWORD8, rOFF32, rSTR2 - addi rSTR1, rSTR1, 32 - addi rSTR2, rSTR2, 32 - bne cr1, L(dLcr1) - cmpld cr5, rWORD7, rWORD8 - bdnz L(dLoop) - bne cr6, L(dLcr6) - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) - .align 3 -L(dP1x): - sldi. r12, rN, 3 - bne cr5, L(dLcr5x) - subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - bne L(d00) - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 0 - blr - -/* Remainder is 16. */ - .align 4 -L(dP2): - mtctr r0 - LD rWORD5, 0, rSTR1 - LD rWORD6, 0, rSTR2 - cmpld cr6, rWORD5, rWORD6 - blt cr7, L(dP2x) - LD rWORD7, rOFF8, rSTR1 - LD rWORD8, rOFF8, rSTR2 - cmpld cr5, rWORD7, rWORD8 -L(dP2e): - LD rWORD1, rOFF16, rSTR1 - LD rWORD2, rOFF16, rSTR2 - cmpld cr7, rWORD1, rWORD2 - LD rWORD3, rOFF24, rSTR1 - LD rWORD4, rOFF24, rSTR2 - cmpld cr1, rWORD3, rWORD4 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - bne cr6, L(dLcr6) - bne cr5, L(dLcr5) - b L(dLoop2) - .align 4 -L(dP2x): - LD rWORD3, rOFF8, rSTR1 - LD rWORD4, rOFF8, rSTR2 - cmpld cr1, rWORD3, rWORD4 - sldi. r12, rN, 3 - bne cr6, L(dLcr6x) - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - bne cr1, L(dLcr1x) - subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - bne L(d00) - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 0 - blr - -/* Remainder is 24. */ - .align 4 -L(dP3): - mtctr r0 - LD rWORD3, 0, rSTR1 - LD rWORD4, 0, rSTR2 - cmpld cr1, rWORD3, rWORD4 -L(dP3e): - LD rWORD5, rOFF8, rSTR1 - LD rWORD6, rOFF8, rSTR2 - cmpld cr6, rWORD5, rWORD6 - blt cr7, L(dP3x) - LD rWORD7, rOFF16, rSTR1 - LD rWORD8, rOFF16, rSTR2 - cmpld cr5, rWORD7, rWORD8 - LD rWORD1, rOFF24, rSTR1 - LD rWORD2, rOFF24, rSTR2 - cmpld cr7, rWORD1, rWORD2 - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 - bne cr1, L(dLcr1) - bne cr6, L(dLcr6) - b L(dLoop1) -/* Again we are on a early exit path (24-31 byte compare), we want to - only use volatile registers and avoid restoring non-volatile - registers. */ - .align 4 -L(dP3x): - LD rWORD1, rOFF16, rSTR1 - LD rWORD2, rOFF16, rSTR2 - cmpld cr7, rWORD1, rWORD2 - sldi. r12, rN, 3 - bne cr1, L(dLcr1x) - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 - bne cr6, L(dLcr6x) - subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - bne cr7, L(dLcr7x) - bne L(d00) - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 0 - blr - -/* Count is a multiple of 32, remainder is 0. */ - .align 4 -L(dP4): - mtctr r0 - LD rWORD1, 0, rSTR1 - LD rWORD2, 0, rSTR2 - cmpld cr7, rWORD1, rWORD2 -L(dP4e): - LD rWORD3, rOFF8, rSTR1 - LD rWORD4, rOFF8, rSTR2 - cmpld cr1, rWORD3, rWORD4 - LD rWORD5, rOFF16, rSTR1 - LD rWORD6, rOFF16, rSTR2 - cmpld cr6, rWORD5, rWORD6 - LD rWORD7, rOFF24, rSTR1 - LD rWORD8, rOFF24, rSTR2 - addi rSTR1, rSTR1, 24 - addi rSTR2, rSTR2, 24 - cmpld cr5, rWORD7, rWORD8 - bne cr7, L(dLcr7) - bne cr1, L(dLcr1) - bdz- L(d24) /* Adjust CTR as we start with +4. */ -/* This is the primary loop. */ - .align 4 -L(dLoop): - LD rWORD1, rOFF8, rSTR1 - LD rWORD2, rOFF8, rSTR2 - cmpld cr1, rWORD3, rWORD4 - bne cr6, L(dLcr6) -L(dLoop1): - LD rWORD3, rOFF16, rSTR1 - LD rWORD4, rOFF16, rSTR2 - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(dLcr5) -L(dLoop2): - LD rWORD5, rOFF24, rSTR1 - LD rWORD6, rOFF24, rSTR2 - cmpld cr5, rWORD7, rWORD8 - bne cr7, L(dLcr7) -L(dLoop3): - LD rWORD7, rOFF32, rSTR1 - LD rWORD8, rOFF32, rSTR2 - addi rSTR1, rSTR1, 32 - addi rSTR2, rSTR2, 32 - bne cr1, L(dLcr1) - cmpld cr7, rWORD1, rWORD2 - bdnz L(dLoop) - -L(dL4): - cmpld cr1, rWORD3, rWORD4 - bne cr6, L(dLcr6) - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(dLcr5) - cmpld cr5, rWORD7, rWORD8 -L(d44): - bne cr7, L(dLcr7) -L(d34): - bne cr1, L(dLcr1) -L(d24): - bne cr6, L(dLcr6) -L(d14): - sldi. r12, rN, 3 - bne cr5, L(dLcr5) -L(d04): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) - subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - beq L(duzeroLength) -/* At this point we have a remainder of 1 to 7 bytes to compare. Since - we are aligned it is safe to load the whole double word, and use - shift right double to eliminate bits beyond the compare length. */ -L(d00): - LD rWORD1, rOFF8, rSTR1 - LD rWORD2, rOFF8, rSTR2 - srd rWORD1, rWORD1, rN - srd rWORD2, rWORD2, rN - cmpld cr7, rWORD1, rWORD2 - bne cr7, L(dLcr7x) - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 0 - blr - - .align 4 -L(dLcr7): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) -L(dLcr7x): - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 1 - bgtlr cr7 - li rRTN, -1 - blr - .align 4 -L(dLcr1): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) -L(dLcr1x): - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 1 - bgtlr cr1 - li rRTN, -1 - blr - .align 4 -L(dLcr6): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) -L(dLcr6x): - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 1 - bgtlr cr6 - li rRTN, -1 - blr - .align 4 -L(dLcr5): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) -L(dLcr5x): - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 1 - bgtlr cr5 - li rRTN, -1 - blr - - .align 4 -L(bytealigned): - mtctr rN - -/* We need to prime this loop. This loop is swing modulo scheduled - to avoid pipe delays. The dependent instruction latencies (load to - compare to conditional branch) is 2 to 3 cycles. In this loop each - dispatch group ends in a branch and takes 1 cycle. Effectively - the first iteration of the loop only serves to load operands and - branches based on compares are delayed until the next loop. - - So we must precondition some registers and condition codes so that - we don't exit the loop early on the first iteration. */ - - lbz rWORD1, 0(rSTR1) - lbz rWORD2, 0(rSTR2) - bdz L(b11) - cmpld cr7, rWORD1, rWORD2 - lbz rWORD3, 1(rSTR1) - lbz rWORD4, 1(rSTR2) - bdz L(b12) - cmpld cr1, rWORD3, rWORD4 - lbzu rWORD5, 2(rSTR1) - lbzu rWORD6, 2(rSTR2) - bdz L(b13) - .align 4 -L(bLoop): - lbzu rWORD1, 1(rSTR1) - lbzu rWORD2, 1(rSTR2) - bne cr7, L(bLcr7) - - cmpld cr6, rWORD5, rWORD6 - bdz L(b3i) - - lbzu rWORD3, 1(rSTR1) - lbzu rWORD4, 1(rSTR2) - bne cr1, L(bLcr1) - - cmpld cr7, rWORD1, rWORD2 - bdz L(b2i) - - lbzu rWORD5, 1(rSTR1) - lbzu rWORD6, 1(rSTR2) - bne cr6, L(bLcr6) - - cmpld cr1, rWORD3, rWORD4 - bdnz L(bLoop) - -/* We speculatively loading bytes before we have tested the previous - bytes. But we must avoid overrunning the length (in the ctr) to - prevent these speculative loads from causing a segfault. In this - case the loop will exit early (before the all pending bytes are - tested. In this case we must complete the pending operations - before returning. */ -L(b1i): - bne cr7, L(bLcr7) - bne cr1, L(bLcr1) - b L(bx56) - .align 4 -L(b2i): - bne cr6, L(bLcr6) - bne cr7, L(bLcr7) - b L(bx34) - .align 4 -L(b3i): - bne cr1, L(bLcr1) - bne cr6, L(bLcr6) - b L(bx12) - .align 4 -L(bLcr7): - li rRTN, 1 - bgtlr cr7 - li rRTN, -1 - blr -L(bLcr1): - li rRTN, 1 - bgtlr cr1 - li rRTN, -1 - blr -L(bLcr6): - li rRTN, 1 - bgtlr cr6 - li rRTN, -1 - blr - -L(b13): - bne cr7, L(bx12) - bne cr1, L(bx34) -L(bx56): - sub rRTN, rWORD5, rWORD6 - blr - nop -L(b12): - bne cr7, L(bx12) -L(bx34): - sub rRTN, rWORD3, rWORD4 - blr -L(b11): -L(bx12): - sub rRTN, rWORD1, rWORD2 - blr - - .align 4 -L(zeroLength): - li rRTN, 0 - blr - - .align 4 -/* At this point we know the strings have different alignment and the - compare length is at least 8 bytes. r12 contains the low order - 3 bits of rSTR1 and cr5 contains the result of the logical compare - of r12 to 0. If r12 == 0 then rStr1 is double word - aligned and can perform the DWunaligned loop. - - Otherwise we know that rSTR1 is not already DW aligned yet. - So we can force the string addresses to the next lower DW - boundary and special case this first DW using shift left to - eliminate bits preceding the first byte. Since we want to join the - normal (DWaligned) compare loop, starting at the second double word, - we need to adjust the length (rN) and special case the loop - versioning for the first DW. This ensures that the loop count is - correct and the first DW (shifted) is in the expected resister pair. */ -L(unaligned): - std rWORD8, rWORD8SAVE(r1) - std rWORD7, rWORD7SAVE(r1) - std rOFF8, rOFF8SAVE(r1) - std rOFF16, rOFF16SAVE(r1) - std rOFF24, rOFF24SAVE(r1) - std rOFF32, rOFF32SAVE(r1) - cfi_offset(rWORD8, rWORD8SAVE) - cfi_offset(rWORD7, rWORD7SAVE) - cfi_offset(rOFF8, rOFF8SAVE) - cfi_offset(rOFF16, rOFF16SAVE) - cfi_offset(rOFF24, rOFF24SAVE) - cfi_offset(rOFF32, rOFF32SAVE) - li rOFF8,8 - li rOFF16,16 - li rOFF24,24 - li rOFF32,32 - std rSHL, rSHLSAVE(r1) - cfi_offset(rSHL, rSHLSAVE) - clrldi rSHL, rSTR2, 61 - beq cr6, L(duzeroLength) - std rSHR, rSHRSAVE(r1) - cfi_offset(rSHR, rSHRSAVE) - beq cr5, L(DWunaligned) - std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) - cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) -/* Adjust the logical start of rSTR2 to compensate for the extra bits - in the 1st rSTR1 DW. */ - sub rWORD8_SHIFT, rSTR2, r12 -/* But do not attempt to address the DW before that DW that contains - the actual start of rSTR2. */ - clrrdi rSTR2, rSTR2, 3 - std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) -/* Compute the left/right shift counts for the unaligned rSTR2, - compensating for the logical (DW aligned) start of rSTR1. */ - clrldi rSHL, rWORD8_SHIFT, 61 - clrrdi rSTR1, rSTR1, 3 - std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) - sldi rSHL, rSHL, 3 - cmpld cr5, rWORD8_SHIFT, rSTR2 - add rN, rN, r12 - sldi rWORD6, r12, 3 - std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) - cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) - cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) - cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) - subfic rSHR, rSHL, 64 - srdi r0, rN, 5 /* Divide by 32. */ - andi. r12, rN, 24 /* Get the DW remainder. */ -/* We normally need to load 2 DWs to start the unaligned rSTR2, but in - this special case those bits may be discarded anyway. Also we - must avoid loading a DW where none of the bits are part of rSTR2 as - this may cross a page boundary and cause a page fault. */ - li rWORD8, 0 - blt cr5, L(dus0) - LD rWORD8, 0, rSTR2 - addi rSTR2, rSTR2, 8 - sld rWORD8, rWORD8, rSHL - -L(dus0): - LD rWORD1, 0, rSTR1 - LD rWORD2, 0, rSTR2 - cmpldi cr1, r12, 16 - cmpldi cr7, rN, 32 - srd r12, rWORD2, rSHR - clrldi rN, rN, 61 - beq L(duPs4) - mtctr r0 - or rWORD8, r12, rWORD8 - bgt cr1, L(duPs3) - beq cr1, L(duPs2) - -/* Remainder is 8. */ - .align 4 -L(dusP1): - sld rWORD8_SHIFT, rWORD2, rSHL - sld rWORD7, rWORD1, rWORD6 - sld rWORD8, rWORD8, rWORD6 - bge cr7, L(duP1e) -/* At this point we exit early with the first double word compare - complete and remainder of 0 to 7 bytes. See L(du14) for details on - how we handle the remaining bytes. */ - cmpld cr5, rWORD7, rWORD8 - sldi. rN, rN, 3 - bne cr5, L(duLcr5) - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li r0, 0 - ble cr7, L(dutrim) - LD rWORD2, rOFF8, rSTR2 - srd r0, rWORD2, rSHR - b L(dutrim) -/* Remainder is 16. */ - .align 4 -L(duPs2): - sld rWORD6_SHIFT, rWORD2, rSHL - sld rWORD5, rWORD1, rWORD6 - sld rWORD6, rWORD8, rWORD6 - b L(duP2e) -/* Remainder is 24. */ - .align 4 -L(duPs3): - sld rWORD4_SHIFT, rWORD2, rSHL - sld rWORD3, rWORD1, rWORD6 - sld rWORD4, rWORD8, rWORD6 - b L(duP3e) -/* Count is a multiple of 32, remainder is 0. */ - .align 4 -L(duPs4): - mtctr r0 - or rWORD8, r12, rWORD8 - sld rWORD2_SHIFT, rWORD2, rSHL - sld rWORD1, rWORD1, rWORD6 - sld rWORD2, rWORD8, rWORD6 - b L(duP4e) - -/* At this point we know rSTR1 is double word aligned and the - compare length is at least 8 bytes. */ - .align 4 -L(DWunaligned): - std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) - clrrdi rSTR2, rSTR2, 3 - std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) - srdi r0, rN, 5 /* Divide by 32. */ - std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) - andi. r12, rN, 24 /* Get the DW remainder. */ - std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) - cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) - cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) - cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) - cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) - sldi rSHL, rSHL, 3 - LD rWORD6, 0, rSTR2 - LD rWORD8, rOFF8, rSTR2 - addi rSTR2, rSTR2, 8 - cmpldi cr1, r12, 16 - cmpldi cr7, rN, 32 - clrldi rN, rN, 61 - subfic rSHR, rSHL, 64 - sld rWORD6_SHIFT, rWORD6, rSHL - beq L(duP4) - mtctr r0 - bgt cr1, L(duP3) - beq cr1, L(duP2) - -/* Remainder is 8. */ - .align 4 -L(duP1): - srd r12, rWORD8, rSHR - LD rWORD7, 0, rSTR1 - sld rWORD8_SHIFT, rWORD8, rSHL - or rWORD8, r12, rWORD6_SHIFT - blt cr7, L(duP1x) -L(duP1e): - LD rWORD1, rOFF8, rSTR1 - LD rWORD2, rOFF8, rSTR2 - cmpld cr5, rWORD7, rWORD8 - srd r0, rWORD2, rSHR - sld rWORD2_SHIFT, rWORD2, rSHL - or rWORD2, r0, rWORD8_SHIFT - LD rWORD3, rOFF16, rSTR1 - LD rWORD4, rOFF16, rSTR2 - cmpld cr7, rWORD1, rWORD2 - srd r12, rWORD4, rSHR - sld rWORD4_SHIFT, rWORD4, rSHL - bne cr5, L(duLcr5) - or rWORD4, r12, rWORD2_SHIFT - LD rWORD5, rOFF24, rSTR1 - LD rWORD6, rOFF24, rSTR2 - cmpld cr1, rWORD3, rWORD4 - srd r0, rWORD6, rSHR - sld rWORD6_SHIFT, rWORD6, rSHL - bne cr7, L(duLcr7) - or rWORD6, r0, rWORD4_SHIFT - cmpld cr6, rWORD5, rWORD6 - b L(duLoop3) - .align 4 -/* At this point we exit early with the first double word compare - complete and remainder of 0 to 7 bytes. See L(du14) for details on - how we handle the remaining bytes. */ -L(duP1x): - cmpld cr5, rWORD7, rWORD8 - sldi. rN, rN, 3 - bne cr5, L(duLcr5) - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li r0, 0 - ble cr7, L(dutrim) - LD rWORD2, rOFF8, rSTR2 - srd r0, rWORD2, rSHR - b L(dutrim) -/* Remainder is 16. */ - .align 4 -L(duP2): - srd r0, rWORD8, rSHR - LD rWORD5, 0, rSTR1 - or rWORD6, r0, rWORD6_SHIFT - sld rWORD6_SHIFT, rWORD8, rSHL -L(duP2e): - LD rWORD7, rOFF8, rSTR1 - LD rWORD8, rOFF8, rSTR2 - cmpld cr6, rWORD5, rWORD6 - srd r12, rWORD8, rSHR - sld rWORD8_SHIFT, rWORD8, rSHL - or rWORD8, r12, rWORD6_SHIFT - blt cr7, L(duP2x) - LD rWORD1, rOFF16, rSTR1 - LD rWORD2, rOFF16, rSTR2 - cmpld cr5, rWORD7, rWORD8 - bne cr6, L(duLcr6) - srd r0, rWORD2, rSHR - sld rWORD2_SHIFT, rWORD2, rSHL - or rWORD2, r0, rWORD8_SHIFT - LD rWORD3, rOFF24, rSTR1 - LD rWORD4, rOFF24, rSTR2 - cmpld cr7, rWORD1, rWORD2 - bne cr5, L(duLcr5) - srd r12, rWORD4, rSHR - sld rWORD4_SHIFT, rWORD4, rSHL - or rWORD4, r12, rWORD2_SHIFT - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - cmpld cr1, rWORD3, rWORD4 - b L(duLoop2) - .align 4 -L(duP2x): - cmpld cr5, rWORD7, rWORD8 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - bne cr6, L(duLcr6) - sldi. rN, rN, 3 - bne cr5, L(duLcr5) - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li r0, 0 - ble cr7, L(dutrim) - LD rWORD2, rOFF8, rSTR2 - srd r0, rWORD2, rSHR - b L(dutrim) - -/* Remainder is 24. */ - .align 4 -L(duP3): - srd r12, rWORD8, rSHR - LD rWORD3, 0, rSTR1 - sld rWORD4_SHIFT, rWORD8, rSHL - or rWORD4, r12, rWORD6_SHIFT -L(duP3e): - LD rWORD5, rOFF8, rSTR1 - LD rWORD6, rOFF8, rSTR2 - cmpld cr1, rWORD3, rWORD4 - srd r0, rWORD6, rSHR - sld rWORD6_SHIFT, rWORD6, rSHL - or rWORD6, r0, rWORD4_SHIFT - LD rWORD7, rOFF16, rSTR1 - LD rWORD8, rOFF16, rSTR2 - cmpld cr6, rWORD5, rWORD6 - bne cr1, L(duLcr1) - srd r12, rWORD8, rSHR - sld rWORD8_SHIFT, rWORD8, rSHL - or rWORD8, r12, rWORD6_SHIFT - blt cr7, L(duP3x) - LD rWORD1, rOFF24, rSTR1 - LD rWORD2, rOFF24, rSTR2 - cmpld cr5, rWORD7, rWORD8 - bne cr6, L(duLcr6) - srd r0, rWORD2, rSHR - sld rWORD2_SHIFT, rWORD2, rSHL - or rWORD2, r0, rWORD8_SHIFT - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 - cmpld cr7, rWORD1, rWORD2 - b L(duLoop1) - .align 4 -L(duP3x): - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 - cmpld cr5, rWORD7, rWORD8 - bne cr6, L(duLcr6) - sldi. rN, rN, 3 - bne cr5, L(duLcr5) - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li r0, 0 - ble cr7, L(dutrim) - LD rWORD2, rOFF8, rSTR2 - srd r0, rWORD2, rSHR - b L(dutrim) - -/* Count is a multiple of 32, remainder is 0. */ - .align 4 -L(duP4): - mtctr r0 - srd r0, rWORD8, rSHR - LD rWORD1, 0, rSTR1 - sld rWORD2_SHIFT, rWORD8, rSHL - or rWORD2, r0, rWORD6_SHIFT -L(duP4e): - LD rWORD3, rOFF8, rSTR1 - LD rWORD4, rOFF8, rSTR2 - cmpld cr7, rWORD1, rWORD2 - srd r12, rWORD4, rSHR - sld rWORD4_SHIFT, rWORD4, rSHL - or rWORD4, r12, rWORD2_SHIFT - LD rWORD5, rOFF16, rSTR1 - LD rWORD6, rOFF16, rSTR2 - cmpld cr1, rWORD3, rWORD4 - bne cr7, L(duLcr7) - srd r0, rWORD6, rSHR - sld rWORD6_SHIFT, rWORD6, rSHL - or rWORD6, r0, rWORD4_SHIFT - LD rWORD7, rOFF24, rSTR1 - LD rWORD8, rOFF24, rSTR2 - addi rSTR1, rSTR1, 24 - addi rSTR2, rSTR2, 24 - cmpld cr6, rWORD5, rWORD6 - bne cr1, L(duLcr1) - srd r12, rWORD8, rSHR - sld rWORD8_SHIFT, rWORD8, rSHL - or rWORD8, r12, rWORD6_SHIFT - cmpld cr5, rWORD7, rWORD8 - bdz L(du24) /* Adjust CTR as we start with +4. */ -/* This is the primary loop. */ - .align 4 -L(duLoop): - LD rWORD1, rOFF8, rSTR1 - LD rWORD2, rOFF8, rSTR2 - cmpld cr1, rWORD3, rWORD4 - bne cr6, L(duLcr6) - srd r0, rWORD2, rSHR - sld rWORD2_SHIFT, rWORD2, rSHL - or rWORD2, r0, rWORD8_SHIFT -L(duLoop1): - LD rWORD3, rOFF16, rSTR1 - LD rWORD4, rOFF16, rSTR2 - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(duLcr5) - srd r12, rWORD4, rSHR - sld rWORD4_SHIFT, rWORD4, rSHL - or rWORD4, r12, rWORD2_SHIFT -L(duLoop2): - LD rWORD5, rOFF24, rSTR1 - LD rWORD6, rOFF24, rSTR2 - cmpld cr5, rWORD7, rWORD8 - bne cr7, L(duLcr7) - srd r0, rWORD6, rSHR - sld rWORD6_SHIFT, rWORD6, rSHL - or rWORD6, r0, rWORD4_SHIFT -L(duLoop3): - LD rWORD7, rOFF32, rSTR1 - LD rWORD8, rOFF32, rSTR2 - addi rSTR1, rSTR1, 32 - addi rSTR2, rSTR2, 32 - cmpld cr7, rWORD1, rWORD2 - bne cr1, L(duLcr1) - srd r12, rWORD8, rSHR - sld rWORD8_SHIFT, rWORD8, rSHL - or rWORD8, r12, rWORD6_SHIFT - bdnz L(duLoop) - -L(duL4): - cmpld cr1, rWORD3, rWORD4 - bne cr6, L(duLcr6) - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(duLcr5) - cmpld cr5, rWORD7, rWORD8 -L(du44): - bne cr7, L(duLcr7) -L(du34): - bne cr1, L(duLcr1) -L(du24): - bne cr6, L(duLcr6) -L(du14): - sldi. rN, rN, 3 - bne cr5, L(duLcr5) -/* At this point we have a remainder of 1 to 7 bytes to compare. We use - shift right double to eliminate bits beyond the compare length. - - However it may not be safe to load rWORD2 which may be beyond the - string length. So we compare the bit length of the remainder to - the right shift count (rSHR). If the bit count is less than or equal - we do not need to load rWORD2 (all significant bits are already in - rWORD8_SHIFT). */ - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li r0, 0 - ble cr7, L(dutrim) - LD rWORD2, rOFF8, rSTR2 - srd r0, rWORD2, rSHR - .align 4 -L(dutrim): - LD rWORD1, rOFF8, rSTR1 - ld rWORD8, -8(r1) - subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ - or rWORD2, r0, rWORD8_SHIFT - ld rWORD7, rWORD7SAVE(r1) - ld rSHL, rSHLSAVE(r1) - srd rWORD1, rWORD1, rN - srd rWORD2, rWORD2, rN - ld rSHR, rSHRSAVE(r1) - ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) - li rRTN, 0 - cmpld cr7, rWORD1, rWORD2 - ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) - ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) - beq cr7, L(dureturn24) - li rRTN, 1 - ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - bgtlr cr7 - li rRTN, -1 - blr - .align 4 -L(duLcr7): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) - li rRTN, 1 - bgt cr7, L(dureturn29) - ld rSHL, rSHLSAVE(r1) - ld rSHR, rSHRSAVE(r1) - li rRTN, -1 - b L(dureturn27) - .align 4 -L(duLcr1): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) - li rRTN, 1 - bgt cr1, L(dureturn29) - ld rSHL, rSHLSAVE(r1) - ld rSHR, rSHRSAVE(r1) - li rRTN, -1 - b L(dureturn27) - .align 4 -L(duLcr6): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) - li rRTN, 1 - bgt cr6, L(dureturn29) - ld rSHL, rSHLSAVE(r1) - ld rSHR, rSHRSAVE(r1) - li rRTN, -1 - b L(dureturn27) - .align 4 -L(duLcr5): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) - li rRTN, 1 - bgt cr5, L(dureturn29) - ld rSHL, rSHLSAVE(r1) - ld rSHR, rSHRSAVE(r1) - li rRTN, -1 - b L(dureturn27) - - .align 3 -L(duZeroReturn): - li rRTN, 0 - .align 4 -L(dureturn): - ld rWORD8, rWORD8SAVE(r1) - ld rWORD7, rWORD7SAVE(r1) -L(dureturn29): - ld rSHL, rSHLSAVE(r1) - ld rSHR, rSHRSAVE(r1) -L(dureturn27): - ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) - ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) - ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) -L(dureturn24): - ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - blr - -L(duzeroLength): - ld rOFF8, rOFF8SAVE(r1) - ld rOFF16, rOFF16SAVE(r1) - ld rOFF24, rOFF24SAVE(r1) - ld rOFF32, rOFF32SAVE(r1) - li rRTN, 0 - blr - -END (MEMCMP) -libc_hidden_builtin_def (memcmp) -weak_alias (memcmp, bcmp) diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S deleted file mode 100644 index bc734c9f4f..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/memset.S +++ /dev/null @@ -1,458 +0,0 @@ -/* Optimized memset implementation for PowerPC64/POWER8. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#define MTVSRD_V1_R4 .long 0x7c240166 /* mtvsrd v1,r4 */ - -/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); - Returns 's'. */ - -#ifndef MEMSET -# define MEMSET memset -#endif - - /* No need to use .machine power8 since mtvsrd is already - handled by the define. It avoid breakage on binutils - that does not support this machine specifier. */ - .machine power7 -EALIGN (MEMSET, 5, 0) - CALL_MCOUNT 3 - -L(_memset): - cmpldi cr7,r5,31 - neg r0,r3 - mr r10,r3 - - insrdi r4,r4,8,48 - insrdi r4,r4,16,32 /* Replicate byte to word. */ - ble cr7,L(write_LT_32) - - andi. r11,r10,15 /* Check alignment of DST. */ - insrdi r4,r4,32,0 /* Replicate word to double word. */ - - beq L(big_aligned) - - mtocrf 0x01,r0 - clrldi r0,r0,60 - - /* Get DST aligned to 16 bytes. */ -1: bf 31,2f - stb r4,0(r10) - addi r10,r10,1 - -2: bf 30,4f - sth r4,0(r10) - addi r10,r10,2 - -4: bf 29,8f - stw r4,0(r10) - addi r10,r10,4 - -8: bf 28,16f - std r4,0(r10) - addi r10,r10,8 - -16: subf r5,r0,r5 - - .align 4 -L(big_aligned): - /* For sizes larger than 255 two possible paths: - - if constant is '0', zero full cache lines with dcbz - - otherwise uses vector instructions. */ - cmpldi cr5,r5,255 - dcbtst 0,r10 - cmpldi cr6,r4,0 - crand 27,26,21 - bt 27,L(huge_dcbz) - bge cr5,L(huge_vector) - - - /* Size between 32 and 255 bytes with constant different than 0, use - doubleword store instruction to achieve best throughput. */ - srdi r8,r5,5 - clrldi r11,r5,59 - cmpldi cr6,r11,0 - cmpdi r8,0 - beq L(tail_bytes) - mtctr r8 - - /* Main aligned write loop, writes 32-bytes at a time. */ - .align 4 -L(big_loop): - std r4,0(r10) - std r4,8(r10) - std r4,16(r10) - std r4,24(r10) - addi r10,r10,32 - bdz L(tail_bytes) - - std r4,0(r10) - std r4,8(r10) - std r4,16(r10) - std r4,24(r10) - addi r10,10,32 - bdnz L(big_loop) - - b L(tail_bytes) - - /* Write remaining 1~31 bytes. */ - .align 4 -L(tail_bytes): - beqlr cr6 - - srdi r7,r11,4 - clrldi r8,r11,60 - mtocrf 0x01,r7 - - .align 4 - bf 31,8f - std r4,0(r10) - std r4,8(r10) - addi r10,r10,16 - - .align 4 -8: mtocrf 0x1,r8 - bf 28,4f - std r4,0(r10) - addi r10,r10,8 - - .align 4 -4: bf 29,2f - stw 4,0(10) - addi 10,10,4 - - .align 4 -2: bf 30,1f - sth 4,0(10) - addi 10,10,2 - - .align 4 -1: bflr 31 - stb 4,0(10) - blr - - /* Size larger than 255 bytes with constant different than 0, use - vector instruction to achieve best throughput. */ -L(huge_vector): - /* Replicate set byte to quadword in VMX register. */ - MTVSRD_V1_R4 - xxpermdi 32,v0,v1,0 - vspltb v2,v0,15 - - /* Main aligned write loop: 128 bytes at a time. */ - li r6,16 - li r7,32 - li r8,48 - mtocrf 0x02,r5 - srdi r12,r5,7 - cmpdi r12,0 - beq L(aligned_tail) - mtctr r12 - b L(aligned_128loop) - - .align 4 -L(aligned_128loop): - stvx v2,0,r10 - stvx v2,r10,r6 - stvx v2,r10,r7 - stvx v2,r10,r8 - addi r10,r10,64 - stvx v2,0,r10 - stvx v2,r10,r6 - stvx v2,r10,r7 - stvx v2,r10,r8 - addi r10,r10,64 - bdnz L(aligned_128loop) - - /* Write remaining 1~127 bytes. */ -L(aligned_tail): - mtocrf 0x01,r5 - bf 25,32f - stvx v2,0,r10 - stvx v2,r10,r6 - stvx v2,r10,r7 - stvx v2,r10,r8 - addi r10,r10,64 - -32: bf 26,16f - stvx v2,0,r10 - stvx v2,r10,r6 - addi r10,r10,32 - -16: bf 27,8f - stvx v2,0,r10 - addi r10,r10,16 - -8: bf 28,4f - std r4,0(r10) - addi r10,r10,8 - - /* Copies 4~7 bytes. */ -4: bf 29,L(tail2) - stw r4,0(r10) - bf 30,L(tail5) - sth r4,4(r10) - bflr 31 - stb r4,6(r10) - /* Return original DST pointer. */ - blr - - /* Special case when value is 0 and we have a long length to deal - with. Use dcbz to zero out a full cacheline of 128 bytes at a time. - Before using dcbz though, we need to get the destination 128-byte - aligned. */ - .align 4 -L(huge_dcbz): - andi. r11,r10,127 - neg r0,r10 - beq L(huge_dcbz_aligned) - - clrldi r0,r0,57 - subf r5,r0,r5 - srdi r0,r0,3 - mtocrf 0x01,r0 - - /* Write 1~128 bytes until DST is aligned to 128 bytes. */ -8: bf 28,4f - - std r4,0(r10) - std r4,8(r10) - std r4,16(r10) - std r4,24(r10) - std r4,32(r10) - std r4,40(r10) - std r4,48(r10) - std r4,56(r10) - addi r10,r10,64 - - .align 4 -4: bf 29,2f - std r4,0(r10) - std r4,8(r10) - std r4,16(r10) - std r4,24(r10) - addi r10,r10,32 - - .align 4 -2: bf 30,1f - std r4,0(r10) - std r4,8(r10) - addi r10,r10,16 - - .align 4 -1: bf 31,L(huge_dcbz_aligned) - std r4,0(r10) - addi r10,r10,8 - -L(huge_dcbz_aligned): - /* Setup dcbz unroll offsets and count numbers. */ - srdi r8,r5,9 - clrldi r11,r5,55 - cmpldi cr6,r11,0 - li r9,128 - cmpdi r8,0 - beq L(huge_tail) - li r7,256 - li r6,384 - mtctr r8 - - .align 4 -L(huge_loop): - /* Sets 512 bytes to zero in each iteration, the loop unrolling shows - a throughput boost for large sizes (2048 bytes or higher). */ - dcbz 0,r10 - dcbz r9,r10 - dcbz r7,r10 - dcbz r6,r10 - addi r10,r10,512 - bdnz L(huge_loop) - - beqlr cr6 - -L(huge_tail): - srdi r6,r11,8 - srdi r7,r11,4 - clrldi r8,r11,4 - cmpldi cr6,r8,0 - mtocrf 0x01,r6 - - beq cr6,L(tail) - - /* We have 1~511 bytes remaining. */ - .align 4 -32: bf 31,16f - dcbz 0,r10 - dcbz r9,r10 - addi r10,r10,256 - - .align 4 -16: mtocrf 0x01,r7 - bf 28,8f - dcbz 0,r10 - addi r10,r10,128 - - .align 4 -8: bf 29,4f - std r4,0(r10) - std r4,8(r10) - std r4,16(r10) - std r4,24(r10) - std r4,32(r10) - std r4,40(r10) - std r4,48(r10) - std r4,56(r10) - addi r10,r10,64 - - .align 4 -4: bf 30,2f - std r4,0(r10) - std r4,8(r10) - std r4,16(r10) - std r4,24(r10) - addi r10,r10,32 - - .align 4 -2: bf 31,L(tail) - std r4,0(r10) - std r4,8(r10) - addi r10,r10,16 - .align 4 - - /* Remaining 1~15 bytes. */ -L(tail): - mtocrf 0x01,r8 - - .align -8: bf 28,4f - std r4,0(r10) - addi r10,r10,8 - - .align 4 -4: bf 29,2f - stw r4,0(r10) - addi r10,r10,4 - - .align 4 -2: bf 30,1f - sth r4,0(r10) - addi r10,r10,2 - - .align 4 -1: bflr 31 - stb r4,0(r10) - blr - - /* Handle short copies of 0~31 bytes. Best throughput is achieved - by just unrolling all operations. */ - .align 4 -L(write_LT_32): - cmpldi cr6,5,8 - mtocrf 0x01,r5 - ble cr6,L(write_LE_8) - - /* At least 9 bytes to go. */ - neg r8,r4 - andi. r0,r8,3 - cmpldi cr1,r5,16 - beq L(write_LT_32_aligned) - - /* Force 4-byte alignment for SRC. */ - mtocrf 0x01,r0 - subf r5,r0,r5 - -2: bf 30,1f - sth r4,0(r10) - addi r10,r10,2 - -1: bf 31,L(end_4bytes_alignment) - stb r4,0(r10) - addi r10,r10,1 - - .align 4 -L(end_4bytes_alignment): - cmpldi cr1,r5,16 - mtocrf 0x01,r5 - -L(write_LT_32_aligned): - blt cr1,8f - - stw r4,0(r10) - stw r4,4(r10) - stw r4,8(r10) - stw r4,12(r10) - addi r10,r10,16 - -8: bf 28,L(tail4) - stw r4,0(r10) - stw r4,4(r10) - addi r10,r10,8 - - .align 4 - /* Copies 4~7 bytes. */ -L(tail4): - bf 29,L(tail2) - stw r4,0(r10) - bf 30,L(tail5) - sth r4,4(r10) - bflr 31 - stb r4,6(r10) - blr - - .align 4 - /* Copies 2~3 bytes. */ -L(tail2): - bf 30,1f - sth r4,0(r10) - bflr 31 - stb r4,2(r10) - blr - - .align 4 -L(tail5): - bflr 31 - stb r4,4(r10) - blr - - .align 4 -1: bflr 31 - stb r4,0(r10) - blr - - /* Handles copies of 0~8 bytes. */ - .align 4 -L(write_LE_8): - bne cr6,L(tail4) - - stw r4,0(r10) - stw r4,4(r10) - blr -END_GEN_TB (MEMSET,TB_TOCLESS) -libc_hidden_builtin_def (memset) - -/* Copied from bzero.S to prevent the linker from inserting a stub - between bzero and memset. */ -ENTRY (__bzero) - CALL_MCOUNT 3 - mr r5,r4 - li r4,0 - b L(_memset) -END (__bzero) -#ifndef __bzero -weak_alias (__bzero, bzero) -#endif diff --git a/sysdeps/powerpc/powerpc64/power8/multiarch/Implies b/sysdeps/powerpc/powerpc64/power8/multiarch/Implies deleted file mode 100644 index 1fc7b7cd39..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/multiarch/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc64/power7/multiarch diff --git a/sysdeps/powerpc/powerpc64/power8/stpcpy.S b/sysdeps/powerpc/powerpc64/power8/stpcpy.S deleted file mode 100644 index 955e738cee..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/stpcpy.S +++ /dev/null @@ -1,24 +0,0 @@ -/* Optimized stpcpy implementation for PowerPC64/POWER8. - Copyright (C) 2015-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#define USE_AS_STPCPY -#include <sysdeps/powerpc/powerpc64/power8/strcpy.S> - -weak_alias (__stpcpy, stpcpy) -libc_hidden_def (__stpcpy) -libc_hidden_builtin_def (stpcpy) diff --git a/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/sysdeps/powerpc/powerpc64/power8/stpncpy.S deleted file mode 100644 index c14d984dd0..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/stpncpy.S +++ /dev/null @@ -1,24 +0,0 @@ -/* Optimized stpncpy implementation for PowerPC64/POWER8. - Copyright (C) 2015-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#define USE_AS_STPNCPY -#include <sysdeps/powerpc/powerpc64/power8/strncpy.S> - -weak_alias (__stpncpy, stpncpy) -libc_hidden_def (__stpncpy) -libc_hidden_builtin_def (stpncpy) diff --git a/sysdeps/powerpc/powerpc64/power8/strcasecmp.S b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S deleted file mode 100644 index 88b17a6eb1..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strcasecmp.S +++ /dev/null @@ -1,457 +0,0 @@ -/* Optimized strcasecmp implementation for PowerPC64. - Copyright (C) 2016-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <locale-defines.h> - -/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */ - -#ifndef USE_AS_STRNCASECMP -# define __STRCASECMP __strcasecmp -# define STRCASECMP strcasecmp -#else -# define __STRCASECMP __strncasecmp -# define STRCASECMP strncasecmp -#endif -/* Convert 16 bytes to lowercase and compare */ -#define TOLOWER() \ - vaddubm v8, v4, v1; \ - vaddubm v7, v4, v3; \ - vcmpgtub v8, v8, v2; \ - vsel v4, v7, v4, v8; \ - vaddubm v8, v5, v1; \ - vaddubm v7, v5, v3; \ - vcmpgtub v8, v8, v2; \ - vsel v5, v7, v5, v8; \ - vcmpequb. v7, v5, v4; - -/* - * Get 16 bytes for unaligned case. - * reg1: Vector to hold next 16 bytes. - * reg2: Address to read from. - * reg3: Permute control vector. - * v8: Tmp vector used to mask unwanted bytes. - * v9: Tmp vector,0 when null is found on first 16 bytes - */ -#ifdef __LITTLE_ENDIAN__ -#define GET16BYTES(reg1, reg2, reg3) \ - lvx reg1, 0, reg2; \ - vspltisb v8, -1; \ - vperm v8, v8, reg1, reg3; \ - vcmpequb. v8, v0, v8; \ - beq cr6, 1f; \ - vspltisb v9, 0; \ - b 2f; \ - .align 4; \ -1: \ - addi r6, reg2, 16; \ - lvx v9, 0, r6; \ -2: \ - vperm reg1, v9, reg1, reg3; -#else -#define GET16BYTES(reg1, reg2, reg3) \ - lvx reg1, 0, reg2; \ - vspltisb v8, -1; \ - vperm v8, reg1, v8, reg3; \ - vcmpequb. v8, v0, v8; \ - beq cr6, 1f; \ - vspltisb v9, 0; \ - b 2f; \ - .align 4; \ -1: \ - addi r6, reg2, 16; \ - lvx v9, 0, r6; \ -2: \ - vperm reg1, reg1, v9, reg3; -#endif - -/* Check null in v4, v5 and convert to lower. */ -#define CHECKNULLANDCONVERT() \ - vcmpequb. v7, v0, v5; \ - beq cr6, 3f; \ - vcmpequb. v7, v0, v4; \ - beq cr6, 3f; \ - b L(null_found); \ - .align 4; \ -3: \ - TOLOWER() - -#ifdef _ARCH_PWR8 -# define VCLZD_V8_v7 vclzd v8, v7; -# define MFVRD_R3_V1 mfvrd r3, v1; -# define VSUBUDM_V9_V8 vsubudm v9, v9, v8; -# define VPOPCNTD_V8_V8 vpopcntd v8, v8; -# define VADDUQM_V7_V8 vadduqm v9, v7, v8; -#else -# define VCLZD_V8_v7 .long 0x11003fc2 -# define MFVRD_R3_V1 .long 0x7c230067 -# define VSUBUDM_V9_V8 .long 0x112944c0 -# define VPOPCNTD_V8_V8 .long 0x110047c3 -# define VADDUQM_V7_V8 .long 0x11274100 -#endif - - .machine power7 - -ENTRY (__STRCASECMP) -#ifdef USE_AS_STRNCASECMP - CALL_MCOUNT 3 -#else - CALL_MCOUNT 2 -#endif -#define rRTN r3 /* Return value */ -#define rSTR1 r10 /* 1st string */ -#define rSTR2 r4 /* 2nd string */ -#define rCHAR1 r6 /* Byte read from 1st string */ -#define rCHAR2 r7 /* Byte read from 2nd string */ -#define rADDR1 r8 /* Address of tolower(rCHAR1) */ -#define rADDR2 r12 /* Address of tolower(rCHAR2) */ -#define rLWR1 r8 /* Word tolower(rCHAR1) */ -#define rLWR2 r12 /* Word tolower(rCHAR2) */ -#define rTMP r9 -#define rLOC r11 /* Default locale address */ - - cmpd cr7, rRTN, rSTR2 - - /* Get locale address. */ - ld rTMP, __libc_tsd_LOCALE@got@tprel(r2) - add rLOC, rTMP, __libc_tsd_LOCALE@tls - ld rLOC, 0(rLOC) - - mr rSTR1, rRTN - li rRTN, 0 - beqlr cr7 -#ifdef USE_AS_STRNCASECMP - cmpdi cr7, r5, 0 - beq cr7, L(retnull) - cmpdi cr7, r5, 16 - blt cr7, L(bytebybyte) -#endif - vspltisb v0, 0 - vspltisb v8, -1 - /* Check for null in initial characters. - Check max of 16 char depending on the alignment. - If null is present, proceed byte by byte. */ - lvx v4, 0, rSTR1 -#ifdef __LITTLE_ENDIAN__ - lvsr v10, 0, rSTR1 /* Compute mask. */ - vperm v9, v8, v4, v10 /* Mask bits that are not part of string. */ -#else - lvsl v10, 0, rSTR1 - vperm v9, v4, v8, v10 -#endif - vcmpequb. v9, v0, v9 /* Check for null bytes. */ - bne cr6, L(bytebybyte) - lvx v5, 0, rSTR2 - /* Calculate alignment. */ -#ifdef __LITTLE_ENDIAN__ - lvsr v6, 0, rSTR2 - vperm v9, v8, v5, v6 /* Mask bits that are not part of string. */ -#else - lvsl v6, 0, rSTR2 - vperm v9, v5, v8, v6 -#endif - vcmpequb. v9, v0, v9 /* Check for null bytes. */ - bne cr6, L(bytebybyte) - /* Check if locale has non ascii characters. */ - ld rTMP, 0(rLOC) - addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES - lwz rTMP, 0(r6) - cmpdi cr7, rTMP, 1 - beq cr7, L(bytebybyte) - - /* Load vector registers with values used for TOLOWER. */ - /* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte. */ - vspltisb v3, 2 - vspltisb v9, 4 - vsl v3, v3, v9 - vaddubm v1, v3, v3 - vnor v1, v1, v1 - vspltisb v2, 7 - vsububm v2, v3, v2 - - andi. rADDR1, rSTR1, 0xF - beq cr0, L(align) - addi r6, rSTR1, 16 - lvx v9, 0, r6 - /* Compute 16 bytes from previous two loads. */ -#ifdef __LITTLE_ENDIAN__ - vperm v4, v9, v4, v10 -#else - vperm v4, v4, v9, v10 -#endif -L(align): - andi. rADDR2, rSTR2, 0xF - beq cr0, L(align1) - addi r6, rSTR2, 16 - lvx v9, 0, r6 - /* Compute 16 bytes from previous two loads. */ -#ifdef __LITTLE_ENDIAN__ - vperm v5, v9, v5, v6 -#else - vperm v5, v5, v9, v6 -#endif -L(align1): - CHECKNULLANDCONVERT() - blt cr6, L(match) - b L(different) - .align 4 -L(match): - clrldi r6, rSTR1, 60 - subfic r7, r6, 16 -#ifdef USE_AS_STRNCASECMP - sub r5, r5, r7 -#endif - add rSTR1, rSTR1, r7 - add rSTR2, rSTR2, r7 - andi. rADDR2, rSTR2, 0xF - addi rSTR1, rSTR1, -16 - addi rSTR2, rSTR2, -16 - beq cr0, L(aligned) -#ifdef __LITTLE_ENDIAN__ - lvsr v6, 0, rSTR2 -#else - lvsl v6, 0, rSTR2 -#endif - /* There are 2 loops depending on the input alignment. - Each loop gets 16 bytes from s1 and s2, check for null, - convert to lowercase and compare. Loop till difference - or null occurs. */ -L(s1_align): - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 -#ifdef USE_AS_STRNCASECMP - cmpdi cr7, r5, 16 - blt cr7, L(bytebybyte) - addi r5, r5, -16 -#endif - lvx v4, 0, rSTR1 - GET16BYTES(v5, rSTR2, v6) - CHECKNULLANDCONVERT() - blt cr6, L(s1_align) - b L(different) - .align 4 -L(aligned): - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 -#ifdef USE_AS_STRNCASECMP - cmpdi cr7, r5, 16 - blt cr7, L(bytebybyte) - addi r5, r5, -16 -#endif - lvx v4, 0, rSTR1 - lvx v5, 0, rSTR2 - CHECKNULLANDCONVERT() - blt cr6, L(aligned) - - /* Calculate and return the difference. */ -L(different): - vaddubm v1, v3, v3 - vcmpequb v7, v0, v7 -#ifdef __LITTLE_ENDIAN__ - /* Count trailing zero. */ - vspltisb v8, -1 - VADDUQM_V7_V8 - vandc v8, v9, v7 - VPOPCNTD_V8_V8 - vspltb v6, v8, 15 - vcmpequb. v6, v6, v1 - blt cr6, L(shift8) -#else - /* Count leading zero. */ - VCLZD_V8_v7 - vspltb v6, v8, 7 - vcmpequb. v6, v6, v1 - blt cr6, L(shift8) - vsro v8, v8, v1 -#endif - b L(skipsum) - .align 4 -L(shift8): - vsumsws v8, v8, v0 -L(skipsum): -#ifdef __LITTLE_ENDIAN__ - /* Shift registers based on leading zero count. */ - vsro v6, v5, v8 - vsro v7, v4, v8 - /* Merge and move to GPR. */ - vmrglb v6, v6, v7 - vslo v1, v6, v1 - MFVRD_R3_V1 - /* Place the characters that are different in first position. */ - sldi rSTR2, rRTN, 56 - srdi rSTR2, rSTR2, 56 - sldi rSTR1, rRTN, 48 - srdi rSTR1, rSTR1, 56 -#else - vslo v6, v5, v8 - vslo v7, v4, v8 - vmrghb v1, v6, v7 - MFVRD_R3_V1 - srdi rSTR2, rRTN, 48 - sldi rSTR2, rSTR2, 56 - srdi rSTR2, rSTR2, 56 - srdi rSTR1, rRTN, 56 -#endif - subf rRTN, rSTR1, rSTR2 - extsw rRTN, rRTN - blr - - .align 4 - /* OK. We've hit the end of the string. We need to be careful that - we don't compare two strings as different because of junk beyond - the end of the strings... */ -L(null_found): - vaddubm v10, v3, v3 -#ifdef __LITTLE_ENDIAN__ - /* Count trailing zero. */ - vspltisb v8, -1 - VADDUQM_V7_V8 - vandc v8, v9, v7 - VPOPCNTD_V8_V8 - vspltb v6, v8, 15 - vcmpequb. v6, v6, v10 - blt cr6, L(shift_8) -#else - /* Count leading zero. */ - VCLZD_V8_v7 - vspltb v6, v8, 7 - vcmpequb. v6, v6, v10 - blt cr6, L(shift_8) - vsro v8, v8, v10 -#endif - b L(skipsum1) - .align 4 -L(shift_8): - vsumsws v8, v8, v0 -L(skipsum1): - /* Calculate shift count based on count of zero. */ - vspltisb v10, 7 - vslb v10, v10, v10 - vsldoi v9, v0, v10, 1 - VSUBUDM_V9_V8 - vspltisb v8, 8 - vsldoi v8, v0, v8, 1 - VSUBUDM_V9_V8 - /* Shift and remove junk after null character. */ -#ifdef __LITTLE_ENDIAN__ - vslo v5, v5, v9 - vslo v4, v4, v9 -#else - vsro v5, v5, v9 - vsro v4, v4, v9 -#endif - /* Convert and compare 16 bytes. */ - TOLOWER() - blt cr6, L(retnull) - b L(different) - .align 4 -L(retnull): - li rRTN, 0 - blr - .align 4 -L(bytebybyte): - /* Unrolling loop for POWER: loads are done with 'lbz' plus - offset and string descriptors are only updated in the end - of loop unrolling. */ - ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC) - lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ - lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ -#ifdef USE_AS_STRNCASECMP - rldicl rTMP, r5, 62, 2 - cmpdi cr7, rTMP, 0 - beq cr7, L(lessthan4) - mtctr rTMP -#endif -L(loop): - cmpdi rCHAR1, 0 /* *s1 == '\0' ? */ - sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */ - sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */ - lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */ - lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */ - cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */ - crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */ - beq cr1, L(done) - lbz rCHAR1, 1(rSTR1) - lbz rCHAR2, 1(rSTR2) - cmpdi rCHAR1, 0 - sldi rADDR1, rCHAR1, 2 - sldi rADDR2, rCHAR2, 2 - lwzx rLWR1, rLOC, rADDR1 - lwzx rLWR2, rLOC, rADDR2 - cmpw cr1, rLWR1, rLWR2 - crorc 4*cr1+eq,eq,4*cr1+eq - beq cr1, L(done) - lbz rCHAR1, 2(rSTR1) - lbz rCHAR2, 2(rSTR2) - cmpdi rCHAR1, 0 - sldi rADDR1, rCHAR1, 2 - sldi rADDR2, rCHAR2, 2 - lwzx rLWR1, rLOC, rADDR1 - lwzx rLWR2, rLOC, rADDR2 - cmpw cr1, rLWR1, rLWR2 - crorc 4*cr1+eq,eq,4*cr1+eq - beq cr1, L(done) - lbz rCHAR1, 3(rSTR1) - lbz rCHAR2, 3(rSTR2) - cmpdi rCHAR1, 0 - /* Increment both string descriptors */ - addi rSTR1, rSTR1, 4 - addi rSTR2, rSTR2, 4 - sldi rADDR1, rCHAR1, 2 - sldi rADDR2, rCHAR2, 2 - lwzx rLWR1, rLOC, rADDR1 - lwzx rLWR2, rLOC, rADDR2 - cmpw cr1, rLWR1, rLWR2 - crorc 4*cr1+eq,eq,4*cr1+eq - beq cr1, L(done) - lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ - lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ -#ifdef USE_AS_STRNCASECMP - bdnz L(loop) -#else - b L(loop) -#endif -#ifdef USE_AS_STRNCASECMP -L(lessthan4): - clrldi r5, r5, 62 - cmpdi cr7, r5, 0 - beq cr7, L(retnull) - mtctr r5 -L(loop1): - cmpdi rCHAR1, 0 - sldi rADDR1, rCHAR1, 2 - sldi rADDR2, rCHAR2, 2 - lwzx rLWR1, rLOC, rADDR1 - lwzx rLWR2, rLOC, rADDR2 - cmpw cr1, rLWR1, rLWR2 - crorc 4*cr1+eq,eq,4*cr1+eq - beq cr1, L(done) - addi rSTR1, rSTR1, 1 - addi rSTR2, rSTR2, 1 - lbz rCHAR1, 0(rSTR1) - lbz rCHAR2, 0(rSTR2) - bdnz L(loop1) -#endif -L(done): - subf r0, rLWR2, rLWR1 - extsw rRTN, r0 - blr -END (__STRCASECMP) - -weak_alias (__STRCASECMP, STRCASECMP) -libc_hidden_builtin_def (__STRCASECMP) diff --git a/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c b/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c deleted file mode 100644 index 0e746b7718..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c +++ /dev/null @@ -1,29 +0,0 @@ -/* Optimized strcasestr implementation for PowerPC64/POWER8. - Copyright (C) 2016-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <string.h> - -#define STRCASESTR __strcasestr_ppc -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(__name) - -#undef weak_alias -#define weak_alias(a,b) -extern __typeof (strcasestr) __strcasestr_ppc attribute_hidden; - -#include <string/strcasestr.c> diff --git a/sysdeps/powerpc/powerpc64/power8/strcasestr.S b/sysdeps/powerpc/powerpc64/power8/strcasestr.S deleted file mode 100644 index 6ac6572f3b..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strcasestr.S +++ /dev/null @@ -1,538 +0,0 @@ -/* Optimized strcasestr implementation for PowerPC64/POWER8. - Copyright (C) 2016-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <locale-defines.h> - -/* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */ - -/* The performance gain is obtained by comparing 16 bytes. */ - -/* When the first char of r4 is hit ITERATIONS times in r3 - fallback to default. */ -#define ITERATIONS 64 - -#ifndef STRCASESTR -# define STRCASESTR __strcasestr -#endif - -#ifndef STRLEN -/* For builds without IFUNC support, local calls should be made to internal - GLIBC symbol (created by libc_hidden_builtin_def). */ -# ifdef SHARED -# define STRLEN __GI_strlen -# else -# define STRLEN strlen -# endif -#endif - -#ifndef STRNLEN -/* For builds without IFUNC support, local calls should be made to internal - GLIBC symbol (created by libc_hidden_builtin_def). */ -# ifdef SHARED -# define STRNLEN __GI_strnlen -# else -# define STRNLEN __strnlen -# endif -#endif - -#ifndef STRCHR -# ifdef SHARED -# define STRCHR __GI_strchr -# else -# define STRCHR strchr -# endif -#endif - -/* Convert 16 bytes of v4 and reg to lowercase and compare. */ -#define TOLOWER(reg) \ - vcmpgtub v6, v4, v1; \ - vcmpgtub v7, v2, v4; \ - vand v8, v7, v6; \ - vand v8, v8, v3; \ - vor v4, v8, v4; \ - vcmpgtub v6, reg, v1; \ - vcmpgtub v7, v2, reg; \ - vand v8, v7, v6; \ - vand v8, v8, v3; \ - vor reg, v8, reg; \ - vcmpequb. v6, reg, v4; - -/* TODO: change these to the actual instructions when the minimum required - binutils allows it. */ -#ifdef _ARCH_PWR8 -#define VCLZD_V8_v7 vclzd v8, v7; -#else -#define VCLZD_V8_v7 .long 0x11003fc2 -#endif - -#define FRAMESIZE (FRAME_MIN_SIZE+48) -/* TODO: change this to .machine power8 when the minimum required binutils - allows it. */ - .machine power7 -EALIGN (STRCASESTR, 4, 0) - CALL_MCOUNT 2 - mflr r0 /* Load link register LR to r0. */ - std r31, -8(r1) /* Save callers register r31. */ - std r30, -16(r1) /* Save callers register r30. */ - std r29, -24(r1) /* Save callers register r29. */ - std r28, -32(r1) /* Save callers register r28. */ - std r27, -40(r1) /* Save callers register r27. */ - std r0, 16(r1) /* Store the link register. */ - cfi_offset(r31, -8) - cfi_offset(r30, -16) - cfi_offset(r29, -24) - cfi_offset(r28, -32) - cfi_offset(r27, -40) - cfi_offset(lr, 16) - stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */ - cfi_adjust_cfa_offset(FRAMESIZE) - - dcbt 0, r3 - dcbt 0, r4 - cmpdi cr7, r3, 0 /* Input validation. */ - beq cr7, L(retnull) - cmpdi cr7, r4, 0 - beq cr7, L(retnull) - - mr r29, r3 - mr r30, r4 - /* Load first byte from r4 and check if its null. */ - lbz r6, 0(r4) - cmpdi cr7, r6, 0 - beq cr7, L(ret_r3) - - ld r10, __libc_tsd_LOCALE@got@tprel(r2) - add r9, r10, __libc_tsd_LOCALE@tls - ld r9, 0(r9) - ld r9, LOCALE_CTYPE_TOUPPER(r9) - sldi r10, r6, 2 /* Convert to upper case. */ - lwzx r28, r9, r10 - - ld r10, __libc_tsd_LOCALE@got@tprel(r2) - add r11, r10, __libc_tsd_LOCALE@tls - ld r11, 0(r11) - ld r11, LOCALE_CTYPE_TOLOWER(r11) - sldi r10, r6, 2 /* Convert to lower case. */ - lwzx r27, r11, r10 - - /* Check if the first char is present. */ - mr r4, r27 - bl STRCHR - nop - mr r5, r3 - mr r3, r29 - mr r29, r5 - mr r4, r28 - bl STRCHR - nop - cmpdi cr7, r29, 0 - beq cr7, L(firstpos) - cmpdi cr7, r3, 0 - beq cr7, L(skipcheck) - cmpw cr7, r3, r29 - ble cr7, L(firstpos) - /* Move r3 to the first occurence. */ -L(skipcheck): - mr r3, r29 -L(firstpos): - mr r29, r3 - - sldi r9, r27, 8 - or r28, r9, r28 - /* Reg r27 is used to count the number of iterations. */ - li r27, 0 - /* If first char of search str is not present. */ - cmpdi cr7, r3, 0 - ble cr7, L(end) - - /* Find the length of pattern. */ - mr r3, r30 - bl STRLEN - nop - - cmpdi cr7, r3, 0 /* If search str is null. */ - beq cr7, L(ret_r3) - - mr r31, r3 - mr r4, r3 - mr r3, r29 - bl STRNLEN - nop - - cmpd cr7, r3, r31 /* If len(r3) < len(r4). */ - blt cr7, L(retnull) - - mr r3, r29 - - /* Locales not matching ASCII for single bytes. */ - ld r10, __libc_tsd_LOCALE@got@tprel(r2) - add r9, r10, __libc_tsd_LOCALE@tls - ld r9, 0(r9) - ld r7, 0(r9) - addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES - lwz r8, 0(r7) - cmpdi cr7, r8, 1 - beq cr7, L(bytebybyte) - - /* If len(r4) < 16 handle byte by byte. */ - /* For shorter strings we will not use vector registers. */ - cmpdi cr7, r31, 16 - blt cr7, L(bytebybyte) - - /* Comparison values used for TOLOWER. */ - /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */ - vspltish v0, 0 - vspltisb v5, 2 - vspltisb v4, 4 - vsl v3, v5, v4 - vaddubm v1, v3, v3 - vspltisb v5, 15 - vaddubm v2, v5, v5 - vaddubm v2, v1, v2 - vspltisb v4, -3 - vaddubm v2, v2, v4 - - /* - 1. Load 16 bytes from r3 and r4 - 2. Check if there is null, If yes, proceed byte by byte path. - 3. Else,Convert both to lowercase and compare. - 4. If they are same proceed to 1. - 5. If they dont match, find if first char of r4 is present in the - loaded 16 byte of r3. - 6. If yes, move position, load next 16 bytes of r3 and proceed to 2. - */ - - mr r8, r3 /* Save r3 for future use. */ - mr r4, r30 /* Restore r4. */ - clrldi r10, r4, 60 - lvx v5, 0, r4 /* Load 16 bytes from r4. */ - cmpdi cr7, r10, 0 - beq cr7, L(begin2) - /* If r4 is unaligned, load another 16 bytes. */ -#ifdef __LITTLE_ENDIAN__ - lvsr v7, 0, r4 -#else - lvsl v7, 0, r4 -#endif - addi r5, r4, 16 - lvx v9, 0, r5 -#ifdef __LITTLE_ENDIAN__ - vperm v5, v9, v5, v7 -#else - vperm v5, v5, v9, v7 -#endif -L(begin2): - lvx v4, 0, r3 - vcmpequb. v7, v0, v4 /* Check for null. */ - beq cr6, L(nullchk6) - b L(trailcheck) - - .align 4 -L(nullchk6): - clrldi r10, r3, 60 - cmpdi cr7, r10, 0 - beq cr7, L(next16) -#ifdef __LITTLE_ENDIAN__ - lvsr v7, 0, r3 -#else - lvsl v7, 0, r3 -#endif - addi r5, r3, 16 - /* If r3 is unaligned, load another 16 bytes. */ - lvx v10, 0, r5 -#ifdef __LITTLE_ENDIAN__ - vperm v4, v10, v4, v7 -#else - vperm v4, v4, v10, v7 -#endif -L(next16): - vcmpequb. v6, v0, v5 /* Check for null. */ - beq cr6, L(nullchk) - b L(trailcheck) - - .align 4 -L(nullchk): - vcmpequb. v6, v0, v4 - beq cr6, L(nullchk1) - b L(retnull) - - .align 4 -L(nullchk1): - /* Convert both v3 and v4 to lower. */ - TOLOWER(v5) - /* If both are same, branch to match. */ - blt cr6, L(match) - /* Find if the first char is present in next 15 bytes. */ -#ifdef __LITTLE_ENDIAN__ - vspltb v6, v5, 15 - vsldoi v7, v0, v4, 15 -#else - vspltb v6, v5, 0 - vspltisb v7, 8 - vslo v7, v4, v7 -#endif - vcmpequb v7, v6, v7 - vcmpequb. v6, v0, v7 - /* Shift r3 by 16 bytes and proceed. */ - blt cr6, L(shift16) - VCLZD_V8_v7 -#ifdef __LITTLE_ENDIAN__ - vspltb v6, v8, 15 -#else - vspltb v6, v8, 7 -#endif - vcmpequb. v6, v6, v1 - /* Shift r3 by 8 bytes and proceed. */ - blt cr6, L(shift8) - b L(begin) - - .align 4 -L(match): - /* There is a match of 16 bytes, check next bytes. */ - cmpdi cr7, r31, 16 - mr r29, r3 - beq cr7, L(ret_r3) - -L(secondmatch): - addi r3, r3, 16 - addi r4, r4, 16 - /* Load next 16 bytes of r3 and r4 and compare. */ - clrldi r10, r4, 60 - cmpdi cr7, r10, 0 - beq cr7, L(nextload) - /* Handle unaligned case. */ - vor v6, v9, v9 - vcmpequb. v7, v0, v6 - beq cr6, L(nullchk2) - b L(trailcheck) - - .align 4 -L(nullchk2): -#ifdef __LITTLE_ENDIAN__ - lvsr v7, 0, r4 -#else - lvsl v7, 0, r4 -#endif - addi r5, r4, 16 - /* If r4 is unaligned, load another 16 bytes. */ - lvx v9, 0, r5 -#ifdef __LITTLE_ENDIAN__ - vperm v11, v9, v6, v7 -#else - vperm v11, v6, v9, v7 -#endif - b L(compare) - - .align 4 -L(nextload): - lvx v11, 0, r4 -L(compare): - vcmpequb. v7, v0, v11 - beq cr6, L(nullchk3) - b L(trailcheck) - - .align 4 -L(nullchk3): - clrldi r10, r3, 60 - cmpdi cr7, r10, 0 - beq cr7, L(nextload1) - /* Handle unaligned case. */ - vor v4, v10, v10 - vcmpequb. v7, v0, v4 - beq cr6, L(nullchk4) - b L(retnull) - - .align 4 -L(nullchk4): -#ifdef __LITTLE_ENDIAN__ - lvsr v7, 0, r3 -#else - lvsl v7, 0, r3 -#endif - addi r5, r3, 16 - /* If r3 is unaligned, load another 16 bytes. */ - lvx v10, 0, r5 -#ifdef __LITTLE_ENDIAN__ - vperm v4, v10, v4, v7 -#else - vperm v4, v4, v10, v7 -#endif - b L(compare1) - - .align 4 -L(nextload1): - lvx v4, 0, r3 -L(compare1): - vcmpequb. v7, v0, v4 - beq cr6, L(nullchk5) - b L(retnull) - - .align 4 -L(nullchk5): - /* Convert both v3 and v4 to lower. */ - TOLOWER(v11) - /* If both are same, branch to secondmatch. */ - blt cr6, L(secondmatch) - /* Continue the search. */ - b L(begin) - - .align 4 -L(trailcheck): - ld r10, __libc_tsd_LOCALE@got@tprel(r2) - add r11, r10, __libc_tsd_LOCALE@tls - ld r11, 0(r11) - ld r11, LOCALE_CTYPE_TOLOWER(r11) -L(loop2): - lbz r5, 0(r3) /* Load byte from r3. */ - lbz r6, 0(r4) /* Load next byte from r4. */ - cmpdi cr7, r6, 0 /* Is it null? */ - beq cr7, L(updater3) - cmpdi cr7, r5, 0 /* Is it null? */ - beq cr7, L(retnull) /* If yes, return. */ - addi r3, r3, 1 - addi r4, r4, 1 /* Increment r4. */ - sldi r10, r5, 2 /* Convert to lower case. */ - lwzx r10, r11, r10 - sldi r7, r6, 2 /* Convert to lower case. */ - lwzx r7, r11, r7 - cmpw cr7, r7, r10 /* Compare with byte from r4. */ - bne cr7, L(begin) - b L(loop2) - - .align 4 -L(shift8): - addi r8, r8, 7 - b L(begin) - .align 4 -L(shift16): - addi r8, r8, 15 - .align 4 -L(begin): - addi r8, r8, 1 - mr r3, r8 - /* When our iterations exceed ITERATIONS,fall back to default. */ - addi r27, r27, 1 - cmpdi cr7, r27, ITERATIONS - beq cr7, L(default) - mr r4, r30 /* Restore r4. */ - b L(begin2) - - /* Handling byte by byte. */ - .align 4 -L(loop1): - mr r3, r8 - addi r27, r27, 1 - cmpdi cr7, r27, ITERATIONS - beq cr7, L(default) - mr r29, r8 - srdi r4, r28, 8 - /* Check if the first char is present. */ - bl STRCHR - nop - mr r5, r3 - mr r3, r29 - mr r29, r5 - sldi r4, r28, 56 - srdi r4, r4, 56 - bl STRCHR - nop - cmpdi cr7, r29, 0 - beq cr7, L(nextpos) - cmpdi cr7, r3, 0 - beq cr7, L(skipcheck1) - cmpw cr7, r3, r29 - ble cr7, L(nextpos) - /* Move r3 to first occurence. */ -L(skipcheck1): - mr r3, r29 -L(nextpos): - mr r29, r3 - cmpdi cr7, r3, 0 - ble cr7, L(retnull) -L(bytebybyte): - ld r10, __libc_tsd_LOCALE@got@tprel(r2) - add r11, r10, __libc_tsd_LOCALE@tls - ld r11, 0(r11) - ld r11, LOCALE_CTYPE_TOLOWER(r11) - mr r4, r30 /* Restore r4. */ - mr r8, r3 /* Save r3. */ - addi r8, r8, 1 - -L(loop): - addi r3, r3, 1 - lbz r5, 0(r3) /* Load byte from r3. */ - addi r4, r4, 1 /* Increment r4. */ - lbz r6, 0(r4) /* Load next byte from r4. */ - cmpdi cr7, r6, 0 /* Is it null? */ - beq cr7, L(updater3) - cmpdi cr7, r5, 0 /* Is it null? */ - beq cr7, L(retnull) /* If yes, return. */ - sldi r10, r5, 2 /* Convert to lower case. */ - lwzx r10, r11, r10 - sldi r7, r6, 2 /* Convert to lower case. */ - lwzx r7, r11, r7 - cmpw cr7, r7, r10 /* Compare with byte from r4. */ - bne cr7, L(loop1) - b L(loop) - - /* Handling return values. */ - .align 4 -L(updater3): - subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */ - b L(end) - - .align 4 -L(ret_r3): - mr r3, r29 /* Return point of match. */ - b L(end) - - .align 4 -L(retnull): - li r3, 0 /* Substring was not found. */ - b L(end) - - .align 4 -L(default): - mr r4, r30 - bl __strcasestr_ppc - nop - - .align 4 -L(end): - addi r1, r1, FRAMESIZE /* Restore stack pointer. */ - cfi_adjust_cfa_offset(-FRAMESIZE) - ld r0, 16(r1) /* Restore the saved link register. */ - ld r27, -40(r1) - ld r28, -32(r1) - ld r29, -24(r1) /* Restore callers save register r29. */ - ld r30, -16(r1) /* Restore callers save register r30. */ - ld r31, -8(r1) /* Restore callers save register r31. */ - cfi_restore(lr) - cfi_restore(r27) - cfi_restore(r28) - cfi_restore(r29) - cfi_restore(r30) - cfi_restore(r31) - mtlr r0 /* Branch to link register. */ - blr -END (STRCASESTR) - -weak_alias (__strcasestr, strcasestr) -libc_hidden_def (__strcasestr) -libc_hidden_builtin_def (strcasestr) diff --git a/sysdeps/powerpc/powerpc64/power8/strchr.S b/sysdeps/powerpc/powerpc64/power8/strchr.S deleted file mode 100644 index e0c185c162..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strchr.S +++ /dev/null @@ -1,377 +0,0 @@ -/* Optimized strchr implementation for PowerPC64/POWER8. - Copyright (C) 2016-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifdef USE_AS_STRCHRNUL -# ifndef STRCHRNUL -# define FUNC_NAME __strchrnul -# else -# define FUNC_NAME STRCHRNUL -# endif -#else -# ifndef STRCHR -# define FUNC_NAME strchr -# else -# define FUNC_NAME STRCHR -# endif -#endif /* !USE_AS_STRCHRNUL */ - -/* int [r3] strchr (char *s [r3], int c [r4]) */ -/* TODO: change these to the actual instructions when the minimum required - binutils allows it. */ -#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) -#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) -#define VBPERMQ(t,a,b) .long (0x1000054c \ - | ((t)<<(32-11)) \ - | ((a)<<(32-16)) \ - | ((b)<<(32-21)) ) -/* TODO: change this to .machine power8 when the minimum required binutils - allows it. */ - .machine power7 -ENTRY (FUNC_NAME) - CALL_MCOUNT 2 - dcbt 0,r3 - clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ - cmpdi cr7,r4,0 - ld r12,0(r8) /* Load doubleword from memory. */ - li r0,0 /* Doubleword with null chars to use - with cmpb. */ - - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - - beq cr7,L(null_match) - - /* Replicate byte to doubleword. */ - insrdi r4,r4,8,48 - insrdi r4,r4,16,32 - insrdi r4,r4,32,0 - - /* Now r4 has a doubleword of c bytes and r0 has - a doubleword of null bytes. */ - - cmpb r10,r12,r4 /* Compare each byte against c byte. */ - cmpb r11,r12,r0 /* Compare each byte against null byte. */ - - /* Move the doublewords left and right to discard the bits that are - not part of the string and bring them back as zeros. */ -#ifdef __LITTLE_ENDIAN__ - srd r10,r10,r6 - srd r11,r11,r6 - sld r10,r10,r6 - sld r11,r11,r6 -#else - sld r10,r10,r6 - sld r11,r11,r6 - srd r10,r10,r6 - srd r11,r11,r6 -#endif - or r5,r10,r11 /* OR the results to speed things up. */ - cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes - have been found. */ - bne cr7,L(done) - - mtcrf 0x01,r8 - - /* Are we now aligned to a doubleword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - - bt 28,L(loop) - - /* Handle WORD2 of pair. */ - ldu r12,8(r8) - cmpb r10,r12,r4 - cmpb r11,r12,r0 - or r5,r10,r11 - cmpdi cr7,r5,0 - bne cr7,L(done) - b L(loop) /* We branch here (rather than falling through) - to skip the nops due to heavy alignment - of the loop below. */ - - .p2align 5 -L(loop): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the null-checking process for bigger strings. */ - ld r12,8(r8) - ldu r9,16(r8) - cmpb r10,r12,r4 - cmpb r11,r12,r0 - cmpb r6,r9,r4 - cmpb r7,r9,r0 - or r5,r10,r11 - or r9,r6,r7 - or r12,r5,r9 - cmpdi cr7,r12,0 - beq cr7,L(vector) - /* OK, one (or both) of the doublewords contains a c/null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a c/null byte. */ - - cmpdi cr6,r5,0 - addi r8,r8,-8 - bne cr6,L(done) - - /* The c/null byte must be in the second doubleword. Adjust the - address again and move the result of cmpb to r10 so we can calculate - the pointer. */ - - mr r10,r6 - mr r11,r7 - addi r8,r8,8 -#ifdef USE_AS_STRCHRNUL - mr r5, r9 -#endif - /* r10/r11 have the output of the cmpb instructions, that is, - 0xff in the same position as the c/null byte in the original - doubleword from the string. Use that to calculate the pointer. */ -L(done): -#ifdef USE_AS_STRCHRNUL - mr r10, r5 -#endif -#ifdef __LITTLE_ENDIAN__ - addi r3,r10,-1 - andc r3,r3,r10 - popcntd r0,r3 -# ifndef USE_AS_STRCHRNUL - addi r4,r11,-1 - andc r4,r4,r11 - cmpld cr7,r3,r4 - bgt cr7,L(no_match) -# endif -#else - cntlzd r0,r10 /* Count leading zeros before c matches. */ -# ifndef USE_AS_STRCHRNUL - cmpld cr7,r11,r10 - bgt cr7,L(no_match) -# endif -#endif - srdi r0,r0,3 /* Convert leading zeros to bytes. */ - add r3,r8,r0 /* Return address of the matching c byte - or null in case c was not found. */ - blr - - /* Check the first 32B in GPR's and move to vectorized loop. */ - .p2align 5 -L(vector): - addi r3, r8, 8 - andi. r10, r3, 31 - bne cr0, L(loop) - vspltisb v0, 0 - /* Precompute vbpermq constant. */ - vspltisb v10, 3 - lvsl v11, r0, r0 - vslb v10, v11, v10 - MTVRD(v1,r4) - li r5, 16 - vspltb v1, v1, 7 - /* Compare 32 bytes in each loop. */ -L(continue): - lvx v4, 0, r3 - lvx v5, r3, r5 - vcmpequb v2, v0, v4 - vcmpequb v3, v0, v5 - vcmpequb v6, v1, v4 - vcmpequb v7, v1, v5 - vor v8, v2, v3 - vor v9, v6, v7 - vor v11, v8, v9 - vcmpequb. v11, v0, v11 - addi r3, r3, 32 - blt cr6, L(continue) - /* One (or both) of the quadwords contains a c/null byte. */ - addi r3, r3, -32 -#ifndef USE_AS_STRCHRNUL - vcmpequb. v11, v0, v9 - blt cr6, L(no_match) -#endif - /* Permute the first bit of each byte into bits 48-63. */ - VBPERMQ(v2, v2, v10) - VBPERMQ(v3, v3, v10) - VBPERMQ(v6, v6, v10) - VBPERMQ(v7, v7, v10) - /* Shift each component into its correct position for merging. */ -#ifdef __LITTLE_ENDIAN__ - vsldoi v3, v3, v3, 2 - vsldoi v7, v7, v7, 2 -#else - vsldoi v2, v2, v2, 6 - vsldoi v3, v3, v3, 4 - vsldoi v6, v6, v6, 6 - vsldoi v7, v7, v7, 4 -#endif - - /* Merge the results and move to a GPR. */ - vor v1, v3, v2 - vor v2, v6, v7 - vor v4, v1, v2 - MFVRD(r5, v4) -#ifdef __LITTLE_ENDIAN__ - addi r6, r5, -1 - andc r6, r6, r5 - popcntd r6, r6 -#else - cntlzd r6, r5 /* Count leading zeros before the match. */ -#endif - add r3, r3, r6 /* Compute final length. */ - /* Return NULL if null found before c. */ -#ifndef USE_AS_STRCHRNUL - lbz r4, 0(r3) - cmpdi cr7, r4, 0 - beq cr7, L(no_match) -#endif - blr - -#ifndef USE_AS_STRCHRNUL - .align 4 -L(no_match): - li r3,0 - blr -#endif - -/* We are here because strchr was called with a null byte. */ - .align 4 -L(null_match): - /* r0 has a doubleword of null bytes. */ - - cmpb r5,r12,r0 /* Compare each byte against null bytes. */ - - /* Move the doublewords left and right to discard the bits that are - not part of the string and bring them back as zeros. */ -#ifdef __LITTLE_ENDIAN__ - srd r5,r5,r6 - sld r5,r5,r6 -#else - sld r5,r5,r6 - srd r5,r5,r6 -#endif - cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes - have been found. */ - bne cr7,L(done_null) - - mtcrf 0x01,r8 - - /* Are we now aligned to a quadword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - - bt 28,L(loop_null) - - /* Handle WORD2 of pair. */ - ldu r12,8(r8) - cmpb r5,r12,r0 - cmpdi cr7,r5,0 - bne cr7,L(done_null) - b L(loop_null) /* We branch here (rather than falling through) - to skip the nops due to heavy alignment - of the loop below. */ - - /* Main loop to look for the end of the string. Since it's a - small loop (< 8 instructions), align it to 32-bytes. */ - .p2align 5 -L(loop_null): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the null-checking process for bigger strings. */ - ld r12,8(r8) - ldu r11,16(r8) - cmpb r5,r12,r0 - cmpb r10,r11,r0 - or r6,r5,r10 - cmpdi cr7,r6,0 - beq cr7,L(vector1) - - /* OK, one (or both) of the doublewords contains a null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a null byte. */ - - cmpdi cr6,r5,0 - addi r8,r8,-8 - bne cr6,L(done_null) - - /* The null byte must be in the second doubleword. Adjust the address - again and move the result of cmpb to r10 so we can calculate the - pointer. */ - - mr r5,r10 - addi r8,r8,8 - - /* r5 has the output of the cmpb instruction, that is, it contains - 0xff in the same position as the null byte in the original - doubleword from the string. Use that to calculate the pointer. */ -L(done_null): -#ifdef __LITTLE_ENDIAN__ - addi r0,r5,-1 - andc r0,r0,r5 - popcntd r0,r0 -#else - cntlzd r0,r5 /* Count leading zeros before the match. */ -#endif - srdi r0,r0,3 /* Convert leading zeros to bytes. */ - add r3,r8,r0 /* Return address of the matching null byte. */ - blr - .p2align 5 -L(vector1): - addi r3, r8, 8 - andi. r10, r3, 31 - bne cr0, L(loop_null) - vspltisb v8, -1 - vspltisb v0, 0 - vspltisb v10, 3 - lvsl v11, r0, r0 - vslb v10, v11, v10 - li r5, 16 -L(continue1): - lvx v4, 0, r3 - lvx v5, r3, r5 - vcmpequb v2, v0, v4 - vcmpequb v3, v0, v5 - vor v8, v2, v3 - vcmpequb. v11, v0, v8 - addi r3, r3, 32 - blt cr6, L(continue1) - addi r3, r3, -32 -L(end1): - VBPERMQ(v2, v2, v10) - VBPERMQ(v3, v3, v10) - /* Shift each component into its correct position for merging. */ -#ifdef __LITTLE_ENDIAN__ - vsldoi v3, v3, v3, 2 -#else - vsldoi v2, v2, v2, 6 - vsldoi v3, v3, v3, 4 -#endif - - /* Merge the results and move to a GPR. */ - vor v4, v3, v2 - MFVRD(r5, v4) -#ifdef __LITTLE_ENDIAN__ - addi r6, r5, -1 - andc r6, r6, r5 - popcntd r6, r6 -#else - cntlzd r6, r5 /* Count leading zeros before the match. */ -#endif - add r3, r3, r6 /* Compute final length. */ - blr -END (FUNC_NAME) - -#ifndef USE_AS_STRCHRNUL -weak_alias (strchr, index) -libc_hidden_builtin_def (strchr) -#endif diff --git a/sysdeps/powerpc/powerpc64/power8/strchrnul.S b/sysdeps/powerpc/powerpc64/power8/strchrnul.S deleted file mode 100644 index 3bf4b275dd..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strchrnul.S +++ /dev/null @@ -1,23 +0,0 @@ -/* Optimized strchrnul implementation for PowerPC64/POWER8. - Copyright (C) 2016-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#define USE_AS_STRCHRNUL 1 -#include <sysdeps/powerpc/powerpc64/power8/strchr.S> - -weak_alias (__strchrnul,strchrnul) -libc_hidden_builtin_def (__strchrnul) diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S deleted file mode 100644 index 770484f1e1..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strcmp.S +++ /dev/null @@ -1,247 +0,0 @@ -/* Optimized strcmp implementation for PowerPC64/POWER8. - Copyright (C) 2015-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifndef STRCMP -# define STRCMP strcmp -#endif - -/* Implements the function - - size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) - - The implementation uses unaligned doubleword access to avoid specialized - code paths depending of data alignment. Although recent powerpc64 uses - 64K as default, the page cross handling assumes minimum page size of - 4k. */ - -EALIGN (STRCMP, 4, 0) - li r0,0 - - /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using - the code: - - (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) - - with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ - - rldicl r7,r3,0,52 - rldicl r9,r4,0,52 - cmpldi cr7,r7,4096-16 - bgt cr7,L(pagecross_check) - cmpldi cr5,r9,4096-16 - bgt cr5,L(pagecross_check) - - /* For short string up to 16 bytes, load both s1 and s2 using - unaligned dwords and compare. */ - ld r8,0(r3) - ld r10,0(r4) - cmpb r12,r8,r0 - cmpb r11,r8,r10 - orc. r9,r12,r11 - bne cr0,L(different_nocmpb) - - ld r8,8(r3) - ld r10,8(r4) - cmpb r12,r8,r0 - cmpb r11,r8,r10 - orc. r9,r12,r11 - bne cr0,L(different_nocmpb) - - addi r7,r3,16 - addi r4,r4,16 - -L(align_8b): - /* Now it has checked for first 16 bytes, align source1 to doubleword - and adjust source2 address. */ - rldicl r9,r7,0,61 /* source1 alignment to doubleword */ - subf r4,r9,r4 /* Adjust source2 address based on source1 - alignment. */ - rldicr r7,r7,0,60 /* Align source1 to doubleword. */ - - /* At this point, source1 alignment is 0 and source2 alignment is - between 0 and 7. Check is source2 alignment is 0, meaning both - sources have the same alignment. */ - andi. r9,r4,0x7 - bne cr0,L(loop_diff_align) - - /* If both source1 and source2 are doubleword aligned, there is no - need for page boundary cross checks. */ - - ld r8,0(r7) - ld r10,0(r4) - cmpb r12,r8,r0 - cmpb r11,r8,r10 - orc. r9,r12,r11 - bne cr0,L(different_nocmpb) - - .align 4 -L(loop_equal_align): - ld r8,8(r7) - ld r10,8(r4) - cmpb r12,r8,r0 - cmpb r11,r8,r10 - orc. r9,r12,r11 - bne cr0,L(different_nocmpb) - - ld r8,16(r7) - ld r10,16(r4) - cmpb r12,r8,r0 - cmpb r11,r8,r10 - orc. r9,r12,r11 - bne cr0,L(different_nocmpb) - - ldu r8,24(r7) - ldu r10,24(r4) - cmpb r12,r8,r0 - cmpb r11,r8,r10 - orc. r9,r12,r11 - bne cr0,L(different_nocmpb) - - b L(loop_equal_align) - - /* A zero byte was found in r8 (s1 dword), r9 contains the cmpb - result and r10 the dword from s2. To code isolate the byte - up to end (including the '\0'), masking with 0xFF the remaining - ones: - - #if __LITTLE_ENDIAN__ - (__builtin_ffsl (x) - 1) = counting trailing zero bits - r9 = (__builtin_ffsl (r9) - 1) + 8; - r9 = -1UL << r9 - #else - r9 = __builtin_clzl (r9) + 8; - r9 = -1UL >> r9 - #endif - r8 = r8 | r9 - r10 = r10 | r9 */ - -#ifdef __LITTLE_ENDIAN__ - nor r9,r9,r9 -L(different_nocmpb): - neg r3,r9 - and r9,r9,r3 - cntlzd r9,r9 - subfic r9,r9,63 -#else - not r9,r9 -L(different_nocmpb): - cntlzd r9,r9 - subfic r9,r9,56 -#endif - srd r3,r8,r9 - srd r10,r10,r9 - rldicl r10,r10,0,56 - rldicl r3,r3,0,56 - subf r3,r10,r3 - extsw r3,r3 - blr - - .align 4 -L(pagecross_check): - subfic r9,r9,4096 - subfic r7,r7,4096 - cmpld cr7,r7,r9 - bge cr7,L(pagecross) - mr r7,r9 - - /* If unaligned 16 bytes reads across a 4K page boundary, it uses - a simple byte a byte comparison until the page alignment for s1 - is reached. */ -L(pagecross): - add r7,r3,r7 - subf r9,r3,r7 - mtctr r9 - - .align 4 -L(pagecross_loop): - /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2 - and if *s1 is '\0'. */ - lbz r9,0(r3) - lbz r10,0(r4) - addi r3,r3,1 - addi r4,r4,1 - cmplw cr7,r9,r10 - cmpdi cr5,r9,r0 - bne cr7,L(pagecross_ne) - beq cr5,L(pagecross_nullfound) - bdnz L(pagecross_loop) - b L(align_8b) - - .align 4 - /* The unaligned read of source2 will cross a 4K page boundary, - and the different byte or NULL maybe be in the remaining page - bytes. Since it can not use the unaligned load, the algorithm - reads and compares 8 bytes to keep source1 doubleword aligned. */ -L(check_source2_byte): - li r9,8 - mtctr r9 - - .align 4 -L(check_source2_byte_loop): - lbz r9,0(r7) - lbz r10,0(r4) - addi r7,r7,1 - addi r4,r4,1 - cmplw cr7,r9,10 - cmpdi r5,r9,0 - bne cr7,L(pagecross_ne) - beq cr5,L(pagecross_nullfound) - bdnz L(check_source2_byte_loop) - - /* If source2 is unaligned to doubleword, the code needs to check - on each interation if the unaligned doubleword access will cross - a 4k page boundary. */ - .align 5 -L(loop_unaligned): - ld r8,0(r7) - ld r10,0(r4) - cmpb r12,r8,r0 - cmpb r11,r8,r10 - orc. r9,r12,r11 - bne cr0,L(different_nocmpb) - addi r7,r7,8 - addi r4,r4,8 - -L(loop_diff_align): - /* Check if [src2]+8 cross a 4k page boundary: - - srcin2 % PAGE_SIZE > (PAGE_SIZE - 8) - - with PAGE_SIZE being 4096. */ - rldicl r9,r4,0,52 - cmpldi cr7,r9,4088 - ble cr7,L(loop_unaligned) - b L(check_source2_byte) - - .align 4 -L(pagecross_ne): - extsw r3,r9 - mr r9,r10 -L(pagecross_retdiff): - subf r9,r9,r3 - extsw r3,r9 - blr - - .align 4 -L(pagecross_nullfound): - li r3,0 - b L(pagecross_retdiff) -END (STRCMP) -libc_hidden_builtin_def (strcmp) diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S deleted file mode 100644 index 7f2cee4b1b..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strcpy.S +++ /dev/null @@ -1,270 +0,0 @@ -/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8. - Copyright (C) 2015-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifdef USE_AS_STPCPY -# ifndef STPCPY -# define FUNC_NAME __stpcpy -# else -# define FUNC_NAME STPCPY -# endif -#else -# ifndef STRCPY -# define FUNC_NAME strcpy -# else -# define FUNC_NAME STRCPY -# endif -#endif /* !USE_AS_STPCPY */ - -/* Implements the function - - char * [r3] strcpy (char *dest [r3], const char *src [r4]) - - or - - char * [r3] stpcpy (char *dest [r3], const char *src [r4]) - - if USE_AS_STPCPY is defined. - - The implementation uses unaligned doubleword access to avoid specialized - code paths depending of data alignment. Although recent powerpc64 uses - 64K as default, the page cross handling assumes minimum page size of - 4k. */ - - .machine power7 -EALIGN (FUNC_NAME, 4, 0) - li r0,0 /* Doubleword with null chars to use - with cmpb. */ - - /* Check if the [src]+15 will cross a 4K page by checking if the bit - indicating the page size changes. Basically: - - uint64_t srcin = (uint64_t)src; - uint64_t ob = srcin & 4096UL; - uint64_t nb = (srcin+15UL) & 4096UL; - if (ob ^ nb) - goto pagecross; */ - - addi r9,r4,15 - xor r9,r9,r4 - rlwinm. r9,r9,0,19,19 - bne L(pagecross) - - /* For short string (less than 16 bytes), just calculate its size as - strlen and issues a memcpy if null is found. */ - mr r7,r4 - ld r12,0(r7) /* Load doubleword from memory. */ - cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */ - cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ - bne cr7,L(done) - - ldu r8,8(r7) - cmpb r10,r8,r0 - cmpdi cr7,r10,0 - bne cr7,L(done) - - b L(loop_before) - - .align 4 -L(pagecross): - clrrdi r7,r4,3 /* Align the address to doubleword boundary. */ - rlwinm r6,r4,3,26,28 /* Calculate padding. */ - li r5,-1 /* MASK = 0xffffffffffffffff. */ - ld r12,0(r7) /* Load doubleword from memory. */ -#ifdef __LITTLE_ENDIAN__ - sld r5,r5,r6 -#else - srd r5,r5,r6 /* MASK = MASK >> padding. */ -#endif - orc r9,r12,r5 /* Mask bits that are not part of the string. */ - cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ - cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ - bne cr7,L(done) - - ldu r6,8(r7) - cmpb r10,r6,r0 - cmpdi cr7,r10,0 - bne cr7,L(done) - - ld r12,0(r7) - cmpb r10,r12,r0 - cmpdi cr7,r10,0 - bne cr7,L(done) - - ldu r6,8(r7) - cmpb r10,r6,r0 - cmpdi cr7,r10,0 - bne cr7,L(done) - - /* We checked for 24 - x bytes, with x being the source alignment - (0 <= x <= 16), and no zero has been found. Start the loop - copy with doubleword aligned address. */ - mr r7,r4 - ld r12, 0(r7) - ldu r8, 8(r7) - -L(loop_before): - /* Save the two doublewords readed from source and align the source - to 16 bytes for the loop. */ - mr r11,r3 - std r12,0(r11) - std r8,8(r11) - addi r11,r11,16 - rldicl r9,r4,0,60 - subf r7,r9,r7 - subf r11,r9,r11 - b L(loop_start) - - .align 5 -L(loop): - std r12, 0(r11) - std r6, 8(r11) - addi r11,r11,16 -L(loop_start): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the null-checking process for bigger strings. */ - - ld r12, 8(r7) - ldu r6, 16(r7) - cmpb r10,r12,r0 - cmpb r9,r6,r0 - or r8,r9,r10 /* Merge everything in one doubleword. */ - cmpdi cr7,r8,0 - beq cr7,L(loop) - - - /* OK, one (or both) of the doublewords contains a null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a null byte. */ - - addi r4,r7,-8 - cmpdi cr6,r10,0 - addi r7,r7,-8 - bne cr6,L(done2) - - /* The null byte must be in the second doubleword. Adjust the address - again and move the result of cmpb to r10 so we can calculate the - length. */ - - mr r10,r9 - addi r7,r7,8 - b L(done2) - - /* r10 has the output of the cmpb instruction, that is, it contains - 0xff in the same position as the null byte in the original - doubleword from the string. Use that to calculate the length. */ -L(done): - mr r11,r3 -L(done2): -#ifdef __LITTLE_ENDIAN__ - addi r9, r10, -1 /* Form a mask from trailing zeros. */ - andc r9, r9, r10 - popcntd r6, r9 /* Count the bits in the mask. */ -#else - cntlzd r6,r10 /* Count leading zeros before the match. */ -#endif - subf r5,r4,r7 - srdi r6,r6,3 /* Convert leading/trailing zeros to bytes. */ - add r8,r5,r6 /* Compute final length. */ -#ifdef USE_AS_STPCPY - /* stpcpy returns the dest address plus the size not counting the - final '\0'. */ - add r3,r11,r8 -#endif - addi r8,r8,1 /* Final '/0'. */ - - cmpldi cr6,r8,8 - mtocrf 0x01,r8 - ble cr6,L(copy_LE_8) - - cmpldi cr1,r8,16 - blt cr1,8f - - /* Handle copies of 0~31 bytes. */ - .align 4 -L(copy_LT_32): - /* At least 6 bytes to go. */ - blt cr1,8f - - /* Copy 16 bytes. */ - ld r6,0(r4) - ld r8,8(r4) - addi r4,r4,16 - std r6,0(r11) - std r8,8(r11) - addi r11,r11,16 -8: /* Copy 8 bytes. */ - bf 28,L(tail4) - ld r6,0(r4) - addi r4,r4,8 - std r6,0(r11) - addi r11,r11,8 - - .align 4 -/* Copies 4~7 bytes. */ -L(tail4): - bf 29,L(tail2) - lwz r6,0(r4) - stw r6,0(r11) - bf 30,L(tail5) - lhz r7,4(r4) - sth r7,4(r11) - bflr 31 - lbz r8,6(r4) - stb r8,6(r11) - blr - - .align 4 -/* Copies 2~3 bytes. */ -L(tail2): - bf 30,1f - lhz r6,0(r4) - sth r6,0(r11) - bflr 31 - lbz r7,2(r4) - stb r7,2(r11) - blr - - .align 4 -L(tail5): - bf 31,1f - lbz r6,4(r4) - stb r6,4(r11) - blr - - .align 4 -1: - bflr 31 - lbz r6,0(r4) - stb r6,0(r11) - blr - -/* Handles copies of 0~8 bytes. */ - .align 4 -L(copy_LE_8): - bne cr6,L(tail4) - ld r6,0(r4) - std r6,0(r11) - blr -END (FUNC_NAME) - -#ifndef USE_AS_STPCPY -libc_hidden_builtin_def (strcpy) -#endif diff --git a/sysdeps/powerpc/powerpc64/power8/strcspn.S b/sysdeps/powerpc/powerpc64/power8/strcspn.S deleted file mode 100644 index c9a7a2e3c3..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strcspn.S +++ /dev/null @@ -1,20 +0,0 @@ -/* Optimized strcspn implementation for PowerPC64/POWER8. - Copyright (C) 2016-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#define USE_AS_STRCSPN 1 -#include <sysdeps/powerpc/powerpc64/power8/strspn.S> diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S deleted file mode 100644 index 8f4a1fc1dc..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strlen.S +++ /dev/null @@ -1,301 +0,0 @@ -/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized - loop. - Copyright (C) 2016-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* TODO: change these to the actual instructions when the minimum required - binutils allows it. */ -#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) -#define VBPERMQ(t,a,b) .long (0x1000054c \ - | ((t)<<(32-11)) \ - | ((a)<<(32-16)) \ - | ((b)<<(32-21)) ) - -/* int [r3] strlen (char *s [r3]) */ - -#ifndef STRLEN -# define STRLEN strlen -#endif - -/* TODO: change this to .machine power8 when the minimum required binutils - allows it. */ - .machine power7 -EALIGN (STRLEN, 4, 0) - CALL_MCOUNT 1 - dcbt 0,r3 - clrrdi r4,r3,3 /* Align the address to doubleword boundary. */ - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - li r0,0 /* Doubleword with null chars to use - with cmpb. */ - li r5,-1 /* MASK = 0xffffffffffffffff. */ - ld r12,0(r4) /* Load doubleword from memory. */ -#ifdef __LITTLE_ENDIAN__ - sld r5,r5,r6 -#else - srd r5,r5,r6 /* MASK = MASK >> padding. */ -#endif - orc r9,r12,r5 /* Mask bits that are not part of the string. */ - cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ - cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ - bne cr7,L(done) - - /* For shorter strings (< 64 bytes), we will not use vector registers, - as the overhead isn't worth it. So, let's use GPRs instead. This - will be done the same way as we do in the POWER7 implementation. - Let's see if we are aligned to a quadword boundary. If so, we can - jump to the first (non-vectorized) loop. Otherwise, we have to - handle the next DWORD first. */ - mtcrf 0x01,r4 - mr r9,r4 - addi r9,r9,8 - bt 28,L(align64) - - /* Handle the next 8 bytes so we are aligned to a quadword - boundary. */ - ldu r5,8(r4) - cmpb r10,r5,r0 - cmpdi cr7,r10,0 - addi r9,r9,8 - bne cr7,L(done) - -L(align64): - /* Proceed to the old (POWER7) implementation, checking two doublewords - per iteraction. For the first 56 bytes, we will just check for null - characters. After that, we will also check if we are 64-byte aligned - so we can jump to the vectorized implementation. We will unroll - these loops to avoid excessive branching. */ - ld r6,8(r4) - ldu r5,16(r4) - cmpb r10,r6,r0 - cmpb r11,r5,r0 - or r5,r10,r11 - cmpdi cr7,r5,0 - addi r9,r9,16 - bne cr7,L(dword_zero) - - ld r6,8(r4) - ldu r5,16(r4) - cmpb r10,r6,r0 - cmpb r11,r5,r0 - or r5,r10,r11 - cmpdi cr7,r5,0 - addi r9,r9,16 - bne cr7,L(dword_zero) - - ld r6,8(r4) - ldu r5,16(r4) - cmpb r10,r6,r0 - cmpb r11,r5,r0 - or r5,r10,r11 - cmpdi cr7,r5,0 - addi r9,r9,16 - bne cr7,L(dword_zero) - - /* Are we 64-byte aligned? If so, jump to the vectorized loop. - Note: aligning to 64-byte will necessarily slow down performance for - strings around 64 bytes in length due to the extra comparisons - required to check alignment for the vectorized loop. This is a - necessary tradeoff we are willing to take in order to speed up the - calculation for larger strings. */ - andi. r10,r9,63 - beq cr0,L(preloop) - ld r6,8(r4) - ldu r5,16(r4) - cmpb r10,r6,r0 - cmpb r11,r5,r0 - or r5,r10,r11 - cmpdi cr7,r5,0 - addi r9,r9,16 - bne cr7,L(dword_zero) - - andi. r10,r9,63 - beq cr0,L(preloop) - ld r6,8(r4) - ldu r5,16(r4) - cmpb r10,r6,r0 - cmpb r11,r5,r0 - or r5,r10,r11 - cmpdi cr7,r5,0 - addi r9,r9,16 - bne cr7,L(dword_zero) - - andi. r10,r9,63 - beq cr0,L(preloop) - ld r6,8(r4) - ldu r5,16(r4) - cmpb r10,r6,r0 - cmpb r11,r5,r0 - or r5,r10,r11 - cmpdi cr7,r5,0 - addi r9,r9,16 - bne cr7,L(dword_zero) - - andi. r10,r9,63 - beq cr0,L(preloop) - ld r6,8(r4) - ldu r5,16(r4) - cmpb r10,r6,r0 - cmpb r11,r5,r0 - or r5,r10,r11 - cmpdi cr7,r5,0 - addi r9,r9,16 - - /* At this point, we are necessarily 64-byte aligned. If no zeroes were - found, jump to the vectorized loop. */ - beq cr7,L(preloop) - -L(dword_zero): - /* OK, one (or both) of the doublewords contains a null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a null byte. */ - - cmpdi cr6,r10,0 - addi r4,r4,-8 - bne cr6,L(done) - - /* The null byte must be in the second doubleword. Adjust the address - again and move the result of cmpb to r10 so we can calculate the - length. */ - - mr r10,r11 - addi r4,r4,8 - - /* If the null byte was found in the non-vectorized code, compute the - final length. r10 has the output of the cmpb instruction, that is, - it contains 0xff in the same position as the null byte in the - original doubleword from the string. Use that to calculate the - length. */ -L(done): -#ifdef __LITTLE_ENDIAN__ - addi r9, r10,-1 /* Form a mask from trailing zeros. */ - andc r9, r9,r10 - popcntd r0, r9 /* Count the bits in the mask. */ -#else - cntlzd r0,r10 /* Count leading zeros before the match. */ -#endif - subf r5,r3,r4 - srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ - add r3,r5,r0 /* Compute final length. */ - blr - - /* Vectorized implementation starts here. */ - .p2align 4 -L(preloop): - /* Set up for the loop. */ - mr r4,r9 - li r7, 16 /* Load required offsets. */ - li r8, 32 - li r9, 48 - li r12, 8 - vxor v0,v0,v0 /* VR with null chars to use with - vcmpequb. */ - - /* Main loop to look for the end of the string. We will read in - 64-byte chunks. Align it to 32 bytes and unroll it 3 times to - leverage the icache performance. */ - .p2align 5 -L(loop): - lvx v1,r4,r0 /* Load 4 quadwords. */ - lvx v2,r4,r7 - lvx v3,r4,r8 - lvx v4,r4,r9 - vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ - vminub v6,v3,v4 - vminub v7,v5,v6 - vcmpequb. v7,v7,v0 /* Check for NULLs. */ - addi r4,r4,64 /* Adjust address for the next iteration. */ - bne cr6,L(vmx_zero) - - lvx v1,r4,r0 /* Load 4 quadwords. */ - lvx v2,r4,r7 - lvx v3,r4,r8 - lvx v4,r4,r9 - vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ - vminub v6,v3,v4 - vminub v7,v5,v6 - vcmpequb. v7,v7,v0 /* Check for NULLs. */ - addi r4,r4,64 /* Adjust address for the next iteration. */ - bne cr6,L(vmx_zero) - - lvx v1,r4,r0 /* Load 4 quadwords. */ - lvx v2,r4,r7 - lvx v3,r4,r8 - lvx v4,r4,r9 - vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ - vminub v6,v3,v4 - vminub v7,v5,v6 - vcmpequb. v7,v7,v0 /* Check for NULLs. */ - addi r4,r4,64 /* Adjust address for the next iteration. */ - beq cr6,L(loop) - -L(vmx_zero): - /* OK, we found a null byte. Let's look for it in the current 64-byte - block and mark it in its corresponding VR. */ - vcmpequb v1,v1,v0 - vcmpequb v2,v2,v0 - vcmpequb v3,v3,v0 - vcmpequb v4,v4,v0 - - /* We will now 'compress' the result into a single doubleword, so it - can be moved to a GPR for the final calculation. First, we - generate an appropriate mask for vbpermq, so we can permute bits into - the first halfword. */ - vspltisb v10,3 - lvsl v11,r0,r0 - vslb v10,v11,v10 - - /* Permute the first bit of each byte into bits 48-63. */ - VBPERMQ(v1,v1,v10) - VBPERMQ(v2,v2,v10) - VBPERMQ(v3,v3,v10) - VBPERMQ(v4,v4,v10) - - /* Shift each component into its correct position for merging. */ -#ifdef __LITTLE_ENDIAN__ - vsldoi v2,v2,v2,2 - vsldoi v3,v3,v3,4 - vsldoi v4,v4,v4,6 -#else - vsldoi v1,v1,v1,6 - vsldoi v2,v2,v2,4 - vsldoi v3,v3,v3,2 -#endif - - /* Merge the results and move to a GPR. */ - vor v1,v2,v1 - vor v2,v3,v4 - vor v4,v1,v2 - MFVRD(r10,v4) - - /* Adjust address to the begninning of the current 64-byte block. */ - addi r4,r4,-64 - -#ifdef __LITTLE_ENDIAN__ - addi r9, r10,-1 /* Form a mask from trailing zeros. */ - andc r9, r9,r10 - popcntd r0, r9 /* Count the bits in the mask. */ -#else - cntlzd r0,r10 /* Count leading zeros before the match. */ -#endif - subf r5,r3,r4 - add r3,r5,r0 /* Compute final length. */ - blr - -END (STRLEN) -libc_hidden_builtin_def (strlen) diff --git a/sysdeps/powerpc/powerpc64/power8/strncase.S b/sysdeps/powerpc/powerpc64/power8/strncase.S deleted file mode 100644 index 32e09e4d94..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strncase.S +++ /dev/null @@ -1,20 +0,0 @@ -/* Optimized strncasecmp implementation for POWER8. - Copyright (C) 2016-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#define USE_AS_STRNCASECMP 1 -#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S> diff --git a/sysdeps/powerpc/powerpc64/power8/strncmp.S b/sysdeps/powerpc/powerpc64/power8/strncmp.S deleted file mode 100644 index 3d8df90538..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strncmp.S +++ /dev/null @@ -1,327 +0,0 @@ -/* Optimized strncmp implementation for PowerPC64/POWER8. - Copyright (C) 2015-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifndef STRNCMP -# define STRNCMP strncmp -#endif - -/* Implements the function - - int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n) - - The implementation uses unaligned doubleword access to avoid specialized - code paths depending of data alignment. Although recent powerpc64 uses - 64K as default, the page cross handling assumes minimum page size of - 4k. */ - - .machine power7 -EALIGN (STRNCMP, 4, 0) - /* Check if size is 0. */ - mr. r10,r5 - beq cr0,L(ret0) - - /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using - the code: - - (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) - - with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ - rldicl r8,r3,0,52 - cmpldi cr7,r8,4096-16 - bgt cr7,L(pagecross) - rldicl r9,r4,0,52 - cmpldi cr7,r9,4096-16 - bgt cr7,L(pagecross) - - /* For short string up to 16 bytes, load both s1 and s2 using - unaligned dwords and compare. */ - ld r7,0(r3) - ld r9,0(r4) - li r8,0 - cmpb r8,r7,r8 - cmpb r6,r7,r9 - orc. r8,r8,r6 - bne cr0,L(different1) - - /* If the string compared are equal, but size is less or equal - to 8, return 0. */ - cmpldi cr7,r10,8 - li r9,0 - ble cr7,L(ret1) - addi r5,r10,-8 - - ld r7,8(r3) - ld r9,8(r4) - cmpb r8,r7,r8 - cmpb r6,r7,r9 - orc. r8,r8,r6 - bne cr0,L(different0) - - cmpldi cr7,r5,8 - mr r9,r8 - ble cr7,L(ret1) - - /* Update pointers and size. */ - addi r10,r10,-16 - addi r3,r3,16 - addi r4,r4,16 - - /* Now it has checked for first 16 bytes, align source1 to doubleword - and adjust source2 address. */ -L(align_8b): - rldicl r5,r3,0,61 - rldicr r3,r3,0,60 - subf r4,r5,r4 - add r10,r10,r5 - - /* At this point, source1 alignment is 0 and source2 alignment is - between 0 and 7. Check is source2 alignment is 0, meaning both - sources have the same alignment. */ - andi. r8,r4,0x7 - beq cr0,L(loop_eq_align_0) - - li r5,0 - b L(loop_ne_align_1) - - /* If source2 is unaligned to doubleword, the code needs to check - on each interation if the unaligned doubleword access will cross - a 4k page boundary. */ - .align 4 -L(loop_ne_align_0): - ld r7,0(r3) - ld r9,0(r4) - cmpb r8,r7,r5 - cmpb r6,r7,r9 - orc. r8,r8,r6 - bne cr0,L(different1) - - cmpldi cr7,r10,8 - ble cr7,L(ret0) - addi r10,r10,-8 - addi r3,r3,8 - addi r4,r4,8 -L(loop_ne_align_1): - rldicl r9,r4,0,52 - cmpldi r7,r9,4088 - ble cr7,L(loop_ne_align_0) - cmpdi cr7,r10,0 - beq cr7,L(ret0) - - lbz r9,0(r3) - lbz r8,0(r4) - cmplw cr7,r9,r8 - bne cr7,L(byte_ne_4) - cmpdi cr7,r9,0 - beq cr7,L(size_reached_0) - - li r9,r7 - addi r8,r3,1 - mtctr r9 - addi r4,r4,1 - addi r10,r10,-1 - addi r3,r3,8 - - /* The unaligned read of source2 will cross a 4K page boundary, - and the different byte or NULL maybe be in the remaining page - bytes. Since it can not use the unaligned load the algorithm - reads and compares 8 bytes to keep source1 doubleword aligned. */ - .align 4 -L(loop_ne_align_byte): - cmpdi cr7,r10,0 - addi r10,r10,-1 - beq cr7,L(ret0) - lbz r9,0(r8) - lbz r7,0(r4) - addi r8,r8,1 - addi r4,r4,1 - cmplw cr7,r9,r7 - cmpdi cr5,r9,0 - bne cr7,L(size_reached_2) - beq cr5,L(size_reached_0) - bdnz L(loop_ne_align_byte) - - cmpdi cr7,r10,0 - bne+ cr7,L(loop_ne_align_0) - - .align 4 -L(ret0): - li r9,0 -L(ret1): - mr r3,r9 - blr - - /* The code now check if r8 and r10 are different by issuing a - cmpb and shift the result based on its output: - - #ifdef __LITTLE_ENDIAN__ - leadzero = (__builtin_ffsl (z1) - 1); - leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero; - r1 = (r1 >> leadzero) & 0xFFUL; - r2 = (r2 >> leadzero) & 0xFFUL; - #else - leadzero = __builtin_clzl (z1); - leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero; - r1 = (r1 >> (56 - leadzero)) & 0xFFUL; - r2 = (r2 >> (56 - leadzero)) & 0xFFUL; - #endif - return r1 - r2; */ - - .align 4 -L(different0): - mr r10,r5 -#ifdef __LITTLE_ENDIAN__ -L(different1): - neg r11,r8 - sldi r10,r10,3 - and r8,r11,r8 - addi r10,r10,-8 - cntlzd r8,r8 - subfic r8,r8,63 - extsw r8,r8 - cmpld cr7,r8,r10 - ble cr7,L(different2) - mr r8,r10 -L(different2): - extsw r8,r8 -#else -L(different1): - addi r10,r10,-1 - cntlzd r8,r8 - sldi r10,r10,3 - cmpld cr7,r8,r10 - blt cr7,L(different2) - mr r8,r10 -L(different2): - subfic r8,r8,56 -#endif - srd r7,r7,r8 - srd r9,r9,r8 - rldicl r3,r7,0,56 - rldicl r9,r9,0,56 - subf r9,r9,3 - extsw r9,r9 - mr r3,r9 - blr - - /* If unaligned 16 bytes reads across a 4K page boundary, it uses - a simple byte a byte comparison until the page alignment for s1 - is reached. */ - .align 4 -L(pagecross): - lbz r7,0(r3) - lbz r9,0(r4) - subfic r8,r8,4095 - cmplw cr7,r9,r7 - bne cr7,L(byte_ne_3) - cmpdi cr7,r9,0 - beq cr7,L(byte_ne_0) - addi r10,r10,-1 - subf r7,r8,r10 - subf r9,r7,r10 - addi r9,r9,1 - mtctr r9 - b L(pagecross_loop1) - - .align 4 -L(pagecross_loop0): - beq cr7,L(ret0) - lbz r9,0(r3) - lbz r8,0(r4) - addi r10,r10,-1 - cmplw cr7,r9,r8 - cmpdi cr5,r9,0 - bne r7,L(byte_ne_2) - beq r5,L(byte_ne_0) -L(pagecross_loop1): - cmpdi cr7,r10,0 - addi r3,r3,1 - addi r4,r4,1 - bdnz L(pagecross_loop0) - cmpdi cr7,r7,0 - li r9,0 - bne+ cr7,L(align_8b) - b L(ret1) - - /* If both source1 and source2 are doubleword aligned, there is no - need for page boundary cross checks. */ - .align 4 -L(loop_eq_align_0): - ld r7,0(r3) - ld r9,0(r4) - cmpb r8,r7,r8 - cmpb r6,r7,r9 - orc. r8,r8,r6 - bne cr0,L(different1) - - cmpldi cr7,r10,8 - ble cr7,L(ret0) - addi r9,r10,-9 - - li r5,0 - srdi r9,r9,3 - addi r9,r9,1 - mtctr r9 - b L(loop_eq_align_2) - - .align 4 -L(loop_eq_align_1): - bdz L(ret0) -L(loop_eq_align_2): - ldu r7,8(r3) - addi r10,r10,-8 - ldu r9,8(r4) - cmpb r8,r7,r5 - cmpb r6,r7,r9 - orc. r8,r8,r6 - beq cr0,L(loop_eq_align_1) - b L(different1) - - .align 4 -L(byte_ne_0): - li r7,0 -L(byte_ne_1): - subf r9,r9,r7 - extsw r9,r9 - b L(ret1) - - .align 4 -L(byte_ne_2): - extsw r7,r9 - mr r9,r8 - b L(byte_ne_1) -L(size_reached_0): - li r10,0 -L(size_reached_1): - subf r9,r9,r10 - extsw r9,r9 - b L(ret1) -L(size_reached_2): - extsw r10,r9 - mr r9,r7 - b L(size_reached_1) -L(byte_ne_3): - extsw r7,r7 - b L(byte_ne_1) -L(byte_ne_4): - extsw r10,r9 - mr r9,r8 - b L(size_reached_1) -END(STRNCMP) -libc_hidden_builtin_def(strncmp) diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S deleted file mode 100644 index 6d40f30ff7..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S +++ /dev/null @@ -1,465 +0,0 @@ -/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8. - Copyright (C) 2015-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifdef USE_AS_STPNCPY -# ifndef STPNCPY -# define FUNC_NAME __stpncpy -# else -# define FUNC_NAME STPNCPY -# endif -#else -# ifndef STRNCPY -# define FUNC_NAME strncpy -# else -# define FUNC_NAME STRNCPY -# endif -#endif /* !USE_AS_STPNCPY */ - -#ifndef MEMSET -/* For builds without IFUNC support, local calls should be made to internal - GLIBC symbol (created by libc_hidden_builtin_def). */ -# ifdef SHARED -# define MEMSET __GI_memset -# else -# define MEMSET memset -# endif -#endif - -#define FRAMESIZE (FRAME_MIN_SIZE+48) - -/* Implements the function - - char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) - - or - - char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) - - if USE_AS_STPCPY is defined. - - The implementation uses unaligned doubleword access to avoid specialized - code paths depending of data alignment. Although recent powerpc64 uses - 64K as default, the page cross handling assumes minimum page size of - 4k. */ - - .machine power7 -EALIGN (FUNC_NAME, 4, 0) - - /* Check if the [src]+15 will cross a 4K page by checking if the bit - indicating the page size changes. Basically: - - uint64_t srcin = (uint64_t)src; - uint64_t ob = srcin & 4096UL; - uint64_t nb = (srcin+15UL) & 4096UL; - if (ob ^ nb) - goto pagecross; */ - - addi r10,r4,16 - rlwinm r9,r4,0,19,19 - - /* Save some non-volatile registers on the stack. */ - std r26,-48(r1) - std r27,-40(r1) - - rlwinm r8,r10,0,19,19 - - std r28,-32(r1) - std r29,-24(r1) - - cmpld cr7,r9,r8 - - std r30,-16(r1) - std r31,-8(r1) - - /* Update CFI. */ - cfi_offset(r26, -48) - cfi_offset(r27, -40) - cfi_offset(r28, -32) - cfi_offset(r29, -24) - cfi_offset(r30, -16) - cfi_offset(r31, -8) - - beq cr7,L(unaligned_lt_16) - rldicl r9,r4,0,61 - subfic r8,r9,8 - cmpld cr7,r5,r8 - bgt cr7,L(pagecross) - - /* At this points there is 1 to 15 bytes to check and write. Since it could - be either from first unaligned 16 bytes access or from bulk copy, the code - uses an unrolled byte read/write instead of trying to analyze the cmpb - results. */ -L(short_path): - mr r9,r3 -L(short_path_1): - /* Return if there are no more bytes to be written. */ - cmpdi cr7,r5,0 - beq cr7,L(short_path_loop_end_1) -L(short_path_2): - /* Copy one char from src (r4) and write it to dest (r9). If it is the - end-of-string, start the null padding. Continue, otherwise. */ - lbz r10,0(r4) - cmpdi cr7,r10,0 - stb r10,0(r9) - beq cr7,L(zero_pad_start_1) - /* If there are no more bytes to be written, return. */ - cmpdi cr0,r5,1 - addi r8,r9,1 - addi r6,r5,-1 - beq cr0,L(short_path_loop_end_0) - /* Copy another char from src (r4) to dest (r9). Check again if it is - the end-of-string. If so, start the null padding. */ - lbz r10,1(r4) - cmpdi cr7,r10,0 - stb r10,1(r9) - beq cr7,L(zero_pad_start_prepare_1) - /* Eagerly decrement r5 by 3, which is the number of bytes already - written, plus one write that will be performed later on. */ - addi r10,r5,-3 - b L(short_path_loop_1) - - .align 4 -L(short_path_loop): - /* At this point, the induction variable, r5, as well as the pointers - to dest and src (r9 and r4, respectivelly) have been updated. - - Note: The registers r7 and r10 are induction variables derived from - r5. They are used to determine if the total number of writes has - been reached at every other write. - - Copy one char from src (r4) and write it to dest (r9). If it is the - end-of-string, start the null padding. Continue, otherwise. */ - lbz r8,0(r4) - addi r7,r10,-2 - cmpdi cr5,r8,0 - stb r8,0(r9) - beq cr5,L(zero_pad_start_1) - beq cr7,L(short_path_loop_end_0) - /* Copy another char from src (r4) to dest (r9). Check again if it is - the end-of-string. If so, start the null padding. */ - lbz r8,1(r4) - cmpdi cr7,r8,0 - stb r8,1(r9) - beq cr7,L(zero_pad_start) - mr r10,r7 -L(short_path_loop_1): - /* This block is reached after two chars have been already written to - dest. Nevertheless, r5 (the induction variable), r9 (the pointer to - dest), and r4 (the pointer to src) have not yet been updated. - - At this point: - r5 holds the count of bytes yet to be written plus 2. - r9 points to the last two chars that were already written to dest. - r4 points to the last two chars that were already copied from src. - - The algorithm continues by decrementing r5, the induction variable, - so that it reflects the last two writes. The pointers to dest (r9) - and to src (r4) are increment by two, for the same reason. - - Note: Register r10 is another induction variable, derived from r5, - which determines if the total number of writes has been reached. */ - addic. r5,r5,-2 - addi r9,r9,2 - cmpdi cr7,r10,0 /* Eagerly check if the next write is the last. */ - addi r4,r4,2 - addi r6,r9,1 - bne cr0,L(short_path_loop) /* Check if the total number of writes - has been reached at every other - write. */ -#ifdef USE_AS_STPNCPY - mr r3,r9 - b L(short_path_loop_end) -#endif - -L(short_path_loop_end_0): -#ifdef USE_AS_STPNCPY - addi r3,r9,1 - b L(short_path_loop_end) -#endif -L(short_path_loop_end_1): -#ifdef USE_AS_STPNCPY - mr r3,r9 -#endif -L(short_path_loop_end): - /* Restore non-volatile registers. */ - ld r26,-48(r1) - ld r27,-40(r1) - ld r28,-32(r1) - ld r29,-24(r1) - ld r30,-16(r1) - ld r31,-8(r1) - blr - - /* This code pads the remainder of dest with NULL bytes. The algorithm - calculates the remaining size and calls memset. */ - .align 4 -L(zero_pad_start): - mr r5,r10 - mr r9,r6 -L(zero_pad_start_1): - /* At this point: - - r5 holds the number of bytes that still have to be written to - dest. - - r9 points to the position, in dest, where the first null byte - will be written. - The above statements are true both when control reaches this label - from a branch or when falling through the previous lines. */ -#ifndef USE_AS_STPNCPY - mr r30,r3 /* Save the return value of strncpy. */ -#endif - /* Prepare the call to memset. */ - mr r3,r9 /* Pointer to the area to be zero-filled. */ - li r4,0 /* Byte to be written (zero). */ - - /* We delayed the creation of the stack frame, as well as the saving of - the link register, because only at this point, we are sure that - doing so is actually needed. */ - - /* Save the link register. */ - mflr r0 - std r0,16(r1) - cfi_offset(lr, 16) - - /* Create the stack frame. */ - stdu r1,-FRAMESIZE(r1) - cfi_adjust_cfa_offset(FRAMESIZE) - - bl MEMSET - nop - - /* Restore the stack frame. */ - addi r1,r1,FRAMESIZE - cfi_adjust_cfa_offset(-FRAMESIZE) - /* Restore the link register. */ - ld r0,16(r1) - mtlr r0 - -#ifndef USE_AS_STPNCPY - mr r3,r30 /* Restore the return value of strncpy, i.e.: - dest. For stpncpy, the return value is the - same as return value of memset. */ -#endif - - /* Restore non-volatile registers and return. */ - ld r26,-48(r1) - ld r27,-40(r1) - ld r28,-32(r1) - ld r29,-24(r1) - ld r30,-16(r1) - ld r31,-8(r1) - blr - - /* The common case where [src]+16 will not cross a 4K page boundary. - In this case the code fast check the first 16 bytes by using doubleword - read/compares and update destiny if neither total size or null byte - is found in destiny. */ - .align 4 -L(unaligned_lt_16): - cmpldi cr7,r5,7 - ble cr7,L(short_path) - ld r7,0(r4) - li r8,0 - cmpb r8,r7,r8 - cmpdi cr7,r8,0 - bne cr7,L(short_path_prepare_2) - addi r6,r5,-8 - std r7,0(r3) - addi r9,r3,8 - cmpldi cr7,r6,7 - addi r7,r4,8 - ble cr7,L(short_path_prepare_1_1) - ld r4,8(r4) - cmpb r8,r4,r8 - cmpdi cr7,r8,0 - bne cr7,L(short_path_prepare_2_1) - std r4,8(r3) - addi r29,r3,16 - addi r5,r5,-16 - /* Neither the null byte was found or total length was reached, - align to 16 bytes and issue a bulk copy/compare. */ - b L(align_to_16b) - - /* In the case of 4k page boundary cross, the algorithm first align - the address to a doubleword, calculate a mask based on alignment - to ignore the bytes and continue using doubleword. */ - .align 4 -L(pagecross): - rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */ - li r6,-1 /* MASK = 0xffffffffffffffffUL. */ - sldi r9,r9,3 /* Calculate padding. */ - ld r7,0(r11) /* Load doubleword from memory. */ -#ifdef __LITTLE_ENDIAN__ - sld r9,r6,r9 /* MASK = MASK << padding. */ -#else - srd r9,r6,r9 /* MASK = MASK >> padding. */ -#endif - orc r9,r7,r9 /* Mask bits that are not part of the - string. */ - li r7,0 - cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */ - cmpdi cr7,r9,0 - bne cr7,L(short_path_prepare_2) - subf r8,r8,r5 /* Adjust total length. */ - cmpldi cr7,r8,8 /* Check if length was reached. */ - ble cr7,L(short_path_prepare_2) - - /* For next checks we have aligned address, so we check for more - three doublewords to make sure we can read 16 unaligned bytes - to start the bulk copy with 16 aligned addresses. */ - ld r7,8(r11) - cmpb r9,r7,r9 - cmpdi cr7,r9,0 - bne cr7,L(short_path_prepare_2) - addi r7,r8,-8 - cmpldi cr7,r7,8 - ble cr7,L(short_path_prepare_2) - ld r7,16(r11) - cmpb r9,r7,r9 - cmpdi cr7,r9,0 - bne cr7,L(short_path_prepare_2) - addi r8,r8,-16 - cmpldi cr7,r8,8 - ble cr7,L(short_path_prepare_2) - ld r8,24(r11) - cmpb r9,r8,r9 - cmpdi cr7,r9,0 - bne cr7,L(short_path_prepare_2) - - /* No null byte found in the 32 bytes readed and length not reached, - read source again using unaligned loads and store them. */ - ld r9,0(r4) - addi r29,r3,16 - addi r5,r5,-16 - std r9,0(r3) - ld r9,8(r4) - std r9,8(r3) - - /* Align source to 16 bytes and adjust destiny and size. */ -L(align_to_16b): - rldicl r9,r10,0,60 - rldicr r28,r10,0,59 - add r12,r5,r9 - subf r29,r9,r29 - - /* The bulk read/compare/copy loads two doublewords, compare and merge - in a single register for speed. This is an attempt to speed up the - null-checking process for bigger strings. */ - - cmpldi cr7,r12,15 - ble cr7,L(short_path_prepare_1_2) - - /* Main loop for large sizes, unrolled 2 times to get better use of - pipeline. */ - ld r8,0(28) - ld r10,8(28) - li r9,0 - cmpb r7,r8,r9 - cmpb r9,r10,r9 - or. r6,r9,r7 - bne cr0,L(short_path_prepare_2_3) - addi r5,r12,-16 - addi r4,r28,16 - std r8,0(r29) - std r10,8(r29) - cmpldi cr7,r5,15 - addi r9,r29,16 - ble cr7,L(short_path_1) - mr r11,r28 - mr r6,r29 - li r30,0 - subfic r26,r4,48 - subfic r27,r9,48 - - b L(loop_16b) - - .align 4 -L(loop_start): - ld r31,0(r11) - ld r10,8(r11) - cmpb r0,r31,r7 - cmpb r8,r10,r7 - or. r7,r0,r8 - addi r5,r5,-32 - cmpldi cr7,r5,15 - add r4,r4,r26 - add r9,r9,r27 - bne cr0,L(short_path_prepare_2_2) - add r4,r28,r4 - std r31,0(r6) - add r9,r29,r9 - std r10,8(r6) - ble cr7,L(short_path_1) - -L(loop_16b): - ld r10,16(r11) - ld r0,24(r11) - cmpb r8,r10,r30 - cmpb r7,r0,r30 - or. r7,r8,r7 - addi r12,r12,-32 - cmpldi cr7,r12,15 - addi r11,r11,32 - bne cr0,L(short_path_2) - std r10,16(r6) - addi r6,r6,32 - std r0,-8(r6) - bgt cr7,L(loop_start) - - mr r5,r12 - mr r4,r11 - mr r9,r6 - b L(short_path_1) - - .align 4 -L(short_path_prepare_1_1): - mr r5,r6 - mr r4,r7 - b L(short_path_1) -L(short_path_prepare_1_2): - mr r5,r12 - mr r4,r28 - mr r9,r29 - b L(short_path_1) -L(short_path_prepare_2): - mr r9,r3 - b L(short_path_2) -L(short_path_prepare_2_1): - mr r5,r6 - mr r4,r7 - b L(short_path_2) -L(short_path_prepare_2_2): - mr r5,r12 - mr r4,r11 - mr r9,r6 - b L(short_path_2) -L(short_path_prepare_2_3): - mr r5,r12 - mr r4,r28 - mr r9,r29 - b L(short_path_2) -L(zero_pad_start_prepare_1): - mr r5,r6 - mr r9,r8 - b L(zero_pad_start_1) -END (FUNC_NAME) - -#ifndef USE_AS_STPNCPY -libc_hidden_builtin_def (strncpy) -#endif diff --git a/sysdeps/powerpc/powerpc64/power8/strnlen.S b/sysdeps/powerpc/powerpc64/power8/strnlen.S deleted file mode 100644 index 3eadbfb09e..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strnlen.S +++ /dev/null @@ -1,433 +0,0 @@ -/* Optimized strnlen implementation for POWER8 using a vmx loop. - - Copyright (C) 2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -/* It is implemented the following heuristic: - 1. Case maxlen <= 32: align the pointer to 8 bytes to loop through - reading doublewords. Uses the POWER7 algorithm. - 2. Case maxlen > 32: check for null bytes in the first 16 bytes using - unaligned accesses. Return length if found. Otherwise: - 2.1 Case maxlen < 64: deduct the bytes previously read, align - the pointer to 16 bytes and loop through reading quadwords - until find null bytes or reach maxlen. - 2.2 Case maxlen > 64: deduct the bytes previously read, align - the pointer to 64 bytes and set up a counter to loop through - reading in strides of 64 bytes. In case it finished the loop - with null bytes not found, process the remainder bytes by - switching to the loop to heuristic in 2.1. */ - -#include <sysdep.h> - -/* Define default page size to 4KB. */ -#define PAGE_SIZE 4096 - -/* The following macros implement Power ISA v2.07 opcodes - that could not be used directly into this code to the keep - compatibility with older binutils versions. */ - -/* Move from vector register doubleword. */ -#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) - -/* Move to vector register doubleword. */ -#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) - -/* Vector Bit Permute Quadword. */ -#define VBPERMQ(t,a,b) .long (0x1000054c \ - | ((t)<<(32-11)) \ - | ((a)<<(32-16)) \ - | ((b)<<(32-21)) ) - -/* Vector Population Count Halfword. */ -#define VPOPCNTH(t,b) .long (0x10000743 | ((t)<<(32-11)) | ((b)<<(32-21))) - -/* Vector Count Leading Zeros Halfword. */ -#define VCLZH(t,b) .long (0x10000742 | ((t)<<(32-11)) | ((b)<<(32-21))) - - -/* int [r3] strnlen (char *s [r3], size_t maxlen [r4]) */ -/* TODO: change to power8 when minimum required binutils allows it. */ - .machine power7 -ENTRY (__strnlen) - CALL_MCOUNT 2 - dcbt 0,r3 - - cmpldi r4,32 /* Check if maxlen <= 32. */ - ble L(small_range) /* If maxlen <= 32. */ - - /* Upcoming 16 bytes unaligned accesses cannot cross the page boundary - otherwise the processor throws an memory access error. - Use following code to check there is room for such as accesses: - (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16) - If it is disallowed then switch to the code that handles - the string when maxlen <= 32. */ - clrldi r10,r3,52 - cmpldi cr7,r10,PAGE_SIZE-16 - bgt cr7,L(small_range) /* If less than 16B of page end. */ - - /* Compute our permute constant r8. */ - li r7,0 - /* Compute a bpermd constant to move bit 0 of each word into - a halfword value, and count trailing zeros. */ -#ifdef __LITTLE_ENDIAN__ - li r8,0x2820 - oris r8,r8,0x3830 - sldi r8,r8,32 - ori r8,r8,0x0800 - oris r8,r8,0x1810 -#else - li r8,0x1018 - oris r8,r8,0x0008 - sldi r8,r8,32 - ori r8,r8,0x3038 - oris r8,r8,0x2028 -#endif - - /* maxlen > 32. Optimistically check for null bytes in the first - 16 bytes of the string using unaligned accesses. */ - ld r5,0(r3) - ld r6,8(r3) - cmpb r10,r7,r5 /* Check for null bytes in DWORD1. */ - cmpb r11,r7,r6 /* Check for null bytes in DWORD2. */ - or. r7,r10,r11 - bne cr0, L(early_find) /* If found null bytes. */ - - /* At this point maxlen > 32 and null bytes were not found at first - 16 bytes. Prepare for loop using VMX. */ - - /* r3 == s, r4 == maxlen. All other volatile regs are unused now. */ - - addi r5,r3,16 /* Align up, or just add the 16B we - already checked. */ - li r0,15 - and r7,r5,r0 /* Find offset into 16B alignment. */ - andc r5,r5,r0 /* Quadword align up s to the next quadword. */ - li r0,16 - subf r0,r7,r0 - subf r4,r0,r4 /* Deduct unaligned bytes from maxlen. */ - - - /* Compute offsets for vmx loads, and precompute the vbpermq - constants for both the 64B and 16B loops. */ - li r6,0 - vspltisb v0,0 - vspltisb v10,3 - lvsl v11,r6,r6 - vslb v10,v11,v10 - - cmpldi r4,64 /* Check maxlen < 64. */ - blt L(smaller) /* If maxlen < 64 */ - - /* In order to begin the 64B loop, it needs to be 64 - bytes aligned. So read quadwords until it is aligned or found null - bytes. At worst case it will be aligned after the fourth iteration, - so unroll the loop to avoid counter checking. */ - andi. r7,r5,63 /* Check if is 64 bytes aligned. */ - beq cr0,L(preloop_64B) /* If it is already 64B aligned. */ - lvx v1,r5,r6 - vcmpequb. v1,v1,v0 - addi r5,r5,16 - addi r4,r4,-16 /* Decrement maxlen in 16 bytes. */ - bne cr6,L(found_aligning64B) /* If found null bytes. */ - - /* Unroll 3x above code block until aligned or find null bytes. */ - andi. r7,r5,63 - beq cr0,L(preloop_64B) - lvx v1,r5,r6 - vcmpequb. v1,v1,v0 - addi r5,r5,16 - addi r4,r4,-16 - bne cr6,L(found_aligning64B) - - andi. r7,r5,63 - beq cr0,L(preloop_64B) - lvx v1,r5,r6 - vcmpequb. v1,v1,v0 - addi r5,r5,16 - addi r4,r4,-16 - bne cr6,L(found_aligning64B) - - andi. r7,r5,63 - beq cr0,L(preloop_64B) - lvx v1,r5,r6 - vcmpequb. v1,v1,v0 - addi r5,r5,16 - addi r4,r4,-16 - bne cr6,L(found_aligning64B) - - /* At this point it should be 16 bytes aligned. - Prepare for the 64B loop. */ - .p2align 4 -L(preloop_64B): - /* Check if maxlen became is less than 64, therefore disallowing the - 64B loop. If it happened switch to the 16B loop code. */ - cmpldi r4,64 /* Check if maxlen < 64. */ - blt L(smaller) /* If maxlen < 64. */ - /* Set some constant values. */ - li r7,16 - li r10,32 - li r9,48 - - /* Compute the number of 64 bytes iterations needed. */ - srdi r11,r4,6 /* Compute loop count (maxlen / 64). */ - andi. r4,r4,63 /* Set maxlen the remainder (maxlen % 64). */ - mtctr r11 /* Move loop count to counter register. */ - - /* Handle maxlen > 64. Loop over the bytes in strides of 64B. */ - .p2align 4 -L(loop_64B): - lvx v1,r5,r6 /* r5 is the pointer to s. */ - lvx v2,r5,r7 - lvx v3,r5,r10 - lvx v4,r5,r9 - /* Compare the four 16B vectors to obtain the least 16 values. - Null bytes should emerge into v7, then check for null bytes. */ - vminub v5,v1,v2 - vminub v6,v3,v4 - vminub v7,v5,v6 - vcmpequb. v7,v7,v0 /* Check for null bytes. */ - addi r5,r5,64 /* Add pointer to next iteraction. */ - bne cr6,L(found_64B) /* If found null bytes. */ - bdnz L(loop_64B) /* Continue the loop if count > 0. */ - -/* Hit loop end without null match. So branch to handle the remainder. */ - - /* Prepare a 16B loop to handle two cases: - 1. If 32 > maxlen < 64. - 2. If maxlen >= 64, and reached end of the 64B loop with null - bytes not found. Thus handle the remainder bytes here. */ - .p2align 4 -L(smaller): - cmpldi r4,0 /* Check maxlen is zero. */ - beq L(done) /* If maxlen is zero. */ - - /* Place rounded up number of qw's to check into a vmx - register, and use some vector tricks to minimize - branching. */ - MTVRD(v7,r4) /* Copy maxlen from GPR to vector register. */ - vspltisb v5,1 - vspltisb v6,15 - vspltb v2,v7,7 - vaddubs v3,v5,v6 - -#ifdef __LITTLE_ENDIAN__ - vspltish v5,1 /* Compute 16 in each byte. */ -#endif - - /* Loop in 16B aligned incremements now. */ - .p2align 4 -L(loop_16B): - lvx v1,r5,r6 /* Load quadword into vector register. */ - addi r5,r5,16 /* Increment address to next 16B block. */ - vor v7,v2,v2 /* Save loop count (v2) into v7. */ - vsububs v2,v2,v3 /* Subtract 16B from count, saturate at 0. */ - vminub v4,v1,v2 - vcmpequb. v4,v4,v0 /* Checking for null bytes. */ - beq cr6,L(loop_16B) /* If null bytes not found. */ - - vcmpequb v1,v1,v0 - VBPERMQ(v1,v1,v10) -#ifdef __LITTLE_ENDIAN__ - vsubuhm v2,v1,v5 /* Form a mask of trailing zeros. */ - vandc v2,v2,v1 - VPOPCNTH(v1,v2) /* Count of trailing zeros, 16 if none. */ -#else - VCLZH(v1,v1) /* Count the leading zeros, 16 if none. */ -#endif - /* Truncate to maximum allowable offset. */ - vcmpgtub v2,v1,v7 /* Compare and truncate for matches beyond - maxlen. */ - vsel v1,v1,v7,v2 /* 0-16 is now in byte 7. */ - - MFVRD(r0,v1) - addi r5,r5,-16 /* Undo speculative bump. */ - extsb r0,r0 /* Clear whatever gunk is in the high 56b. */ - add r5,r5,r0 /* Add the offset of whatever was found. */ -L(done): - subf r3,r3,r5 /* Length is equal to the offset of null byte - matched minus the pointer to s. */ - blr /* Done. */ - - /* Handle case of maxlen > 64 and found null bytes in last block - of 64 bytes read. */ - .p2align 4 -L(found_64B): - /* A zero was found. Reduce the result. */ - vcmpequb v1,v1,v0 - vcmpequb v2,v2,v0 - vcmpequb v3,v3,v0 - vcmpequb v4,v4,v0 - - /* Permute the first bit of each byte into bits 48-63. */ - VBPERMQ(v1,v1,v10) - VBPERMQ(v2,v2,v10) - VBPERMQ(v3,v3,v10) - VBPERMQ(v4,v4,v10) - - /* Shift each component into its correct position for merging. */ -#ifdef __LITTLE_ENDIAN__ - vsldoi v2,v2,v2,2 - vsldoi v3,v3,v3,4 - vsldoi v4,v4,v4,6 -#else - vsldoi v1,v1,v1,6 - vsldoi v2,v2,v2,4 - vsldoi v3,v3,v3,2 -#endif - - /* Merge the results and move to a GPR. */ - vor v1,v2,v1 - vor v2,v3,v4 - vor v4,v1,v2 - - /* Adjust address to the start of the current 64B block. */ - addi r5,r5,-64 - - MFVRD(r10,v4) -#ifdef __LITTLE_ENDIAN__ - addi r9,r10,-1 /* Form a mask from trailing zeros. */ - andc r9,r9,r10 - popcntd r0,r9 /* Count the bits in the mask. */ -#else - cntlzd r0,r10 /* Count leading zeros before the match. */ -#endif - subf r5,r3,r5 - add r3,r5,r0 /* Compute final length. */ - blr /* Done. */ - - /* Handle case where null bytes were found while aligning - as a preparation for the 64B loop. */ - .p2align 4 -L(found_aligning64B): - VBPERMQ(v1,v1,v10) -#ifdef __LITTLE_ENDIAN__ - MFVRD(r10,v1) - addi r9,r10,-1 /* Form a mask from trailing zeros. */ - andc r9,r9,r10 - popcntd r0,r9 /* Count the bits in the mask. */ -#else - vsldoi v1,v1,v1,6 - MFVRD(r10,v1) - cntlzd r0,r10 /* Count leading zeros before the match. */ -#endif - addi r5,r5,-16 /* Adjust address to offset of last 16 bytes - read. */ - /* Calculate length as subtracted the pointer to s of last 16 bytes - offset, added with the bytes before the match. */ - subf r5,r3,r5 - add r3,r5,r0 - blr /* Done. */ - - /* Handle case of maxlen > 32 and found a null bytes within the first - 16 bytes of s. */ - .p2align 4 -L(early_find): - bpermd r5,r8,r10 /* r8 contains the bit permute constants. */ - bpermd r6,r8,r11 - sldi r5,r5,8 - or r5,r5,r6 /* r5 should hold a 16B mask of - a potential 0. */ - cntlzd r5,r5 /* Count leading zeros. */ - addi r3,r5,-48 /* Deduct the 48 leading zeros always - present. */ - blr /* Done. */ - - /* Handle case of maxlen <= 32. Use the POWER7 algorithm. */ - .p2align 4 -L(small_range): - clrrdi r8,r3,3 /* Align the pointer to 8B. */ - li r0,0 - /* Register's content at this point: - r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B, - r7 == last acceptable address. */ - cmpldi r4,0 /* Check if maxlen is zero. */ - beq L(end_max) /* If maxlen is zero. */ - - /* Calculate the last acceptable address and check for possible - addition overflow by using satured math: - r7 = r3 + r4 - r7 |= -(r7 < x) */ - add r7,r3,r4 - subfc r6,r3,r7 - subfe r9,r9,r9 - extsw r6,r9 - or r7,r7,r6 - addi r7,r7,-1 - - clrrdi r7,r7,3 /* Align to 8B address of last - acceptable address. */ - - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - ld r12,0(r8) /* Load aligned doubleword. */ - cmpb r10,r12,r0 /* Check for null bytes. */ -#ifdef __LITTLE_ENDIAN__ - srd r10,r10,r6 - sld r10,r10,r6 -#else - sld r10,r10,r6 - srd r10,r10,r6 -#endif /* __LITTLE_ENDIAN__ */ - cmpldi cr7,r10,0 - bne cr7,L(done_small) /* If found null byte. */ - - cmpld r8,r7 /* Check if reached maxlen. */ - beq L(end_max) /* If reached maxlen. */ - - /* Still handling case of maxlen <= 32. Read doubleword aligned until - find null bytes or reach maxlen. */ - .p2align 4 -L(loop_small): - ldu r12,8(r8) /* Load next doubleword and update r8. */ - cmpb r10,r12,r0 /* Check for null bytes. */ - cmpldi cr6,r10,0 - bne cr6,L(done_small) /* If found null bytes. */ - cmpld r8,r7 /* Check if reached maxlen. */ - bne L(loop_small) /* If it has more bytes to read. */ - mr r3,r4 /* Reached maxlen with null bytes not found. - Length is equal to maxlen. */ - blr /* Done. */ - - /* Still handling case of maxlen <= 32. Found null bytes. - Registers: r10 == match bits within doubleword, r8 == address of - last doubleword read, r3 == pointer to s, r4 == maxlen. */ - .p2align 4 -L(done_small): -#ifdef __LITTLE_ENDIAN__ - /* Count trailing zeros. */ - addi r0,r10,-1 - andc r0,r0,r10 - popcntd r0,r0 -#else - cntlzd r0,r10 /* Count leading zeros before the match. */ -#endif - sub r3,r8,r3 /* Calculate total of bytes before the match. */ - srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ - add r3,r3,r0 /* Length until the match. */ - cmpld r3,r4 /* Check length is greater than maxlen. */ - blelr - mr r3,r4 /* If length is greater than maxlen, return - maxlen. */ - blr - - /* Handle case of reached maxlen with null bytes not found. */ - .p2align 4 -L(end_max): - mr r3,r4 /* Length is equal to maxlen. */ - blr /* Done. */ - - -END (__strnlen) -libc_hidden_def (__strnlen) -weak_alias (__strnlen, strnlen) -libc_hidden_def (strnlen) diff --git a/sysdeps/powerpc/powerpc64/power8/strrchr.S b/sysdeps/powerpc/powerpc64/power8/strrchr.S deleted file mode 100644 index 8eb74853c3..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strrchr.S +++ /dev/null @@ -1,464 +0,0 @@ -/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn. - Copyright (C) 2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* char *[r3] strrchr (char *s [r3], int c [r4]) */ -/* TODO: change these to the actual instructions when the minimum required - binutils allows it. */ -#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) -#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) -#define VBPERMQ(t,a,b) .long (0x1000054c \ - | ((t)<<(32-11)) \ - | ((a)<<(32-16)) \ - | ((b)<<(32-21)) ) -#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21))) -#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21))) -#define VADDUQM(t,a,b) .long (0x10000100 \ - | ((t)<<(32-11)) \ - | ((a)<<(32-16)) \ - | ((b)<<(32-21)) ) -#ifdef __LITTLE_ENDIAN__ -/* Find the match position from v6 and place result in r6. */ -# define CALCULATE_MATCH() \ - VBPERMQ(v6, v6, v10); \ - vsldoi v6, v6, v6, 6; \ - MFVRD(r7, v6); \ - cntlzd r6, r7; \ - subfic r6, r6, 15; -/* - * Find the first null position to mask bytes after null. - * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw. - * Result placed at v2. - */ -# define FIND_NULL_POS(reg) \ - vspltisb v11, -1; \ - VADDUQM(v11, reg, v11); \ - vandc v11, v11, reg; \ - VPOPCNTD(v2, v11); \ - vspltb v11, v2, 15; \ - vcmpequb. v11, v11, v9; \ - blt cr6, 1f; \ - vsldoi v9, v0, v9, 1; \ - vslo v2, v2, v9; \ -1: \ - vsumsws v2, v2, v0; -#else -# define CALCULATE_MATCH() \ - VBPERMQ(v6, v6, v10); \ - MFVRD(r7, v6); \ - addi r6, r7, -1; \ - andc r6, r6, r7; \ - popcntd r6, r6; \ - subfic r6, r6, 15; -# define FIND_NULL_POS(reg) \ - VCLZD(v2, reg); \ - vspltb v11, v2, 7; \ - vcmpequb. v11, v11, v9; \ - blt cr6, 1f; \ - vsldoi v9, v0, v9, 1; \ - vsro v2, v2, v9; \ -1: \ - vsumsws v2, v2, v0; -#endif /* !__LITTLE_ENDIAN__ */ - .machine power7 -ENTRY (strrchr) - CALL_MCOUNT 2 - dcbt 0,r3 - clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ - cmpdi cr7,r4,0 - ld r12,0(r8) /* Load doubleword from memory. */ - li r9,0 /* Used to store last occurence. */ - li r0,0 /* Doubleword with null chars to use - with cmpb. */ - - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - - beq cr7,L(null_match) - - /* Replicate byte to doubleword. */ - insrdi r4,r4,8,48 - insrdi r4,r4,16,32 - insrdi r4,r4,32,0 - - /* r4 is changed now. If it's passed more chars, then - check for null again. */ - cmpdi cr7,r4,0 - beq cr7,L(null_match) - /* Now r4 has a doubleword of c bytes and r0 has - a doubleword of null bytes. */ - - cmpb r10,r12,r4 /* Compare each byte against c byte. */ - cmpb r11,r12,r0 /* Compare each byte against null byte. */ - - /* Move the doublewords left and right to discard the bits that are - not part of the string and bring them back as zeros. */ -#ifdef __LITTLE_ENDIAN__ - srd r10,r10,r6 - srd r11,r11,r6 - sld r10,r10,r6 - sld r11,r11,r6 -#else - sld r10,r10,r6 - sld r11,r11,r6 - srd r10,r10,r6 - srd r11,r11,r6 -#endif - or r5,r10,r11 /* OR the results to speed things up. */ - cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes - have been found. */ - bne cr7,L(done) - -L(align): - andi. r12, r8, 15 - - /* Are we now aligned to a doubleword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - - bne cr0, L(loop) - - /* Handle WORD2 of pair. */ - ldu r12,8(r8) - cmpb r10,r12,r4 - cmpb r11,r12,r0 - or r5,r10,r11 - cmpdi cr7,r5,0 - bne cr7,L(done) - b L(loop) /* We branch here (rather than falling through) - to skip the nops due to heavy alignment - of the loop below. */ - .p2align 5 -L(loop): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the null-checking process for bigger strings. */ - ld r12,8(r8) - ldu r7,16(r8) - cmpb r10,r12,r4 - cmpb r11,r12,r0 - cmpb r6,r7,r4 - cmpb r7,r7,r0 - or r12,r10,r11 - or r5,r6,r7 - or r5,r12,r5 - cmpdi cr7,r5,0 - beq cr7,L(vector) - - /* OK, one (or both) of the doublewords contains a c/null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a c/null byte. */ - cmpdi cr6,r12,0 - addi r8,r8,-8 - bne cr6,L(done) - - /* The c/null byte must be in the second doubleword. Adjust the - address again and move the result of cmpb to r10 so we can calculate - the pointer. */ - - mr r10,r6 - mr r11,r7 - addi r8,r8,8 - - /* r10/r11 have the output of the cmpb instructions, that is, - 0xff in the same position as the c/null byte in the original - doubleword from the string. Use that to calculate the pointer. */ - -L(done): - /* If there are more than one 0xff in r11, find the first position of - 0xff in r11 and fill r10 with 0 from that position. */ - cmpdi cr7,r11,0 - beq cr7,L(no_null) -#ifdef __LITTLE_ENDIAN__ - addi r3,r11,-1 - andc r3,r3,r11 - popcntd r0,r3 -#else - cntlzd r0,r11 -#endif - subfic r0,r0,63 - li r6,-1 -#ifdef __LITTLE_ENDIAN__ - srd r0,r6,r0 -#else - sld r0,r6,r0 -#endif - and r10,r0,r10 -L(no_null): -#ifdef __LITTLE_ENDIAN__ - cntlzd r0,r10 /* Count leading zeros before c matches. */ - addi r3,r10,-1 - andc r3,r3,r10 - addi r10,r11,-1 - andc r10,r10,r11 - cmpld cr7,r3,r10 - bgt cr7,L(no_match) -#else - addi r3,r10,-1 /* Count trailing zeros before c matches. */ - andc r3,r3,r10 - popcntd r0,r3 - cmpld cr7,r11,r10 - bgt cr7,L(no_match) -#endif - srdi r0,r0,3 /* Convert trailing zeros to bytes. */ - subfic r0,r0,7 - add r9,r8,r0 /* Return address of the matching c byte - or null in case c was not found. */ - li r0,0 - cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */ - beq cr7,L(align) - - .align 4 -L(no_match): - mr r3,r9 - blr - -/* Check the first 32B in GPR's and move to vectorized loop. */ - .p2align 5 -L(vector): - addi r3, r8, 8 - /* Make sure 32B aligned. */ - andi. r10, r3, 31 - bne cr0, L(loop) - vspltisb v0, 0 - /* Precompute vbpermq constant. */ - vspltisb v10, 3 - lvsl v11, r0, r0 - vslb v10, v11, v10 - MTVRD(v1, r4) - li r5, 16 - vspltb v1, v1, 7 - /* Compare 32 bytes in each loop. */ -L(continue): - lvx v4, 0, r3 - lvx v5, r3, r5 - vcmpequb v2, v0, v4 - vcmpequb v3, v0, v5 - vcmpequb v6, v1, v4 - vcmpequb v7, v1, v5 - vor v8, v2, v3 - vor v9, v6, v7 - vor v11, v8, v9 - vcmpequb. v11, v0, v11 - addi r3, r3, 32 - blt cr6, L(continue) - vcmpequb. v8, v0, v8 - blt cr6, L(match) - - /* One (or both) of the quadwords contains c/null. */ - vspltisb v8, 2 - vspltisb v9, 5 - /* Precompute values used for comparison. */ - vsl v9, v8, v9 /* v9 = 0x4040404040404040. */ - vaddubm v8, v9, v9 - vsldoi v8, v0, v8, 1 /* v8 = 0x80. */ - - /* Check if null is in second qw. */ - vcmpequb. v11, v0, v2 - blt cr6, L(secondqw) - - /* Null found in first qw. */ - addi r8, r3, -32 - /* Calculate the null position. */ - FIND_NULL_POS(v2) - /* Check if null is in the first byte. */ - vcmpequb. v11, v0, v2 - blt cr6, L(no_match) - vsububm v2, v8, v2 - /* Mask unwanted bytes after null. */ -#ifdef __LITTLE_ENDIAN__ - vslo v6, v6, v2 - vsro v6, v6, v2 -#else - vsro v6, v6, v2 - vslo v6, v6, v2 -#endif - vcmpequb. v11, v0, v6 - blt cr6, L(no_match) - /* Found a match before null. */ - CALCULATE_MATCH() - add r3, r8, r6 - blr - -L(secondqw): - addi r8, r3, -16 - FIND_NULL_POS(v3) - vcmpequb. v11, v0, v2 - blt cr6, L(no_match1) - vsububm v2, v8, v2 - /* Mask unwanted bytes after null. */ -#ifdef __LITTLE_ENDIAN__ - vslo v7, v7, v2 - vsro v7, v7, v2 -#else - vsro v7, v7, v2 - vslo v7, v7, v2 -#endif - vcmpequb. v11, v0, v7 - blt cr6, L(no_match1) - addi r8, r8, 16 - vor v6, v0, v7 -L(no_match1): - addi r8, r8, -16 - vcmpequb. v11, v0, v6 - blt cr6, L(no_match) - /* Found a match before null. */ - CALCULATE_MATCH() - add r3, r8, r6 - blr - -L(match): - /* One (or both) of the quadwords contains a match. */ - mr r8, r3 - vcmpequb. v8, v0, v7 - blt cr6, L(firstqw) - /* Match found in second qw. */ - addi r8, r8, 16 - vor v6, v0, v7 -L(firstqw): - addi r8, r8, -32 - CALCULATE_MATCH() - add r9, r8, r6 /* Compute final length. */ - b L(continue) -/* We are here because strrchr was called with a null byte. */ - .align 4 -L(null_match): - /* r0 has a doubleword of null bytes. */ - - cmpb r5,r12,r0 /* Compare each byte against null bytes. */ - - /* Move the doublewords left and right to discard the bits that are - not part of the string and bring them back as zeros. */ -#ifdef __LITTLE_ENDIAN__ - srd r5,r5,r6 - sld r5,r5,r6 -#else - sld r5,r5,r6 - srd r5,r5,r6 -#endif - cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes - have been found. */ - bne cr7,L(done_null) - - andi. r12, r8, 15 - - /* Are we now aligned to a quadword boundary? If so, skip to - the main loop. Otherwise, go through the alignment code. */ - - bne cr0, L(loop_null) - - /* Handle WORD2 of pair. */ - ldu r12,8(r8) - cmpb r5,r12,r0 - cmpdi cr7,r5,0 - bne cr7,L(done_null) - b L(loop_null) /* We branch here (rather than falling through) - to skip the nops due to heavy alignment - of the loop below. */ - - /* Main loop to look for the end of the string. Since it's a - small loop (< 8 instructions), align it to 32-bytes. */ - .p2align 5 -L(loop_null): - /* Load two doublewords, compare and merge in a - single register for speed. This is an attempt - to speed up the null-checking process for bigger strings. */ - ld r12,8(r8) - ldu r11,16(r8) - cmpb r5,r12,r0 - cmpb r10,r11,r0 - or r6,r5,r10 - cmpdi cr7,r6,0 - beq cr7,L(vector1) - - /* OK, one (or both) of the doublewords contains a null byte. Check - the first doubleword and decrement the address in case the first - doubleword really contains a null byte. */ - - cmpdi cr6,r5,0 - addi r8,r8,-8 - bne cr6,L(done_null) - - /* The null byte must be in the second doubleword. Adjust the address - again and move the result of cmpb to r10 so we can calculate the - pointer. */ - - mr r5,r10 - addi r8,r8,8 - - /* r5 has the output of the cmpb instruction, that is, it contains - 0xff in the same position as the null byte in the original - doubleword from the string. Use that to calculate the pointer. */ -L(done_null): -#ifdef __LITTLE_ENDIAN__ - addi r0,r5,-1 - andc r0,r0,r5 - popcntd r0,r0 -#else - cntlzd r0,r5 /* Count leading zeros before the match. */ -#endif - srdi r0,r0,3 /* Convert trailing zeros to bytes. */ - add r3,r8,r0 /* Return address of the matching null byte. */ - blr -/* Check the first 32B in GPR's and move to vectorized loop. */ - .p2align 5 -L(vector1): - addi r3, r8, 8 - /* Make sure 32B aligned. */ - andi. r10, r3, 31 - bne cr0, L(loop_null) - vspltisb v0, 0 - /* Precompute vbpermq constant. */ - vspltisb v10, 3 - lvsl v11, r0, r0 - vslb v10, v11, v10 - li r5, 16 - /* Compare 32 bytes in each loop. */ -L(continue1): - lvx v4, 0, r3 - lvx v5, r3, r5 - vcmpequb v2, v0, v4 - vcmpequb v3, v0, v5 - vor v8, v2, v3 - vcmpequb. v11, v0, v8 - addi r3, r3, 32 - blt cr6, L(continue1) - addi r3, r3, -32 - VBPERMQ(v2, v2, v10) - VBPERMQ(v3, v3, v10) - /* Shift each component into its correct position for merging. */ -#ifdef __LITTLE_ENDIAN__ - vsldoi v3, v3, v3, 2 -#else - vsldoi v2, v2, v2, 6 - vsldoi v3, v3, v3, 4 -#endif - /* Merge the results and move to a GPR. */ - vor v4, v3, v2 - MFVRD(r5, v4) -#ifdef __LITTLE_ENDIAN__ - addi r6, r5, -1 - andc r6, r6, r5 - popcntd r6, r6 -#else - cntlzd r6, r5 /* Count leading zeros before the match. */ -#endif - add r3, r3, r6 /* Compute final length. */ - blr -END (strrchr) -weak_alias (strrchr, rindex) -libc_hidden_builtin_def (strrchr) diff --git a/sysdeps/powerpc/powerpc64/power8/strspn.S b/sysdeps/powerpc/powerpc64/power8/strspn.S deleted file mode 100644 index e9271898f2..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/strspn.S +++ /dev/null @@ -1,202 +0,0 @@ -/* Optimized strspn implementation for Power8. - - Copyright (C) 2016-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -/* size_t [r3] strspn (const char *string [r3], - const char *needleAccept [r4]) */ - -/* This takes a novel approach by computing a 256 bit mask whereby - each set bit implies the byte is "accepted". P8 vector hardware - has extremely efficient hardware for selecting bits from a mask. - - One might ask "why not use bpermd for short strings"? It is - so slow that its performance about matches the generic PPC64 - variant without any fancy masking, with the added expense of - making the mask. That was the first variant of this. */ - - - -#include "sysdep.h" - -#ifndef USE_AS_STRCSPN -# define USE_AS_STRCSPN 0 -# ifndef STRSPN -# define STRSPN strspn -# endif -# define INITIAL_MASK 0 -# define UPDATE_MASK(RA, RS, RB) or RA, RS, RB -#else -# ifndef STRSPN -# define STRSPN strcspn -# endif -# define INITIAL_MASK -1 -# define UPDATE_MASK(RA, RS, RB) andc RA, RS, RB -#endif - -/* Simple macro to use VSX instructions in overlapping VR's. */ -#define XXVR(insn, vrt, vra, vrb) \ - insn 32+vrt, 32+vra, 32+vrb - -/* ISA 2.07B instructions are not all defined for older binutils. - Macros are defined below for these newer instructions in order - to maintain compatibility. */ - -/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */ -#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) -#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) - -#define VBPERMQ(t,a,b) .long (0x1000054c \ - | ((t)<<(32-11)) \ - | ((a)<<(32-16)) \ - | ((b)<<(32-21)) ) - - /* This can be updated to power8 once the minimum version of - binutils supports power8 and the above instructions. */ - .machine power7 -EALIGN(STRSPN, 4, 0) - CALL_MCOUNT 2 - - /* Generate useful constants for later on. */ - vspltisb v1, 7 - vspltisb v2, -1 - vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */ - vspltisb v10, 0 - vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */ - XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */ - - /* Prepare to compute 256b mask. */ - addi r4, r4, -1 - li r5, INITIAL_MASK - li r6, INITIAL_MASK - li r7, INITIAL_MASK - li r8, INITIAL_MASK - -#if USE_AS_STRCSPN - /* Ensure the null character never matches by clearing ISA bit 0 in - in r5 which is the bit which will check for it in the later usage - of vbpermq. */ - srdi r5, r5, 1 -#endif - - li r11, 1 - sldi r11, r11, 63 - - /* Start interleaved Mask computation. - This will eventually or 1's into ignored bits from vbpermq. */ - lvsr v11, 0, r3 - vspltb v11, v11, 0 /* Splat shift constant. */ - - /* Build a 256b mask in r5-r8. */ - .align 4 -L(next_needle): - lbzu r9, 1(r4) - - cmpldi cr0, r9, 0 - cmpldi cr1, r9, 128 - - /* This is a little tricky. srd only uses the first 7 bits, - and if bit 7 is set, value is always 0. So, we can - effectively shift 128b in this case. */ - xori r12, r9, 0x40 /* Invert bit 6. */ - srd r10, r11, r9 /* Mask for bits 0-63. */ - srd r12, r11, r12 /* Mask for bits 64-127. */ - - beq cr0, L(start_cmp) - - /* Now, or the value into the correct GPR. */ - bge cr1,L(needle_gt128) - UPDATE_MASK (r5, r5, r10) /* 0 - 63. */ - UPDATE_MASK (r6, r6, r12) /* 64 - 127. */ - b L(next_needle) - - .align 4 -L(needle_gt128): - UPDATE_MASK (r7, r7, r10) /* 128 - 191. */ - UPDATE_MASK (r8, r8, r12) /* 192 - 255. */ - b L(next_needle) - - - .align 4 -L(start_cmp): - /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */ - mr r0, r3 /* Save r3 for final length computation. */ - MTVRD (v5, r5) - MTVRD (v6, r6) - MTVRD (v7, r7) - MTVRD (v8, r8) - - /* Continue interleaved mask generation. */ -#ifdef __LITTLE_ENDIAN__ - vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */ - vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */ -#else - vslw v11, v2, v11 /* Note, shift ignores higher order bits. */ - vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */ -#endif - lvx v0, 0, r3 /* Note, unaligned load ignores lower bits. */ - - /* Do the merging of the bitmask. */ - XXVR(xxmrghd, v5, v5, v6) - XXVR(xxmrghd, v6, v7, v8) - - /* Finish mask generation. */ - vand v11, v11, v4 /* Throwaway bits not in the mask. */ - - /* Compare the first 1-16B, while masking unwanted bytes. */ - clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */ - vxor v9, v0, v1 /* Swap high bit. */ - VBPERMQ (v8, v5, v0) - VBPERMQ (v7, v6, v9) - vor v7, v7, v8 - vor v7, v7, v11 /* Ignore non-participating bytes. */ - vcmpequh. v8, v7, v4 - bnl cr6, L(done) - - addi r3, r3, 16 - - .align 4 -L(vec): - lvx v0, 0, r3 - addi r3, r3, 16 - vxor v9, v0, v1 /* Swap high bit. */ - VBPERMQ (v8, v5, v0) - VBPERMQ (v7, v6, v9) - vor v7, v7, v8 - vcmpequh. v8, v7, v4 - blt cr6, L(vec) - - addi r3, r3, -16 -L(done): - subf r3, r0, r3 - MFVRD (r10, v7) - -#ifdef __LITTLE_ENDIAN__ - addi r0, r10, 1 /* Count the trailing 1's. */ - andc r10, r10, r0 - popcntd r10, r10 -#else - xori r10, r10, 0xffff /* Count leading 1's by inverting. */ - addi r3, r3, -48 /* Account for the extra leading zeros. */ - cntlzd r10, r10 -#endif - - add r3, r3, r10 - blr - -END(STRSPN) -libc_hidden_builtin_def (STRSPN) |