diff options
-rw-r--r-- | sysdeps/powerpc/powerpc64/le/power10/strlen.S | 221 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/Makefile | 3 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c | 2 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S | 2 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/strlen.c | 3 |
5 files changed, 230 insertions, 1 deletions
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S new file mode 100644 index 0000000000..ca7e9eb3d8 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S @@ -0,0 +1,221 @@ +/* Optimized strlen implementation for POWER10 LE. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifndef STRLEN +# define STRLEN __strlen +# define DEFINE_STRLEN_HIDDEN_DEF 1 +#endif + +/* TODO: Replace macros by the actual instructions when minimum binutils becomes + >= 2.35. This is used to keep compatibility with older versions. */ +#define VEXTRACTBM(rt,vrb) \ + .long(((4)<<(32-6)) \ + | ((rt)<<(32-11)) \ + | ((8)<<(32-16)) \ + | ((vrb)<<(32-21)) \ + | 1602) + +#define LXVP(xtp,dq,ra) \ + .long(((6)<<(32-6)) \ + | ((((xtp)-32)>>1)<<(32-10)) \ + | ((1)<<(32-11)) \ + | ((ra)<<(32-16)) \ + | dq) + +#define CHECK16(vreg,offset,addr,label) \ + lxv vreg+32,offset(addr); \ + vcmpequb. vreg,vreg,v18; \ + bne cr6,L(label); + +/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has # + of bytes already checked. */ +#define CHECK64(offset,addr,label) \ + li r6,offset; \ + LXVP(v4+32,offset,addr); \ + LXVP(v6+32,offset+32,addr); \ + vminub v14,v4,v5; \ + vminub v15,v6,v7; \ + vminub v16,v14,v15; \ + vcmpequb. v0,v16,v18; \ + bne cr6,L(label) + +#define TAIL(vreg,increment) \ + vctzlsbb r4,vreg; \ + subf r3,r3,r5; \ + addi r4,r4,increment; \ + add r3,r3,r4; \ + blr + +/* Implements the function + + int [r3] strlen (const void *s [r3]) + + The implementation can load bytes past a matching byte, but only + up to the next 64B boundary, so it never crosses a page. */ + +.machine power9 + +ENTRY_TOCLESS (STRLEN, 4) + CALL_MCOUNT 1 + + vspltisb v18,0 + vspltisb v19,-1 + + /* Next 16B-aligned address. Prepare address for L(aligned). */ + addi r5,r3,16 + clrrdi r5,r5,4 + + /* Align data and fill bytes not loaded with non matching char. */ + lvx v0,0,r3 + lvsr v1,0,r3 + vperm v0,v19,v0,v1 + + vcmpequb. v6,v0,v18 + beq cr6,L(aligned) + + vctzlsbb r3,v6 + blr + + /* Test next 176B, 16B at a time. The main loop is optimized for longer + strings, so checking the first bytes in 16B chunks benefits a lot + small strings. */ + .p2align 5 +L(aligned): + /* Prepare address for the loop. */ + addi r4,r3,192 + clrrdi r4,r4,6 + + CHECK16(v0,0,r5,tail1) + CHECK16(v1,16,r5,tail2) + CHECK16(v2,32,r5,tail3) + CHECK16(v3,48,r5,tail4) + CHECK16(v4,64,r5,tail5) + CHECK16(v5,80,r5,tail6) + CHECK16(v6,96,r5,tail7) + CHECK16(v7,112,r5,tail8) + CHECK16(v8,128,r5,tail9) + CHECK16(v9,144,r5,tail10) + CHECK16(v10,160,r5,tail11) + + addi r5,r4,128 + + /* Switch to a more aggressive approach checking 64B each time. Use 2 + pointers 128B apart and unroll the loop once to make the pointer + updates and usages separated enough to avoid stalls waiting for + address calculation. */ + .p2align 5 +L(loop): + CHECK64(0,r4,pre_tail_64b) + CHECK64(64,r4,pre_tail_64b) + addi r4,r4,256 + + CHECK64(0,r5,tail_64b) + CHECK64(64,r5,tail_64b) + addi r5,r5,256 + + b L(loop) + + .p2align 5 +L(pre_tail_64b): + mr r5,r4 +L(tail_64b): + /* OK, we found a null byte. Let's look for it in the current 64-byte + block and mark it in its corresponding VR. lxvp vx,0(ry) puts the + low 16B bytes into vx+1, and the high into vx, so the order here is + v5, v4, v7, v6. */ + vcmpequb v1,v5,v18 + vcmpequb v2,v4,v18 + vcmpequb v3,v7,v18 + vcmpequb v4,v6,v18 + + /* Take into account the other 64B blocks we had already checked. */ + add r5,r5,r6 + + /* Extract first bit of each byte. */ + VEXTRACTBM(r7,v1) + VEXTRACTBM(r8,v2) + VEXTRACTBM(r9,v3) + VEXTRACTBM(r10,v4) + + /* Shift each value into their corresponding position. */ + sldi r8,r8,16 + sldi r9,r9,32 + sldi r10,r10,48 + + /* Merge the results. */ + or r7,r7,r8 + or r8,r9,r10 + or r10,r8,r7 + + cnttzd r0,r10 /* Count trailing zeros before the match. */ + subf r5,r3,r5 + add r3,r5,r0 /* Compute final length. */ + blr + + .p2align 5 +L(tail1): + TAIL(v0,0) + + .p2align 5 +L(tail2): + TAIL(v1,16) + + .p2align 5 +L(tail3): + TAIL(v2,32) + + .p2align 5 +L(tail4): + TAIL(v3,48) + + .p2align 5 +L(tail5): + TAIL(v4,64) + + .p2align 5 +L(tail6): + TAIL(v5,80) + + .p2align 5 +L(tail7): + TAIL(v6,96) + + .p2align 5 +L(tail8): + TAIL(v7,112) + + .p2align 5 +L(tail9): + TAIL(v8,128) + + .p2align 5 +L(tail10): + TAIL(v9,144) + + .p2align 5 +L(tail11): + TAIL(v10,160) + +END (STRLEN) + +#ifdef DEFINE_STRLEN_HIDDEN_DEF +weak_alias (__strlen, strlen) +libc_hidden_builtin_def (strlen) +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index f46bf50732..8aa46a3702 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -33,7 +33,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ ifneq (,$(filter %le,$(config-machine))) sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ - rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 + rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ + strlen-power10 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 72f7f83e7e..1a6993616f 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -112,6 +112,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */ IFUNC_IMPL (i, name, strlen, #ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_1, + __strlen_power10) IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00, __strlen_power9) #endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S new file mode 100644 index 0000000000..6a774fad58 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S @@ -0,0 +1,2 @@ +#define STRLEN __strlen_power10 +#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c index c3bbc78df8..109c8a90bd 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c @@ -31,9 +31,12 @@ extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden; extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden; extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden; extern __typeof (__redirect_strlen) __strlen_power9 attribute_hidden; +extern __typeof (__redirect_strlen) __strlen_power10 attribute_hidden; libc_ifunc (__libc_strlen, # ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_1) + ? __strlen_power10 : (hwcap2 & PPC_FEATURE2_ARCH_3_00) ? __strlen_power9 : # endif |