diff options
author | Paul E. Murphy <murphyp@linux.vnet.ibm.com> | 2016-03-14 17:40:46 -0400 |
---|---|---|
committer | Paul E. Murphy <murphyp@linux.vnet.ibm.com> | 2016-04-07 15:51:28 -0500 |
commit | 25dba0ad054723196fb633ba5d8a463ef5cb775c (patch) | |
tree | 03f9259662b03ea65b00bc6721afba2d47104bef /sysdeps | |
parent | 1d2a8245ff7bcc1d8be54f01e26c49297e446aba (diff) | |
download | glibc-25dba0ad054723196fb633ba5d8a463ef5cb775c.tar.gz glibc-25dba0ad054723196fb633ba5d8a463ef5cb775c.tar.xz glibc-25dba0ad054723196fb633ba5d8a463ef5cb775c.zip |
powerpc: Add optimized P8 strspn
This utilizes vectors and bitmasks. For small needle, large haystack, the performance improvement is upto 8x. For short strings (0-4B), the cost of computing the bitmask dominates, and is a tad slower.
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/Makefile | 3 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c | 8 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S | 40 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c | 25 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/strspn.c | 35 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/strspn.S | 179 |
6 files changed, 289 insertions, 1 deletions
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 3b0e3a0117..7ed56bf292 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -19,7 +19,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ strcmp-power8 strcmp-power7 strcmp-ppc64 \ strcat-power8 strcat-power7 strcat-ppc64 \ memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \ - strncpy-power8 strstr-power7 strstr-ppc64 + strncpy-power8 strstr-power7 strstr-ppc64 \ + strspn-power8 strspn-ppc64 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 11a8215ee4..3e1f099d71 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -322,6 +322,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ppc)) + /* Support sysdeps/powerpc/powerpc64/multiarch/strspn.c. */ + IFUNC_IMPL (i, name, strspn, + IFUNC_IMPL_ADD (array, i, strspn, + hwcap2 & PPC_FEATURE2_ARCH_2_07, + __strspn_power8) + IFUNC_IMPL_ADD (array, i, strspn, 1, + __strspn_ppc)) + /* Support sysdeps/powerpc/powerpc64/multiarch/strstr.c. */ IFUNC_IMPL (i, name, strstr, IFUNC_IMPL_ADD (array, i, strstr, diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S new file mode 100644 index 0000000000..86a4e090d7 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S @@ -0,0 +1,40 @@ +/* Optimized strspn implementation for POWER8. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__strspn_power8) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__strspn_power8): \ + cfi_startproc; \ + LOCALENTRY(__strspn_power8) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strspn_power8) \ + END_2(__strspn_power8) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/power8/strspn.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c new file mode 100644 index 0000000000..4c63665385 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c @@ -0,0 +1,25 @@ +/* Default strspn implementation for PowerPC64. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define STRSPN __strspn_ppc +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) +#endif + +#include <string/strspn.c> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/powerpc/powerpc64/multiarch/strspn.c new file mode 100644 index 0000000000..0e653f3f6e --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn.c @@ -0,0 +1,35 @@ +/* Multiple versions of strspn. PowerPC64 version. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +# include <string.h> +# include <shlib-compat.h> +# include "init-arch.h" + +#undef strspn +extern __typeof (strspn) __libc_strspn; + +extern __typeof (strspn) __strspn_ppc attribute_hidden; +extern __typeof (strspn) __strspn_power8 attribute_hidden; + +libc_ifunc (__libc_strspn, + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __strspn_power8 + : __strspn_ppc); + +weak_alias (__libc_strspn, strspn) +libc_hidden_builtin_def (strspn) diff --git a/sysdeps/powerpc/powerpc64/power8/strspn.S b/sysdeps/powerpc/powerpc64/power8/strspn.S new file mode 100644 index 0000000000..0dda437a2e --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/strspn.S @@ -0,0 +1,179 @@ +/* Optimized strspn implementation for Power8. + + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* size_t [r3] strspn (const char *string [r3], + const char *needleAccept [r4]) */ + +/* This takes a novel approach by computing a 256 bit mask whereby + each set bit implies the byte is "accepted". P8 vector hardware + has extremely efficient hardware for selecting bits from a mask. + + One might ask "why not use bpermd for short strings"? It is + so slow that its performance about matches the generic PPC64 + variant without any fancy masking, with the added expense of + making the mask. That was the first variant of this. */ + + + +#include "sysdep.h" + +/* Simple macro to use VSX instructions in overlapping VR's. */ +#define XXVR(insn, vrt, vra, vrb) \ + insn 32+vrt, 32+vra, 32+vrb + +/* ISA 2.07B instructions are not all defined for older binutils. + Macros are defined below for these newer instructions in order + to maintain compatibility. */ + +/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */ +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) + +#define VBPERMQ(t,a,b) .long (0x1000054c \ + | ((t)<<(32-11)) \ + | ((a)<<(32-16)) \ + | ((b)<<(32-21)) ) + + /* This can be updated to power8 once the minimum version of + binutils supports power8 and the above instructions. */ + .machine power7 +EALIGN(strspn, 4, 0) + CALL_MCOUNT 2 + + /* Generate useful constants for later on. */ + vspltisb v1, 7 + vspltisb v2, -1 + vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */ + vspltisb v10, 0 + vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */ + XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */ + + /* Prepare to compute 256b mask. */ + addi r4, r4, -1 + li r5, 0 + li r6, 0 + li r7, 0 + li r8, 0 + li r11, 1 + sldi r11, r11, 63 + + /* Start interleaved Mask computation. + This will eventually or 1's into ignored bits from vbpermq. */ + lvsr v11, 0, r3 + vspltb v11, v11, 0 /* Splat shift constant. */ + + /* Build a 256b mask in r5-r8. */ + .align 4 +L(next_needle): + lbzu r9, 1(r4) + + cmpldi cr0, r9, 0 + cmpldi cr1, r9, 128 + + /* This is a little tricky. srd only uses the first 7 bits, + and if bit 7 is set, value is always 0. So, we can + effectively shift 128b in this case. */ + xori r12, r9, 0x40 /* Invert bit 6. */ + srd r10, r11, r9 /* Mask for bits 0-63. */ + srd r12, r11, r12 /* Mask for bits 64-127. */ + + beq cr0, L(start_cmp) + + /* Now, or the value into the correct GPR. */ + bge cr1,L(needle_gt128) + or r5, r5, r10 /* 0 - 63. */ + or r6, r6, r12 /* 64 - 127. */ + b L(next_needle) + + .align 4 +L(needle_gt128): + or r7, r7, r10 /* 128 - 191. */ + or r8, r8, r12 /* 192 - 255. */ + b L(next_needle) + + + .align 4 +L(start_cmp): + /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */ + mr r0, r3 /* Save r3 for final length computation. */ + MTVRD (v5, r5) + MTVRD (v6, r6) + MTVRD (v7, r7) + MTVRD (v8, r8) + + /* Continue interleaved mask generation. */ +#ifdef __LITTLE_ENDIAN__ + vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */ + vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */ +#else + vslw v11, v2, v11 /* Note, shift ignores higher order bits. */ + vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */ +#endif + lvx v0, 0, r3 /* Note, unaligned load ignores lower bits. */ + + /* Do the merging of the bitmask. */ + XXVR(xxmrghd, v5, v5, v6) + XXVR(xxmrghd, v6, v7, v8) + + /* Finish mask generation. */ + vand v11, v11, v4 /* Throwaway bits not in the mask. */ + + /* Compare the first 1-16B, while masking unwanted bytes. */ + clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */ + vxor v9, v0, v1 /* Swap high bit. */ + VBPERMQ (v8, v5, v0) + VBPERMQ (v7, v6, v9) + vor v7, v7, v8 + vor v7, v7, v11 /* Ignore non-participating bytes. */ + vcmpequh. v8, v7, v4 + bnl cr6, L(done) + + addi r3, r3, 16 + + .align 4 +L(vec): + lvx v0, 0, r3 + addi r3, r3, 16 + vxor v9, v0, v1 /* Swap high bit. */ + VBPERMQ (v8, v5, v0) + VBPERMQ (v7, v6, v9) + vor v7, v7, v8 + vcmpequh. v8, v7, v4 + blt cr6, L(vec) + + addi r3, r3, -16 +L(done): + subf r3, r0, r3 + MFVRD (r10, v7) + +#ifdef __LITTLE_ENDIAN__ + addi r0, r10, 1 /* Count the trailing 1's. */ + andc r10, r10, r0 + popcntd r10, r10 +#else + xori r10, r10, 0xffff /* Count leading 1's by inverting. */ + addi r3, r3, -48 /* Account for the extra leading zeros. */ + cntlzd r10, r10 +#endif + + add r3, r3, r10 + blr + +END(strspn) +libc_hidden_builtin_def (strspn) |