diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch')
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 1 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 7 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memrchr-avx2.S | 359 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memrchr-sse2.S | 26 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memrchr.c | 31 |
5 files changed, 424 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 4523f51095..2fa390b3dd 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -7,6 +7,7 @@ ifeq ($(subdir),string) sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcmp-sse2-unaligned strncmp-ssse3 \ memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \ + memrchr-sse2 memrchr-avx2 \ memcmp-avx2-movbe \ memcmp-sse4 memcpy-ssse3 \ memmove-ssse3 \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 8dda1b040a..5670eb7e9e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -112,6 +112,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2_unaligned_erms)) + /* Support sysdeps/x86_64/multiarch/memrchr.S. */ + IFUNC_IMPL (i, name, memrchr, + IFUNC_IMPL_ADD (array, i, memrchr, + HAS_ARCH_FEATURE (AVX2_Usable), + __memrchr_avx2) + IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2)) + /* Support sysdeps/x86_64/multiarch/memset_chk.S. */ IFUNC_IMPL (i, name, __memset_chk, IFUNC_IMPL_ADD (array, i, __memset_chk, 1, diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S new file mode 100644 index 0000000000..3ee02e1cc3 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -0,0 +1,359 @@ +/* memrchr optimized with AVX2. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define VEC_SIZE 32 + + .section .text.avx,"ax",@progbits +ENTRY (__memrchr_avx2) + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + vpbroadcastb %xmm0, %ymm0 + + subq $VEC_SIZE, %rdx + jbe L(last_vec_or_less) + + addq %rdx, %rdi + + /* Check the last VEC_SIZE bytes. */ + vpcmpeqb (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + subq $(VEC_SIZE * 4), %rdi + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + jz L(aligned_more) + + /* Align data for aligned loads in the loop. */ + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rdx + andq $-VEC_SIZE, %rdi + subq %rcx, %rdx + + .p2align 4 +L(aligned_more): + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) + + /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x1) + + vpcmpeqb (%rdi), %ymm0, %ymm4 + vpmovmskb %ymm4, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + /* Align data to 4 * VEC_SIZE for loop with fewer branches. + There are some overlaps with above if data isn't aligned + to 4 * VEC_SIZE. */ + movl %edi, %ecx + andl $(VEC_SIZE * 4 - 1), %ecx + jz L(loop_4x_vec) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rdx + andq $-(VEC_SIZE * 4), %rdi + subq %rcx, %rdx + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + subq $(VEC_SIZE * 4), %rdi + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) + + vmovdqa (%rdi), %ymm1 + vmovdqa VEC_SIZE(%rdi), %ymm2 + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 + + vpcmpeqb %ymm1, %ymm0, %ymm1 + vpcmpeqb %ymm2, %ymm0, %ymm2 + vpcmpeqb %ymm3, %ymm0, %ymm3 + vpcmpeqb %ymm4, %ymm0, %ymm4 + + vpor %ymm1, %ymm2, %ymm5 + vpor %ymm3, %ymm4, %ymm6 + vpor %ymm5, %ymm6, %ymm5 + + vpmovmskb %ymm5, %eax + testl %eax, %eax + jz L(loop_4x_vec) + + /* There is a match. */ + vpmovmskb %ymm4, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x1) + + vpmovmskb %ymm1, %eax + bsrl %eax, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_4x_vec_or_less): + addl $(VEC_SIZE * 4), %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) + + vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x1_check) + cmpl $(VEC_SIZE * 3), %edx + jbe L(zero) + + vpcmpeqb (%rdi), %ymm0, %ymm4 + vpmovmskb %ymm4, %eax + testl %eax, %eax + jz L(zero) + bsrl %eax, %eax + subq $(VEC_SIZE * 4), %rdx + addq %rax, %rdx + jl L(zero) + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_2x_vec): + vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3_check) + cmpl $VEC_SIZE, %edx + jbe L(zero) + + vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jz L(zero) + bsrl %eax, %eax + subq $(VEC_SIZE * 2), %rdx + addq %rax, %rdx + jl L(zero) + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x0): + bsrl %eax, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x1): + bsrl %eax, %eax + addl $VEC_SIZE, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x2): + bsrl %eax, %eax + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x3): + bsrl %eax, %eax + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x1_check): + bsrl %eax, %eax + subq $(VEC_SIZE * 3), %rdx + addq %rax, %rdx + jl L(zero) + addl $VEC_SIZE, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x3_check): + bsrl %eax, %eax + subq $VEC_SIZE, %rdx + addq %rax, %rdx + jl L(zero) + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(zero): + VZEROUPPER +L(null): + xorl %eax, %eax + ret + + .p2align 4 +L(last_vec_or_less_aligned): + movl %edx, %ecx + + vpcmpeqb (%rdi), %ymm0, %ymm1 + + movl $1, %edx + /* Support rdx << 32. */ + salq %cl, %rdx + subq $1, %rdx + + vpmovmskb %ymm1, %eax + + /* Remove the trailing bytes. */ + andl %edx, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_or_less): + addl $VEC_SIZE, %edx + + /* Check for zero length. */ + testl %edx, %edx + jz L(null) + + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + jz L(last_vec_or_less_aligned) + + movl %ecx, %esi + movl %ecx, %r8d + addl %edx, %esi + andq $-VEC_SIZE, %rdi + + subl $VEC_SIZE, %esi + ja L(last_vec_2x_aligned) + + /* Check the last VEC. */ + vpcmpeqb (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + + /* Remove the leading and trailing bytes. */ + sarl %cl, %eax + movl %edx, %ecx + + movl $1, %edx + sall %cl, %edx + subl $1, %edx + + andl %edx, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_2x_aligned): + movl %esi, %ecx + + /* Check the last VEC. */ + vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 + + movl $1, %edx + sall %cl, %edx + subl $1, %edx + + vpmovmskb %ymm1, %eax + + /* Remove the trailing bytes. */ + andl %edx, %eax + + testl %eax, %eax + jnz L(last_vec_x1) + + /* Check the second last VEC. */ + vpcmpeqb (%rdi), %ymm0, %ymm1 + + movl %r8d, %ecx + + vpmovmskb %ymm1, %eax + + /* Remove the leading bytes. Must use unsigned right shift for + bsrl below. */ + shrl %cl, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax + VZEROUPPER + ret +END (__memrchr_avx2) +#endif diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S new file mode 100644 index 0000000000..f518819c87 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S @@ -0,0 +1,26 @@ +/* memrchr optimized with SSE2. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define __memrchr __memrchr_sse2 + +# undef weak_alias +# define weak_alias(__memrchr, memrchr) +#endif + +#include "../memrchr.S" diff --git a/sysdeps/x86_64/multiarch/memrchr.c b/sysdeps/x86_64/multiarch/memrchr.c new file mode 100644 index 0000000000..003d403106 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr.c @@ -0,0 +1,31 @@ +/* Multiple versions of memrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define memrchr __redirect_memrchr +# include <string.h> +# undef memrchr + +# define SYMBOL_NAME memrchr +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_memrchr, __memrchr, IFUNC_SELECTOR ()); +weak_alias (__memrchr, memrchr) +#endif |