diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/strchr-avx2.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/strchr-avx2.S | 254 |
1 files changed, 254 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S new file mode 100644 index 0000000000..e4292d33ab --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S @@ -0,0 +1,254 @@ +/* strchr/strchrnul optimized with AVX2. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRCHR +# define STRCHR __strchr_avx2 +# endif + +# ifdef USE_AS_WCSCHR +# define VPBROADCAST vpbroadcastd +# define VPCMPEQ vpcmpeqd +# define CHAR_REG esi +# else +# define VPBROADCAST vpbroadcastb +# define VPCMPEQ vpcmpeqb +# define CHAR_REG sil +# endif + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define VEC_SIZE 32 + + .section .text.avx,"ax",@progbits +ENTRY (STRCHR) + movl %edi, %ecx + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + vpxor %xmm9, %xmm9, %xmm9 + VPBROADCAST %xmm0, %ymm0 + /* Check if we may cross page boundary with one vector load. */ + andl $(2 * VEC_SIZE - 1), %ecx + cmpl $VEC_SIZE, %ecx + ja L(cros_page_boundary) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the + null byte. */ + vmovdqu (%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + /* Align data for aligned loads in the loop. */ + addq $VEC_SIZE, %rdi + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + + jmp L(more_4x_vec) + + .p2align 4 +L(cros_page_boundary): + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + vmovdqu (%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + /* Remove the leading bytes. */ + sarl %cl, %eax + testl %eax, %eax + jz L(aligned_more) + /* Found CHAR or the null byte. */ + tzcntl %eax, %eax + addq %rcx, %rax +# ifdef USE_AS_STRCHRNUL + addq %rdi, %rax +# else + xorl %edx, %edx + leaq (%rdi, %rax), %rax + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(aligned_more): + addq $VEC_SIZE, %rdi + +L(more_4x_vec): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + vmovdqa (%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + vmovdqa VEC_SIZE(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + addq $(VEC_SIZE * 4), %rdi + + /* Align data to 4 * VEC_SIZE. */ + movq %rdi, %rcx + andl $(4 * VEC_SIZE - 1), %ecx + andq $-(4 * VEC_SIZE), %rdi + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + vmovdqa (%rdi), %ymm5 + vmovdqa VEC_SIZE(%rdi), %ymm6 + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 + + VPCMPEQ %ymm5, %ymm0, %ymm1 + VPCMPEQ %ymm6, %ymm0, %ymm2 + VPCMPEQ %ymm7, %ymm0, %ymm3 + VPCMPEQ %ymm8, %ymm0, %ymm4 + + VPCMPEQ %ymm5, %ymm9, %ymm5 + VPCMPEQ %ymm6, %ymm9, %ymm6 + VPCMPEQ %ymm7, %ymm9, %ymm7 + VPCMPEQ %ymm8, %ymm9, %ymm8 + + vpor %ymm1, %ymm5, %ymm1 + vpor %ymm2, %ymm6, %ymm2 + vpor %ymm3, %ymm7, %ymm3 + vpor %ymm4, %ymm8, %ymm4 + + vpor %ymm1, %ymm2, %ymm5 + vpor %ymm3, %ymm4, %ymm6 + + vpor %ymm5, %ymm6, %ymm5 + + vpmovmskb %ymm5, %eax + testl %eax, %eax + jnz L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + + jmp L(loop_4x_vec) + + .p2align 4 +L(first_vec_x0): + /* Found CHAR or the null byte. */ + tzcntl %eax, %eax +# ifdef USE_AS_STRCHRNUL + addq %rdi, %rax +# else + xorl %edx, %edx + leaq (%rdi, %rax), %rax + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax +# ifdef USE_AS_STRCHRNUL + addq $VEC_SIZE, %rax + addq %rdi, %rax +# else + xorl %edx, %edx + leaq VEC_SIZE(%rdi, %rax), %rax + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax +# ifdef USE_AS_STRCHRNUL + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax +# else + xorl %edx, %edx + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(4x_vec_end): + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(first_vec_x1) + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x2) + vpmovmskb %ymm4, %eax + testl %eax, %eax +L(first_vec_x3): + tzcntl %eax, %eax +# ifdef USE_AS_STRCHRNUL + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax +# else + xorl %edx, %edx + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + VZEROUPPER + ret + +END (STRCHR) +#endif |