diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/memrchr-evex.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/memrchr-evex.S | 337 |
1 files changed, 337 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S new file mode 100644 index 0000000000..16bf8e02b1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S @@ -0,0 +1,337 @@ +/* memrchr optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define VMOVA vmovdqa64 + +# define YMMMATCH ymm16 + +# define VEC_SIZE 32 + + .section .text.evex,"ax",@progbits +ENTRY (__memrchr_evex) + /* Broadcast CHAR to YMMMATCH. */ + vpbroadcastb %esi, %YMMMATCH + + sub $VEC_SIZE, %RDX_LP + jbe L(last_vec_or_less) + + add %RDX_LP, %RDI_LP + + /* Check the last VEC_SIZE bytes. */ + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + subq $(VEC_SIZE * 4), %rdi + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + jz L(aligned_more) + + /* Align data for aligned loads in the loop. */ + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rdx + andq $-VEC_SIZE, %rdi + subq %rcx, %rdx + + .p2align 4 +L(aligned_more): + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) + + /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 + kmovd %k3, %eax + testl %eax, %eax + jnz L(last_vec_x1) + + vpcmpb $0, (%rdi), %YMMMATCH, %k4 + kmovd %k4, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + /* Align data to 4 * VEC_SIZE for loop with fewer branches. + There are some overlaps with above if data isn't aligned + to 4 * VEC_SIZE. */ + movl %edi, %ecx + andl $(VEC_SIZE * 4 - 1), %ecx + jz L(loop_4x_vec) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rdx + andq $-(VEC_SIZE * 4), %rdi + subq %rcx, %rdx + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + subq $(VEC_SIZE * 4), %rdi + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) + + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 + kord %k1, %k2, %k5 + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 + + kord %k3, %k4, %k6 + kortestd %k5, %k6 + jz L(loop_4x_vec) + + /* There is a match. */ + kmovd %k4, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + kmovd %k3, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x1) + + kmovd %k1, %eax + bsrl %eax, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_4x_vec_or_less): + addl $(VEC_SIZE * 4), %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) + + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 + kmovd %k3, %eax + testl %eax, %eax + jnz L(last_vec_x1_check) + cmpl $(VEC_SIZE * 3), %edx + jbe L(zero) + + vpcmpb $0, (%rdi), %YMMMATCH, %k4 + kmovd %k4, %eax + testl %eax, %eax + jz L(zero) + bsrl %eax, %eax + subq $(VEC_SIZE * 4), %rdx + addq %rax, %rdx + jl L(zero) + addq %rdi, %rax + ret + + .p2align 4 +L(last_2x_vec): + vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(last_vec_x3_check) + cmpl $VEC_SIZE, %edx + jbe L(zero) + + vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + testl %eax, %eax + jz L(zero) + bsrl %eax, %eax + subq $(VEC_SIZE * 2), %rdx + addq %rax, %rdx + jl L(zero) + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x0): + bsrl %eax, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x1): + bsrl %eax, %eax + addl $VEC_SIZE, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x2): + bsrl %eax, %eax + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x3): + bsrl %eax, %eax + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x1_check): + bsrl %eax, %eax + subq $(VEC_SIZE * 3), %rdx + addq %rax, %rdx + jl L(zero) + addl $VEC_SIZE, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x3_check): + bsrl %eax, %eax + subq $VEC_SIZE, %rdx + addq %rax, %rdx + jl L(zero) + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(last_vec_or_less_aligned): + movl %edx, %ecx + + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + + movl $1, %edx + /* Support rdx << 32. */ + salq %cl, %rdx + subq $1, %rdx + + kmovd %k1, %eax + + /* Remove the trailing bytes. */ + andl %edx, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_or_less): + addl $VEC_SIZE, %edx + + /* Check for zero length. */ + testl %edx, %edx + jz L(zero) + + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + jz L(last_vec_or_less_aligned) + + movl %ecx, %esi + movl %ecx, %r8d + addl %edx, %esi + andq $-VEC_SIZE, %rdi + + subl $VEC_SIZE, %esi + ja L(last_vec_2x_aligned) + + /* Check the last VEC. */ + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + kmovd %k1, %eax + + /* Remove the leading and trailing bytes. */ + sarl %cl, %eax + movl %edx, %ecx + + movl $1, %edx + sall %cl, %edx + subl $1, %edx + + andl %edx, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax + ret + + .p2align 4 +L(last_vec_2x_aligned): + movl %esi, %ecx + + /* Check the last VEC. */ + vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 + + movl $1, %edx + sall %cl, %edx + subl $1, %edx + + kmovd %k1, %eax + + /* Remove the trailing bytes. */ + andl %edx, %eax + + testl %eax, %eax + jnz L(last_vec_x1) + + /* Check the second last VEC. */ + vpcmpb $0, (%rdi), %YMMMATCH, %k1 + + movl %r8d, %ecx + + kmovd %k1, %eax + + /* Remove the leading bytes. Must use unsigned right shift for + bsrl below. */ + shrl %cl, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax + ret +END (__memrchr_evex) +#endif |