diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/strlen-evex-base.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen-evex-base.S | 380 |
1 files changed, 159 insertions, 221 deletions
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S index 7305b24e28..77dc89900a 100644 --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S @@ -1,5 +1,5 @@ -/* Placeholder function, not used by any processor at the moment. - Copyright (C) 2022-2023 Free Software Foundation, Inc. +/* strlen/wcslen optimized with 256/512-bit EVEX instructions. + Copyright (C) 2021-2023 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,7 +16,6 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -/* UNUSED. Exists purely as reference implementation. */ #include <isa-level.h> @@ -26,272 +25,211 @@ # ifdef USE_AS_WCSLEN # define VPCMPEQ vpcmpeqd +# define VPCMPNEQ vpcmpneqd # define VPTESTN vptestnmd +# define VPTEST vptestmd # define VPMINU vpminud # define CHAR_SIZE 4 +# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg # else # define VPCMPEQ vpcmpeqb +# define VPCMPNEQ vpcmpneqb # define VPTESTN vptestnmb +# define VPTEST vptestmb # define VPMINU vpminub # define CHAR_SIZE 1 +# define CHAR_SIZE_SHIFT_REG(reg) + +# define REG_WIDTH VEC_SIZE # endif -# define PAGE_SIZE 4096 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - .section SECTION(.text),"ax",@progbits -/* Aligning entry point to 64 byte, provides better performance for - one vector length string. */ -ENTRY_P2ALIGN (STRLEN, 6) -# ifdef USE_AS_STRNLEN - /* Check zero length. */ - test %RSI_LP, %RSI_LP - jz L(ret_max) -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %esi, %esi -# endif +# include "reg-macros.h" + +# if CHAR_PER_VEC == 64 + +# define TAIL_RETURN_LBL first_vec_x2 +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) + +# define FALLTHROUGH_RETURN_LBL first_vec_x3 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) + +# else + +# define TAIL_RETURN_LBL first_vec_x3 +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) + +# define FALLTHROUGH_RETURN_LBL first_vec_x2 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) # endif +# define XZERO VMM_128(0) +# define VZERO VMM(0) +# define PAGE_SIZE 4096 + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN(STRLEN, 6) movl %edi, %eax - vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) - sall $20, %eax - cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax - ja L(page_cross) - - /* Compare [w]char for null, mask bit will be set for match. */ - VPCMPEQ (%rdi), %VMM(0), %k0 -# ifdef USE_AS_STRNLEN - KMOV %k0, %VRCX - /* Store max length in rax. */ - mov %rsi, %rax - /* If rcx is 0, rax will have max length. We can not use VRCX - and VRAX here for evex256 because, upper 32 bits may be - undefined for ecx and eax. */ - bsfq %rcx, %rax - cmp $CHAR_PER_VEC, %rax - ja L(align_more) - cmpq %rax, %rsi - cmovb %esi, %eax -# else + vpxorq %XZERO, %XZERO, %XZERO + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a + null byte. */ + VPCMPEQ (%rdi), %VZERO, %k0 KMOV %k0, %VRAX test %VRAX, %VRAX - jz L(align_more) + jz L(aligned_more) bsf %VRAX, %VRAX -# endif ret - /* At this point vector max length reached. */ -# ifdef USE_AS_STRNLEN - .p2align 4,,3 -L(ret_max): - movq %rsi, %rax + .p2align 4,, 8 +L(first_vec_x4): + bsf %VRAX, %VRAX + subl %ecx, %edi + CHAR_SIZE_SHIFT_REG (edi) + leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax ret -# endif -L(align_more): - mov %rdi, %rax - /* Align rax to VEC_SIZE. */ - andq $-VEC_SIZE, %rax -# ifdef USE_AS_STRNLEN - movq %rdi, %rdx - subq %rax, %rdx -# ifdef USE_AS_WCSLEN - shr $2, %VRDX -# endif - /* At this point rdx contains [w]chars already compared. */ - leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx - /* At this point rdx contains number of w[char] needs to go. - Now onwards rdx will keep decrementing with each compare. */ -# endif - - /* Loop unroll 4 times for 4 vector loop. */ - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 - subq $-VEC_SIZE, %rax - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x1) -# ifdef USE_AS_STRNLEN - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) -# endif - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x2) + /* Aligned more for strnlen compares remaining length vs 2 * + CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before + going to the loop. */ + .p2align 4,, 10 +L(aligned_more): + movq %rdi, %rcx + andq $(VEC_SIZE * -1), %rdi +L(cross_page_continue): + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without + rechecking bounds. */ + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x1) -# ifdef USE_AS_STRNLEN - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) -# endif + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x2) - VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x3) + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x3) -# ifdef USE_AS_STRNLEN - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) -# endif + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x4) - VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x4) + subq $(VEC_SIZE * -1), %rdi -# ifdef USE_AS_STRNLEN - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) - /* Save pointer before 4 x VEC_SIZE alignment. */ - movq %rax, %rcx +# if CHAR_PER_VEC == 64 + /* No partial register stalls on processors that we use evex512 + on and this saves code size. */ + xorb %dil, %dil +# else + andq $-(VEC_SIZE * 4), %rdi # endif - /* Align address to VEC_SIZE * 4 for loop. */ - andq $-(VEC_SIZE * 4), %rax - -# ifdef USE_AS_STRNLEN - subq %rax, %rcx -# ifdef USE_AS_WCSLEN - shr $2, %VRCX -# endif - /* rcx contains number of [w]char will be recompared due to - alignment fixes. rdx must be incremented by rcx to offset - alignment adjustment. */ - addq %rcx, %rdx - /* Need jump as we don't want to add/subtract rdx for first - iteration of 4 x VEC_SIZE aligned loop. */ -# endif - .p2align 4,,11 -L(loop): - /* VPMINU and VPCMP combination provide better performance as - compared to alternative combinations. */ - VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) - VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) - VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) - VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) + /* Compare 4 * VEC at a time forward. */ + .p2align 4 +L(loop_4x_vec): + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) VPTESTN %VMM(2), %VMM(2), %k0 - VPTESTN %VMM(4), %VMM(4), %k1 + VPTESTN %VMM(4), %VMM(4), %k2 - subq $-(VEC_SIZE * 4), %rax - KORTEST %k0, %k1 + subq $-(VEC_SIZE * 4), %rdi + KORTEST %k0, %k2 + jz L(loop_4x_vec) -# ifndef USE_AS_STRNLEN - jz L(loop) + VPTESTN %VMM(1), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x0) + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x1) + + VPTESTN %VMM(3), %VMM(3), %k0 + +# if CHAR_PER_VEC == 64 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x2) + KMOV %k2, %VRAX # else - jnz L(loopend) - subq $(CHAR_PER_VEC * 4), %rdx - ja L(loop) - mov %rsi, %rax + /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. */ + kmovd %k2, %edx + kmovd %k0, %eax + salq $CHAR_PER_VEC, %rdx + orq %rdx, %rax +# endif + + /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. */ + .p2align 4,, 2 +L(FALLTHROUGH_RETURN_LBL): + bsfq %rax, %rax + subq %rcx, %rdi + CHAR_SIZE_SHIFT_REG (rdi) + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax ret -# endif -L(loopend): - - VPTESTN %VMM(1), %VMM(1), %k2 - KMOV %k2, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x1) - - KMOV %k0, %VRCX - /* At this point, if k0 is non zero, null char must be in the - second vector. */ - test %VRCX, %VRCX - jnz L(ret_vec_x2) - - VPTESTN %VMM(3), %VMM(3), %k3 - KMOV %k3, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x3) - /* At this point null [w]char must be in the fourth vector so no - need to check. */ - KMOV %k1, %VRCX - - /* Fourth, third, second vector terminating are pretty much - same, implemented this way to avoid branching and reuse code - from pre loop exit condition. */ -L(ret_vec_x4): - bsf %VRCX, %VRCX - subq %rdi, %rax -# ifdef USE_AS_WCSLEN - subq $-(VEC_SIZE * 3), %rax - shrq $2, %rax - addq %rcx, %rax -# else - leaq (VEC_SIZE * 3)(%rcx, %rax), %rax -# endif -# ifdef USE_AS_STRNLEN - cmpq %rsi, %rax - cmovnb %rsi, %rax -# endif + .p2align 4,, 8 +L(first_vec_x0): + bsf %VRAX, %VRAX + sub %rcx, %rdi + CHAR_SIZE_SHIFT_REG (rdi) + addq %rdi, %rax ret -L(ret_vec_x3): - bsf %VRCX, %VRCX - subq %rdi, %rax -# ifdef USE_AS_WCSLEN - subq $-(VEC_SIZE * 2), %rax - shrq $2, %rax - addq %rcx, %rax -# else - leaq (VEC_SIZE * 2)(%rcx, %rax), %rax -# endif -# ifdef USE_AS_STRNLEN - cmpq %rsi, %rax - cmovnb %rsi, %rax -# endif + .p2align 4,, 10 +L(first_vec_x1): + bsf %VRAX, %VRAX + sub %rcx, %rdi + CHAR_SIZE_SHIFT_REG (rdi) + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax ret -L(ret_vec_x2): - subq $-VEC_SIZE, %rax -L(ret_vec_x1): - bsf %VRCX, %VRCX - subq %rdi, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - addq %rcx, %rax -# ifdef USE_AS_STRNLEN - cmpq %rsi, %rax - cmovnb %rsi, %rax -# endif + .p2align 4,, 10 + /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. */ +L(TAIL_RETURN_LBL): + bsf %VRAX, %VRAX + sub %VRCX, %VRDI + CHAR_SIZE_SHIFT_REG (VRDI) + lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX ret -L(page_cross): - mov %rdi, %rax - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx + .p2align 4,, 8 +L(cross_page_boundary): + movq %rdi, %rcx + /* Align data to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi + + VPCMPEQ (%rdi), %VZERO, %k0 + + KMOV %k0, %VRAX # ifdef USE_AS_WCSLEN - sarl $2, %ecx -# endif - /* ecx contains number of w[char] to be skipped as a result - of address alignment. */ - andq $-VEC_SIZE, %rax - VPCMPEQ (%rax), %VMM(0), %k0 - KMOV %k0, %VRDX - /* Ignore number of character for alignment adjustment. */ - shr %cl, %VRDX -# ifdef USE_AS_STRNLEN - jnz L(page_cross_end) - movl $CHAR_PER_VEC, %eax - sub %ecx, %eax - cmp %rax, %rsi - ja L(align_more) + movl %ecx, %edx + shrl $2, %edx + andl $(CHAR_PER_VEC - 1), %edx + shrx %edx, %eax, %eax + testl %eax, %eax # else - jz L(align_more) -# endif - -L(page_cross_end): - bsf %VRDX, %VRAX -# ifdef USE_AS_STRNLEN - cmpq %rsi, %rax - cmovnb %esi, %eax + shr %cl, %VRAX # endif + jz L(cross_page_continue) + bsf %VRAX, %VRAX ret -END (STRLEN) +END(STRLEN) #endif |