diff options
author | Sunil K Pandey <skpgkp2@gmail.com> | 2022-10-03 12:00:53 -0700 |
---|---|---|
committer | Sunil K Pandey <skpgkp2@gmail.com> | 2022-10-30 13:09:56 -0700 |
commit | e96971482de05eff92c1408b694c320cedd2d167 (patch) | |
tree | a927b601d7a54a330e71c87c3f16b620653eea17 | |
parent | 361d6454c034a920f2c96517c277990d390b9652 (diff) | |
download | glibc-e96971482de05eff92c1408b694c320cedd2d167.tar.gz glibc-e96971482de05eff92c1408b694c320cedd2d167.tar.xz glibc-e96971482de05eff92c1408b694c320cedd2d167.zip |
x86-64: Improve evex512 version of strlen functions
This patch improves following functionality - Replace VPCMP with VPCMPEQ. - Replace page cross check logic with sall. - Remove extra lea from align_more. - Remove uncondition loop jump. - Use bsf to check max length in first vector. Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen-evex-base.S | 91 |
1 files changed, 57 insertions, 34 deletions
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S index c832b15a48..fd6c770e6e 100644 --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S @@ -25,12 +25,12 @@ # include <sysdep.h> # ifdef USE_AS_WCSLEN -# define VPCMP vpcmpd +# define VPCMPEQ vpcmpeqd # define VPTESTN vptestnmd # define VPMINU vpminud # define CHAR_SIZE 4 # else -# define VPCMP vpcmpb +# define VPCMPEQ vpcmpeqb # define VPTESTN vptestnmb # define VPMINU vpminub # define CHAR_SIZE 1 @@ -55,20 +55,29 @@ ENTRY_P2ALIGN (STRLEN, 6) movl %edi, %eax vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) - andl $(PAGE_SIZE - 1), %eax - cmpl $(PAGE_SIZE - VEC_SIZE), %eax + sall $20, %eax + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax ja L(page_cross) /* Compare [w]char for null, mask bit will be set for match. */ - VPCMP $0, (%rdi), %VMM(0), %k0 + VPCMPEQ (%rdi), %VMM(0), %k0 +# ifdef USE_AS_STRNLEN + KMOV %k0, %VRCX + /* Store max length in rax. */ + mov %rsi, %rax + /* If rcx is 0, rax will have max length. We can not use VRCX + and VRAX here for evex256 because, upper 32 bits may be + undefined for ecx and eax. */ + bsfq %rcx, %rax + cmp $CHAR_PER_VEC, %rax + ja L(align_more) + cmpq %rax, %rsi + cmovb %esi, %eax +# else KMOV %k0, %VRAX test %VRAX, %VRAX jz L(align_more) - bsf %VRAX, %VRAX -# ifdef USE_AS_STRNLEN - cmpq %rsi, %rax - cmovnb %rsi, %rax # endif ret @@ -81,25 +90,24 @@ L(ret_max): # endif L(align_more): - leaq VEC_SIZE(%rdi), %rax + mov %rdi, %rax /* Align rax to VEC_SIZE. */ andq $-VEC_SIZE, %rax # ifdef USE_AS_STRNLEN - movq %rax, %rdx - subq %rdi, %rdx + movq %rdi, %rdx + subq %rax, %rdx # ifdef USE_AS_WCSLEN shr $2, %VRDX # endif /* At this point rdx contains [w]chars already compared. */ - subq %rsi, %rdx - jae L(ret_max) - negq %rdx + leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx /* At this point rdx contains number of w[char] needs to go. Now onwards rdx will keep decrementing with each compare. */ # endif /* Loop unroll 4 times for 4 vector loop. */ - VPCMP $0, (%rax), %VMM(0), %k0 + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 + subq $-VEC_SIZE, %rax KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(ret_vec_x1) @@ -109,7 +117,7 @@ L(align_more): jbe L(ret_max) # endif - VPCMP $0, VEC_SIZE(%rax), %VMM(0), %k0 + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(ret_vec_x2) @@ -119,7 +127,7 @@ L(align_more): jbe L(ret_max) # endif - VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0 + VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(ret_vec_x3) @@ -129,7 +137,7 @@ L(align_more): jbe L(ret_max) # endif - VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0 + VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(ret_vec_x4) @@ -155,16 +163,10 @@ L(align_more): addq %rcx, %rdx /* Need jump as we don't want to add/subtract rdx for first iteration of 4 x VEC_SIZE aligned loop. */ - jmp L(loop_entry) # endif .p2align 4,,11 L(loop): -# ifdef USE_AS_STRNLEN - subq $(CHAR_PER_VEC * 4), %rdx - jbe L(ret_max) -L(loop_entry): -# endif /* VPMINU and VPCMP combination provide better performance as compared to alternative combinations. */ VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) @@ -177,7 +179,18 @@ L(loop_entry): subq $-(VEC_SIZE * 4), %rax KORTEST %k0, %k1 - jz L(loop) + +# ifndef USE_AS_STRNLEN + jz L(loop) +# else + jnz L(loopend) + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop) + mov %rsi, %rax + ret +# endif + +L(loopend): VPTESTN %VMM(1), %VMM(1), %k2 KMOV %k2, %VRCX @@ -249,24 +262,34 @@ L(ret_vec_x1): ret L(page_cross): - movl %eax, %ecx -# ifdef USE_AS_WCSLEN + mov %rdi, %rax + movl %edi, %ecx andl $(VEC_SIZE - 1), %ecx +# ifdef USE_AS_WCSLEN sarl $2, %ecx # endif /* ecx contains number of w[char] to be skipped as a result of address alignment. */ - xorq %rdi, %rax - VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0 - KMOV %k0, %VRAX + andq $-VEC_SIZE, %rax + VPCMPEQ (%rax), %VMM(0), %k0 + KMOV %k0, %VRDX /* Ignore number of character for alignment adjustment. */ - shr %cl, %VRAX + shr %cl, %VRDX +# ifdef USE_AS_STRNLEN + jnz L(page_cross_end) + movl $CHAR_PER_VEC, %eax + sub %ecx, %eax + cmp %rax, %rsi + ja L(align_more) +# else jz L(align_more) +# endif - bsf %VRAX, %VRAX +L(page_cross_end): + bsf %VRDX, %VRAX # ifdef USE_AS_STRNLEN cmpq %rsi, %rax - cmovnb %rsi, %rax + cmovnb %esi, %eax # endif ret |