From c25eb94aed942761aabdcd05239b7e76dcdde098 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Thu, 20 Oct 2022 10:26:46 -0700 Subject: x86: Remove AVX512-BVMI2 instruction from strrchr-evex.S commit b412213eee0afa3b51dfe92b736dfc7c981309f5 Author: Noah Goldstein Date: Tue Oct 18 17:44:07 2022 -0700 x86: Optimize strrchr-evex.S and implement with VMM headers Added `vpcompress{b|d}` to the page-cross logic with is an AVX512-VBMI2 instruction. This is not supported on SKX. Since the page-cross logic is relatively cold and the benefit is minimal revert the page-cross case back to the old logic which is supported on SKX. Tested on x86-64. --- sysdeps/x86_64/multiarch/strrchr-evex.S | 69 ++++++++++++++------------------- 1 file changed, 29 insertions(+), 40 deletions(-) (limited to 'sysdeps/x86_64/multiarch') diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S index 45487dc87a..26b3457875 100644 --- a/sysdeps/x86_64/multiarch/strrchr-evex.S +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S @@ -29,9 +29,7 @@ # include "x86-evex256-vecs.h" # ifdef USE_AS_WCSRCHR -# define RCX_M cl -# define SHIFT_REG rcx -# define VPCOMPRESS vpcompressd +# define SHIFT_REG rsi # define kunpck_2x kunpckbw # define kmov_2x kmovd # define maskz_2x ecx @@ -46,9 +44,7 @@ # define USE_WIDE_CHAR # else -# define RCX_M ecx # define SHIFT_REG rdi -# define VPCOMPRESS vpcompressb # define kunpck_2x kunpckdq # define kmov_2x kmovq # define maskz_2x rcx @@ -78,7 +74,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) andl $(PAGE_SIZE - 1), %eax cmpl $(PAGE_SIZE - VEC_SIZE), %eax jg L(cross_page_boundary) - +L(page_cross_continue): VMOVU (%rdi), %VMM(1) /* k0 has a 1 for each zero CHAR in VEC(1). */ VPTESTN %VMM(1), %VMM(1), %k0 @@ -86,7 +82,6 @@ ENTRY_P2ALIGN(STRRCHR, 6) test %VRSI, %VRSI jz L(aligned_more) /* fallthrough: zero CHAR in first VEC. */ -L(page_cross_return): /* K1 has a 1 for each search CHAR match in VEC(1). */ VPCMPEQ %VMATCH, %VMM(1), %k1 KMOV %k1, %VRAX @@ -197,7 +192,6 @@ L(first_vec_x2): .p2align 4,, 12 L(aligned_more): -L(page_cross_continue): /* Need to keep original pointer incase VEC(1) has last match. */ movq %rdi, %rsi @@ -353,53 +347,48 @@ L(return_new_match): leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret - .p2align 4,, 4 L(cross_page_boundary): + /* eax contains all the page offset bits of src (rdi). `xor rdi, + rax` sets pointer will all page offset bits cleared so + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC + before page cross (guranteed to be safe to read). Doing this + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves + a bit of code size. */ xorq %rdi, %rax - mov $-1, %VRDX - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) - VPTESTN %VMM(6), %VMM(6), %k0 - KMOV %k0, %VRSI - -# ifdef USE_AS_WCSRCHR - movl %edi, %ecx - and $(VEC_SIZE - 1), %ecx - shrl $2, %ecx -# endif - shlx %VGPR(SHIFT_REG), %VRDX, %VRDX + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) + VPTESTN %VMM(1), %VMM(1), %k0 + KMOV %k0, %VRCX + /* Shift out zero CHAR matches that are before the begining of + src (rdi). */ # ifdef USE_AS_WCSRCHR - kmovb %edx, %k1 -# else - KMOV %VRDX, %k1 + movl %edi, %esi + andl $(VEC_SIZE - 1), %esi + shrl $2, %esi # endif + shrx %VGPR(SHIFT_REG), %VRCX, %VRCX - /* Need to adjust result to VEC(1) so it can be re-used by - L(return_vec_x0_test). The alternative is to collect VEC(1) - will a page cross load which is far more expensive. */ - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} - - /* We could technically just jmp back after the vpcompress but - it doesn't save any 16-byte blocks. */ - shrx %VGPR(SHIFT_REG), %VRSI, %VRSI - test %VRSI, %VRSI + test %VRCX, %VRCX jz L(page_cross_continue) - /* Duplicate of return logic from ENTRY. Doesn't cause spill to - next cache line so might as well copy it here. */ - VPCMPEQ %VMATCH, %VMM(1), %k1 + /* Found zero CHAR so need to test for search CHAR. */ + VPCMP $0, %VMATCH, %VMM(1), %k1 KMOV %k1, %VRAX - blsmsk %VRSI, %VRSI - and %VRSI, %VRAX - jz L(ret_page_cross) + /* Shift out search CHAR matches that are before the begining of + src (rdi). */ + shrx %VGPR(SHIFT_REG), %VRAX, %VRAX + + /* Check if any search CHAR match in range. */ + blsmsk %VRCX, %VRCX + and %VRCX, %VRAX + jz L(ret3) bsr %VRAX, %VRAX # ifdef USE_AS_WCSRCHR leaq (%rdi, %rax, CHAR_SIZE), %rax # else addq %rdi, %rax # endif -L(ret_page_cross): +L(ret3): ret - /* 1 byte till next cache line. */ END(STRRCHR) #endif -- cgit 1.4.1