diff options
-rw-r--r-- | sysdeps/x86_64/multiarch/strchr-evex.S | 558 |
1 files changed, 340 insertions, 218 deletions
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S index a1c15c4419..c2a0d112f7 100644 --- a/sysdeps/x86_64/multiarch/strchr-evex.S +++ b/sysdeps/x86_64/multiarch/strchr-evex.S @@ -26,48 +26,75 @@ # define STRCHR __strchr_evex # endif -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif # ifdef USE_AS_WCSCHR # define VPBROADCAST vpbroadcastd -# define VPCMP vpcmpd +# define VPCMP vpcmpd +# define VPCMPEQ vpcmpeqd # define VPTESTN vptestnmd +# define VPTEST vptestmd # define VPMINU vpminud # define CHAR_REG esi -# define SHIFT_REG ecx +# define SHIFT_REG rcx # define CHAR_SIZE 4 + +# define USE_WIDE_CHAR # else # define VPBROADCAST vpbroadcastb -# define VPCMP vpcmpb +# define VPCMP vpcmpb +# define VPCMPEQ vpcmpeqb # define VPTESTN vptestnmb +# define VPTEST vptestmb # define VPMINU vpminub # define CHAR_REG sil -# define SHIFT_REG edx +# define SHIFT_REG rdi # define CHAR_SIZE 1 # endif -# define XMMZERO xmm16 - -# define YMMZERO ymm16 -# define YMM0 ymm17 -# define YMM1 ymm18 -# define YMM2 ymm19 -# define YMM3 ymm20 -# define YMM4 ymm21 -# define YMM5 ymm22 -# define YMM6 ymm23 -# define YMM7 ymm24 -# define YMM8 ymm25 - -# define VEC_SIZE 32 -# define PAGE_SIZE 4096 -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - - .section .text.evex,"ax",@progbits -ENTRY_P2ALIGN (STRCHR, 5) - /* Broadcast CHAR to YMM0. */ - VPBROADCAST %esi, %YMM0 +# include "reg-macros.h" + +# if VEC_SIZE == 64 +# define MASK_GPR rcx +# define LOOP_REG rax + +# define COND_MASK(k_reg) {%k_reg} +# else +# define MASK_GPR rax +# define LOOP_REG rdi + +# define COND_MASK(k_reg) +# endif + +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + +# if CHAR_PER_VEC == 64 +# define LAST_VEC_OFFSET (VEC_SIZE * 3) +# define TESTZ(reg) incq %VGPR_SZ(reg, 64) +# else + +# if CHAR_PER_VEC == 32 +# define TESTZ(reg) incl %VGPR_SZ(reg, 32) +# elif CHAR_PER_VEC == 16 +# define TESTZ(reg) incw %VGPR_SZ(reg, 16) +# else +# define TESTZ(reg) incb %VGPR_SZ(reg, 8) +# endif + +# define LAST_VEC_OFFSET (VEC_SIZE * 2) +# endif + +# define VMATCH VMM(0) + +# define PAGE_SIZE 4096 + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN (STRCHR, 6) + /* Broadcast CHAR to VEC_0. */ + VPBROADCAST %esi, %VMATCH movl %edi, %eax andl $(PAGE_SIZE - 1), %eax /* Check if we cross page boundary with one vector load. @@ -75,19 +102,27 @@ ENTRY_P2ALIGN (STRCHR, 5) cmpl $(PAGE_SIZE - VEC_SIZE), %eax ja L(cross_page_boundary) + /* Check the first VEC_SIZE bytes. Search for both CHAR and the null bytes. */ - VMOVU (%rdi), %YMM1 - + VMOVU (%rdi), %VMM(1) /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPTESTN %YMM2, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax + vpxorq %VMM(1), %VMATCH, %VMM(2) + VPMINU %VMM(2), %VMM(1), %VMM(2) + /* Each bit in K0 represents a CHAR or a null byte in VEC_1. */ + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRAX +# if VEC_SIZE == 64 && defined USE_AS_STRCHRNUL + /* If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so + that all logic for match/null in first VEC first in 1x cache + lines. This has a slight cost to larger sizes. */ + bsf %VRAX, %VRAX + jz L(aligned_more) +# else + test %VRAX, %VRAX jz L(aligned_more) - tzcntl %eax, %eax + bsf %VRAX, %VRAX +# endif # ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG @@ -109,287 +144,374 @@ ENTRY_P2ALIGN (STRCHR, 5) # endif ret - - - .p2align 4,, 10 -L(first_vec_x4): -# ifndef USE_AS_STRCHRNUL - /* Check to see if first match was CHAR (k0) or null (k1). */ - kmovd %k0, %eax - tzcntl %eax, %eax - kmovd %k1, %ecx - /* bzhil will not be 0 if first match was null. */ - bzhil %eax, %ecx, %ecx - jne L(zero) -# else - /* Combine CHAR and null matches. */ - kord %k0, %k1, %k0 - kmovd %k0, %eax - tzcntl %eax, %eax -# endif - /* NB: Multiply sizeof char type (1 or 4) to get the number of - bytes. */ - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax - ret - # ifndef USE_AS_STRCHRNUL L(zero): xorl %eax, %eax ret # endif - - .p2align 4 + .p2align 4,, 2 +L(first_vec_x3): + subq $-(VEC_SIZE * 2), %rdi +# if VEC_SIZE == 32 + /* Reuse L(first_vec_x3) for last VEC2 only for VEC_SIZE == 32. + For VEC_SIZE == 64 the registers don't match. */ +L(last_vec_x2): +# endif L(first_vec_x1): /* Use bsf here to save 1-byte keeping keeping the block in 1x fetch block. eax guranteed non-zero. */ - bsfl %eax, %eax + bsf %VRCX, %VRCX # ifndef USE_AS_STRCHRNUL - /* Found CHAR or the null byte. */ - cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + /* Found CHAR or the null byte. */ + cmp (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG jne L(zero) - # endif /* NB: Multiply sizeof char type (1 or 4) to get the number of bytes. */ - leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + leaq (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %rax ret - .p2align 4,, 10 + .p2align 4,, 2 +L(first_vec_x4): + subq $-(VEC_SIZE * 2), %rdi L(first_vec_x2): # ifndef USE_AS_STRCHRNUL /* Check to see if first match was CHAR (k0) or null (k1). */ - kmovd %k0, %eax - tzcntl %eax, %eax - kmovd %k1, %ecx + KMOV %k0, %VRAX + tzcnt %VRAX, %VRAX + KMOV %k1, %VRCX /* bzhil will not be 0 if first match was null. */ - bzhil %eax, %ecx, %ecx + bzhi %VRAX, %VRCX, %VRCX jne L(zero) # else /* Combine CHAR and null matches. */ - kord %k0, %k1, %k0 - kmovd %k0, %eax - tzcntl %eax, %eax + KOR %k0, %k1, %k0 + KMOV %k0, %VRAX + bsf %VRAX, %VRAX # endif /* NB: Multiply sizeof char type (1 or 4) to get the number of bytes. */ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret - .p2align 4,, 10 -L(first_vec_x3): - /* Use bsf here to save 1-byte keeping keeping the block in 1x - fetch block. eax guranteed non-zero. */ - bsfl %eax, %eax -# ifndef USE_AS_STRCHRNUL - /* Found CHAR or the null byte. */ - cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG - jne L(zero) +# ifdef USE_AS_STRCHRNUL + /* We use this as a hook to get imm8 encoding for the jmp to + L(page_cross_boundary). This allows the hot case of a + match/null-term in first VEC to fit entirely in 1 cache + line. */ +L(cross_page_boundary): + jmp L(cross_page_boundary_real) # endif - /* NB: Multiply sizeof char type (1 or 4) to get the number of - bytes. */ - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax - ret .p2align 4 L(aligned_more): +L(cross_page_continue): /* Align data to VEC_SIZE. */ andq $-VEC_SIZE, %rdi -L(cross_page_continue): - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since - data is only aligned to VEC_SIZE. Use two alternating methods - for checking VEC to balance latency and port contention. */ - /* This method has higher latency but has better port - distribution. */ - VMOVA (VEC_SIZE)(%rdi), %YMM1 + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. Use two alternating + methods for checking VEC to balance latency and port + contention. */ + + /* Method(1) with 8c latency: + For VEC_SIZE == 32: + p0 * 1.83, p1 * 0.83, p5 * 1.33 + For VEC_SIZE == 64: + p0 * 2.50, p1 * 0.00, p5 * 1.50 */ + VMOVA (VEC_SIZE)(%rdi), %VMM(1) /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPTESTN %YMM2, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax + vpxorq %VMM(1), %VMATCH, %VMM(2) + VPMINU %VMM(2), %VMM(1), %VMM(2) + /* Each bit in K0 represents a CHAR or a null byte in VEC_1. */ + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX jnz L(first_vec_x1) - /* This method has higher latency but has better port - distribution. */ - VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 - /* Each bit in K0 represents a CHAR in YMM1. */ - VPCMP $0, %YMM1, %YMM0, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPTESTN %YMM1, %YMM1, %k1 - kortestd %k0, %k1 + /* Method(2) with 6c latency: + For VEC_SIZE == 32: + p0 * 1.00, p1 * 0.00, p5 * 2.00 + For VEC_SIZE == 64: + p0 * 1.00, p1 * 0.00, p5 * 2.00 */ + VMOVA (VEC_SIZE * 2)(%rdi), %VMM(1) + /* Each bit in K0 represents a CHAR in VEC_1. */ + VPCMPEQ %VMM(1), %VMATCH, %k0 + /* Each bit in K1 represents a CHAR in VEC_1. */ + VPTESTN %VMM(1), %VMM(1), %k1 + KORTEST %k0, %k1 jnz L(first_vec_x2) - VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 + /* By swapping between Method 1/2 we get more fair port + distrubition and better throughput. */ + + VMOVA (VEC_SIZE * 3)(%rdi), %VMM(1) /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPTESTN %YMM2, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax + vpxorq %VMM(1), %VMATCH, %VMM(2) + VPMINU %VMM(2), %VMM(1), %VMM(2) + /* Each bit in K0 represents a CHAR or a null byte in VEC_1. */ + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX jnz L(first_vec_x3) - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 - /* Each bit in K0 represents a CHAR in YMM1. */ - VPCMP $0, %YMM1, %YMM0, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPTESTN %YMM1, %YMM1, %k1 - kortestd %k0, %k1 + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) + /* Each bit in K0 represents a CHAR in VEC_1. */ + VPCMPEQ %VMM(1), %VMATCH, %k0 + /* Each bit in K1 represents a CHAR in VEC_1. */ + VPTESTN %VMM(1), %VMM(1), %k1 + KORTEST %k0, %k1 jnz L(first_vec_x4) /* Align data to VEC_SIZE * 4 for the loop. */ +# if VEC_SIZE == 64 + /* Use rax for the loop reg as it allows to the loop to fit in + exactly 2-cache-lines. (more efficient imm32 + gpr + encoding). */ + leaq (VEC_SIZE)(%rdi), %rax + /* No partial register stalls on evex512 processors. */ + xorb %al, %al +# else + /* For VEC_SIZE == 32 continue using rdi for loop reg so we can + reuse more code and save space. */ addq $VEC_SIZE, %rdi andq $-(VEC_SIZE * 4), %rdi - +# endif .p2align 4 L(loop_4x_vec): - /* Check 4x VEC at a time. No penalty to imm32 offset with evex - encoding. */ - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 - VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 - VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 - VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 - - /* For YMM1 and YMM3 use xor to set the CHARs matching esi to + /* Check 4x VEC at a time. No penalty for imm32 offset with evex + encoding (if offset % VEC_SIZE == 0). */ + VMOVA (VEC_SIZE * 4)(%LOOP_REG), %VMM(1) + VMOVA (VEC_SIZE * 5)(%LOOP_REG), %VMM(2) + VMOVA (VEC_SIZE * 6)(%LOOP_REG), %VMM(3) + VMOVA (VEC_SIZE * 7)(%LOOP_REG), %VMM(4) + + /* Collect bits where VEC_1 does NOT match esi. This is later + use to mask of results (getting not matches allows us to + save an instruction on combining). */ + VPCMP $4, %VMATCH, %VMM(1), %k1 + + /* Two methods for loop depending on VEC_SIZE. This is because + with zmm registers VPMINU can only run on p0 (as opposed to + p0/p1 for ymm) so it is less prefered. */ +# if VEC_SIZE == 32 + /* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to zero. */ - vpxorq %YMM1, %YMM0, %YMM5 - /* For YMM2 and YMM4 cmp not equals to CHAR and store result in - k register. Its possible to save either 1 or 2 instructions - using cmp no equals method for either YMM1 or YMM1 and YMM3 - respectively but bottleneck on p5 makes it not worth it. */ - VPCMP $4, %YMM0, %YMM2, %k2 - vpxorq %YMM3, %YMM0, %YMM7 - VPCMP $4, %YMM0, %YMM4, %k4 - - /* Use min to select all zeros from either xor or end of string). - */ - VPMINU %YMM1, %YMM5, %YMM1 - VPMINU %YMM3, %YMM7, %YMM3 + vpxorq %VMM(2), %VMATCH, %VMM(6) + vpxorq %VMM(3), %VMATCH, %VMM(7) - /* Use min + zeromask to select for zeros. Since k2 and k4 will - have 0 as positions that matched with CHAR which will set - zero in the corresponding destination bytes in YMM2 / YMM4. - */ - VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} - VPMINU %YMM3, %YMM4, %YMM4 - VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} - - VPTESTN %YMM4, %YMM4, %k1 - kmovd %k1, %ecx - subq $-(VEC_SIZE * 4), %rdi - testl %ecx, %ecx + /* Find non-matches in VEC_4 while combining with non-matches + from VEC_1. NB: Try and use masked predicate execution on + instructions that have mask result as it has no latency + penalty. */ + VPCMP $4, %VMATCH, %VMM(4), %k4{%k1} + + /* Combined zeros from VEC_1 / VEC_2 (search for null term). */ + VPMINU %VMM(1), %VMM(2), %VMM(2) + + /* Use min to select all zeros from either xor or end of + string). */ + VPMINU %VMM(3), %VMM(7), %VMM(3) + VPMINU %VMM(2), %VMM(6), %VMM(2) + + /* Combined zeros from VEC_2 / VEC_3 (search for null term). */ + VPMINU %VMM(3), %VMM(4), %VMM(4) + + /* Combined zeros from VEC_2 / VEC_4 (this has all null term and + esi matches for VEC_2 / VEC_3). */ + VPMINU %VMM(2), %VMM(4), %VMM(4) +# else + /* Collect non-matches for VEC_2. */ + VPCMP $4, %VMM(2), %VMATCH, %k2 + + /* Combined zeros from VEC_1 / VEC_2 (search for null term). */ + VPMINU %VMM(1), %VMM(2), %VMM(2) + + /* Find non-matches in VEC_3/VEC_4 while combining with non- + matches from VEC_1/VEC_2 respectively. */ + VPCMP $4, %VMM(3), %VMATCH, %k3{%k1} + VPCMP $4, %VMM(4), %VMATCH, %k4{%k2} + + /* Finish combining zeros in all VECs. */ + VPMINU %VMM(3), %VMM(4), %VMM(4) + + /* Combine in esi matches for VEC_3 (if there was a match with + esi, the corresponding bit in %k3 is zero so the + VPMINU_MASKZ will have a zero in the result). NB: This make + the VPMINU 3c latency. The only way to avoid it is to + createa a 12c dependency chain on all the `VPCMP $4, ...` + which has higher total latency. */ + VPMINU %VMM(2), %VMM(4), %VMM(4){%k3}{z} +# endif + VPTEST %VMM(4), %VMM(4), %k0{%k4} + KMOV %k0, %VRDX + subq $-(VEC_SIZE * 4), %LOOP_REG + + /* TESTZ is inc using the proper register width depending on + CHAR_PER_VEC. An esi match or null-term match leaves a zero- + bit in rdx so inc won't overflow and won't be zero. */ + TESTZ (rdx) jz L(loop_4x_vec) - VPTESTN %YMM1, %YMM1, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(last_vec_x1) + VPTEST %VMM(1), %VMM(1), %k0{%k1} + KMOV %k0, %VGPR(MASK_GPR) + TESTZ (MASK_GPR) +# if VEC_SIZE == 32 + /* We can reuse the return code in page_cross logic for VEC_SIZE + == 32. */ + jnz L(last_vec_x1_vec_size32) +# else + jnz L(last_vec_x1_vec_size64) +# endif + - VPTESTN %YMM2, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax + /* COND_MASK integates the esi matches for VEC_SIZE == 64. For + VEC_SIZE == 32 they are already integrated. */ + VPTEST %VMM(2), %VMM(2), %k0 COND_MASK(k2) + KMOV %k0, %VRCX + TESTZ (rcx) jnz L(last_vec_x2) - VPTESTN %YMM3, %YMM3, %k0 - kmovd %k0, %eax - /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ -# ifdef USE_AS_WCSCHR - sall $8, %ecx - orl %ecx, %eax - bsfl %eax, %eax + VPTEST %VMM(3), %VMM(3), %k0 COND_MASK(k3) + KMOV %k0, %VRCX +# if CHAR_PER_VEC == 64 + TESTZ (rcx) + jnz L(last_vec_x3) # else - salq $32, %rcx - orq %rcx, %rax - bsfq %rax, %rax + salq $CHAR_PER_VEC, %rdx + TESTZ (rcx) + orq %rcx, %rdx # endif + + bsfq %rdx, %rdx + # ifndef USE_AS_STRCHRNUL /* Check if match was CHAR or null. */ - cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + cmp (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %CHAR_REG jne L(zero_end) # endif /* NB: Multiply sizeof char type (1 or 4) to get the number of bytes. */ - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + leaq (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %rax ret - .p2align 4,, 8 -L(last_vec_x1): - bsfl %eax, %eax -# ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. - */ - leaq (%rdi, %rax, CHAR_SIZE), %rax -# else - addq %rdi, %rax +# ifndef USE_AS_STRCHRNUL +L(zero_end): + xorl %eax, %eax + ret # endif -# ifndef USE_AS_STRCHRNUL + + /* Seperate return label for last VEC1 because for VEC_SIZE == + 32 we can reuse return code in L(page_cross) but VEC_SIZE == + 64 has mismatched registers. */ +# if VEC_SIZE == 64 + .p2align 4,, 8 +L(last_vec_x1_vec_size64): + bsf %VRCX, %VRCX +# ifndef USE_AS_STRCHRNUL /* Check if match was null. */ - cmp (%rax), %CHAR_REG + cmp (%rax, %rcx, CHAR_SIZE), %CHAR_REG jne L(zero_end) -# endif - +# endif +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. + */ + leaq (%rax, %rcx, CHAR_SIZE), %rax +# else + addq %rcx, %rax +# endif ret + /* Since we can't combine the last 2x matches for CHAR_PER_VEC + == 64 we need return label for last VEC3. */ +# if CHAR_PER_VEC == 64 .p2align 4,, 8 +L(last_vec_x3): + addq $VEC_SIZE, %LOOP_REG +# endif + + /* Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't + reuse L(first_vec_x3) due to register mismatch. */ L(last_vec_x2): - bsfl %eax, %eax -# ifndef USE_AS_STRCHRNUL + bsf %VGPR(MASK_GPR), %VGPR(MASK_GPR) +# ifndef USE_AS_STRCHRNUL /* Check if match was null. */ - cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + cmp (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG jne L(zero_end) -# endif +# endif /* NB: Multiply sizeof char type (1 or 4) to get the number of bytes. */ - leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + leaq (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %rax ret +# endif - /* Cold case for crossing page with first load. */ - .p2align 4,, 8 + /* Cold case for crossing page with first load. */ + .p2align 4,, 10 +# ifndef USE_AS_STRCHRNUL L(cross_page_boundary): - movq %rdi, %rdx +# endif +L(cross_page_boundary_real): /* Align rdi. */ - andq $-VEC_SIZE, %rdi - VMOVA (%rdi), %YMM1 - /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPTESTN %YMM2, %YMM2, %k0 - kmovd %k0, %eax + xorq %rdi, %rax + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) + /* Use high latency method of getting matches to save code size. + */ + + /* K1 has 1s where VEC(1) does NOT match esi. */ + VPCMP $4, %VMM(1), %VMATCH, %k1 + /* K0 has ones where K1 is 1 (non-match with esi), and non-zero + (null). */ + VPTEST %VMM(1), %VMM(1), %k0{%k1} + KMOV %k0, %VRAX /* Remove the leading bits. */ # ifdef USE_AS_WCSCHR - movl %edx, %SHIFT_REG + movl %edi, %VGPR_SZ(SHIFT_REG, 32) /* NB: Divide shift count by 4 since each bit in K1 represent 4 bytes. */ - sarl $2, %SHIFT_REG - andl $(CHAR_PER_VEC - 1), %SHIFT_REG + sarl $2, %VGPR_SZ(SHIFT_REG, 32) + andl $(CHAR_PER_VEC - 1), %VGPR_SZ(SHIFT_REG, 32) + + /* if wcsrchr we need to reverse matches as we can't rely on + signed shift to bring in ones. There is not sarx for + gpr8/16. Also not we can't use inc here as the lower bits + represent matches out of range so we can't rely on overflow. + */ + xorl $((1 << CHAR_PER_VEC)- 1), %eax +# endif + /* Use arithmatic shift so that leading 1s are filled in. */ + sarx %VGPR(SHIFT_REG), %VRAX, %VRAX + /* If eax is all ones then no matches for esi or NULL. */ + +# ifdef USE_AS_WCSCHR + test %VRAX, %VRAX +# else + inc %VRAX # endif - sarxl %SHIFT_REG, %eax, %eax - /* If eax is zero continue. */ - testl %eax, %eax jz L(cross_page_continue) - bsfl %eax, %eax + .p2align 4,, 10 +L(last_vec_x1_vec_size32): + bsf %VRAX, %VRAX # ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of - bytes. */ - leaq (%rdx, %rax, CHAR_SIZE), %rax + /* NB: Multiply wchar_t count by 4 to get the number of bytes. + */ + leaq (%rdi, %rax, CHAR_SIZE), %rax # else - addq %rdx, %rax + addq %rdi, %rax # endif # ifndef USE_AS_STRCHRNUL /* Check to see if match was CHAR or null. */ cmp (%rax), %CHAR_REG - je L(cross_page_ret) -L(zero_end): - xorl %eax, %eax -L(cross_page_ret): + jne L(zero_end_0) # endif ret +# ifndef USE_AS_STRCHRNUL +L(zero_end_0): + xorl %eax, %eax + ret +# endif END (STRCHR) #endif |