diff options
-rw-r--r-- | sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S | 1 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memrchr-avx2.S | 534 |
2 files changed, 257 insertions, 278 deletions
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S index cea2d2a72d..5e9beeeef2 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S @@ -2,6 +2,7 @@ # define MEMRCHR __memrchr_avx2_rtm #endif +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST #define ZERO_UPPER_VEC_REGISTERS_RETURN \ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S index ba2ce7cb03..9c83c76d3c 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -21,340 +21,318 @@ # include <sysdep.h> # ifndef MEMRCHR -# define MEMRCHR __memrchr_avx2 +# define MEMRCHR __memrchr_avx2 # endif # ifndef VZEROUPPER -# define VZEROUPPER vzeroupper +# define VZEROUPPER vzeroupper # endif # ifndef SECTION # define SECTION(p) p##.avx # endif -# define VEC_SIZE 32 +# define VEC_SIZE 32 +# define PAGE_SIZE 4096 + .section SECTION(.text), "ax", @progbits +ENTRY(MEMRCHR) +# ifdef __ILP32__ + /* Clear upper bits. */ + and %RDX_LP, %RDX_LP +# else + test %RDX_LP, %RDX_LP +# endif + jz L(zero_0) - .section SECTION(.text),"ax",@progbits -ENTRY (MEMRCHR) - /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 - vpbroadcastb %xmm0, %ymm0 - - sub $VEC_SIZE, %RDX_LP - jbe L(last_vec_or_less) - - add %RDX_LP, %RDI_LP - - /* Check the last VEC_SIZE bytes. */ - vpcmpeqb (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x0) + /* Get end pointer. Minus one for two reasons. 1) It is necessary for a + correct page cross check and 2) it correctly sets up end ptr to be + subtract by lzcnt aligned. */ + leaq -1(%rdx, %rdi), %rax - subq $(VEC_SIZE * 4), %rdi - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - jz L(aligned_more) + vpbroadcastb %xmm0, %ymm0 - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - addq $VEC_SIZE, %rdx - andq $-VEC_SIZE, %rdi - subq %rcx, %rdx + /* Check if we can load 1x VEC without cross a page. */ + testl $(PAGE_SIZE - VEC_SIZE), %eax + jz L(page_cross) + + vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + cmpq $VEC_SIZE, %rdx + ja L(more_1x_vec) + +L(ret_vec_x0_test): + /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which + will gurantee edx (len) is less than it. */ + lzcntl %ecx, %ecx + + /* Hoist vzeroupper (not great for RTM) to save code size. This allows + all logic for edx (len) <= VEC_SIZE to fit in first cache line. */ + COND_VZEROUPPER + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret - .p2align 4 -L(aligned_more): - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) - - /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(last_vec_x1) - - vpcmpeqb (%rdi), %ymm0, %ymm4 - vpmovmskb %ymm4, %eax - testl %eax, %eax - jnz L(last_vec_x0) - - /* Align data to 4 * VEC_SIZE for loop with fewer branches. - There are some overlaps with above if data isn't aligned - to 4 * VEC_SIZE. */ - movl %edi, %ecx - andl $(VEC_SIZE * 4 - 1), %ecx - jz L(loop_4x_vec) - - addq $(VEC_SIZE * 4), %rdi - addq $(VEC_SIZE * 4), %rdx - andq $-(VEC_SIZE * 4), %rdi - subq %rcx, %rdx + /* Fits in aligning bytes of first cache line. */ +L(zero_0): + xorl %eax, %eax + ret - .p2align 4 -L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - subq $(VEC_SIZE * 4), %rdi - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) - - vmovdqa (%rdi), %ymm1 - vmovdqa VEC_SIZE(%rdi), %ymm2 - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 - - vpcmpeqb %ymm1, %ymm0, %ymm1 - vpcmpeqb %ymm2, %ymm0, %ymm2 - vpcmpeqb %ymm3, %ymm0, %ymm3 - vpcmpeqb %ymm4, %ymm0, %ymm4 - - vpor %ymm1, %ymm2, %ymm5 - vpor %ymm3, %ymm4, %ymm6 - vpor %ymm5, %ymm6, %ymm5 - - vpmovmskb %ymm5, %eax - testl %eax, %eax - jz L(loop_4x_vec) - - /* There is a match. */ - vpmovmskb %ymm4, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(last_vec_x1) - - vpmovmskb %ymm1, %eax - bsrl %eax, %eax - addq %rdi, %rax + .p2align 4,, 9 +L(ret_vec_x0): + lzcntl %ecx, %ecx + subq %rcx, %rax L(return_vzeroupper): ZERO_UPPER_VEC_REGISTERS_RETURN - .p2align 4 -L(last_4x_vec_or_less): - addl $(VEC_SIZE * 4), %edx - cmpl $(VEC_SIZE * 2), %edx - jbe L(last_2x_vec) - - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(last_vec_x1_check) - cmpl $(VEC_SIZE * 3), %edx - jbe L(zero) - - vpcmpeqb (%rdi), %ymm0, %ymm4 - vpmovmskb %ymm4, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - subq $(VEC_SIZE * 4), %rdx - addq %rax, %rdx - jl L(zero) - addq %rdi, %rax - VZEROUPPER_RETURN - - .p2align 4 + .p2align 4,, 10 +L(more_1x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x0) + + /* Align rax (string pointer). */ + andq $-VEC_SIZE, %rax + + /* Recompute remaining length after aligning. */ + movq %rax, %rdx + /* Need this comparison next no matter what. */ + vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1 + subq %rdi, %rdx + decq %rax + vpmovmskb %ymm1, %ecx + /* Fall through for short (hotter than length). */ + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) L(last_2x_vec): - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x3_check) cmpl $VEC_SIZE, %edx - jbe L(zero) - - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - subq $(VEC_SIZE * 2), %rdx - addq %rax, %rdx - jl L(zero) - addl $(VEC_SIZE * 2), %eax - addq %rdi, %rax - VZEROUPPER_RETURN - - .p2align 4 -L(last_vec_x0): - bsrl %eax, %eax - addq %rdi, %rax - VZEROUPPER_RETURN + jbe L(ret_vec_x0_test) + + testl %ecx, %ecx + jnz L(ret_vec_x0) + + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + /* 64-bit lzcnt. This will naturally add 32 to position. */ + lzcntq %rcx, %rcx + COND_VZEROUPPER + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret - .p2align 4 -L(last_vec_x1): - bsrl %eax, %eax - addl $VEC_SIZE, %eax - addq %rdi, %rax - VZEROUPPER_RETURN - .p2align 4 -L(last_vec_x2): - bsrl %eax, %eax - addl $(VEC_SIZE * 2), %eax - addq %rdi, %rax + /* Inexpensive place to put this regarding code size / target alignments + / ICache NLP. Necessary for 2-byte encoding of jump to page cross + case which in turn is necessary for hot path (len <= VEC_SIZE) to fit + in first cache line. */ +L(page_cross): + movq %rax, %rsi + andq $-VEC_SIZE, %rsi + vpcmpeqb (%rsi), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + /* Shift out negative alignment (because we are starting from endptr and + working backwards). */ + movl %eax, %r8d + /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ + notl %r8d + shlxl %r8d, %ecx, %ecx + cmpq %rdi, %rsi + ja L(more_1x_vec) + lzcntl %ecx, %ecx + COND_VZEROUPPER + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret + .p2align 4,, 11 +L(ret_vec_x1): + /* This will naturally add 32 to position. */ + lzcntq %rcx, %rcx + subq %rcx, %rax VZEROUPPER_RETURN + .p2align 4,, 10 +L(more_2x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x0) - .p2align 4 -L(last_vec_x3): - bsrl %eax, %eax - addl $(VEC_SIZE * 3), %eax - addq %rdi, %rax - ret + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1) - .p2align 4 -L(last_vec_x1_check): - bsrl %eax, %eax - subq $(VEC_SIZE * 3), %rdx - addq %rax, %rdx - jl L(zero) - addl $VEC_SIZE, %eax - addq %rdi, %rax - VZEROUPPER_RETURN - .p2align 4 -L(last_vec_x3_check): - bsrl %eax, %eax - subq $VEC_SIZE, %rdx - addq %rax, %rdx - jl L(zero) - addl $(VEC_SIZE * 3), %eax - addq %rdi, %rax - VZEROUPPER_RETURN + /* Needed no matter what. */ + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - .p2align 4 -L(zero): - xorl %eax, %eax - VZEROUPPER_RETURN + subq $(VEC_SIZE * 4), %rdx + ja L(more_4x_vec) + + cmpl $(VEC_SIZE * -1), %edx + jle L(ret_vec_x2_test) + +L(last_vec): + testl %ecx, %ecx + jnz L(ret_vec_x2) + + /* Needed no matter what. */ + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 3), %rax + COND_VZEROUPPER + subq %rcx, %rax + cmpq %rax, %rdi + ja L(zero_2) + ret - .p2align 4 -L(null): + /* First in aligning bytes. */ +L(zero_2): xorl %eax, %eax ret - .p2align 4 -L(last_vec_or_less_aligned): - movl %edx, %ecx + .p2align 4,, 4 +L(ret_vec_x2_test): + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 2), %rax + COND_VZEROUPPER + subq %rcx, %rax + cmpq %rax, %rdi + ja L(zero_2) + ret - vpcmpeqb (%rdi), %ymm0, %ymm1 - movl $1, %edx - /* Support rdx << 32. */ - salq %cl, %rdx - subq $1, %rdx + .p2align 4,, 11 +L(ret_vec_x2): + /* ecx must be non-zero. */ + bsrl %ecx, %ecx + leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax + VZEROUPPER_RETURN - vpmovmskb %ymm1, %eax + .p2align 4,, 14 +L(ret_vec_x3): + /* ecx must be non-zero. */ + bsrl %ecx, %ecx + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax + VZEROUPPER_RETURN - /* Remove the trailing bytes. */ - andl %edx, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - addq %rdi, %rax - VZEROUPPER_RETURN .p2align 4 -L(last_vec_or_less): - addl $VEC_SIZE, %edx +L(more_4x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x2) - /* Check for zero length. */ - testl %edx, %edx - jz L(null) + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - jz L(last_vec_or_less_aligned) + testl %ecx, %ecx + jnz L(ret_vec_x3) - movl %ecx, %esi - movl %ecx, %r8d - addl %edx, %esi - andq $-VEC_SIZE, %rdi + /* Check if near end before re-aligning (otherwise might do an + unnecissary loop iteration). */ + addq $-(VEC_SIZE * 4), %rax + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec) - subl $VEC_SIZE, %esi - ja L(last_vec_2x_aligned) + /* Align rax to (VEC_SIZE - 1). */ + orq $(VEC_SIZE * 4 - 1), %rax + movq %rdi, %rdx + /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because + lengths that overflow can be valid and break the comparison. */ + orq $(VEC_SIZE * 4 - 1), %rdx - /* Check the last VEC. */ - vpcmpeqb (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - - /* Remove the leading and trailing bytes. */ - sarl %cl, %eax - movl %edx, %ecx + .p2align 4 +L(loop_4x_vec): + /* Need this comparison next no matter what. */ + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2 + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3 + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4 - movl $1, %edx - sall %cl, %edx - subl $1, %edx + vpor %ymm1, %ymm2, %ymm2 + vpor %ymm3, %ymm4, %ymm4 + vpor %ymm2, %ymm4, %ymm4 + vpmovmskb %ymm4, %esi - andl %edx, %eax - testl %eax, %eax - jz L(zero) + testl %esi, %esi + jnz L(loop_end) - bsrl %eax, %eax - addq %rdi, %rax - addq %r8, %rax - VZEROUPPER_RETURN + addq $(VEC_SIZE * -4), %rax + cmpq %rdx, %rax + jne L(loop_4x_vec) - .p2align 4 -L(last_vec_2x_aligned): - movl %esi, %ecx + subl %edi, %edx + incl %edx - /* Check the last VEC. */ - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 +L(last_4x_vec): + /* Used no matter what. */ + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - movl $1, %edx - sall %cl, %edx - subl $1, %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) - vpmovmskb %ymm1, %eax + testl %ecx, %ecx + jnz L(ret_vec_x0_end) - /* Remove the trailing bytes. */ - andl %edx, %eax + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1_end) - testl %eax, %eax - jnz L(last_vec_x1) + /* Used no matter what. */ + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - /* Check the second last VEC. */ - vpcmpeqb (%rdi), %ymm0, %ymm1 + cmpl $(VEC_SIZE * 3), %edx + ja L(last_vec) + + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 2), %rax + COND_VZEROUPPER + subq %rcx, %rax + cmpq %rax, %rdi + jbe L(ret0) + xorl %eax, %eax +L(ret0): + ret - movl %r8d, %ecx - vpmovmskb %ymm1, %eax + .p2align 4 +L(loop_end): + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x0_end) + + vpmovmskb %ymm2, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1_end) + + vpmovmskb %ymm3, %ecx + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) + then it won't affect the result in esi (VEC4). If ecx is non-zero + then CHAR in VEC3 and bsrq will use that position. */ + salq $32, %rcx + orq %rsi, %rcx + bsrq %rcx, %rcx + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax + VZEROUPPER_RETURN - /* Remove the leading bytes. Must use unsigned right shift for - bsrl below. */ - shrl %cl, %eax - testl %eax, %eax - jz L(zero) + .p2align 4,, 4 +L(ret_vec_x1_end): + /* 64-bit version will automatically add 32 (VEC_SIZE). */ + lzcntq %rcx, %rcx + subq %rcx, %rax + VZEROUPPER_RETURN - bsrl %eax, %eax - addq %rdi, %rax - addq %r8, %rax + .p2align 4,, 4 +L(ret_vec_x0_end): + lzcntl %ecx, %ecx + subq %rcx, %rax VZEROUPPER_RETURN -END (MEMRCHR) + + /* 2 bytes until next cache line. */ +END(MEMRCHR) #endif |