/* memrchr optimized with 256-bit EVEX instructions. Copyright (C) 2021-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see . */ #if IS_IN (libc) # include # include "evex256-vecs.h" # if VEC_SIZE != 32 # error "VEC_SIZE != 32 unimplemented" # endif # ifndef MEMRCHR # define MEMRCHR __memrchr_evex # endif # define PAGE_SIZE 4096 # define VECMATCH VEC(0) .section SECTION(.text), "ax", @progbits ENTRY_P2ALIGN(MEMRCHR, 6) # ifdef __ILP32__ /* Clear upper bits. */ and %RDX_LP, %RDX_LP # else test %RDX_LP, %RDX_LP # endif jz L(zero_0) /* Get end pointer. Minus one for two reasons. 1) It is necessary for a correct page cross check and 2) it correctly sets up end ptr to be subtract by lzcnt aligned. */ leaq -1(%rdi, %rdx), %rax vpbroadcastb %esi, %VECMATCH /* Check if we can load 1x VEC without cross a page. */ testl $(PAGE_SIZE - VEC_SIZE), %eax jz L(page_cross) /* Don't use rax for pointer here because EVEX has better encoding with offset % VEC_SIZE == 0. */ vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0 kmovd %k0, %ecx /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */ cmpq $VEC_SIZE, %rdx ja L(more_1x_vec) L(ret_vec_x0_test): /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which will guarantee edx (len) is less than it. */ lzcntl %ecx, %ecx cmpl %ecx, %edx jle L(zero_0) subq %rcx, %rax ret /* Fits in aligning bytes of first cache line. */ L(zero_0): xorl %eax, %eax ret .p2align 4,, 9 L(ret_vec_x0_dec): decq %rax L(ret_vec_x0): lzcntl %ecx, %ecx subq %rcx, %rax ret .p2align 4,, 10 L(more_1x_vec): testl %ecx, %ecx jnz L(ret_vec_x0) /* Align rax (pointer to string). */ andq $-VEC_SIZE, %rax /* Recompute length after aligning. */ movq %rax, %rdx /* Need no matter what. */ vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0 kmovd %k0, %ecx subq %rdi, %rdx cmpq $(VEC_SIZE * 2), %rdx ja L(more_2x_vec) L(last_2x_vec): /* Must dec rax because L(ret_vec_x0_test) expects it. */ decq %rax cmpl $VEC_SIZE, %edx jbe L(ret_vec_x0_test) testl %ecx, %ecx jnz L(ret_vec_x0) /* Don't use rax for pointer here because EVEX has better encoding with offset % VEC_SIZE == 0. */ vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0 kmovd %k0, %ecx /* NB: 64-bit lzcnt. This will naturally add 32 to position. */ lzcntq %rcx, %rcx cmpl %ecx, %edx jle L(zero_0) subq %rcx, %rax ret /* Inexpensive place to put this regarding code size / target alignments / ICache NLP. Necessary for 2-byte encoding of jump to page cross case which in turn is necessary for hot path (len <= VEC_SIZE) to fit in first cache line. */ L(page_cross): movq %rax, %rsi andq $-VEC_SIZE, %rsi vpcmpb $0, (%rsi), %VECMATCH, %k0 kmovd %k0, %r8d /* Shift out negative alignment (because we are starting from endptr and working backwards). */ movl %eax, %ecx /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ notl %ecx shlxl %ecx, %r8d, %ecx cmpq %rdi, %rsi ja L(more_1x_vec) lzcntl %ecx, %ecx cmpl %ecx, %edx jle L(zero_1) subq %rcx, %rax ret /* Continue creating zero labels that fit in aligning bytes and get 2-byte encoding / are in the same cache line as condition. */ L(zero_1): xorl %eax, %eax ret .p2align 4,, 8 L(ret_vec_x1): /* This will naturally add 32 to position. */ bsrl %ecx, %ecx leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax ret .p2align 4,, 8 L(more_2x_vec): testl %ecx, %ecx jnz L(ret_vec_x0_dec) vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0 kmovd %k0, %ecx testl %ecx, %ecx jnz L(ret_vec_x1) /* Need no matter what. */ vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0 kmovd %k0, %ecx subq $(VEC_SIZE * 4), %rdx ja L(more_4x_vec) cmpl $(VEC_SIZE * -1), %edx jle L(ret_vec_x2_test) L(last_vec): testl %ecx, %ecx jnz L(ret_vec_x2) /* Need no matter what. */ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 kmovd %k0, %ecx lzcntl %ecx, %ecx subq $(VEC_SIZE * 3 + 1), %rax subq %rcx, %rax cmpq %rax, %rdi ja L(zero_1) ret .p2align 4,, 8 L(ret_vec_x2_test): lzcntl %ecx, %ecx subq $(VEC_SIZE * 2 + 1), %rax subq %rcx, %rax cmpq %rax, %rdi ja L(zero_1) ret .p2align 4,, 8 L(ret_vec_x2): bsrl %ecx, %ecx leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax ret .p2align 4,, 8 L(ret_vec_x3): bsrl %ecx, %ecx leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax ret .p2align 4,, 8 L(more_4x_vec): testl %ecx, %ecx jnz L(ret_vec_x2) vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 kmovd %k0, %ecx testl %ecx, %ecx jnz L(ret_vec_x3) /* Check if near end before re-aligning (otherwise might do an unnecessary loop iteration). */ addq $-(VEC_SIZE * 4), %rax cmpq $(VEC_SIZE * 4), %rdx jbe L(last_4x_vec) decq %rax andq $-(VEC_SIZE * 4), %rax movq %rdi, %rdx /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because lengths that overflow can be valid and break the comparison. */ andq $-(VEC_SIZE * 4), %rdx .p2align 4 L(loop_4x_vec): /* Store 1 were not-equals and 0 where equals in k1 (used to mask later on). */ vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1 /* VEC(2/3) will have zero-byte where we found a CHAR. */ vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2) vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3) vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4 /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */ vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z} vptestnmb %VEC(3), %VEC(3), %k2 /* Any 1s and we found CHAR. */ kortestd %k2, %k4 jnz L(loop_end) addq $-(VEC_SIZE * 4), %rax cmpq %rdx, %rax jne L(loop_4x_vec) /* Need to re-adjust rdx / rax for L(last_4x_vec). */ subq $-(VEC_SIZE * 4), %rdx movq %rdx, %rax subl %edi, %edx L(last_4x_vec): /* Used no matter what. */ vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0 kmovd %k0, %ecx cmpl $(VEC_SIZE * 2), %edx jbe L(last_2x_vec) testl %ecx, %ecx jnz L(ret_vec_x0_dec) vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0 kmovd %k0, %ecx testl %ecx, %ecx jnz L(ret_vec_x1) /* Used no matter what. */ vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0 kmovd %k0, %ecx cmpl $(VEC_SIZE * 3), %edx ja L(last_vec) lzcntl %ecx, %ecx subq $(VEC_SIZE * 2 + 1), %rax subq %rcx, %rax cmpq %rax, %rdi jbe L(ret_1) xorl %eax, %eax L(ret_1): ret .p2align 4,, 6 L(loop_end): kmovd %k1, %ecx notl %ecx testl %ecx, %ecx jnz L(ret_vec_x0_end) vptestnmb %VEC(2), %VEC(2), %k0 kmovd %k0, %ecx testl %ecx, %ecx jnz L(ret_vec_x1_end) kmovd %k2, %ecx kmovd %k4, %esi /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) then it won't affect the result in esi (VEC4). If ecx is non-zero then CHAR in VEC3 and bsrq will use that position. */ salq $32, %rcx orq %rsi, %rcx bsrq %rcx, %rcx addq %rcx, %rax ret .p2align 4,, 4 L(ret_vec_x0_end): addq $(VEC_SIZE), %rax L(ret_vec_x1_end): bsrl %ecx, %ecx leaq (VEC_SIZE * 2)(%rax, %rcx), %rax ret END(MEMRCHR) #endif