/* {wcs|str}ncat with 256/512-bit EVEX. Copyright (C) 2022-2024 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see . */ #include #if ISA_SHOULD_BUILD (4) /* Use evex-masked stores for small sizes. Turned off at the moment. */ # define USE_EVEX_MASKED_STORE 0 # include # ifndef VEC_SIZE # include "x86-evex256-vecs.h" # endif # ifndef STRNCAT # define STRNCAT __strncat_evex # endif # ifdef USE_AS_WCSCPY # define MOVCHAR movl # define VMOVU_MASK vmovdqu32 # define VPMIN vpminud # define VPTESTN vptestnmd # define VPTEST vptestmd # define VPCMPEQ vpcmpeqd # define CHAR_SIZE 4 # define REP_MOVS rep movsd # define VMASK_REG VR10 # define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst # define USE_WIDE_CHAR # else # define MOVCHAR movb # define VMOVU_MASK vmovdqu8 # define VPMIN vpminub # define VPTESTN vptestnmb # define VPTEST vptestmb # define VPCMPEQ vpcmpeqb # define CHAR_SIZE 1 # define REP_MOVS rep movsb # define VMASK_REG VRCX # define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst # endif # include "strncpy-or-cat-overflow-def.h" # include "reg-macros.h" # define VZERO VMM(7) # define VZERO_128 VMM_128(7) # define PAGE_SIZE 4096 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) .section SECTION(.text), "ax", @progbits ENTRY(STRNCAT) # ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %edx, %edx # endif movq %rdi, %rax /* NB: It's safe to filter out zero-length strings WITHOUT setting null-term. Destination MUST be a null-terminated string so essentially the work is already done. */ # ifdef USE_AS_WCSCPY leaq -1(%rdx), %rcx shrq $56, %rcx jnz L(zero_len) # else test %rdx, %rdx jle L(zero_len) # endif # include "strcat-strlen-evex.h.S" movl %esi, %ecx andl $(PAGE_SIZE - 1), %ecx cmpl $(PAGE_SIZE - VEC_SIZE), %ecx ja L(page_cross) L(page_cross_continue): VMOVU (%rsi), %VMM(0) VPTESTN %VMM(0), %VMM(0), %k0 /* If USE_EVEX_MASK_STORE is enabled then we just handle length <= CHAR_PER_VEC with masked instructions (which have potential for dramatically bad perf if dst splits a page and is not in the TLB). */ # if USE_EVEX_MASKED_STORE KMOV %k0, %VRCX FIND_FIRST_ONE (VRCX, VR8) cmpq %r8, %rdx jbe L(less_1x_vec) test %VRCX, %VRCX jz L(more_1x_vec) blsmsk %VRCX, %VRCX KMOV %VRCX, %k1 VMOVU_MASK %VMM(0), (%rdi){%k1} ret L(less_1x_vec): mov $-1, %VRCX bzhi %VRDX, %VRCX, %VRCX KMOV %VRCX, %k1 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE) VMOVU_MASK %VMM(0), (%rdi){%k1} ret # else KMOV %k0, %VMASK_REG /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf %VMASK_REG, %VRCX` for wcsncat. */ FIND_FIRST_ONE (VMASK_REG, VRCX) cmpq %rcx, %rdx jbe L(less_1x_vec) /* If there were no zero-CHARs (rcx was zero before FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ cmpl $CHAR_PER_VEC, %ecx je L(more_1x_vec) movl %ecx, %edx L(less_1x_vec): # if VEC_SIZE == 64 cmpl $(32 / CHAR_SIZE), %edx jae L(copy_32_63) # endif cmpl $(16 / CHAR_SIZE), %edx jae L(copy_16_31) cmpl $(8 / CHAR_SIZE), %edx jae L(copy_8_15) # ifdef USE_AS_WCSCPY vmovd %VMM_128(0), (%rdi) MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE) ret # else cmpl $4, %edx jae L(copy_4_7) movzbl (%rsi), %ecx cmpl $1, %edx jbe L(set_null_term) movzwl 1(%rsi), %esi movw %si, 1(%rdi) .p2align 4,, 1 L(set_null_term): movb %cl, (%rdi) MOVCHAR $0, (%rdi, %rdx) ret # endif # if VEC_SIZE == 64 .p2align 4,, 6 L(copy_32_63): VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) VMOVU %VMM_256(0), (%rdi) VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE) MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE) ret # endif .p2align 4,, 6 L(copy_16_31): /* Use xmm1 explicitly here as it won't require a `vzeroupper` and will save code size. */ vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1 VMOVU %VMM_128(0), (%rdi) vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE) MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE) ret .p2align 4,, 2 L(copy_8_15): movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx vmovq %VMM_128(0), (%rdi) movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE) MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE) ret # ifndef USE_AS_WCSCPY .p2align 4,, 12 L(copy_4_7): movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx vmovd %VMM_128(0), (%rdi) movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE) MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE) ret # endif # endif .p2align 4,, 4 L(zero_len): # ifdef USE_AS_WCSCPY test %rdx, %rdx # endif jne OVERFLOW_STRCAT ret .p2align 4,, 8 L(more_1x_vec): VMOVU %VMM(0), (%rdi) /* We are going to align rsi here so will need to be able to re- adjust rdi/rdx afterwards. NB: We filtered out huge lengths so rsi + rdx * CHAR_SIZE cannot overflow. */ leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx subq %rsi, %rdi andq $-(VEC_SIZE), %rsi L(loop_last_4x_vec): addq %rsi, %rdi subq %rsi, %rdx # ifdef USE_AS_WCSCPY shrq $2, %rdx # endif /* Will need this regardless. */ VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) VPTESTN %VMM(1), %VMM(1), %k0 KMOV %k0, %VMASK_REG cmpq $(CHAR_PER_VEC * 2), %rdx ja L(more_2x_vec) L(last_2x_vec): FIND_FIRST_ONE (VMASK_REG, VRCX) cmpl %ecx, %edx jbe L(ret_vec_x1_len) /* If there were no zero-CHARs (rcx was zero before FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ cmpl $CHAR_PER_VEC, %ecx jne L(ret_vec_x1) VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) VPTESTN %VMM(2), %VMM(2), %k0 KMOV %k0, %VRCX addl $-CHAR_PER_VEC, %edx bzhi %VRDX, %VRCX, %VR8 jz L(ret_vec_x2_len) L(ret_vec_x2): bsf %VRCX, %VRDX L(ret_vec_x2_len): VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) ret .p2align 4,, 4 L(ret_vec_x1_len): movl %edx, %ecx L(ret_vec_x1): VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) MOVCHAR $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE) VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) VZEROUPPER_RETURN .p2align 4,, 8 L(last_4x_vec): addl $-(CHAR_PER_VEC * 4), %edx VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) VPTESTN %VMM(1), %VMM(1), %k0 KMOV %k0, %VMASK_REG subq $-(VEC_SIZE * 4), %rsi subq $-(VEC_SIZE * 4), %rdi cmpl $(CHAR_PER_VEC * 2), %edx jbe L(last_2x_vec) .p2align 4,, 8 L(more_2x_vec): # ifdef USE_AS_WCSCPY xorl %ecx, %ecx # endif bsf %VMASK_REG, %VRCX jnz L(ret_vec_x1) VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) VPTESTN %VMM(2), %VMM(2), %k0 KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(ret_vec_x2) VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) VPTESTN %VMM(3), %VMM(3), %k0 KMOV %k0, %VMASK_REG cmpq $(CHAR_PER_VEC * 4), %rdx ja L(more_4x_vec) /* Adjust length before going to L(ret_vec_x3_len) or L(ret_vec_x3). */ addl $(CHAR_PER_VEC * -2), %edx FIND_FIRST_ONE (VMASK_REG, VRCX) cmpl %ecx, %edx jbe L(ret_vec_x3_len) /* If there were no zero-CHARs (rcx was zero before FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ cmpl $CHAR_PER_VEC, %ecx jne L(ret_vec_x3) VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) VPTESTN %VMM(4), %VMM(4), %k0 KMOV %k0, %VRCX addl $-CHAR_PER_VEC, %edx bzhi %VRDX, %VRCX, %VR8 jz L(ret_vec_x4_len) L(ret_vec_x4): bsf %VRCX, %VRDX L(ret_vec_x4_len): VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) MOVCHAR $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE) VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) ret .p2align 4,, 4 L(ret_vec_x3_len): movl %edx, %ecx L(ret_vec_x3): VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) ret .p2align 4,, 8 L(more_4x_vec): # ifdef USE_AS_WCSCPY xorl %ecx, %ecx # endif bsf %VMASK_REG, %VRCX jnz L(ret_vec_x3) VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) VPTESTN %VMM(4), %VMM(4), %k0 KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(ret_vec_x4) VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) /* Check if we are near the end before aligning. */ cmpq $(CHAR_PER_VEC * 8), %rdx jbe L(last_4x_vec) /* Add rsi to rdx (length) before aligning rsi. NB: Since we filtered out huge lengths this cannot overflow. */ # ifdef USE_AS_WCSCPY leaq (%rsi, %rdx, CHAR_SIZE), %rdx # else addq %rsi, %rdx # endif /* Subtract rsi from rdi before aligning (add back will have correct rdi for aligned rsi). */ subq %rsi, %rdi subq $-(VEC_SIZE * 5), %rsi andq $(VEC_SIZE * -4), %rsi /* Load first half of the loop before entry. */ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) VPMIN %VMM(0), %VMM(1), %VMM(4) VPMIN %VMM(2), %VMM(3), %VMM(6) VPTESTN %VMM(4), %VMM(4), %k2 VPTESTN %VMM(6), %VMM(6), %k4 /* Offset rsi by VEC_SIZE so that we can jump to L(loop_last_4x_vec). */ addq $-(VEC_SIZE), %rsi KORTEST %k2, %k4 jnz L(loop_4x_done) /* Store loop end in r9. */ leaq -(VEC_SIZE * 5)(%rdx), %r9 .p2align 4,, 11 L(loop_4x_vec): VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) subq $(VEC_SIZE * -4), %rsi cmpq %rsi, %r9 jbe L(loop_last_4x_vec) VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) VPMIN %VMM(0), %VMM(1), %VMM(4) VPMIN %VMM(2), %VMM(3), %VMM(6) VPTESTN %VMM(4), %VMM(4), %k2 VPTESTN %VMM(6), %VMM(6), %k4 KORTEST %k2, %k4 jz L(loop_4x_vec) L(loop_4x_done): VPTESTN %VMM(0), %VMM(0), %k0 KMOV %k0, %VRCX /* Restore rdi (dst). */ addq %rsi, %rdi /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so test with bsf. */ bsf %VRCX, %VRCX jnz L(ret_vec_x1) VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi) KMOV %k2, %VRCX test %VRCX, %VRCX jnz L(ret_vec_x2) VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) VPTESTN %VMM(2), %VMM(2), %k0 KMOV %k0, %VRCX bsf %VRCX, %VRCX jnz L(ret_vec_x3) VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) KMOV %k4, %VRCX bsf %VRCX, %VRCX VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) ret .p2align 4,, 4 L(page_cross): movq %rsi, %r8 andq $(VEC_SIZE * -1), %r8 VPCMPEQ (%r8), %VZERO, %k0 # ifdef USE_AS_WCSCPY KMOV %k0, %VR9 shrl $2, %ecx andl $(CHAR_PER_VEC - 1), %ecx shrx %VRCX, %VR9, %VRCX # else KMOV %k0, %VRCX shrx %VRSI, %VRCX, %VRCX # endif subl %esi, %r8d andl $(VEC_SIZE - 1), %r8d # ifdef USE_AS_WCSCPY shrl $2, %r8d # endif cmpq %r8, %rdx jbe L(page_cross_small) /* Optimizing more for space as this is very cold code. This saves 2x cache lines. */ /* This adds once to the later result which will get correct copy bounds. NB: this can never zero-out a non-zero RCX as to be in the page cross case rsi cannot be aligned and we already right-shift rcx by the misalignment. */ shl %VRCX jz L(page_cross_continue) bsf %VRCX, %VRCX REP_MOVS ret L(page_cross_small): tzcnt %VRCX, %VRCX jz L(page_cross_setz) cmpl %edx, %ecx cmova %edx, %ecx # ifdef USE_AS_WCSCPY rep movsd # else rep movsb # endif L(page_cross_setz): MOVCHAR $0, (%rdi) ret END(STRNCAT) #endif