/* strncat with AVX2 Copyright (C) 2022-2024 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see . */ #include #if ISA_SHOULD_BUILD (3) # include # ifndef VEC_SIZE # include "x86-avx-vecs.h" # endif # ifndef STRNCAT # define STRNCAT __strncat_avx2 # endif # ifdef USE_AS_WCSCPY # define MOVCHAR movl # define VPCMPEQ vpcmpeqd # define VPMIN vpminud # define CHAR_SIZE 4 # else # define MOVCHAR movb # define VPCMPEQ vpcmpeqb # define VPMIN vpminub # define CHAR_SIZE 1 # endif # include "strncpy-or-cat-overflow-def.h" # define PAGE_SIZE 4096 # define VZERO VMM(7) # define VZERO_128 VMM_128(7) .section SECTION(.text), "ax", @progbits ENTRY(STRNCAT) # ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %edx, %edx # endif /* Filter zero length strings and very long strings. Zero length strings just return, very long strings are handled by using the non-length variant {wcs|str}cat. */ movq %rdi, %rax # ifdef USE_AS_WCSCPY leaq -1(%rdx), %rcx shr $56, %rcx jnz L(zero_len) salq $2, %rdx # else test %rdx, %rdx jle L(zero_len) # endif vpxor %VZERO_128, %VZERO_128, %VZERO_128 # include "strcat-strlen-avx2.h.S" movl %esi, %ecx andl $(PAGE_SIZE - 1), %ecx cmpl $(PAGE_SIZE - VEC_SIZE), %ecx ja L(page_cross) L(page_cross_continue): VMOVU (%rsi), %VMM(0) VPCMPEQ %VMM(0), %VZERO, %VMM(6) vpmovmskb %VMM(6), %ecx tzcnt %ecx, %r8d cmpq %r8, %rdx jbe L(less_1x_vec) testl %ecx, %ecx jz L(more_1x_vec) /* Hoist this to save code size. */ movl %r8d, %edx L(less_1x_vec): COND_VZEROUPPER cmpl $16, %edx jae L(copy_16_31) cmpl $8, %edx jae L(copy_8_15) # ifdef USE_AS_WCSCPY vmovd %VMM_128(0), (%rdi) MOVCHAR $0, (%rdi, %rdx) ret # else cmpl $4, %edx jae L(copy_4_7) movzbl (%rsi), %ecx cmpl $1, %edx jbe L(set_null_term) /* NB: make this `vmovw` if support for AVX512-FP16 is added. */ movzwl 1(%rsi), %esi movw %si, 1(%rdi) .p2align 4,, 1 L(set_null_term): movb %cl, (%rdi) MOVCHAR $0, (%rdi, %rdx) ret .p2align 4,, 11 L(copy_4_7): movl -(4)(%rsi, %rdx), %ecx vmovd %xmm0, (%rdi) movl %ecx, -(4)(%rdi, %rdx) MOVCHAR $0, (%rdi, %rdx) ret # endif .p2align 4,, 10 L(copy_16_31): VMOVU -(16)(%rsi, %rdx), %xmm1 VMOVU %xmm0, (%rdi) VMOVU %xmm1, -(16)(%rdi, %rdx) MOVCHAR $0, (%rdi, %rdx) ret .p2align 4,, 10 L(copy_8_15): movq -(8)(%rsi, %rdx), %rcx vmovq %xmm0, (%rdi) movq %rcx, -(8)(%rdi, %rdx) MOVCHAR $0, (%rdi, %rdx) ret .p2align 4,, 8 .p2align 6,, 14 L(more_1x_vec): VMOVU %VMM(0), (%rdi) /* Align rsi (src) and just rdx/rdi (length/dst). */ addq %rsi, %rdx subq %rsi, %rdi orq $(VEC_SIZE - 1), %rsi incq %rsi addq %rsi, %rdi L(loop_last_4x_vec): subq %rsi, %rdx VMOVA 0(%rsi), %VMM(1) VPCMPEQ %VMM(1), %VZERO, %VMM(6) vpmovmskb %VMM(6), %ecx cmpq $(VEC_SIZE * 2), %rdx ja L(more_2x_vec) L(last_2x_vec): tzcnt %ecx, %ecx cmpl %ecx, %edx jbe L(ret_vec_x1_len) cmpl $VEC_SIZE, %ecx jnz L(ret_vec_x1) VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2) VMOVU %VMM(1), (%rdi) VPCMPEQ %VMM(2), %VZERO, %VMM(6) vpmovmskb %VMM(6), %ecx addl $-VEC_SIZE, %edx bzhil %edx, %ecx, %r8d jz L(ret_vec_x2_len) L(ret_vec_x2): bsfl %ecx, %edx L(ret_vec_x2_len): VMOVU (%rsi, %rdx), %VMM(0) MOVCHAR $0, (VEC_SIZE)(%rdi, %rdx) VMOVU %VMM(0), (%rdi, %rdx) L(return_vzeroupper): ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4,, 12 L(ret_vec_x1_len): movl %edx, %ecx L(ret_vec_x1): VMOVU -(VEC_SIZE)(%rsi, %rcx), %VMM(1) MOVCHAR $0, (%rdi, %rcx) VMOVU %VMM(1), -VEC_SIZE(%rdi, %rcx) VZEROUPPER_RETURN .p2align 4,, 8 L(last_4x_vec): subq $-(VEC_SIZE * 4), %rsi VMOVA 0(%rsi), %VMM(1) VPCMPEQ %VMM(1), %VZERO, %VMM(6) vpmovmskb %VMM(6), %ecx subq $-(VEC_SIZE * 4), %rdi addl $-(VEC_SIZE * 4), %edx cmpl $(VEC_SIZE * 2), %edx jbe L(last_2x_vec) .p2align 4,, 8 L(more_2x_vec): /* L(ret_vec_x1) expects ecx to have position of first match so test with bsf. */ bsfl %ecx, %ecx jnz L(ret_vec_x1) VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2) VMOVU %VMM(1), (%rdi) VPCMPEQ %VMM(2), %VZERO, %VMM(6) vpmovmskb %VMM(6), %ecx testl %ecx, %ecx jnz L(ret_vec_x2) VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3) VMOVU %VMM(2), (VEC_SIZE * 1)(%rdi) VPCMPEQ %VMM(3), %VZERO, %VMM(6) vpmovmskb %VMM(6), %ecx /* Check if length is greater than 4x VEC. */ cmpq $(VEC_SIZE * 4), %rdx ja L(more_4x_vec) addl $(VEC_SIZE * -2), %edx tzcnt %ecx, %ecx cmpl %ecx, %edx jbe L(ret_vec_x3_len) cmpl $VEC_SIZE, %ecx jnz L(ret_vec_x3) VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(4) VMOVU %VMM(3), (VEC_SIZE * 2 + 0)(%rdi) VPCMPEQ %VMM(4), %VZERO, %VMM(6) vpmovmskb %VMM(6), %ecx addl $-VEC_SIZE, %edx bzhil %edx, %ecx, %r8d jz L(ret_vec_x4_len) L(ret_vec_x4): bsfl %ecx, %edx L(ret_vec_x4_len): VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(0) MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rdx) VMOVU %VMM(0), (VEC_SIZE * 2)(%rdi, %rdx) VZEROUPPER_RETURN .p2align 4,, 4 L(ret_vec_x3_len): movl %edx, %ecx L(ret_vec_x3): VMOVU (VEC_SIZE)(%rsi, %rcx), %VMM(0) MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rcx) VMOVU %VMM(0), (VEC_SIZE)(%rdi, %rcx) VZEROUPPER_RETURN .p2align 4,, 8 L(more_4x_vec): bsfl %ecx, %ecx jnz L(ret_vec_x3) VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4) VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) VPCMPEQ %VMM(4), %VZERO, %VMM(6) vpmovmskb %VMM(6), %ecx testl %ecx, %ecx jnz L(ret_vec_x4) VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi) /* Recheck length before aligning. */ cmpq $(VEC_SIZE * 8), %rdx jbe L(last_4x_vec) /* Align rsi (src) and just rdx/rdi (length/dst). */ addq %rsi, %rdx subq %rsi, %rdi subq $-(VEC_SIZE * 4), %rsi andq $(VEC_SIZE * -4), %rsi /* Do first half of loop ahead of time so loop can just start by storing. */ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) VPMIN %VMM(0), %VMM(1), %VMM(4) VPMIN %VMM(2), %VMM(3), %VMM(6) VPMIN %VMM(4), %VMM(6), %VMM(6) VPCMPEQ %VMM(6), %VZERO, %VMM(6) vpmovmskb %VMM(6), %r8d addq %rsi, %rdi testl %r8d, %r8d jnz L(loop_4x_done) /* Use r9 for end of region before handling last 4x VEC specially. */ leaq -(VEC_SIZE * 4)(%rdx), %r9 .p2align 4,, 11 L(loop_4x_vec): VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) subq $(VEC_SIZE * -4), %rsi VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi) subq $(VEC_SIZE * -4), %rdi cmpq %rsi, %r9 jbe L(loop_last_4x_vec) VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) VPMIN %VMM(0), %VMM(1), %VMM(4) VPMIN %VMM(2), %VMM(3), %VMM(6) VPMIN %VMM(4), %VMM(6), %VMM(6) VPCMPEQ %VMM(6), %VZERO, %VMM(6) vpmovmskb %VMM(6), %r8d testl %r8d, %r8d jz L(loop_4x_vec) L(loop_4x_done): VPCMPEQ %VMM(0), %VZERO, %VMM(6) vpmovmskb %VMM(6), %ecx /* L(ret_vec_x1) expects ecx to have position of first match so test with bsf. */ bsfl %ecx, %ecx jnz L(ret_vec_x1) VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) VPCMPEQ %VMM(1), %VZERO, %VMM(6) vpmovmskb %VMM(6), %ecx testl %ecx, %ecx jnz L(ret_vec_x2) VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) VPCMPEQ %VMM(2), %VZERO, %VMM(6) vpmovmskb %VMM(6), %ecx bsfl %ecx, %ecx jnz L(ret_vec_x3) VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) bsfl %r8d, %r8d VMOVU (VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1) VMOVU %VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8) VZEROUPPER_RETURN .p2align 4,, 4 L(page_cross): movq %rsi, %r8 andq $(VEC_SIZE * -1), %r8 VPCMPEQ (%r8), %VZERO, %VMM(6) vpmovmskb %VMM(6), %ecx shrxl %esi, %ecx, %ecx subl %esi, %r8d andl $(VEC_SIZE - 1), %r8d cmpq %r8, %rdx jbe L(page_cross_small) /* Optimizing more aggressively for space as this is very cold code. This saves 2x cache lines. */ /* This adds once to the later result which will get correct copy bounds. NB: this can never zero-out a non-zero RCX as to be in the page cross case rsi cannot be aligned and we already right-shift rcx by the misalignment. */ shll $CHAR_SIZE, %ecx jz L(page_cross_continue) bsfl %ecx, %ecx rep movsb VZEROUPPER_RETURN L(page_cross_small): tzcntl %ecx, %ecx jz L(page_cross_setz) cmpl %edx, %ecx cmova %edx, %ecx rep movsb L(page_cross_setz): MOVCHAR $0, (%rdi) VZEROUPPER_RETURN L(zero_len): # ifdef USE_AS_WCSCPY test %rdx, %rdx # endif jnz OVERFLOW_STRCAT ret END(STRNCAT) #endif