/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions. Copyright (C) 2022-2023 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see . */ #include #if ISA_SHOULD_BUILD (4) /* Use evex-masked stores for small sizes. Turned off at the moment. */ # define USE_EVEX_MASKED_STORE 0 # include # ifndef VEC_SIZE # include "x86-evex256-vecs.h" # endif # ifndef STRNCPY # define STRNCPY __strncpy_evex # endif # ifdef USE_AS_WCSCPY # define VMOVU_MASK vmovdqu32 # define VPCMPEQ vpcmpeqd # define VPMIN vpminud # define VPTESTN vptestnmd # define VPTEST vptestmd # define CHAR_SIZE 4 # define REP_MOVS rep movsd # define REP_STOS rep stosl # define USE_WIDE_CHAR # else # define VMOVU_MASK vmovdqu8 # define VPCMPEQ vpcmpeqb # define VPMIN vpminub # define VPTESTN vptestnmb # define VPTEST vptestmb # define CHAR_SIZE 1 # define REP_MOVS rep movsb # define REP_STOS rep stosb # endif # include "strncpy-or-cat-overflow-def.h" # define PAGE_SIZE 4096 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) # include "reg-macros.h" # define VZERO VMM(7) # define VZERO_256 VMM_256(7) # define VZERO_128 VMM_128(7) # if VEC_SIZE == 64 # define VZERO_HALF VZERO_256 # else # define VZERO_HALF VZERO_128 # endif .section SECTION(.text), "ax", @progbits ENTRY(STRNCPY) # ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %edx, %edx # endif /* Filter zero length strings and very long strings. Zero length strings just return, very long strings are handled by just running rep stos{b|l} to zero set (which will almost certainly segfault), if that succeeds then just calling OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */ # ifdef USE_AS_WCSCPY decq %rdx movq %rdx, %rax /* 56 is end of max supported address space. */ shr $56, %rax jnz L(zero_len) # else decq %rdx /* If the flag needs to become `jb` replace `dec` with `sub`. */ jl L(zero_len) # endif vpxorq %VZERO_128, %VZERO_128, %VZERO_128 movl %esi, %eax andl $(PAGE_SIZE - 1), %eax cmpl $(PAGE_SIZE - VEC_SIZE), %eax ja L(page_cross) L(page_cross_continue): VMOVU (%rsi), %VMM(0) VPTESTN %VMM(0), %VMM(0), %k0 KMOV %k0, %VRCX /* If no STPCPY just save end ahead of time. */ # ifndef USE_AS_STPCPY movq %rdi, %rax # endif cmpq $(CHAR_PER_VEC), %rdx /* If USE_EVEX_MASK_STORE is enabled then we just handle length <= CHAR_PER_VEC with masked instructions (which have potential for dramatically bad perf if dst splits a page and is not in the TLB). */ # if USE_EVEX_MASKED_STORE /* `jae` because length rdx is now length - 1. */ jae L(more_1x_vec) /* If there where multiple zero-CHAR matches in the first VEC, VRCX will be overset but that's fine since any oversets where at zero-positions anyways. */ # ifdef USE_AS_STPCPY tzcnt %VRCX, %VRAX cmpl %eax, %edx cmovb %edx, %eax # ifdef USE_AS_WCSCPY adcl $0, %eax leaq (%rdi, %rax, CHAR_SIZE), %rax # else adcq %rdi, %rax # endif # endif dec %VRCX /* Zero out all non-zero CHAR's after the first zero match. */ KMOV %VRCX, %k1 /* Use VZERO as destination so this can be reused for L(zfill_less_vec) (which if jumped to by subsequent logic will have zerod out VZERO. */ VMOVU_MASK %VMM(0), %VZERO{%k1}{z} L(zfill_less_vec): /* Get mask for what we need to set. */ incl %edx mov $-1, %VRCX bzhi %VRDX, %VRCX, %VRCX KMOV %VRCX, %k1 VMOVU_MASK %VZERO, (%rdi){%k1} ret .p2align 4,, 4 L(zero_len): cmpq $-1, %rdx jne L(best_effort_strncpy) movq %rdi, %rax ret .p2align 4,, 8 L(more_1x_vec): # else /* `jb` because length rdx is now length - 1. */ jb L(less_1x_vec) # endif /* This may overset but that's fine because we still need to zero fill. */ VMOVU %VMM(0), (%rdi) /* Length must be >= CHAR_PER_VEC so match here means we must zero-fill. */ test %VRCX, %VRCX jnz L(zfill) /* We are going to align rsi here so will need to be able to re- adjust rdi/rdx afterwards. NB: We filtered out huge lengths so rsi + rdx * CHAR_SIZE cannot overflow. */ leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx subq %rsi, %rdi andq $-(VEC_SIZE), %rsi L(loop_last_4x_vec): addq %rsi, %rdi subq %rsi, %rdx # ifdef USE_AS_WCSCPY shrq $2, %rdx # endif VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) VPTESTN %VMM(1), %VMM(1), %k0 KMOV %k0, %VRCX /* -1 because of the `dec %rdx` earlier. */ cmpq $(CHAR_PER_VEC * 2 - 1), %rdx ja L(more_2x_vec) L(last_2x_vec): /* This will be need to be computed no matter what. We do it ahead of time for CHAR_PER_VEC == 64 because we can't adjust the value of `tzcnt` with a shift. */ # if CHAR_PER_VEC == 64 tzcntq %rcx, %rcx # endif cmpl $(CHAR_PER_VEC), %edx jb L(ret_vec_x1_len) /* Separate logic for CHAR_PER_VEC == 64 because we already did `tzcnt` on VRCX. */ # if CHAR_PER_VEC == 64 /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */ cmpb $CHAR_PER_VEC, %cl jnz L(ret_vec_x1_no_bsf) # else test %VRCX, %VRCX jnz L(ret_vec_x1) # endif VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) KMOV %k0, %VRCX # if CHAR_PER_VEC < 64 /* This essentiallys adds CHAR_PER_VEC to computed result. */ shlq $CHAR_PER_VEC, %rcx # else tzcntq %rcx, %rcx addl $CHAR_PER_VEC, %ecx # endif .p2align 4,, 4 L(ret_vec_x1_len): /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has already been done. */ # if CHAR_PER_VEC < 64 tzcntq %rcx, %rcx # endif cmpl %ecx, %edx jbe L(ret_vec_x1_len_no_zfill) /* Fall through (expectation) is copy len < buffer len. */ VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) L(ret_vec_x1_len_no_zfill_mov): movl %ecx, %edx # ifdef USE_AS_STPCPY /* clear flags. */ xorl %ecx, %ecx # endif L(ret_vec_x1_len_no_zfill): VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) # ifdef USE_AS_STPCPY # ifdef USE_AS_WCSCPY adcq $0, %rdx leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax # else leal (VEC_SIZE)(%rdx), %eax adcq %rdi, %rax # endif # endif ret .p2align 4,, 10 L(ret_vec_x1): bsf %VRCX, %VRCX L(ret_vec_x1_no_bsf): VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) subl %ecx, %edx cmpl $CHAR_PER_VEC, %edx jb L(ret_vec_x1_len_no_zfill_mov) /* Fall through (expectation) is copy len < buffer len. */ VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE) # ifdef USE_AS_STPCPY leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax # endif ret .p2align 4,, 8 L(last_4x_vec): /* Separate logic for CHAR_PER_VEC == 64 because we can do `andl $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just using `movzbl`. */ # if CHAR_PER_VEC == 64 movzbl %dl, %edx # else andl $(CHAR_PER_VEC * 4 - 1), %edx # endif VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) VPTESTN %VMM(1), %VMM(1), %k0 KMOV %k0, %VRCX subq $-(VEC_SIZE * 4), %rsi subq $-(VEC_SIZE * 4), %rdi cmpl $(CHAR_PER_VEC * 2 - 1), %edx jbe L(last_2x_vec) .p2align 4,, 8 L(more_2x_vec): VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) test %VRCX, %VRCX /* Must fill at least 2x VEC. */ jnz L(zfill_vec1) VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) VPTESTN %VMM(2), %VMM(2), %k0 KMOV %k0, %VRCX test %VRCX, %VRCX /* Must fill at least 1x VEC. */ jnz L(zfill_vec2) VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) VPTESTN %VMM(3), %VMM(3), %k0 KMOV %k0, %VRCX /* Check if len is more 4x VEC. -1 because rdx is len - 1. */ cmpq $(CHAR_PER_VEC * 4 - 1), %rdx ja L(more_4x_vec) subl $(CHAR_PER_VEC * 3), %edx jb L(ret_vec_x3_len) test %VRCX, %VRCX jnz L(ret_vec_x3) VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) KMOV %k0, %VRCX tzcnt %VRCX, %VRCX cmpl %ecx, %edx jbe L(ret_vec_x4_len_no_zfill) /* Fall through (expectation) is copy len < buffer len. */ VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) movl %ecx, %edx L(ret_vec_x4_len_no_zfill): VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) # ifdef USE_AS_STPCPY # ifdef USE_AS_WCSCPY adcq $0, %rdx leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax # else leal (VEC_SIZE * 4 + 0)(%rdx), %eax adcq %rdi, %rax # endif # endif ret L(ret_vec_x3_len): addl $(CHAR_PER_VEC * 1), %edx tzcnt %VRCX, %VRCX cmpl %ecx, %edx jbe L(ret_vec_x3_len_no_zfill) /* Fall through (expectation) is copy len < buffer len. */ VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) L(ret_vec_x3_len_no_zfill_mov): movl %ecx, %edx # ifdef USE_AS_STPCPY /* clear flags. */ xorl %ecx, %ecx # endif .p2align 4,, 4 L(ret_vec_x3_len_no_zfill): VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) # ifdef USE_AS_STPCPY # ifdef USE_AS_WCSCPY adcq $0, %rdx leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax # else leal (VEC_SIZE * 3 + 0)(%rdx), %eax adcq %rdi, %rax # endif # endif ret .p2align 4,, 8 L(ret_vec_x3): bsf %VRCX, %VRCX VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE) subl %ecx, %edx jl L(ret_vec_x3_len_no_zfill_mov) VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) # ifdef USE_AS_STPCPY leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax # endif ret .p2align 4,, 8 L(more_4x_vec): VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) test %VRCX, %VRCX jnz L(zfill_vec3) VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) VPTESTN %VMM(4), %VMM(4), %k0 KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(zfill_vec4) /* Recheck length before aligning. */ cmpq $(CHAR_PER_VEC * 8 - 1), %rdx jbe L(last_4x_vec) /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */ # ifdef USE_AS_WCSCPY leaq (%rsi, %rdx, CHAR_SIZE), %rdx # else addq %rsi, %rdx # endif subq %rsi, %rdi subq $-(VEC_SIZE * 5), %rsi andq $(VEC_SIZE * -4), %rsi /* Load first half of the loop before entry. */ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) VPMIN %VMM(0), %VMM(1), %VMM(4) VPMIN %VMM(2), %VMM(3), %VMM(6) VPTESTN %VMM(4), %VMM(4), %k2 VPTESTN %VMM(6), %VMM(6), %k4 /* Offset rsi by VEC_SIZE so that we can jump to L(loop_last_4x_vec). */ addq $-(VEC_SIZE), %rsi KORTEST %k2, %k4 jnz L(loop_4x_done) /* Store loop end in r9. */ leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9 .p2align 4,, 11 L(loop_4x_vec): VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) subq $(VEC_SIZE * -4), %rsi cmpq %rsi, %r9 jbe L(loop_last_4x_vec) VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) VPMIN %VMM(0), %VMM(1), %VMM(4) VPMIN %VMM(2), %VMM(3), %VMM(6) VPTESTN %VMM(4), %VMM(4), %k2 VPTESTN %VMM(6), %VMM(6), %k4 KORTEST %k2, %k4 jz L(loop_4x_vec) L(loop_4x_done): /* Restore rdx (length). */ subq %rsi, %rdx # ifdef USE_AS_WCSCPY shrq $2, %rdx # endif VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) /* Restore rdi (dst). */ addq %rsi, %rdi VPTESTN %VMM(0), %VMM(0), %k0 KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(zfill_vec1) VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) KMOV %k2, %VRCX test %VRCX, %VRCX jnz L(zfill_vec2) VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) VPTESTN %VMM(2), %VMM(2), %k0 KMOV %k0, %VRCX test %VRCX, %VRCX jnz L(zfill_vec3) VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi) KMOV %k4, %VRCX // Zfill more.... .p2align 4,, 4 L(zfill_vec4): subq $(VEC_SIZE * -2), %rdi addq $(CHAR_PER_VEC * -2), %rdx L(zfill_vec2): subq $(VEC_SIZE * -2), %rdi addq $(CHAR_PER_VEC * -1), %rdx L(zfill): /* VRCX must be non-zero. */ bsf %VRCX, %VRCX /* Adjust length / dst for zfill. */ subq %rcx, %rdx # ifdef USE_AS_WCSCPY leaq (%rdi, %rcx, CHAR_SIZE), %rdi # else addq %rcx, %rdi # endif # ifdef USE_AS_STPCPY movq %rdi, %rax # endif L(zfill_from_page_cross): /* From here on out its just memset(rdi, 0, rdx). */ cmpq $CHAR_PER_VEC, %rdx jb L(zfill_less_vec) L(zfill_more_1x_vec): VMOVU %VZERO, (%rdi) VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) cmpq $(CHAR_PER_VEC * 2 - 1), %rdx ja L(zfill_more_2x_vec) L(zfill_done0): ret /* Coming from vec1/vec2 we must be able to zfill at least 2x VEC. */ .p2align 4,, 8 L(zfill_vec3): subq $(VEC_SIZE * -2), %rdi addq $(CHAR_PER_VEC * -2), %rdx .p2align 4,, 2 L(zfill_vec1): bsfq %rcx, %rcx /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here. */ leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi subq %rcx, %rdx # ifdef USE_AS_STPCPY movq %rdi, %rax # endif VMOVU %VZERO, (%rdi) VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) cmpq $(CHAR_PER_VEC * 2), %rdx jb L(zfill_done0) L(zfill_more_2x_vec): VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) VMOVU %VZERO, (VEC_SIZE)(%rdi) subq $(CHAR_PER_VEC * 4 - 1), %rdx jbe L(zfill_done) # ifdef USE_AS_WCSCPY leaq (%rdi, %rdx, CHAR_SIZE), %rdx # else addq %rdi, %rdx # endif VMOVU %VZERO, (VEC_SIZE * 2)(%rdi) VMOVU %VZERO, (VEC_SIZE * 3)(%rdi) VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx) VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx) subq $-(VEC_SIZE * 4), %rdi cmpq %rdi, %rdx jbe L(zfill_done) /* Align rdi and zfill loop. */ andq $-(VEC_SIZE), %rdi .p2align 4,, 12 L(zfill_loop_4x_vec): VMOVA %VZERO, (VEC_SIZE * 0)(%rdi) VMOVA %VZERO, (VEC_SIZE * 1)(%rdi) VMOVA %VZERO, (VEC_SIZE * 2)(%rdi) VMOVA %VZERO, (VEC_SIZE * 3)(%rdi) subq $-(VEC_SIZE * 4), %rdi cmpq %rdi, %rdx ja L(zfill_loop_4x_vec) L(zfill_done): ret /* Less 1x VEC case if we are not using evex masked store. */ # if !USE_EVEX_MASKED_STORE .p2align 4,, 8 L(copy_1x): /* Special case for copy 1x. It can be handled quickly and many buffer sizes have convenient alignment. */ VMOVU %VMM(0), (%rdi) /* If no zeros then we are done. */ testl %ecx, %ecx jz L(ret_1x_1x) /* Need to zfill, not we know that length <= CHAR_PER_VEC so we only handle the small case here. */ bsf %VRCX, %VRCX L(zfill_less_vec_no_bsf): /* Adjust length / dst then just zfill less_vec. */ subq %rcx, %rdx # ifdef USE_AS_WCSCPY leaq (%rdi, %rcx, CHAR_SIZE), %rdi # else addq %rcx, %rdi # endif # ifdef USE_AS_STPCPY movq %rdi, %rax # endif L(zfill_less_vec): cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx jb L(zfill_less_half) VMOVU %VZERO_HALF, (%rdi) VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) ret # ifdef USE_AS_STPCPY L(ret_1x_1x): leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax ret # endif # if VEC_SIZE == 64 .p2align 4,, 4 L(copy_32_63): /* Overfill to avoid branches. */ VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) VMOVU %VMM_256(0), (%rdi) VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) /* We are taking advantage of the fact that to be here we must be writing null-term as (%rdi, %rcx) we have a byte of lee- way for overwriting. */ cmpl %ecx, %edx ja L(zfill_less_vec_no_bsf) # ifndef USE_AS_STPCPY L(ret_1x_1x): # else # ifdef USE_AS_WCSCPY adcq $0, %rdx leaq (%rdi, %rdx, CHAR_SIZE), %rax # else movl %edx, %eax adcq %rdi, %rax # endif # endif ret # endif .p2align 4,, 4 L(copy_16_31): /* Overfill to avoid branches. */ vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 VMOVU %VMM_128(0), (%rdi) vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) cmpl %ecx, %edx /* Separate logic depending on VEC_SIZE. If VEC_SIZE == 64 then we have a larger copy block for 32-63 so this is just falls through to zfill 16-31. If VEC_SIZE == 32 then we check for full zfill of less 1x VEC. */ # if VEC_SIZE == 64 jbe L(ret_16_31) subl %ecx, %edx # ifdef USE_AS_WCSCPY leaq (%rdi, %rcx, CHAR_SIZE), %rdi # else addq %rcx, %rdi # endif # ifdef USE_AS_STPCPY movq %rdi, %rax # endif L(zfill_less_half): L(zfill_less_32): cmpl $(16 / CHAR_SIZE), %edx jb L(zfill_less_16) VMOVU %VZERO_128, (%rdi) VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) # ifdef USE_AS_STPCPY ret # endif L(ret_16_31): # ifdef USE_AS_STPCPY # ifdef USE_AS_WCSCPY adcq $0, %rdx leaq (%rdi, %rdx, CHAR_SIZE), %rax # else movl %edx, %eax adcq %rdi, %rax # endif # endif ret # else /* VEC_SIZE == 32 begins. */ ja L(zfill_less_vec_no_bsf) # ifndef USE_AS_STPCPY L(ret_1x_1x): # else # ifdef USE_AS_WCSCPY adcq $0, %rdx leaq (%rdi, %rdx, CHAR_SIZE), %rax # else movl %edx, %eax adcq %rdi, %rax # endif # endif ret # endif .p2align 4,, 4 L(copy_8_15): /* Overfill to avoid branches. */ movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi vmovq %VMM_128(0), (%rdi) movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) cmpl %ecx, %edx jbe L(ret_8_15) subl %ecx, %edx # ifdef USE_AS_WCSCPY leaq (%rdi, %rcx, CHAR_SIZE), %rdi # else addq %rcx, %rdi # endif # ifdef USE_AS_STPCPY movq %rdi, %rax # endif .p2align 4,, 8 # if VEC_SIZE == 32 L(zfill_less_half): # endif L(zfill_less_16): xorl %ecx, %ecx cmpl $(8 / CHAR_SIZE), %edx jb L(zfill_less_8) movq %rcx, (%rdi) movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) # ifndef USE_AS_STPCPY L(ret_8_15): # endif ret .p2align 4,, 8 L(less_1x_vec): je L(copy_1x) /* We will need `tzcnt` result for all other copy sizes. */ tzcnt %VRCX, %VRCX # if VEC_SIZE == 64 cmpl $(32 / CHAR_SIZE), %edx jae L(copy_32_63) # endif cmpl $(16 / CHAR_SIZE), %edx jae L(copy_16_31) cmpl $(8 / CHAR_SIZE), %edx jae L(copy_8_15) # ifdef USE_AS_WCSCPY testl %ecx, %ecx jz L(zfill_less_8_set_ret) movl (%rsi, %rdx, CHAR_SIZE), %esi vmovd %VMM_128(0), (%rdi) movl %esi, (%rdi, %rdx, CHAR_SIZE) # ifdef USE_AS_STPCPY cmpl %ecx, %edx L(ret_8_15): adcq $0, %rdx leaq (%rdi, %rdx, CHAR_SIZE), %rax # endif ret L(zfill_less_8_set_ret): xorl %ecx, %ecx # ifdef USE_AS_STPCPY movq %rdi, %rax # endif L(zfill_less_8): movl %ecx, (%rdi) movl %ecx, (%rdi, %rdx, CHAR_SIZE) ret # else cmpl $3, %edx jb L(copy_0_3) /* Overfill to avoid branches. */ movl -3(%rsi, %rdx), %esi vmovd %VMM_128(0), (%rdi) movl %esi, -3(%rdi, %rdx) cmpl %ecx, %edx jbe L(ret_4_7) subq %rcx, %rdx addq %rcx, %rdi # ifdef USE_AS_STPCPY movq %rdi, %rax # endif xorl %ecx, %ecx .p2align 4,, 8 L(zfill_less_8): cmpl $3, %edx jb L(zfill_less_3) movl %ecx, (%rdi) movl %ecx, -3(%rdi, %rdx) # ifdef USE_AS_STPCPY ret # endif L(ret_4_7): # ifdef USE_AS_STPCPY L(ret_8_15): movl %edx, %eax adcq %rdi, %rax # endif ret .p2align 4,, 4 L(zfill_less_3): testl %edx, %edx jz L(zfill_1) movw %cx, (%rdi) L(zfill_1): movb %cl, (%rdi, %rdx) ret .p2align 4,, 8 L(copy_0_3): vmovd %VMM_128(0), %r8d testl %edx, %edx jz L(copy_1) movw %r8w, (%rdi) cmpl %ecx, %edx ja L(zfill_from_1) movzbl (%rsi, %rdx), %r8d # ifdef USE_AS_STPCPY movl %edx, %eax adcq %rdi, %rax movb %r8b, (%rdi, %rdx) ret # endif L(copy_1): # ifdef USE_AS_STPCPY movl %edx, %eax cmpl %ecx, %edx adcq %rdi, %rax # endif # ifdef USE_AS_WCSCPY vmovd %VMM_128(0), (%rdi) # else movb %r8b, (%rdi, %rdx) # endif ret # endif # ifndef USE_AS_WCSCPY .p2align 4,, 8 L(zfill_from_1): # ifdef USE_AS_STPCPY leaq (%rdi, %rcx), %rax # endif movw $0, -1(%rdi, %rdx) ret # endif .p2align 4,, 4 L(zero_len): incq %rdx jne L(best_effort_strncpy) movq %rdi, %rax ret # endif .p2align 4,, 4 .p2align 6,, 8 L(page_cross): movq %rsi, %rax andq $(VEC_SIZE * -1), %rax VPCMPEQ (%rax), %VZERO, %k0 KMOV %k0, %VRCX # ifdef USE_AS_WCSCPY movl %esi, %r8d shrl $2, %r8d andl $(CHAR_PER_VEC - 1), %r8d shrx %VR8, %VRCX, %VRCX # else shrx %VRSI, %VRCX, %VRCX # endif /* Compute amount of bytes we checked. */ subl %esi, %eax andl $(VEC_SIZE - 1), %eax # ifdef USE_AS_WCSCPY shrl $2, %eax # endif /* If rax > rdx then we are finishing the copy at the end of the page. */ cmpq %rax, %rdx jb L(page_cross_small) /* If rcx is non-zero then continue. */ test %VRCX, %VRCX jz L(page_cross_continue) /* We found zero-CHAR so need to copy then zfill (we know we didn't cover all of length here). */ bsf %VRCX, %VRCX L(movsb_and_zfill): incl %ecx subq %rcx, %rdx # ifdef USE_AS_STPCPY leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax # else movq %rdi, %rax # endif REP_MOVS # ifdef USE_AS_WCSCPY movl $0, (%rdi) # else movb $0, (%rdi) # endif jmp L(zfill_from_page_cross) L(page_cross_small): tzcnt %VRCX, %VRCX cmpl %ecx, %edx jbe L(page_cross_copy_only) /* Do a zfill of the tail before copying. */ movq %rdi, %r9 xorl %eax, %eax movl %ecx, %r8d subl %ecx, %edx leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi movl %edx, %ecx REP_STOS movq %r9, %rdi movl %r8d, %edx L(page_cross_copy_only): leal 1(%rdx), %ecx # ifdef USE_AS_STPCPY # ifdef USE_AS_WCSCPY adcl $0, %edx leaq (%rdi, %rdx, CHAR_SIZE), %rax # else movl %edx, %eax adcq %rdi, %rax # endif # else movq %rdi, %rax # endif REP_MOVS ret L(best_effort_strncpy): movq %rdx, %rcx xorl %eax, %eax movq %rdi, %r8 /* The length is >= 2^63. We very much so expect to segfault at rep stos. If that doesn't happen then just strcpy to finish. */ REP_STOS movq %r8, %rdi jmp OVERFLOW_STRCPY END(STRNCPY) #endif