diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/strcpy-evex.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/strcpy-evex.S | 1282 |
1 files changed, 409 insertions, 873 deletions
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S index 82e45ac675..932129ab40 100644 --- a/sysdeps/x86_64/multiarch/strcpy-evex.S +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S @@ -1,4 +1,4 @@ -/* strcpy with 256-bit EVEX instructions. +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions. Copyright (C) 2021-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,990 +17,526 @@ <https://www.gnu.org/licenses/>. */ #include <isa-level.h> - #if ISA_SHOULD_BUILD (4) -# ifndef USE_AS_STRCAT -# include <sysdep.h> + /* Use evex-masked stores for small sizes. Turned off at the + moment. */ +# define USE_EVEX_MASKED_STORE 0 + /* Use movsb in page cross case to save code size. */ +# define USE_MOVSB_IN_PAGE_CROSS 1 -# ifndef STRCPY -# define STRCPY __strcpy_evex -# endif +# include <sysdep.h> +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" # endif -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 - -/* Number of bytes in a vector register */ -# ifndef VEC_SIZE -# define VEC_SIZE 32 +# ifndef STRCPY +# define STRCPY __strcpy_evex # endif -# define XMM2 xmm18 -# define XMM3 xmm19 -# define YMM2 ymm18 -# define YMM3 ymm19 -# define YMM4 ymm20 -# define YMM5 ymm21 -# define YMM6 ymm22 -# define YMM7 ymm23 +# ifdef USE_AS_WCSCPY +# define VMOVU_MASK vmovdqu32 +# define VPMIN vpminud +# define VPTESTN vptestnmd +# define VPTEST vptestmd +# define VPCMPEQ vpcmpeqd +# define CHAR_SIZE 4 -# ifndef USE_AS_STRCAT +# define REP_MOVS rep movsd -/* zero register */ -# define XMMZERO xmm16 -# define YMMZERO ymm16 -# define YMM1 ymm17 - - .section .text.evex,"ax",@progbits -ENTRY (STRCPY) -# ifdef USE_AS_STRNCPY - mov %RDX_LP, %R8_LP - test %R8_LP, %R8_LP - jz L(ExitZero) -# endif - mov %rsi, %rcx -# ifndef USE_AS_STPCPY - mov %rdi, %rax /* save result */ -# endif +# define USE_WIDE_CHAR +# else +# define VMOVU_MASK vmovdqu8 +# define VPMIN vpminub +# define VPTESTN vptestnmb +# define VPTEST vptestmb +# define VPCMPEQ vpcmpeqb +# define CHAR_SIZE 1 - vpxorq %XMMZERO, %XMMZERO, %XMMZERO +# define REP_MOVS rep movsb # endif - and $((VEC_SIZE * 4) - 1), %ecx - cmp $(VEC_SIZE * 2), %ecx - jbe L(SourceStringAlignmentLessTwoVecSize) - - and $-VEC_SIZE, %rsi - and $(VEC_SIZE - 1), %ecx - - vpcmpb $0, (%rsi), %YMMZERO, %k0 - kmovd %k0, %edx - shr %cl, %rdx +# include "reg-macros.h" -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - mov $VEC_SIZE, %r10 - sub %rcx, %r10 - cmp %r10, %r8 -# else - mov $(VEC_SIZE + 1), %r10 - sub %rcx, %r10 - cmp %r10, %r8 -# endif - jbe L(CopyVecSizeTailCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyVecSizeTail) - - vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 - kmovd %k1, %edx -# ifdef USE_AS_STRNCPY - add $VEC_SIZE, %r10 - cmp %r10, %r8 - jbe L(CopyTwoVecSizeCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyTwoVecSize) - - VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ - VMOVU %YMM2, (%rdi) - -/* If source address alignment != destination address alignment */ - .p2align 4 -L(UnalignVecSizeBoth): - sub %rcx, %rdi -# ifdef USE_AS_STRNCPY - add %rcx, %r8 - sbb %rcx, %rcx - or %rcx, %r8 -# endif - mov $VEC_SIZE, %rcx - VMOVA (%rsi, %rcx), %YMM2 - VMOVU %YMM2, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 - vpcmpb $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $(VEC_SIZE * 3), %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec2) +# ifdef USE_AS_STPCPY +# define END_REG rax # else - jnz L(CopyVecSize) +# define END_REG rdi, %rdx, CHAR_SIZE # endif - VMOVU %YMM2, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 - vpcmpb $0, %YMM3, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec3) +# ifdef USE_AS_STRCAT +# define PAGE_ALIGN_REG edx +# define PAGE_ALIGN_REG_64 rdx # else - jnz L(CopyVecSize) +# define PAGE_ALIGN_REG eax +# define PAGE_ALIGN_REG_64 rax # endif - VMOVU %YMM3, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 - vpcmpb $0, %YMM4, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec4) -# else - jnz L(CopyVecSize) -# endif +# define VZERO VMM(7) +# define VZERO_128 VMM_128(7) - VMOVU %YMM4, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 - vpcmpb $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec2) -# else - jnz L(CopyVecSize) -# endif - VMOVU %YMM2, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 - vpcmpb $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec2) -# else - jnz L(CopyVecSize) -# endif +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 - VMOVU %YMM2, (%rdi, %rcx) - vpcmpb $0, %YMM3, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec3) -# else - jnz L(CopyVecSize) -# endif - VMOVU %YMM3, (%rdi, %rcx) - mov %rsi, %rdx - lea VEC_SIZE(%rsi, %rcx), %rsi - and $-(VEC_SIZE * 4), %rsi - sub %rsi, %rdx - sub %rdx, %rdi -# ifdef USE_AS_STRNCPY - lea (VEC_SIZE * 8)(%r8, %rdx), %r8 -# endif -L(UnalignedFourVecSizeLoop): - VMOVA (%rsi), %YMM4 - VMOVA VEC_SIZE(%rsi), %YMM5 - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 - vpminub %YMM5, %YMM4, %YMM2 - vpminub %YMM7, %YMM6, %YMM3 - vpminub %YMM2, %YMM3, %YMM2 - /* If K7 != 0, there is a null byte. */ - vpcmpb $0, %YMM2, %YMMZERO, %k7 - kmovd %k7, %edx -# ifdef USE_AS_STRNCPY - sub $(VEC_SIZE * 4), %r8 - jbe L(UnalignedLeaveCase2OrCase3) + .section SECTION(.text), "ax", @progbits +ENTRY(STRCPY) +# ifdef USE_AS_STRCAT + movq %rdi, %rax +# include "strcat-strlen-evex.h.S" # endif - test %edx, %edx - jnz L(UnalignedFourVecSizeLeave) - -L(UnalignedFourVecSizeLoop_start): - add $(VEC_SIZE * 4), %rdi - add $(VEC_SIZE * 4), %rsi - VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) - VMOVA (%rsi), %YMM4 - VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) - VMOVA VEC_SIZE(%rsi), %YMM5 - vpminub %YMM5, %YMM4, %YMM2 - VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 - VMOVU %YMM7, -VEC_SIZE(%rdi) - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 - vpminub %YMM7, %YMM6, %YMM3 - vpminub %YMM2, %YMM3, %YMM2 - /* If K7 != 0, there is a null byte. */ - vpcmpb $0, %YMM2, %YMMZERO, %k7 - kmovd %k7, %edx -# ifdef USE_AS_STRNCPY - sub $(VEC_SIZE * 4), %r8 - jbe L(UnalignedLeaveCase2OrCase3) + + movl %esi, %PAGE_ALIGN_REG + andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG + cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG + ja L(page_cross) +L(page_cross_continue): + VMOVU (%rsi), %VMM(0) +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT + movq %rdi, %rax # endif - test %edx, %edx - jz L(UnalignedFourVecSizeLoop_start) -L(UnalignedFourVecSizeLeave): - vpcmpb $0, %YMM4, %YMMZERO, %k1 - kmovd %k1, %edx - test %edx, %edx - jnz L(CopyVecSizeUnaligned_0) - vpcmpb $0, %YMM5, %YMMZERO, %k2 - kmovd %k2, %ecx - test %ecx, %ecx - jnz L(CopyVecSizeUnaligned_16) + /* Two short string implementations. One with traditional + branching approach and one with masked instructions (which + have potential for dramatically bad perf if dst splits a + page and is not in the TLB). */ +# if USE_EVEX_MASKED_STORE + VPTEST %VMM(0), %VMM(0), %k0 + KMOV %k0, %VRCX +# ifdef USE_AS_WCSCPY + subl $((1 << CHAR_PER_VEC)- 1), %VRCX +# else + inc %VRCX +# endif + jz L(more_1x_vec) + KMOV %VRCX, %k1 + KXOR %k0, %k1, %k1 - vpcmpb $0, %YMM6, %YMMZERO, %k3 - kmovd %k3, %edx - test %edx, %edx - jnz L(CopyVecSizeUnaligned_32) - - vpcmpb $0, %YMM7, %YMMZERO, %k4 - kmovd %k4, %ecx - bsf %ecx, %edx - VMOVU %YMM4, (%rdi) - VMOVU %YMM5, VEC_SIZE(%rdi) - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea (VEC_SIZE * 3)(%rdi, %rdx), %rax -# endif - VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) - add $(VEC_SIZE - 1), %r8 - sub %rdx, %r8 - lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - add $(VEC_SIZE * 3), %rsi - add $(VEC_SIZE * 3), %rdi - jmp L(CopyVecSizeExit) -# endif + VMOVU_MASK %VMM(0), (%rdi){%k1} -/* If source address alignment == destination address alignment */ +# ifdef USE_AS_STPCPY + bsf %VRCX, %VRCX + leaq (%rdi, %rcx, CHAR_SIZE), %rax +# endif + ret -L(SourceStringAlignmentLessTwoVecSize): - VMOVU (%rsi), %YMM3 - VMOVU VEC_SIZE(%rsi), %YMM2 - vpcmpb $0, %YMM3, %YMMZERO, %k0 - kmovd %k0, %edx +# else + VPTESTN %VMM(0), %VMM(0), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jz L(more_1x_vec) -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - cmp $VEC_SIZE, %r8 + xorl %edx, %edx + bsf %VRCX, %VRDX +# ifdef USE_AS_STPCPY + leaq (%rdi, %rdx, CHAR_SIZE), %rax +# endif + + /* Use mask bits in rcx to detect which copy we need. If the low + mask is zero then there must be a bit set in the upper half. + I.e if rcx != 0 and ecx == 0, then match must be upper 32 + bits so we use L(copy_32_63). */ +# if VEC_SIZE == 64 +# ifdef USE_AS_WCSCPY + testb %cl, %cl +# else + testl %ecx, %ecx +# endif + jz L(copy_32_63) +# endif + +# ifdef USE_AS_WCSCPY + testb $0xf, %cl # else - cmp $(VEC_SIZE + 1), %r8 + testw %cx, %cx # endif - jbe L(CopyVecSizeTail1Case2OrCase3) -# endif - test %edx, %edx - jnz L(CopyVecSizeTail1) + jz L(copy_16_31) - VMOVU %YMM3, (%rdi) - vpcmpb $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %edx -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - cmp $(VEC_SIZE * 2), %r8 +# ifdef USE_AS_WCSCPY + testb $0x3, %cl # else - cmp $((VEC_SIZE * 2) + 1), %r8 + testb %cl, %cl # endif - jbe L(CopyTwoVecSize1Case2OrCase3) -# endif - test %edx, %edx - jnz L(CopyTwoVecSize1) - - and $-VEC_SIZE, %rsi - and $(VEC_SIZE - 1), %ecx - jmp L(UnalignVecSizeBoth) + jz L(copy_8_15) -/*------End of main part with loops---------------------*/ -/* Case1 */ +# ifdef USE_AS_WCSCPY + vmovd %VMM_128(0), (%rdi) + /* No need to copy, we know its zero. */ + movl $0, (%END_REG) -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) - .p2align 4 -L(CopyVecSize): - add %rcx, %rdi -# endif -L(CopyVecSizeTail): - add %rcx, %rsi -L(CopyVecSizeTail1): - bsf %edx, %edx -L(CopyVecSizeExit): - cmp $32, %edx - jae L(Exit32_63) - cmp $16, %edx - jae L(Exit16_31) - cmp $8, %edx - jae L(Exit8_15) - cmp $4, %edx - jae L(Exit4_7) - cmp $3, %edx - je L(Exit3) - cmp $1, %edx - ja L(Exit2) - je L(Exit1) - movb $0, (%rdi) -# ifdef USE_AS_STPCPY - lea (%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $1, %r8 - lea 1(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif ret +# else - .p2align 4 -L(CopyTwoVecSize1): - add $VEC_SIZE, %rsi - add $VEC_SIZE, %rdi -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $VEC_SIZE, %r8 -# endif - jmp L(CopyVecSizeTail1) - - .p2align 4 -L(CopyTwoVecSize): - bsf %edx, %edx - add %rcx, %rsi - add $VEC_SIZE, %edx - sub %ecx, %edx - jmp L(CopyVecSizeExit) - - .p2align 4 -L(CopyVecSizeUnaligned_0): - bsf %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif - VMOVU %YMM4, (%rdi) - add $((VEC_SIZE * 4) - 1), %r8 - sub %rdx, %r8 - lea 1(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - jmp L(CopyVecSizeExit) -# endif + testb $0x7, %cl + jz L(copy_4_7) - .p2align 4 -L(CopyVecSizeUnaligned_16): - bsf %ecx, %edx - VMOVU %YMM4, (%rdi) -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea VEC_SIZE(%rdi, %rdx), %rax -# endif - VMOVU %YMM5, VEC_SIZE(%rdi) - add $((VEC_SIZE * 3) - 1), %r8 - sub %rdx, %r8 - lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - add $VEC_SIZE, %rsi - add $VEC_SIZE, %rdi - jmp L(CopyVecSizeExit) -# endif - .p2align 4 -L(CopyVecSizeUnaligned_32): - bsf %edx, %edx - VMOVU %YMM4, (%rdi) - VMOVU %YMM5, VEC_SIZE(%rdi) -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea (VEC_SIZE * 2)(%rdi, %rdx), %rax -# endif - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) - add $((VEC_SIZE * 2) - 1), %r8 - sub %rdx, %r8 - lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - add $(VEC_SIZE * 2), %rsi - add $(VEC_SIZE * 2), %rdi - jmp L(CopyVecSizeExit) -# endif + test %edx, %edx + jz L(set_null_term) -# ifdef USE_AS_STRNCPY -# ifndef USE_AS_STRCAT - .p2align 4 -L(CopyVecSizeUnalignedVec6): - VMOVU %YMM6, (%rdi, %rcx) - jmp L(CopyVecSizeVecExit) - - .p2align 4 -L(CopyVecSizeUnalignedVec5): - VMOVU %YMM5, (%rdi, %rcx) - jmp L(CopyVecSizeVecExit) - - .p2align 4 -L(CopyVecSizeUnalignedVec4): - VMOVU %YMM4, (%rdi, %rcx) - jmp L(CopyVecSizeVecExit) - - .p2align 4 -L(CopyVecSizeUnalignedVec3): - VMOVU %YMM3, (%rdi, %rcx) - jmp L(CopyVecSizeVecExit) + /* NB: make this `vmovw` if support for AVX512-FP16 is added. + */ + vmovd %VMM_128(0), %esi + movw %si, (%rdi) + + .p2align 4,, 1 +L(set_null_term): + /* No need to copy, we know its zero. */ + movb $0, (%END_REG) + ret # endif -/* Case2 */ - - .p2align 4 -L(CopyVecSizeCase2): - add $VEC_SIZE, %r8 - add %rcx, %rdi - add %rcx, %rsi - bsf %edx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) - jmp L(StrncpyExit) - - .p2align 4 -L(CopyTwoVecSizeCase2): - add %rcx, %rsi - bsf %edx, %edx - add $VEC_SIZE, %edx - sub %ecx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) - jmp L(StrncpyExit) - -L(CopyVecSizeTailCase2): - add %rcx, %rsi - bsf %edx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) - jmp L(StrncpyExit) - -L(CopyVecSizeTail1Case2): - bsf %edx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) - jmp L(StrncpyExit) - -/* Case2 or Case3, Case3 */ - - .p2align 4 -L(CopyVecSizeCase2OrCase3): - test %rdx, %rdx - jnz L(CopyVecSizeCase2) -L(CopyVecSizeCase3): - add $VEC_SIZE, %r8 - add %rcx, %rdi - add %rcx, %rsi - jmp L(StrncpyExit) - - .p2align 4 -L(CopyTwoVecSizeCase2OrCase3): - test %rdx, %rdx - jnz L(CopyTwoVecSizeCase2) - add %rcx, %rsi - jmp L(StrncpyExit) - - .p2align 4 -L(CopyVecSizeTailCase2OrCase3): - test %rdx, %rdx - jnz L(CopyVecSizeTailCase2) - add %rcx, %rsi - jmp L(StrncpyExit) - - .p2align 4 -L(CopyTwoVecSize1Case2OrCase3): - add $VEC_SIZE, %rdi - add $VEC_SIZE, %rsi - sub $VEC_SIZE, %r8 -L(CopyVecSizeTail1Case2OrCase3): - test %rdx, %rdx - jnz L(CopyVecSizeTail1Case2) - jmp L(StrncpyExit) +# if VEC_SIZE == 64 + .p2align 4,, 6 +L(copy_32_63): + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) + VMOVU %VMM_256(0), (%rdi) + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG) + ret +# endif + + + .p2align 4,, 6 +L(copy_16_31): + /* Use xmm1 explicitly here as it won't require a `vzeroupper` + and will save code size. */ + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 + VMOVU %VMM_128(0), (%rdi) + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG) + ret + + .p2align 4,, 8 +L(copy_8_15): +# ifdef USE_AS_WCSCPY + movl -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx +# else + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx +# endif + vmovq %VMM_128(0), (%rdi) + movq %rcx, -(8 - CHAR_SIZE)(%END_REG) + ret # endif -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ - .p2align 4 -L(Exit1): - movzwl (%rsi), %edx - mov %dx, (%rdi) -# ifdef USE_AS_STPCPY - lea 1(%rdi), %rax +# ifndef USE_AS_WCSCPY + .p2align 4,, 12 +L(copy_4_7): + movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx + vmovd %VMM_128(0), (%rdi) + movl %ecx, -(4 - CHAR_SIZE)(%END_REG) + ret # endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $2, %r8 - lea 2(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) + + + .p2align 4,, 8 +L(more_1x_vec): +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + VMOVU %VMM(0), (%rdi) # endif - ret + subq %rsi, %rdi + andq $-(VEC_SIZE), %rsi + addq %rsi, %rdi + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) - .p2align 4 -L(Exit2): - movzwl (%rsi), %ecx - mov %cx, (%rdi) - movb $0, 2(%rdi) + /* Ideally we store after moves to minimize impact of potential + false-dependencies. */ +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT + VMOVU %VMM(0), (%rax) +# endif + + VPTESTN %VMM(1), %VMM(1), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x1) + + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) + VMOVU %VMM(1), VEC_SIZE(%rdi) + + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x2) + + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) + + VPTESTN %VMM(3), %VMM(3), %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(ret_vec_x3) + + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) + VPTESTN %VMM(4), %VMM(4), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x4) + + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) + + + /* Align for 4x loop. */ + subq %rsi, %rdi + + /* + VEC_SIZE * 5 because we never added the original VEC_SIZE + we covered before aligning. */ + subq $-(VEC_SIZE * 5), %rsi + andq $-(VEC_SIZE * 4), %rsi + + + /* Load first half of the loop before entry. */ + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPTESTN %VMM(4), %VMM(4), %k2 + VPTESTN %VMM(6), %VMM(6), %k4 + KORTEST %k2, %k4 + jnz L(loop_4x_done) + + .p2align 4,, 11 +L(loop_4x_vec): + + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi) + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi) + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi) + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi) + + subq $(VEC_SIZE * -4), %rsi + + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) + + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPTESTN %VMM(4), %VMM(4), %k2 + VPTESTN %VMM(6), %VMM(6), %k4 + KORTEST %k2, %k4 + jz L(loop_4x_vec) + +L(loop_4x_done): + VPTESTN %VMM(0), %VMM(0), %k0 + KMOV %k0, %VRCX + /* Restore rdi (%rdi). */ + addq %rsi, %rdi + test %VRCX, %VRCX + jnz L(ret_vec_x0_end) + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) + + KMOV %k2, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x1) + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) + + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x2) + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) + /* Place L(ret_vec_x4) here to save code size. We get a + meaningfuly benefit doing this for stpcpy. */ + KMOV %k4, %VRDX +L(ret_vec_x3): + bsf %VRDX, %VRDX + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) # ifdef USE_AS_STPCPY - lea 2(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $3, %r8 - lea 3(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) + leaq (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax # endif +L(return_end): ret - .p2align 4 -L(Exit3): - mov (%rsi), %edx - mov %edx, (%rdi) + .p2align 4,, 6 +L(ret_vec_x0_end): + bsf %VRCX, %VRCX # ifdef USE_AS_STPCPY - lea 3(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $4, %r8 - lea 4(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) + leaq (%rdi, %rcx, CHAR_SIZE), %rax # endif + inc %VRCX + VMOVU (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) ret - .p2align 4 -L(Exit4_7): - mov (%rsi), %ecx - mov %ecx, (%rdi) - mov -3(%rsi, %rdx), %ecx - mov %ecx, -3(%rdi, %rdx) + .p2align 4,, 8 +L(ret_vec_x1): + bsf %VRCX, %VRCX + VMOVU (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) # ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub %rdx, %r8 - sub $1, %r8 - lea 1(%rdi, %rdx), %rdi - jnz L(StrncpyFillTailWithZero) + leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax # endif ret - .p2align 4 -L(Exit8_15): - mov (%rsi), %rcx - mov -7(%rsi, %rdx), %r9 - mov %rcx, (%rdi) - mov %r9, -7(%rdi, %rdx) + .p2align 4,, 4 +L(ret_vec_x2): + bsf %VRCX, %VRCX + VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) # ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub %rdx, %r8 - sub $1, %r8 - lea 1(%rdi, %rdx), %rdi - jnz L(StrncpyFillTailWithZero) + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax # endif ret - .p2align 4 -L(Exit16_31): - VMOVU (%rsi), %XMM2 - VMOVU -15(%rsi, %rdx), %XMM3 - VMOVU %XMM2, (%rdi) - VMOVU %XMM3, -15(%rdi, %rdx) + /* ret_vec_x3 reuses return code after the loop. */ + .p2align 4,, 6 +L(ret_vec_x4): + bsf %VRCX, %VRCX + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) # ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub %rdx, %r8 - sub $1, %r8 - lea 1(%rdi, %rdx), %rdi - jnz L(StrncpyFillTailWithZero) + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax # endif ret - .p2align 4 -L(Exit32_63): - VMOVU (%rsi), %YMM2 - VMOVU -31(%rsi, %rdx), %YMM3 - VMOVU %YMM2, (%rdi) - VMOVU %YMM3, -31(%rdi, %rdx) -# ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax + + .p2align 4,, 4 +L(page_cross): +# ifndef USE_AS_STRCAT + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 # endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub %rdx, %r8 - sub $1, %r8 - lea 1(%rdi, %rdx), %rdi - jnz L(StrncpyFillTailWithZero) + movq %rsi, %rcx + andq $(VEC_SIZE * -1), %rcx + + VPCMPEQ (%rcx), %VZERO, %k0 + KMOV %k0, %VRCX +# ifdef USE_AS_WCSCPY + andl $(VEC_SIZE - 1), %PAGE_ALIGN_REG + shrl $2, %PAGE_ALIGN_REG # endif - ret + shrx %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX -# ifdef USE_AS_STRNCPY +# if USE_MOVSB_IN_PAGE_CROSS + /* Optimizing more aggressively for space as this is very cold + code. This saves 2x cache lines. */ - .p2align 4 -L(StrncpyExit1): - movzbl (%rsi), %edx - mov %dl, (%rdi) -# ifdef USE_AS_STPCPY - lea 1(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, 1(%rdi) + /* This adds once to the later result which will get correct + copy bounds. NB: this can never zero-out a non-zero RCX as + to be in the page cross case rsi cannot be aligned and we + already right-shift rcx by the misalignment. */ + shl %VRCX + jz L(page_cross_continue) +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT + movq %rdi, %rax # endif - ret + bsf %VRCX, %VRCX + REP_MOVS - .p2align 4 -L(StrncpyExit2): - movzwl (%rsi), %edx - mov %dx, (%rdi) # ifdef USE_AS_STPCPY - lea 2(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, 2(%rdi) + leaq -CHAR_SIZE(%rdi), %rax # endif ret - .p2align 4 -L(StrncpyExit3_4): - movzwl (%rsi), %ecx - movzwl -2(%rsi, %r8), %edx - mov %cx, (%rdi) - mov %dx, -2(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - ret - .p2align 4 -L(StrncpyExit5_8): - mov (%rsi), %ecx - mov -4(%rsi, %r8), %edx - mov %ecx, (%rdi) - mov %edx, -4(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - ret +# else + /* Check if we found zero-char before end of page. */ + test %VRCX, %VRCX + jz L(page_cross_continue) - .p2align 4 -L(StrncpyExit9_16): - mov (%rsi), %rcx - mov -8(%rsi, %r8), %rdx - mov %rcx, (%rdi) - mov %rdx, -8(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - ret + /* Traditional copy case, essentially same as used in non-page- + cross case but since we can't reuse VMM(0) we need twice as + many loads from rsi. */ - .p2align 4 -L(StrncpyExit17_32): - VMOVU (%rsi), %XMM2 - VMOVU -16(%rsi, %r8), %XMM3 - VMOVU %XMM2, (%rdi) - VMOVU %XMM3, -16(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) +# ifndef USE_AS_STRCAT + xorl %edx, %edx # endif - ret - - .p2align 4 -L(StrncpyExit33_64): - /* 0/32, 31/16 */ - VMOVU (%rsi), %YMM2 - VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 - VMOVU %YMM2, (%rdi) - VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) + /* Dependency on rdi must already have been satisfied. */ + bsf %VRCX, %VRDX # ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax + leaq (%rdi, %rdx, CHAR_SIZE), %rax +# elif !defined USE_AS_STRCAT + movq %rdi, %rax # endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - ret - .p2align 4 -L(StrncpyExit65): - /* 0/32, 32/32, 64/1 */ - VMOVU (%rsi), %YMM2 - VMOVU 32(%rsi), %YMM3 - mov 64(%rsi), %cl - VMOVU %YMM2, (%rdi) - VMOVU %YMM3, 32(%rdi) - mov %cl, 64(%rdi) -# ifdef USE_AS_STPCPY - lea 65(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, 65(%rdi) +# if VEC_SIZE == 64 +# ifdef USE_AS_WCSCPY + testb %cl, %cl +# else + test %ecx, %ecx +# endif + jz L(page_cross_copy_32_63) # endif - ret - -# ifndef USE_AS_STRCAT - .p2align 4 -L(Fill1): - mov %dl, (%rdi) - ret +# ifdef USE_AS_WCSCPY + testb $0xf, %cl +# else + testw %cx, %cx +# endif + jz L(page_cross_copy_16_31) - .p2align 4 -L(Fill2): - mov %dx, (%rdi) - ret +# ifdef USE_AS_WCSCPY + testb $0x3, %cl +# else + testb %cl, %cl +# endif + jz L(page_cross_copy_8_15) - .p2align 4 -L(Fill3_4): - mov %dx, (%rdi) - mov %dx, -2(%rdi, %r8) +# ifdef USE_AS_WCSCPY + movl (%rsi), %esi + movl %esi, (%rdi) + movl $0, (%END_REG) ret +# else - .p2align 4 -L(Fill5_8): - mov %edx, (%rdi) - mov %edx, -4(%rdi, %r8) - ret + testb $0x7, %cl + jz L(page_cross_copy_4_7) - .p2align 4 -L(Fill9_16): - mov %rdx, (%rdi) - mov %rdx, -8(%rdi, %r8) + test %edx, %edx + jz L(page_cross_set_null_term) + movzwl (%rsi), %ecx + movw %cx, (%rdi) +L(page_cross_set_null_term): + movb $0, (%END_REG) ret - .p2align 4 -L(Fill17_32): - VMOVU %XMMZERO, (%rdi) - VMOVU %XMMZERO, -16(%rdi, %r8) - ret - .p2align 4 -L(CopyVecSizeUnalignedVec2): - VMOVU %YMM2, (%rdi, %rcx) - - .p2align 4 -L(CopyVecSizeVecExit): - bsf %edx, %edx - add $(VEC_SIZE - 1), %r8 - add %rcx, %rdi -# ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif - sub %rdx, %r8 - lea 1(%rdi, %rdx), %rdi - - .p2align 4 -L(StrncpyFillTailWithZero): - xor %edx, %edx - sub $VEC_SIZE, %r8 - jbe L(StrncpyFillExit) - - VMOVU %YMMZERO, (%rdi) - add $VEC_SIZE, %rdi - - mov %rdi, %rsi - and $(VEC_SIZE - 1), %esi - sub %rsi, %rdi - add %rsi, %r8 - sub $(VEC_SIZE * 4), %r8 - jb L(StrncpyFillLessFourVecSize) - -L(StrncpyFillLoopVmovdqa): - VMOVA %YMMZERO, (%rdi) - VMOVA %YMMZERO, VEC_SIZE(%rdi) - VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) - VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) - add $(VEC_SIZE * 4), %rdi - sub $(VEC_SIZE * 4), %r8 - jae L(StrncpyFillLoopVmovdqa) - -L(StrncpyFillLessFourVecSize): - add $(VEC_SIZE * 2), %r8 - jl L(StrncpyFillLessTwoVecSize) - VMOVA %YMMZERO, (%rdi) - VMOVA %YMMZERO, VEC_SIZE(%rdi) - add $(VEC_SIZE * 2), %rdi - sub $VEC_SIZE, %r8 - jl L(StrncpyFillExit) - VMOVA %YMMZERO, (%rdi) - add $VEC_SIZE, %rdi - jmp L(Fill) - - .p2align 4 -L(StrncpyFillLessTwoVecSize): - add $VEC_SIZE, %r8 - jl L(StrncpyFillExit) - VMOVA %YMMZERO, (%rdi) - add $VEC_SIZE, %rdi - jmp L(Fill) - - .p2align 4 -L(StrncpyFillExit): - add $VEC_SIZE, %r8 -L(Fill): - cmp $17, %r8d - jae L(Fill17_32) - cmp $9, %r8d - jae L(Fill9_16) - cmp $5, %r8d - jae L(Fill5_8) - cmp $3, %r8d - jae L(Fill3_4) - cmp $1, %r8d - ja L(Fill2) - je L(Fill1) + .p2align 4,, 4 +L(page_cross_copy_4_7): + movl (%rsi), %ecx + movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi + movl %ecx, (%rdi) + movl %esi, -(4 - CHAR_SIZE)(%END_REG) ret - -/* end of ifndef USE_AS_STRCAT */ # endif - .p2align 4 -L(UnalignedLeaveCase2OrCase3): - test %rdx, %rdx - jnz L(UnalignedFourVecSizeLeaveCase2) -L(UnalignedFourVecSizeLeaveCase3): - lea (VEC_SIZE * 4)(%r8), %rcx - and $-VEC_SIZE, %rcx - add $(VEC_SIZE * 3), %r8 - jl L(CopyVecSizeCase3) - VMOVU %YMM4, (%rdi) - sub $VEC_SIZE, %r8 - jb L(CopyVecSizeCase3) - VMOVU %YMM5, VEC_SIZE(%rdi) - sub $VEC_SIZE, %r8 - jb L(CopyVecSizeCase3) - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) - sub $VEC_SIZE, %r8 - jb L(CopyVecSizeCase3) - VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) -# ifdef USE_AS_STPCPY - lea (VEC_SIZE * 4)(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (VEC_SIZE * 4)(%rdi) -# endif +# if VEC_SIZE == 64 + .p2align 4,, 4 +L(page_cross_copy_32_63): + VMOVU (%rsi), %VMM_256(0) + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) + VMOVU %VMM_256(0), (%rdi) + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG) ret - - .p2align 4 -L(UnalignedFourVecSizeLeaveCase2): - xor %ecx, %ecx - vpcmpb $0, %YMM4, %YMMZERO, %k1 - kmovd %k1, %edx - add $(VEC_SIZE * 3), %r8 - jle L(CopyVecSizeCase2OrCase3) - test %edx, %edx -# ifndef USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec4) -# else - jnz L(CopyVecSize) -# endif - vpcmpb $0, %YMM5, %YMMZERO, %k2 - kmovd %k2, %edx - VMOVU %YMM4, (%rdi) - add $VEC_SIZE, %rcx - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) - test %edx, %edx -# ifndef USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec5) -# else - jnz L(CopyVecSize) # endif - vpcmpb $0, %YMM6, %YMMZERO, %k3 - kmovd %k3, %edx - VMOVU %YMM5, VEC_SIZE(%rdi) - add $VEC_SIZE, %rcx - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) - test %edx, %edx -# ifndef USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec6) -# else - jnz L(CopyVecSize) -# endif - - vpcmpb $0, %YMM7, %YMMZERO, %k4 - kmovd %k4, %edx - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) - lea VEC_SIZE(%rdi, %rcx), %rdi - lea VEC_SIZE(%rsi, %rcx), %rsi - bsf %edx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) -L(StrncpyExit): - cmp $65, %r8d - je L(StrncpyExit65) - cmp $33, %r8d - jae L(StrncpyExit33_64) - cmp $17, %r8d - jae L(StrncpyExit17_32) - cmp $9, %r8d - jae L(StrncpyExit9_16) - cmp $5, %r8d - jae L(StrncpyExit5_8) - cmp $3, %r8d - jae L(StrncpyExit3_4) - cmp $1, %r8d - ja L(StrncpyExit2) - je L(StrncpyExit1) -# ifdef USE_AS_STPCPY - mov %rdi, %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi) -# endif + .p2align 4,, 4 +L(page_cross_copy_16_31): + vmovdqu (%rsi), %xmm0 + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG) ret - .p2align 4 -L(ExitZero): -# ifndef USE_AS_STRCAT - mov %rdi, %rax -# endif + .p2align 4,, 4 +L(page_cross_copy_8_15): + movq (%rsi), %rcx + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi + movq %rcx, (%rdi) + movq %rsi, -(8 - CHAR_SIZE)(%END_REG) ret - -# endif - -# ifndef USE_AS_STRCAT -END (STRCPY) -# else -END (STRCAT) # endif +END(STRCPY) #endif |