diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/strcpy-ssse3.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3721 |
1 files changed, 3721 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S new file mode 100644 index 0000000000..efbd3bfccb --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S @@ -0,0 +1,3721 @@ +/* strcpy with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include <sysdep.h> + +# ifndef STRCPY +# define STRCPY __strcpy_ssse3 +# endif + + .section .text.ssse3,"ax",@progbits +ENTRY (STRCPY) + mov %rsi, %rcx +# ifdef USE_AS_STRNCPY + mov %rdx, %r8 +# endif + mov %rdi, %rdx +# ifdef USE_AS_STRNCPY + test %r8, %r8 + jz L(Exit0) + cmp $8, %r8 + jbe L(StrncpyExit8Bytes) +# endif + cmpb $0, (%rcx) + jz L(Exit1) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmpb $0, 6(%rcx) + jz L(Exit7) + cmpb $0, 7(%rcx) + jz L(Exit8) +# ifdef USE_AS_STRNCPY + cmp $16, %r8 + jb L(StrncpyExit15Bytes) +# endif + cmpb $0, 8(%rcx) + jz L(Exit9) + cmpb $0, 9(%rcx) + jz L(Exit10) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmpb $0, 13(%rcx) + jz L(Exit14) + cmpb $0, 14(%rcx) + jz L(Exit15) +# ifdef USE_AS_STRNCPY + cmp $16, %r8 + je L(Exit16) +# endif + cmpb $0, 15(%rcx) + jz L(Exit16) + +# ifdef USE_AS_STRNCPY + mov %rcx, %rsi + and $0xf, %rsi + +/* add 16 bytes rcx_shift to r8 */ + + add %rsi, %r8 +# endif + lea 16(%rcx), %rsi +/* Now: + rsi = alignment_16(rcx) + rcx_shift + 16; + rcx_shift = rcx - alignment_16(rcx) +*/ + and $-16, %rsi +/* Now: + rsi = alignment_16(rcx) + 16 +*/ + pxor %xmm0, %xmm0 + mov (%rcx), %r9 + mov %r9, (%rdx) +/* + look if there is zero symbol in next 16 bytes of string + from rsi to rsi + 15 and form mask in xmm0 +*/ + pcmpeqb (%rsi), %xmm0 + mov 8(%rcx), %r9 + mov %r9, 8(%rdx) + +/* convert byte mask in xmm0 to bit mask */ + + pmovmskb %xmm0, %rax + sub %rcx, %rsi + +/* rsi = 16 - rcx_shift */ + +/* rax = 0: there isn't end of string from position rsi to rsi+15 */ + +# ifdef USE_AS_STRNCPY + sub $32, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + mov %rdx, %rax + lea 16(%rdx), %rdx +/* Now: + rdx = rdx + 16 = alignment_16(rdx) + rdx_shift + 16 +*/ + and $-16, %rdx + +/* Now: rdx = alignment_16(rdx) + 16 */ + + sub %rdx, %rax + +/* Now: rax = rdx_shift - 16 */ + +# ifdef USE_AS_STRNCPY + add %rax, %rsi + lea -1(%rsi), %rsi + and $1<<31, %esi + test %rsi, %rsi + jnz L(ContinueCopy) + lea 16(%r8), %r8 + +L(ContinueCopy): +# endif + sub %rax, %rcx +/* Now: + case rcx_shift >= rdx_shift: + rcx = alignment_16(rcx) + (rcx_shift - rdx_shift) + 16 + case rcx_shift < rdx_shift: + rcx = alignment_16(rcx) + (16 + rcx_shift - rdx_shift) +*/ + mov %rcx, %rax + and $0xf, %rax +/* Now: + case rcx_shift >= rdx_shift: rax = rcx_shift - rdx_shift + case rcx_shift < rdx_shift: rax = (16 + rcx_shift - rdx_shift) + rax can be 0, 1, ..., 15 +*/ + mov $0, %rsi + +/* case: rcx_shift == rdx_shift */ + + jz L(Align16Both) + + cmp $8, %rax + jae L(ShlHigh8) + cmp $1, %rax + je L(Shl1) + cmp $2, %rax + je L(Shl2) + cmp $3, %rax + je L(Shl3) + cmp $4, %rax + je L(Shl4) + cmp $5, %rax + je L(Shl5) + cmp $6, %rax + je L(Shl6) + jmp L(Shl7) + +L(ShlHigh8): + je L(Shl8) + cmp $9, %rax + je L(Shl9) + cmp $10, %rax + je L(Shl10) + cmp $11, %rax + je L(Shl11) + cmp $12, %rax + je L(Shl12) + cmp $13, %rax + je L(Shl13) + cmp $14, %rax + je L(Shl14) + jmp L(Shl15) + +L(Align16Both): + movaps (%rcx), %xmm1 + movaps 16(%rcx), %xmm2 + movaps %xmm1, (%rdx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm4 + movaps %xmm3, (%rdx, %rsi) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm1 + movaps %xmm4, (%rdx, %rsi) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm2 + movaps %xmm1, (%rdx, %rsi) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%rdx, %rsi) + mov %rcx, %rax + lea 16(%rcx, %rsi), %rcx + and $-0x40, %rcx + sub %rcx, %rax + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + lea 48+64(%r8, %rax), %r8 +# endif + mov $-0x40, %rsi + +L(Aligned64Loop): + movaps (%rcx), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rcx), %xmm5 + movaps 32(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rcx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rax + lea 64(%rdx), %rdx + lea 64(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeaveCase2OrCase3) +# endif + test %rax, %rax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%rdx) + movaps %xmm5, -48(%rdx) + movaps %xmm6, -32(%rdx) + movaps %xmm7, -16(%rdx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): +# ifdef USE_AS_STRNCPY + lea 48(%r8), %r8 +# endif + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rax + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%r8), %r8 +# endif + pmovmskb %xmm0, %rax + movaps %xmm4, -64(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%r8), %r8 +# endif + pmovmskb %xmm0, %rax + movaps %xmm5, -48(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%rdx) + pcmpeqb %xmm7, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%r8), %r8 +# endif + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl1): + movaps -1(%rcx), %xmm1 + movaps 15(%rcx), %xmm2 +L(Shl1Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 31(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -15(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -1(%rcx), %xmm1 + +L(Shl1LoopStart): + movaps 15(%rcx), %xmm2 + movaps 31(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 47(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 63(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + test %rax, %rax + palignr $1, %xmm3, %xmm4 + jnz L(Shl1Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave1) +# endif + palignr $1, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $1, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl1LoopStart) + +L(Shl1LoopExit): + movaps (%rdx), %xmm6 + psrldq $15, %xmm6 + mov $15, %rsi + palignr $1, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl2): + movaps -2(%rcx), %xmm1 + movaps 14(%rcx), %xmm2 +L(Shl2Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 30(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -14(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -2(%rcx), %xmm1 + +L(Shl2LoopStart): + movaps 14(%rcx), %xmm2 + movaps 30(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 46(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 62(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + test %rax, %rax + palignr $2, %xmm3, %xmm4 + jnz L(Shl2Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave2) +# endif + palignr $2, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $2, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl2LoopStart) + +L(Shl2LoopExit): + movaps (%rdx), %xmm6 + psrldq $14, %xmm6 + mov $14, %rsi + palignr $2, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl3): + movaps -3(%rcx), %xmm1 + movaps 13(%rcx), %xmm2 +L(Shl3Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 29(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -13(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -3(%rcx), %xmm1 + +L(Shl3LoopStart): + movaps 13(%rcx), %xmm2 + movaps 29(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 45(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 61(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + test %rax, %rax + palignr $3, %xmm3, %xmm4 + jnz L(Shl3Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave3) +# endif + palignr $3, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $3, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl3LoopStart) + +L(Shl3LoopExit): + movaps (%rdx), %xmm6 + psrldq $13, %xmm6 + mov $13, %rsi + palignr $3, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl4): + movaps -4(%rcx), %xmm1 + movaps 12(%rcx), %xmm2 +L(Shl4Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 28(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -12(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -4(%rcx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%rcx), %xmm2 + movaps 28(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %rax, %rax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave4) +# endif + palignr $4, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movaps (%rdx), %xmm6 + psrldq $12, %xmm6 + mov $12, %rsi + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl5): + movaps -5(%rcx), %xmm1 + movaps 11(%rcx), %xmm2 +L(Shl5Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 27(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -11(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -5(%rcx), %xmm1 + +L(Shl5LoopStart): + movaps 11(%rcx), %xmm2 + movaps 27(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 43(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 59(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + test %rax, %rax + palignr $5, %xmm3, %xmm4 + jnz L(Shl5Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave5) +# endif + palignr $5, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $5, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl5LoopStart) + +L(Shl5LoopExit): + movaps (%rdx), %xmm6 + psrldq $11, %xmm6 + mov $11, %rsi + palignr $5, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl6): + movaps -6(%rcx), %xmm1 + movaps 10(%rcx), %xmm2 +L(Shl6Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 26(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -10(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -6(%rcx), %xmm1 + +L(Shl6LoopStart): + movaps 10(%rcx), %xmm2 + movaps 26(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 42(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 58(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + test %rax, %rax + palignr $6, %xmm3, %xmm4 + jnz L(Shl6Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave6) +# endif + palignr $6, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $6, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl6LoopStart) + +L(Shl6LoopExit): + movaps (%rdx), %xmm6 + psrldq $10, %xmm6 + mov $10, %rsi + palignr $6, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl7): + movaps -7(%rcx), %xmm1 + movaps 9(%rcx), %xmm2 +L(Shl7Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 25(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -9(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -7(%rcx), %xmm1 + +L(Shl7LoopStart): + movaps 9(%rcx), %xmm2 + movaps 25(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 41(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 57(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + test %rax, %rax + palignr $7, %xmm3, %xmm4 + jnz L(Shl7Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave7) +# endif + palignr $7, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $7, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl7LoopStart) + +L(Shl7LoopExit): + movaps (%rdx), %xmm6 + psrldq $9, %xmm6 + mov $9, %rsi + palignr $7, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl8): + movaps -8(%rcx), %xmm1 + movaps 8(%rcx), %xmm2 +L(Shl8Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 24(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -8(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -8(%rcx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%rcx), %xmm2 + movaps 24(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %rax, %rax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave8) +# endif + palignr $8, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movaps (%rdx), %xmm6 + psrldq $8, %xmm6 + mov $8, %rsi + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl9): + movaps -9(%rcx), %xmm1 + movaps 7(%rcx), %xmm2 +L(Shl9Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 23(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -7(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -9(%rcx), %xmm1 + +L(Shl9LoopStart): + movaps 7(%rcx), %xmm2 + movaps 23(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 39(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 55(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + test %rax, %rax + palignr $9, %xmm3, %xmm4 + jnz L(Shl9Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave9) +# endif + palignr $9, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $9, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl9LoopStart) + +L(Shl9LoopExit): + movaps (%rdx), %xmm6 + psrldq $7, %xmm6 + mov $7, %rsi + palignr $9, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl10): + movaps -10(%rcx), %xmm1 + movaps 6(%rcx), %xmm2 +L(Shl10Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 22(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -6(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -10(%rcx), %xmm1 + +L(Shl10LoopStart): + movaps 6(%rcx), %xmm2 + movaps 22(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 38(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 54(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + test %rax, %rax + palignr $10, %xmm3, %xmm4 + jnz L(Shl10Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave10) +# endif + palignr $10, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $10, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl10LoopStart) + +L(Shl10LoopExit): + movaps (%rdx), %xmm6 + psrldq $6, %xmm6 + mov $6, %rsi + palignr $10, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl11): + movaps -11(%rcx), %xmm1 + movaps 5(%rcx), %xmm2 +L(Shl11Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 21(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -5(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -11(%rcx), %xmm1 + +L(Shl11LoopStart): + movaps 5(%rcx), %xmm2 + movaps 21(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 37(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 53(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + test %rax, %rax + palignr $11, %xmm3, %xmm4 + jnz L(Shl11Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave11) +# endif + palignr $11, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $11, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl11LoopStart) + +L(Shl11LoopExit): + movaps (%rdx), %xmm6 + psrldq $5, %xmm6 + mov $5, %rsi + palignr $11, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl12): + movaps -12(%rcx), %xmm1 + movaps 4(%rcx), %xmm2 +L(Shl12Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 20(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -4(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -12(%rcx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%rcx), %xmm2 + movaps 20(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %rax, %rax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave12) +# endif + palignr $12, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movaps (%rdx), %xmm6 + psrldq $4, %xmm6 + mov $4, %rsi + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl13): + movaps -13(%rcx), %xmm1 + movaps 3(%rcx), %xmm2 +L(Shl13Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 19(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -3(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -13(%rcx), %xmm1 + +L(Shl13LoopStart): + movaps 3(%rcx), %xmm2 + movaps 19(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 35(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 51(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + test %rax, %rax + palignr $13, %xmm3, %xmm4 + jnz L(Shl13Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave13) +# endif + palignr $13, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $13, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl13LoopStart) + +L(Shl13LoopExit): + movaps (%rdx), %xmm6 + psrldq $3, %xmm6 + mov $3, %rsi + palignr $13, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl14): + movaps -14(%rcx), %xmm1 + movaps 2(%rcx), %xmm2 +L(Shl14Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 18(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -2(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -14(%rcx), %xmm1 + +L(Shl14LoopStart): + movaps 2(%rcx), %xmm2 + movaps 18(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 34(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 50(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + test %rax, %rax + palignr $14, %xmm3, %xmm4 + jnz L(Shl14Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave14) +# endif + palignr $14, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $14, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl14LoopStart) + +L(Shl14LoopExit): + movaps (%rdx), %xmm6 + psrldq $2, %xmm6 + mov $2, %rsi + palignr $14, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl15): + movaps -15(%rcx), %xmm1 + movaps 1(%rcx), %xmm2 +L(Shl15Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 17(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -1(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -15(%rcx), %xmm1 + +L(Shl15LoopStart): + movaps 1(%rcx), %xmm2 + movaps 17(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 33(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 49(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + test %rax, %rax + palignr $15, %xmm3, %xmm4 + jnz L(Shl15Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave15) +# endif + palignr $15, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $15, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl15LoopStart) + +L(Shl15LoopExit): + movaps (%rdx), %xmm6 + psrldq $1, %xmm6 + mov $1, %rsi + palignr $15, %xmm1, %xmm6 + movaps %xmm6, (%rdx) +# ifdef USE_AS_STRCAT + jmp L(CopyFrom1To16Bytes) +# endif + + + .p2align 4 +L(CopyFrom1To16Bytes): +# ifdef USE_AS_STRNCPY + add $16, %r8 +# endif + add %rsi, %rdx + add %rsi, %rcx + + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + + .p2align 4 +L(Exit8): + mov (%rcx), %rax + mov %rax, (%rdx) +# ifdef USE_AS_STPCPY + lea 7(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $8, %r8 + lea 8(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + + .p2align 4 +L(Exit16): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) +# ifdef USE_AS_STPCPY + lea 15(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $16, %r8 + lea 16(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rsi, %rcx + lea (%rsi, %rdx), %rsi + lea -9(%r8), %rdx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%rsi), %rdx + jz L(ExitHighCase2) + + cmp $1, %r8 + je L(Exit1) + test $0x01, %al + jnz L(Exit1) + cmp $2, %r8 + je L(Exit2) + test $0x02, %al + jnz L(Exit2) + cmp $3, %r8 + je L(Exit3) + test $0x04, %al + jnz L(Exit3) + cmp $4, %r8 + je L(Exit4) + test $0x08, %al + jnz L(Exit4) + cmp $5, %r8 + je L(Exit5) + test $0x10, %al + jnz L(Exit5) + cmp $6, %r8 + je L(Exit6) + test $0x20, %al + jnz L(Exit6) + cmp $7, %r8 + je L(Exit7) + test $0x40, %al + jnz L(Exit7) + jmp L(Exit8) + + .p2align 4 +L(ExitHighCase2): + cmp $9, %r8 + je L(Exit9) + test $0x01, %ah + jnz L(Exit9) + cmp $10, %r8 + je L(Exit10) + test $0x02, %ah + jnz L(Exit10) + cmp $11, %r8 + je L(Exit11) + test $0x04, %ah + jnz L(Exit11) + cmp $12, %r8 + je L(Exit12) + test $0x8, %ah + jnz L(Exit12) + cmp $13, %r8 + je L(Exit13) + test $0x10, %ah + jnz L(Exit13) + cmp $14, %r8 + je L(Exit14) + test $0x20, %ah + jnz L(Exit14) + cmp $15, %r8 + je L(Exit15) + test $0x40, %ah + jnz L(Exit15) + jmp L(Exit16) + +L(CopyFrom1To16BytesCase2OrCase3): + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rsi, %rdx + add %rsi, %rcx + + cmp $16, %r8 + je L(Exit16) + cmp $8, %r8 + je L(Exit8) + jg L(More8Case3) + cmp $4, %r8 + je L(Exit4) + jg L(More4Case3) + cmp $2, %r8 + jl L(Exit1) + je L(Exit2) + jg L(Exit3) +L(More8Case3): /* but less than 16 */ + cmp $12, %r8 + je L(Exit12) + jl L(Less12Case3) + cmp $14, %r8 + jl L(Exit13) + je L(Exit14) + jg L(Exit15) +L(More4Case3): /* but less than 8 */ + cmp $6, %r8 + jl L(Exit5) + je L(Exit6) + jg L(Exit7) +L(Less12Case3): /* but more than 8 */ + cmp $10, %r8 + jl L(Exit9) + je L(Exit10) + jg L(Exit11) +# endif + + .p2align 4 +L(Exit1): + movb (%rcx), %al + movb %al, (%rdx) +# ifdef USE_AS_STPCPY + lea (%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $1, %r8 + lea 1(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit2): + movw (%rcx), %ax + movw %ax, (%rdx) +# ifdef USE_AS_STPCPY + lea 1(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $2, %r8 + lea 2(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit3): + movw (%rcx), %ax + movw %ax, (%rdx) + movb 2(%rcx), %al + movb %al, 2(%rdx) +# ifdef USE_AS_STPCPY + lea 2(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $3, %r8 + lea 3(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit4): + movl (%rcx), %eax + movl %eax, (%rdx) +# ifdef USE_AS_STPCPY + lea 3(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $4, %r8 + lea 4(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit5): + movl (%rcx), %eax + movl %eax, (%rdx) + movb 4(%rcx), %al + movb %al, 4(%rdx) +# ifdef USE_AS_STPCPY + lea 4(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $5, %r8 + lea 5(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit6): + movl (%rcx), %eax + movl %eax, (%rdx) + movw 4(%rcx), %ax + movw %ax, 4(%rdx) +# ifdef USE_AS_STPCPY + lea 5(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $6, %r8 + lea 6(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit7): + movl (%rcx), %eax + movl %eax, (%rdx) + movl 3(%rcx), %eax + movl %eax, 3(%rdx) +# ifdef USE_AS_STPCPY + lea 6(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $7, %r8 + lea 7(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit9): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 5(%rcx), %eax + mov %eax, 5(%rdx) +# ifdef USE_AS_STPCPY + lea 8(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $9, %r8 + lea 9(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit10): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 6(%rcx), %eax + mov %eax, 6(%rdx) +# ifdef USE_AS_STPCPY + lea 9(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $10, %r8 + lea 10(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit11): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 7(%rcx), %eax + mov %eax, 7(%rdx) +# ifdef USE_AS_STPCPY + lea 10(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $11, %r8 + lea 11(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit12): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %eax + mov %eax, 8(%rdx) +# ifdef USE_AS_STPCPY + lea 11(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $12, %r8 + lea 12(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit13): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 5(%rcx), %rax + mov %rax, 5(%rdx) +# ifdef USE_AS_STPCPY + lea 12(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $13, %r8 + lea 13(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit14): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 6(%rcx), %rax + mov %rax, 6(%rdx) +# ifdef USE_AS_STPCPY + lea 13(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $14, %r8 + lea 14(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit15): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 7(%rcx), %rax + mov %rax, 7(%rdx) +# ifdef USE_AS_STPCPY + lea 14(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $15, %r8 + lea 15(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + +# ifdef USE_AS_STRNCPY + .p2align 4 +L(Fill0): + ret + + .p2align 4 +L(Fill1): + movb %dl, (%rcx) + ret + + .p2align 4 +L(Fill2): + movw %dx, (%rcx) + ret + + .p2align 4 +L(Fill3): + movw %dx, (%rcx) + movb %dl, 2(%rcx) + ret + + .p2align 4 +L(Fill4): + movl %edx, (%rcx) + ret + + .p2align 4 +L(Fill5): + movl %edx, (%rcx) + movb %dl, 4(%rcx) + ret + + .p2align 4 +L(Fill6): + movl %edx, (%rcx) + movw %dx, 4(%rcx) + ret + + .p2align 4 +L(Fill7): + movl %edx, (%rcx) + movl %edx, 3(%rcx) + ret + + .p2align 4 +L(Fill8): + mov %rdx, (%rcx) + ret + + .p2align 4 +L(Fill9): + mov %rdx, (%rcx) + movb %dl, 8(%rcx) + ret + + .p2align 4 +L(Fill10): + mov %rdx, (%rcx) + movw %dx, 8(%rcx) + ret + + .p2align 4 +L(Fill11): + mov %rdx, (%rcx) + movl %edx, 7(%rcx) + ret + + .p2align 4 +L(Fill12): + mov %rdx, (%rcx) + movl %edx, 8(%rcx) + ret + + .p2align 4 +L(Fill13): + mov %rdx, (%rcx) + mov %rdx, 5(%rcx) + ret + + .p2align 4 +L(Fill14): + mov %rdx, (%rcx) + mov %rdx, 6(%rcx) + ret + + .p2align 4 +L(Fill15): + mov %rdx, (%rcx) + mov %rdx, 7(%rcx) + ret + + .p2align 4 +L(Fill16): + mov %rdx, (%rcx) + mov %rdx, 8(%rcx) + ret + + .p2align 4 +L(StrncpyFillExit1): + lea 16(%r8), %r8 +L(FillFrom1To16Bytes): + test %r8, %r8 + jz L(Fill0) + cmp $16, %r8 + je L(Fill16) + cmp $8, %r8 + je L(Fill8) + jg L(FillMore8) + cmp $4, %r8 + je L(Fill4) + jg L(FillMore4) + cmp $2, %r8 + jl L(Fill1) + je L(Fill2) + jg L(Fill3) +L(FillMore8): /* but less than 16 */ + cmp $12, %r8 + je L(Fill12) + jl L(FillLess12) + cmp $14, %r8 + jl L(Fill13) + je L(Fill14) + jg L(Fill15) +L(FillMore4): /* but less than 8 */ + cmp $6, %r8 + jl L(Fill5) + je L(Fill6) + jg L(Fill7) +L(FillLess12): /* but more than 8 */ + cmp $10, %r8 + jl L(Fill9) + je L(Fill10) + jmp L(Fill11) + + .p2align 4 +L(StrncpyFillTailWithZero1): + xor %rdx, %rdx + sub $16, %r8 + jbe L(StrncpyFillExit1) + + pxor %xmm0, %xmm0 + mov %rdx, (%rcx) + mov %rdx, 8(%rcx) + + lea 16(%rcx), %rcx + + mov %rcx, %rdx + and $0xf, %rdx + sub %rdx, %rcx + add %rdx, %r8 + xor %rdx, %rdx + sub $64, %r8 + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%rcx) + movdqa %xmm0, 16(%rcx) + movdqa %xmm0, 32(%rcx) + movdqa %xmm0, 48(%rcx) + lea 64(%rcx), %rcx + sub $64, %r8 + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %r8 + jl L(StrncpyFillLess32) + movdqa %xmm0, (%rcx) + movdqa %xmm0, 16(%rcx) + lea 32(%rcx), %rcx + sub $16, %r8 + jl L(StrncpyFillExit1) + movdqa %xmm0, (%rcx) + lea 16(%rcx), %rcx + jmp L(FillFrom1To16Bytes) + +L(StrncpyFillLess32): + add $16, %r8 + jl L(StrncpyFillExit1) + movdqa %xmm0, (%rcx) + lea 16(%rcx), %rcx + jmp L(FillFrom1To16Bytes) + + .p2align 4 +L(Exit0): + mov %rdx, %rax + ret + + .p2align 4 +L(StrncpyExit15Bytes): + cmp $9, %r8 + je L(Exit9) + cmpb $0, 8(%rcx) + jz L(Exit9) + cmp $10, %r8 + je L(Exit10) + cmpb $0, 9(%rcx) + jz L(Exit10) + cmp $11, %r8 + je L(Exit11) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmp $12, %r8 + je L(Exit12) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmp $13, %r8 + je L(Exit13) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmp $14, %r8 + je L(Exit14) + cmpb $0, 13(%rcx) + jz L(Exit14) + mov (%rcx), %rax + mov %rax, (%rdx) + mov 7(%rcx), %rax + mov %rax, 7(%rdx) +# ifdef USE_AS_STPCPY + lea 14(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax +# else + mov %rdi, %rax +# endif + ret + + .p2align 4 +L(StrncpyExit8Bytes): + cmp $1, %r8 + je L(Exit1) + cmpb $0, (%rcx) + jz L(Exit1) + cmp $2, %r8 + je L(Exit2) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmp $3, %r8 + je L(Exit3) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmp $4, %r8 + je L(Exit4) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmp $5, %r8 + je L(Exit5) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmp $6, %r8 + je L(Exit6) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmp $7, %r8 + je L(Exit7) + cmpb $0, 6(%rcx) + jz L(Exit7) + mov (%rcx), %rax + mov %rax, (%rdx) +# ifdef USE_AS_STPCPY + lea 7(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax +# else + mov %rdi, %rax +# endif + ret + +# endif + +# ifdef USE_AS_STRNCPY + +L(StrncpyLeaveCase2OrCase3): + test %rax, %rax + jnz L(Aligned64LeaveCase2) + +L(Aligned64LeaveCase3): + lea 64(%r8), %r8 + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm4, -64(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm5, -48(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm6, -32(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + jmp L(CopyFrom1To16BytesCase3) + +L(Aligned64LeaveCase2): + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rax + add $48, %r8 + jle L(CopyFrom1To16BytesCase2OrCase3) + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm4, -64(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm5, -48(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm6, -32(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + jmp L(CopyFrom1To16BytesCase2) +/*--------------------------------------------------*/ +L(StrncpyExit1Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $15, %xmm6 + mov $15, %rsi + palignr $1, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit2Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $14, %xmm6 + mov $14, %rsi + palignr $2, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit3Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $13, %xmm6 + mov $13, %rsi + palignr $3, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit4Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $12, %xmm6 + mov $12, %rsi + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit5Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $11, %xmm6 + mov $11, %rsi + palignr $5, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit6Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $10, %xmm6 + mov $10, %rsi + palignr $6, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit7Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $9, %xmm6 + mov $9, %rsi + palignr $7, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit8Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $8, %xmm6 + mov $8, %rsi + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit9Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $7, %xmm6 + mov $7, %rsi + palignr $9, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit10Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $6, %xmm6 + mov $6, %rsi + palignr $10, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit11Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $5, %xmm6 + mov $5, %rsi + palignr $11, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit12Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $4, %xmm6 + mov $4, %rsi + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit13Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $3, %xmm6 + mov $3, %rsi + palignr $13, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit14Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $2, %xmm6 + mov $2, %rsi + palignr $14, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit15Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $1, %xmm6 + mov $1, %rsi + palignr $15, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave1): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit1) + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit1) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 31+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit1) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit1) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit1): + movaps (%rdx, %rsi), %xmm6 + psrldq $15, %xmm6 + palignr $1, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 15(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave2): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit2) + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit2) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 30+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit2) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit2) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit2): + movaps (%rdx, %rsi), %xmm6 + psrldq $14, %xmm6 + palignr $2, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 14(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave3): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit3) + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit3) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 29+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit3) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit3) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit3): + movaps (%rdx, %rsi), %xmm6 + psrldq $13, %xmm6 + palignr $3, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 13(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave4): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit4) + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit4) + palignr $4, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 28+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit4) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit4) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit4): + movaps (%rdx, %rsi), %xmm6 + psrldq $12, %xmm6 + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 12(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave5): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit5) + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit5) + palignr $5, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 27+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit5) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit5) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit5): + movaps (%rdx, %rsi), %xmm6 + psrldq $11, %xmm6 + palignr $5, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 11(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave6): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit6) + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit6) + palignr $6, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 26+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit6) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit6) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit6): + movaps (%rdx, %rsi), %xmm6 + psrldq $10, %xmm6 + palignr $6, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 10(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave7): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit7) + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit7) + palignr $7, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 25+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit7) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit7) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit7): + movaps (%rdx, %rsi), %xmm6 + psrldq $9, %xmm6 + palignr $7, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 9(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave8): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit8) + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit8) + palignr $8, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 24+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit8) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit8) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit8): + movaps (%rdx, %rsi), %xmm6 + psrldq $8, %xmm6 + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 8(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave9): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit9) + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit9) + palignr $9, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 23+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit9) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit9) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit9): + movaps (%rdx, %rsi), %xmm6 + psrldq $7, %xmm6 + palignr $9, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 7(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave10): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit10) + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit10) + palignr $10, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 22+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit10) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit10) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit10): + movaps (%rdx, %rsi), %xmm6 + psrldq $6, %xmm6 + palignr $10, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 6(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave11): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit11) + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit11) + palignr $11, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 21+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit11) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit11) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit11): + movaps (%rdx, %rsi), %xmm6 + psrldq $5, %xmm6 + palignr $11, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 5(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave12): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit12) + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit12) + palignr $12, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 20+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit12) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit12) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit12): + movaps (%rdx, %rsi), %xmm6 + psrldq $4, %xmm6 + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 4(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave13): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit13) + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit13) + palignr $13, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 19+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit13) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit13) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit13): + movaps (%rdx, %rsi), %xmm6 + psrldq $3, %xmm6 + palignr $13, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 3(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave14): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit14) + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit14) + palignr $14, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 18+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit14) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit14) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit14): + movaps (%rdx, %rsi), %xmm6 + psrldq $2, %xmm6 + palignr $14, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 2(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave15): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit15) + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit15) + palignr $15, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 17+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit15) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit15) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit15): + movaps (%rdx, %rsi), %xmm6 + psrldq $1, %xmm6 + palignr $15, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 1(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) +# endif + +END (STRCPY) + +#endif |