diff options
author | Ondřej Bílka <neleai@seznam.cz> | 2015-06-17 15:32:54 +0200 |
---|---|---|
committer | Ondřej Bílka <neleai@seznam.cz> | 2015-06-20 08:32:10 +0200 |
commit | d0731dac4e35206d4cd7a512e357ef66353b3581 (patch) | |
tree | 37ba6a1128fb33b77600ae1aaf7d1012777e8a20 | |
parent | c10b9b13f7471b08273effc8cd7e51b119df9348 (diff) | |
download | glibc-d0731dac4e35206d4cd7a512e357ef66353b3581.tar.gz glibc-d0731dac4e35206d4cd7a512e357ef66353b3581.tar.xz glibc-d0731dac4e35206d4cd7a512e357ef66353b3581.zip |
new sse2 and avx2 strcpy and stpcpy
-rw-r--r-- | math/Makefile | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpcpy-avx2.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S | 439 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpncpy.S | 5 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strchrnul_avx2.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcpy-avx2.S | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S | 1890 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcpy.S | 22 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S | 1891 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncpy.S | 88 |
14 files changed, 2435 insertions, 1921 deletions
diff --git a/math/Makefile b/math/Makefile index 7f6b85ec0d..143fa47ab0 100644 --- a/math/Makefile +++ b/math/Makefile @@ -115,7 +115,7 @@ tests-static = test-fpucw-static test-fpucw-ieee-static test-longdouble-yes = test-ldouble test-ildoubl ifneq (no,$(PERL)) -libm-vec-tests = $(addprefix test-,$(libmvec-tests)) +#libm-vec-tests = $(addprefix test-,$(libmvec-tests)) libm-tests = test-float test-double $(test-longdouble-$(long-double-fcts)) \ test-ifloat test-idouble $(libm-vec-tests) libm-tests.o = $(addsuffix .o,$(libm-tests)) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index d7002a9df3..c57374454e 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -29,7 +29,7 @@ CFLAGS-strspn-c.c += -msse4 endif ifeq (yes,$(config-cflags-avx2)) -sysdep_routines += memset-avx2 +sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 endif endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index b64e4f1532..d398e43d29 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -88,6 +88,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpcpy.S. */ IFUNC_IMPL (i, name, stpcpy, + IFUNC_IMPL_ADD (array, i, strcpy, HAS_AVX2, __stpcpy_avx2) IFUNC_IMPL_ADD (array, i, stpcpy, HAS_SSSE3, __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2)) @@ -137,6 +138,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strcpy.S. */ IFUNC_IMPL (i, name, strcpy, + IFUNC_IMPL_ADD (array, i, strcpy, HAS_AVX2, __strcpy_avx2) IFUNC_IMPL_ADD (array, i, strcpy, HAS_SSSE3, __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2.S b/sysdeps/x86_64/multiarch/stpcpy-avx2.S new file mode 100644 index 0000000000..bd30ef6b55 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2.S @@ -0,0 +1,3 @@ +#define USE_AVX2 +#define STPCPY __stpcpy_avx2 +#include "stpcpy-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S index 34231f8b46..695a23688b 100644 --- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S @@ -1,3 +1,436 @@ -#define USE_AS_STPCPY -#define STRCPY __stpcpy_sse2_unaligned -#include "strcpy-sse2-unaligned.S" +/* stpcpy with SSE2 and unaligned load + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#ifndef STPCPY +# define STPCPY __stpcpy_sse2_unaligned +#endif + +ENTRY(STPCPY) + mov %esi, %edx +#ifdef AS_STRCPY + movq %rdi, %rax +#endif + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + andl $4095, %edx + cmp $3968, %edx + ja L(cross_page) + + movdqu (%rsi), %xmm0 + pcmpeqb %xmm0, %xmm4 + pmovmskb %xmm4, %edx + testl %edx, %edx + je L(more16bytes) + bsf %edx, %ecx +#ifndef AS_STRCPY + lea (%rdi, %rcx), %rax +#endif + cmp $7, %ecx + movq (%rsi), %rdx + jb L(less_8_bytesb) +L(8bytes_from_cross): + movq -7(%rsi, %rcx), %rsi + movq %rdx, (%rdi) +#ifdef AS_STRCPY + movq %rsi, -7(%rdi, %rcx) +#else + movq %rsi, -7(%rax) +#endif + ret + + .p2align 4 +L(less_8_bytesb): + cmp $2, %ecx + jbe L(less_4_bytes) +L(4bytes_from_cross): + mov -3(%rsi, %rcx), %esi + mov %edx, (%rdi) +#ifdef AS_STRCPY + mov %esi, -3(%rdi, %rcx) +#else + mov %esi, -3(%rax) +#endif + ret + +.p2align 4 + L(less_4_bytes): + /* + Test branch vs this branchless that works for i 0,1,2 + d[i] = 0; + d[i/2] = s[1]; + d[0] = s[0]; + */ +#ifdef AS_STRCPY + movb $0, (%rdi, %rcx) +#endif + + shr $1, %ecx + mov %edx, %esi + shr $8, %edx + movb %dl, (%rdi, %rcx) +#ifndef AS_STRCPY + movb $0, (%rax) +#endif + movb %sil, (%rdi) + ret + + + + + + .p2align 4 +L(more16bytes): + pxor %xmm6, %xmm6 + movdqu 16(%rsi), %xmm1 + pxor %xmm7, %xmm7 + pcmpeqb %xmm1, %xmm5 + pmovmskb %xmm5, %edx + testl %edx, %edx + je L(more32bytes) + bsf %edx, %edx +#ifdef AS_STRCPY + movdqu 1(%rsi, %rdx), %xmm1 + movdqu %xmm0, (%rdi) + movdqu %xmm1, 1(%rdi, %rdx) +#else + lea 16(%rdi, %rdx), %rax + movdqu 1(%rsi, %rdx), %xmm1 + movdqu %xmm0, (%rdi) + movdqu %xmm1, -15(%rax) +#endif + ret + + .p2align 4 +L(more32bytes): + movdqu 32(%rsi), %xmm2 + movdqu 48(%rsi), %xmm3 + + pcmpeqb %xmm2, %xmm6 + pcmpeqb %xmm3, %xmm7 + pmovmskb %xmm7, %edx + shl $16, %edx + pmovmskb %xmm6, %ecx + or %ecx, %edx + je L(more64bytes) + bsf %edx, %edx +#ifndef AS_STRCPY + lea 32(%rdi, %rdx), %rax +#endif + movdqu 1(%rsi, %rdx), %xmm2 + movdqu 17(%rsi, %rdx), %xmm3 + movdqu %xmm0, (%rdi) + movdqu %xmm1, 16(%rdi) +#ifdef AS_STRCPY + movdqu %xmm2, 1(%rdi, %rdx) + movdqu %xmm3, 17(%rdi, %rdx) +#else + movdqu %xmm2, -31(%rax) + movdqu %xmm3, -15(%rax) +#endif + ret + + .p2align 4 +L(more64bytes): + movdqu %xmm0, (%rdi) + movdqu %xmm1, 16(%rdi) + movdqu %xmm2, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqu 64(%rsi), %xmm0 + movdqu 80(%rsi), %xmm1 + movdqu 96(%rsi), %xmm2 + movdqu 112(%rsi), %xmm3 + + pcmpeqb %xmm0, %xmm4 + pcmpeqb %xmm1, %xmm5 + pcmpeqb %xmm2, %xmm6 + pcmpeqb %xmm3, %xmm7 + pmovmskb %xmm4, %ecx + pmovmskb %xmm5, %edx + pmovmskb %xmm6, %r8d + pmovmskb %xmm7, %r9d + shl $16, %edx + or %ecx, %edx + shl $32, %r8 + shl $48, %r9 + or %r8, %rdx + or %r9, %rdx + test %rdx, %rdx + je L(prepare_loop) + bsf %rdx, %rdx +#ifndef AS_STRCPY + lea 64(%rdi, %rdx), %rax +#endif + movdqu 1(%rsi, %rdx), %xmm0 + movdqu 17(%rsi, %rdx), %xmm1 + movdqu 33(%rsi, %rdx), %xmm2 + movdqu 49(%rsi, %rdx), %xmm3 +#ifdef AS_STRCPY + movdqu %xmm0, 1(%rdi, %rdx) + movdqu %xmm1, 17(%rdi, %rdx) + movdqu %xmm2, 33(%rdi, %rdx) + movdqu %xmm3, 49(%rdi, %rdx) +#else + movdqu %xmm0, -63(%rax) + movdqu %xmm1, -47(%rax) + movdqu %xmm2, -31(%rax) + movdqu %xmm3, -15(%rax) +#endif + ret + + + .p2align 4 +L(prepare_loop): + movdqu %xmm0, 64(%rdi) + movdqu %xmm1, 80(%rdi) + movdqu %xmm2, 96(%rdi) + movdqu %xmm3, 112(%rdi) + + subq %rsi, %rdi + add $64, %rsi + andq $-64, %rsi + addq %rsi, %rdi + jmp L(loop_entry) + +#ifdef USE_AVX2 + .p2align 4 +L(loop): + vmovdqu %ymm1, (%rdi) + vmovdqu %ymm3, 32(%rdi) +L(loop_entry): + vmovdqa 96(%rsi), %ymm3 + vmovdqa 64(%rsi), %ymm1 + vpminub %ymm3, %ymm1, %ymm2 + addq $64, %rsi + addq $64, %rdi + vpcmpeqb %ymm5, %ymm2, %ymm0 + vpmovmskb %ymm0, %edx + test %edx, %edx + je L(loop) + salq $32, %rdx + vpcmpeqb %ymm5, %ymm1, %ymm4 + vpmovmskb %ymm4, %ecx + or %rcx, %rdx + bsfq %rdx, %rdx +#ifndef AS_STRCPY + lea (%rdi, %rdx), %rax +#endif + vmovdqu -63(%rsi, %rdx), %ymm0 + vmovdqu -31(%rsi, %rdx), %ymm2 +#ifdef AS_STRCPY + vmovdqu %ymm0, -63(%rdi, %rdx) + vmovdqu %ymm2, -31(%rdi, %rdx) +#else + vmovdqu %ymm0, -63(%rax) + vmovdqu %ymm2, -31(%rax) +#endif + vzeroupper + ret +#else + .p2align 4 +L(loop): + movdqu %xmm1, (%rdi) + movdqu %xmm2, 16(%rdi) + movdqu %xmm3, 32(%rdi) + movdqu %xmm4, 48(%rdi) +L(loop_entry): + movdqa 96(%rsi), %xmm3 + movdqa 112(%rsi), %xmm4 + movdqa %xmm3, %xmm0 + movdqa 80(%rsi), %xmm2 + pminub %xmm4, %xmm0 + movdqa 64(%rsi), %xmm1 + pminub %xmm2, %xmm0 + pminub %xmm1, %xmm0 + addq $64, %rsi + addq $64, %rdi + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + je L(loop) + salq $48, %rdx + pcmpeqb %xmm1, %xmm5 + pcmpeqb %xmm2, %xmm6 + pmovmskb %xmm5, %ecx +#ifdef AS_STRCPY + pmovmskb %xmm6, %r8d + pcmpeqb %xmm3, %xmm7 + pmovmskb %xmm7, %r9d + sal $16, %r8d + or %r8d, %ecx +#else + pmovmskb %xmm6, %eax + pcmpeqb %xmm3, %xmm7 + pmovmskb %xmm7, %r9d + sal $16, %eax + or %eax, %ecx +#endif + salq $32, %r9 + orq %rcx, %rdx + orq %r9, %rdx + bsfq %rdx, %rdx +#ifndef AS_STRCPY + lea (%rdi, %rdx), %rax +#endif + movdqu -63(%rsi, %rdx), %xmm0 + movdqu -47(%rsi, %rdx), %xmm1 + movdqu -31(%rsi, %rdx), %xmm2 + movdqu -15(%rsi, %rdx), %xmm3 +#ifdef AS_STRCPY + movdqu %xmm0, -63(%rdi, %rdx) + movdqu %xmm1, -47(%rdi, %rdx) + movdqu %xmm2, -31(%rdi, %rdx) + movdqu %xmm3, -15(%rdi, %rdx) +#else + movdqu %xmm0, -63(%rax) + movdqu %xmm1, -47(%rax) + movdqu %xmm2, -31(%rax) + movdqu %xmm3, -15(%rax) +#endif + ret +#endif + + .p2align 4 +L(cross_page): + movq %rsi, %rcx + pxor %xmm0, %xmm0 + and $15, %ecx + movq %rsi, %r9 + movq %rdi, %r10 + subq %rcx, %rsi + subq %rcx, %rdi + movdqa (%rsi), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + shr %cl, %edx + shl %cl, %edx + test %edx, %edx + jne L(less_32_cross) + + addq $16, %rsi + addq $16, %rdi + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jne L(less_32_cross) + movdqu %xmm1, (%rdi) + + movdqu (%r9), %xmm0 + movdqu %xmm0, (%r10) + + mov $8, %rcx +L(cross_loop): + addq $16, %rsi + addq $16, %rdi + pxor %xmm0, %xmm0 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jne L(return_cross) + movdqu %xmm1, (%rdi) + sub $1, %rcx + ja L(cross_loop) + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + lea -64(%rsi), %rdx + andq $-64, %rdx + addq %rdx, %rdi + subq %rsi, %rdi + movq %rdx, %rsi + jmp L(loop_entry) + + .p2align 4 +L(return_cross): + bsf %edx, %edx +#ifdef AS_STRCPY + movdqu -15(%rsi, %rdx), %xmm0 + movdqu %xmm0, -15(%rdi, %rdx) +#else + lea (%rdi, %rdx), %rax + movdqu -15(%rsi, %rdx), %xmm0 + movdqu %xmm0, -15(%rax) +#endif + ret + + .p2align 4 +L(less_32_cross): + bsf %rdx, %rdx + lea (%rdi, %rdx), %rcx +#ifndef AS_STRCPY + mov %rcx, %rax +#endif + mov %r9, %rsi + mov %r10, %rdi + sub %rdi, %rcx + cmp $15, %ecx + jb L(less_16_cross) + movdqu (%rsi), %xmm0 + movdqu -15(%rsi, %rcx), %xmm1 + movdqu %xmm0, (%rdi) +#ifdef AS_STRCPY + movdqu %xmm1, -15(%rdi, %rcx) +#else + movdqu %xmm1, -15(%rax) +#endif + ret + +L(less_16_cross): + cmp $7, %ecx + jb L(less_8_bytes_cross) + movq (%rsi), %rdx + jmp L(8bytes_from_cross) + +L(less_8_bytes_cross): + cmp $2, %ecx + jbe L(3_bytes_cross) + mov (%rsi), %edx + jmp L(4bytes_from_cross) + +L(3_bytes_cross): + jb L(1_2bytes_cross) + movzwl (%rsi), %edx + jmp L(_3_bytesb) + +L(1_2bytes_cross): + movb (%rsi), %dl + jmp L(0_2bytes_from_cross) + + .p2align 4 +L(less_4_bytesb): + je L(_3_bytesb) +L(0_2bytes_from_cross): + movb %dl, (%rdi) +#ifdef AS_STRCPY + movb $0, (%rdi, %rcx) +#else + movb $0, (%rax) +#endif + ret + + .p2align 4 +L(_3_bytesb): + movw %dx, (%rdi) + movb $0, 2(%rdi) + ret + +END(STPCPY) diff --git a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S index 658520f78f..3f35068fb3 100644 --- a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S @@ -1,4 +1,3 @@ #define USE_AS_STPCPY -#define USE_AS_STRNCPY #define STRCPY __stpncpy_sse2_unaligned -#include "strcpy-sse2-unaligned.S" +#include "strncpy-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S index 2698ca6a8c..159604a616 100644 --- a/sysdeps/x86_64/multiarch/stpncpy.S +++ b/sysdeps/x86_64/multiarch/stpncpy.S @@ -1,8 +1,7 @@ /* Multiple versions of stpncpy All versions must be listed in ifunc-impl-list.c. */ -#define STRCPY __stpncpy +#define STRNCPY __stpncpy #define USE_AS_STPCPY -#define USE_AS_STRNCPY -#include "strcpy.S" +#include "strncpy.S" weak_alias (__stpncpy, stpncpy) diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S index 81f1b40ef6..1faa49d44d 100644 --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S @@ -275,5 +275,5 @@ L(StartStrcpyPart): # define USE_AS_STRNCPY # endif -# include "strcpy-sse2-unaligned.S" +# include "strncpy-sse2-unaligned.S" #endif diff --git a/sysdeps/x86_64/multiarch/strchrnul_avx2.S b/sysdeps/x86_64/multiarch/strchrnul_avx2.S new file mode 100644 index 0000000000..4dcb981ab1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchrnul_avx2.S @@ -0,0 +1,3 @@ +#define USE_AS_STRCHRNUL +#define __strchr_avx2 __strchrnul_avx2 +#include "strchr_avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S new file mode 100644 index 0000000000..a3133a4134 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S @@ -0,0 +1,4 @@ +#define USE_AVX2 +#define AS_STRCPY +#define STPCPY __strcpy_avx2 +#include "stpcpy-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S index 8f03d1db24..310e4fac0a 100644 --- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S @@ -1,1887 +1,3 @@ -/* strcpy with SSE2 and unaligned load - Copyright (C) 2011-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# ifndef USE_AS_STRCAT -# include <sysdep.h> - -# ifndef STRCPY -# define STRCPY __strcpy_sse2_unaligned -# endif - -# endif - -# define JMPTBL(I, B) I - B -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), %rcx; \ - lea (%r11, %rcx), %rcx; \ - jmp *%rcx - -# ifndef USE_AS_STRCAT - -.text -ENTRY (STRCPY) -# ifdef USE_AS_STRNCPY - mov %rdx, %r8 - test %r8, %r8 - jz L(ExitZero) -# endif - mov %rsi, %rcx -# ifndef USE_AS_STPCPY - mov %rdi, %rax /* save result */ -# endif - -# endif - - and $63, %rcx - cmp $32, %rcx - jbe L(SourceStringAlignmentLess32) - - and $-16, %rsi - and $15, %rcx - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - - pcmpeqb (%rsi), %xmm1 - pmovmskb %xmm1, %rdx - shr %cl, %rdx - -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - mov $16, %r10 - sub %rcx, %r10 - cmp %r10, %r8 -# else - mov $17, %r10 - sub %rcx, %r10 - cmp %r10, %r8 -# endif - jbe L(CopyFrom1To16BytesTailCase2OrCase3) -# endif - test %rdx, %rdx - jnz L(CopyFrom1To16BytesTail) - - pcmpeqb 16(%rsi), %xmm0 - pmovmskb %xmm0, %rdx - -# ifdef USE_AS_STRNCPY - add $16, %r10 - cmp %r10, %r8 - jbe L(CopyFrom1To32BytesCase2OrCase3) -# endif - test %rdx, %rdx - jnz L(CopyFrom1To32Bytes) - - movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */ - movdqu %xmm1, (%rdi) - -/* If source address alignment != destination address alignment */ - .p2align 4 -L(Unalign16Both): - sub %rcx, %rdi -# ifdef USE_AS_STRNCPY - add %rcx, %r8 -# endif - mov $16, %rcx - movdqa (%rsi, %rcx), %xmm1 - movaps 16(%rsi, %rcx), %xmm2 - movdqu %xmm1, (%rdi, %rcx) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rdx - add $16, %rcx -# ifdef USE_AS_STRNCPY - sub $48, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rdx, %rdx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyFrom1To16BytesUnalignedXmm2) -# else - jnz L(CopyFrom1To16Bytes) -# endif - - movaps 16(%rsi, %rcx), %xmm3 - movdqu %xmm2, (%rdi, %rcx) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rdx - add $16, %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rdx, %rdx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyFrom1To16BytesUnalignedXmm3) -# else - jnz L(CopyFrom1To16Bytes) -# endif - - movaps 16(%rsi, %rcx), %xmm4 - movdqu %xmm3, (%rdi, %rcx) - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rdx - add $16, %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rdx, %rdx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyFrom1To16BytesUnalignedXmm4) -# else - jnz L(CopyFrom1To16Bytes) -# endif - - movaps 16(%rsi, %rcx), %xmm1 - movdqu %xmm4, (%rdi, %rcx) - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %rdx - add $16, %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rdx, %rdx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyFrom1To16BytesUnalignedXmm1) -# else - jnz L(CopyFrom1To16Bytes) -# endif - - movaps 16(%rsi, %rcx), %xmm2 - movdqu %xmm1, (%rdi, %rcx) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rdx - add $16, %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rdx, %rdx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyFrom1To16BytesUnalignedXmm2) -# else - jnz L(CopyFrom1To16Bytes) -# endif - - movaps 16(%rsi, %rcx), %xmm3 - movdqu %xmm2, (%rdi, %rcx) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rdx - add $16, %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rdx, %rdx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyFrom1To16BytesUnalignedXmm3) -# else - jnz L(CopyFrom1To16Bytes) -# endif - - movdqu %xmm3, (%rdi, %rcx) - mov %rsi, %rdx - lea 16(%rsi, %rcx), %rsi - and $-0x40, %rsi - sub %rsi, %rdx - sub %rdx, %rdi -# ifdef USE_AS_STRNCPY - lea 128(%r8, %rdx), %r8 -# endif -L(Unaligned64Loop): - movaps (%rsi), %xmm2 - movaps %xmm2, %xmm4 - movaps 16(%rsi), %xmm5 - movaps 32(%rsi), %xmm3 - movaps %xmm3, %xmm6 - movaps 48(%rsi), %xmm7 - pminub %xmm5, %xmm2 - pminub %xmm7, %xmm3 - pminub %xmm2, %xmm3 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %rdx -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(UnalignedLeaveCase2OrCase3) -# endif - test %rdx, %rdx - jnz L(Unaligned64Leave) - -L(Unaligned64Loop_start): - add $64, %rdi - add $64, %rsi - movdqu %xmm4, -64(%rdi) - movaps (%rsi), %xmm2 - movdqa %xmm2, %xmm4 - movdqu %xmm5, -48(%rdi) - movaps 16(%rsi), %xmm5 - pminub %xmm5, %xmm2 - movaps 32(%rsi), %xmm3 - movdqu %xmm6, -32(%rdi) - movaps %xmm3, %xmm6 - movdqu %xmm7, -16(%rdi) - movaps 48(%rsi), %xmm7 - pminub %xmm7, %xmm3 - pminub %xmm2, %xmm3 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %rdx -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(UnalignedLeaveCase2OrCase3) -# endif - test %rdx, %rdx - jz L(Unaligned64Loop_start) - -L(Unaligned64Leave): - pxor %xmm1, %xmm1 - - pcmpeqb %xmm4, %xmm0 - pcmpeqb %xmm5, %xmm1 - pmovmskb %xmm0, %rdx - pmovmskb %xmm1, %rcx - test %rdx, %rdx - jnz L(CopyFrom1To16BytesUnaligned_0) - test %rcx, %rcx - jnz L(CopyFrom1To16BytesUnaligned_16) - - pcmpeqb %xmm6, %xmm0 - pcmpeqb %xmm7, %xmm1 - pmovmskb %xmm0, %rdx - pmovmskb %xmm1, %rcx - test %rdx, %rdx - jnz L(CopyFrom1To16BytesUnaligned_32) - - bsf %rcx, %rdx - movdqu %xmm4, (%rdi) - movdqu %xmm5, 16(%rdi) - movdqu %xmm6, 32(%rdi) -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea 48(%rdi, %rdx), %rax -# endif - movdqu %xmm7, 48(%rdi) - add $15, %r8 - sub %rdx, %r8 - lea 49(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - add $48, %rsi - add $48, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) -# endif - -/* If source address alignment == destination address alignment */ - -L(SourceStringAlignmentLess32): - pxor %xmm0, %xmm0 - movdqu (%rsi), %xmm1 - movdqu 16(%rsi), %xmm2 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %rdx - -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - cmp $16, %r8 -# else - cmp $17, %r8 -# endif - jbe L(CopyFrom1To16BytesTail1Case2OrCase3) -# endif - test %rdx, %rdx - jnz L(CopyFrom1To16BytesTail1) - - pcmpeqb %xmm2, %xmm0 - movdqu %xmm1, (%rdi) - pmovmskb %xmm0, %rdx - -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - cmp $32, %r8 -# else - cmp $33, %r8 -# endif - jbe L(CopyFrom1To32Bytes1Case2OrCase3) -# endif - test %rdx, %rdx - jnz L(CopyFrom1To32Bytes1) - - and $-16, %rsi - and $15, %rcx - jmp L(Unalign16Both) - -/*------End of main part with loops---------------------*/ - -/* Case1 */ - -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) - .p2align 4 -L(CopyFrom1To16Bytes): - add %rcx, %rdi - add %rcx, %rsi - bsf %rdx, %rdx - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) -# endif - .p2align 4 -L(CopyFrom1To16BytesTail): - add %rcx, %rsi - bsf %rdx, %rdx - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) - - .p2align 4 -L(CopyFrom1To32Bytes1): - add $16, %rsi - add $16, %rdi -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $16, %r8 -# endif -L(CopyFrom1To16BytesTail1): - bsf %rdx, %rdx - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) - - .p2align 4 -L(CopyFrom1To32Bytes): - bsf %rdx, %rdx - add %rcx, %rsi - add $16, %rdx - sub %rcx, %rdx - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) - - .p2align 4 -L(CopyFrom1To16BytesUnaligned_0): - bsf %rdx, %rdx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif - movdqu %xmm4, (%rdi) - add $63, %r8 - sub %rdx, %r8 - lea 1(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) -# endif - - .p2align 4 -L(CopyFrom1To16BytesUnaligned_16): - bsf %rcx, %rdx - movdqu %xmm4, (%rdi) -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea 16(%rdi, %rdx), %rax -# endif - movdqu %xmm5, 16(%rdi) - add $47, %r8 - sub %rdx, %r8 - lea 17(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - add $16, %rsi - add $16, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) -# endif - - .p2align 4 -L(CopyFrom1To16BytesUnaligned_32): - bsf %rdx, %rdx - movdqu %xmm4, (%rdi) - movdqu %xmm5, 16(%rdi) -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea 32(%rdi, %rdx), %rax -# endif - movdqu %xmm6, 32(%rdi) - add $31, %r8 - sub %rdx, %r8 - lea 33(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - add $32, %rsi - add $32, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) -# endif - -# ifdef USE_AS_STRNCPY -# ifndef USE_AS_STRCAT - .p2align 4 -L(CopyFrom1To16BytesUnalignedXmm6): - movdqu %xmm6, (%rdi, %rcx) - jmp L(CopyFrom1To16BytesXmmExit) - - .p2align 4 -L(CopyFrom1To16BytesUnalignedXmm5): - movdqu %xmm5, (%rdi, %rcx) - jmp L(CopyFrom1To16BytesXmmExit) - - .p2align 4 -L(CopyFrom1To16BytesUnalignedXmm4): - movdqu %xmm4, (%rdi, %rcx) - jmp L(CopyFrom1To16BytesXmmExit) - - .p2align 4 -L(CopyFrom1To16BytesUnalignedXmm3): - movdqu %xmm3, (%rdi, %rcx) - jmp L(CopyFrom1To16BytesXmmExit) - - .p2align 4 -L(CopyFrom1To16BytesUnalignedXmm1): - movdqu %xmm1, (%rdi, %rcx) - jmp L(CopyFrom1To16BytesXmmExit) -# endif - - .p2align 4 -L(CopyFrom1To16BytesExit): - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) - -/* Case2 */ - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %r8 - add %rcx, %rdi - add %rcx, %rsi - bsf %rdx, %rdx - cmp %r8, %rdx - jb L(CopyFrom1To16BytesExit) - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) - - .p2align 4 -L(CopyFrom1To32BytesCase2): - add %rcx, %rsi - bsf %rdx, %rdx - add $16, %rdx - sub %rcx, %rdx - cmp %r8, %rdx - jb L(CopyFrom1To16BytesExit) - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) - -L(CopyFrom1To16BytesTailCase2): - add %rcx, %rsi - bsf %rdx, %rdx - cmp %r8, %rdx - jb L(CopyFrom1To16BytesExit) - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) - -L(CopyFrom1To16BytesTail1Case2): - bsf %rdx, %rdx - cmp %r8, %rdx - jb L(CopyFrom1To16BytesExit) - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) - -/* Case2 or Case3, Case3 */ - - .p2align 4 -L(CopyFrom1To16BytesCase2OrCase3): - test %rdx, %rdx - jnz L(CopyFrom1To16BytesCase2) -L(CopyFrom1To16BytesCase3): - add $16, %r8 - add %rcx, %rdi - add %rcx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) - - .p2align 4 -L(CopyFrom1To32BytesCase2OrCase3): - test %rdx, %rdx - jnz L(CopyFrom1To32BytesCase2) - add %rcx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) - - .p2align 4 -L(CopyFrom1To16BytesTailCase2OrCase3): - test %rdx, %rdx - jnz L(CopyFrom1To16BytesTailCase2) - add %rcx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) - - .p2align 4 -L(CopyFrom1To32Bytes1Case2OrCase3): - add $16, %rdi - add $16, %rsi - sub $16, %r8 -L(CopyFrom1To16BytesTail1Case2OrCase3): - test %rdx, %rdx - jnz L(CopyFrom1To16BytesTail1Case2) - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) - -# endif - -/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/ - - .p2align 4 -L(Exit1): - mov %dh, (%rdi) -# ifdef USE_AS_STPCPY - lea (%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $1, %r8 - lea 1(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit2): - mov (%rsi), %dx - mov %dx, (%rdi) -# ifdef USE_AS_STPCPY - lea 1(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $2, %r8 - lea 2(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit3): - mov (%rsi), %cx - mov %cx, (%rdi) - mov %dh, 2(%rdi) -# ifdef USE_AS_STPCPY - lea 2(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $3, %r8 - lea 3(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit4): - mov (%rsi), %edx - mov %edx, (%rdi) -# ifdef USE_AS_STPCPY - lea 3(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $4, %r8 - lea 4(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit5): - mov (%rsi), %ecx - mov %dh, 4(%rdi) - mov %ecx, (%rdi) -# ifdef USE_AS_STPCPY - lea 4(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $5, %r8 - lea 5(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit6): - mov (%rsi), %ecx - mov 4(%rsi), %dx - mov %ecx, (%rdi) - mov %dx, 4(%rdi) -# ifdef USE_AS_STPCPY - lea 5(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $6, %r8 - lea 6(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit7): - mov (%rsi), %ecx - mov 3(%rsi), %edx - mov %ecx, (%rdi) - mov %edx, 3(%rdi) -# ifdef USE_AS_STPCPY - lea 6(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $7, %r8 - lea 7(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit8): - mov (%rsi), %rdx - mov %rdx, (%rdi) -# ifdef USE_AS_STPCPY - lea 7(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $8, %r8 - lea 8(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit9): - mov (%rsi), %rcx - mov %dh, 8(%rdi) - mov %rcx, (%rdi) -# ifdef USE_AS_STPCPY - lea 8(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $9, %r8 - lea 9(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit10): - mov (%rsi), %rcx - mov 8(%rsi), %dx - mov %rcx, (%rdi) - mov %dx, 8(%rdi) -# ifdef USE_AS_STPCPY - lea 9(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $10, %r8 - lea 10(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit11): - mov (%rsi), %rcx - mov 7(%rsi), %edx - mov %rcx, (%rdi) - mov %edx, 7(%rdi) -# ifdef USE_AS_STPCPY - lea 10(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $11, %r8 - lea 11(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit12): - mov (%rsi), %rcx - mov 8(%rsi), %edx - mov %rcx, (%rdi) - mov %edx, 8(%rdi) -# ifdef USE_AS_STPCPY - lea 11(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $12, %r8 - lea 12(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit13): - mov (%rsi), %rcx - mov 5(%rsi), %rdx - mov %rcx, (%rdi) - mov %rdx, 5(%rdi) -# ifdef USE_AS_STPCPY - lea 12(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $13, %r8 - lea 13(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit14): - mov (%rsi), %rcx - mov 6(%rsi), %rdx - mov %rcx, (%rdi) - mov %rdx, 6(%rdi) -# ifdef USE_AS_STPCPY - lea 13(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $14, %r8 - lea 14(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit15): - mov (%rsi), %rcx - mov 7(%rsi), %rdx - mov %rcx, (%rdi) - mov %rdx, 7(%rdi) -# ifdef USE_AS_STPCPY - lea 14(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $15, %r8 - lea 15(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit16): - movdqu (%rsi), %xmm0 - movdqu %xmm0, (%rdi) -# ifdef USE_AS_STPCPY - lea 15(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $16, %r8 - lea 16(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit17): - movdqu (%rsi), %xmm0 - movdqu %xmm0, (%rdi) - mov %dh, 16(%rdi) -# ifdef USE_AS_STPCPY - lea 16(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $17, %r8 - lea 17(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit18): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %cx - movdqu %xmm0, (%rdi) - mov %cx, 16(%rdi) -# ifdef USE_AS_STPCPY - lea 17(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $18, %r8 - lea 18(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit19): - movdqu (%rsi), %xmm0 - mov 15(%rsi), %ecx - movdqu %xmm0, (%rdi) - mov %ecx, 15(%rdi) -# ifdef USE_AS_STPCPY - lea 18(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $19, %r8 - lea 19(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit20): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %ecx - movdqu %xmm0, (%rdi) - mov %ecx, 16(%rdi) -# ifdef USE_AS_STPCPY - lea 19(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $20, %r8 - lea 20(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit21): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %ecx - movdqu %xmm0, (%rdi) - mov %ecx, 16(%rdi) - mov %dh, 20(%rdi) -# ifdef USE_AS_STPCPY - lea 20(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $21, %r8 - lea 21(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit22): - movdqu (%rsi), %xmm0 - mov 14(%rsi), %rcx - movdqu %xmm0, (%rdi) - mov %rcx, 14(%rdi) -# ifdef USE_AS_STPCPY - lea 21(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $22, %r8 - lea 22(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit23): - movdqu (%rsi), %xmm0 - mov 15(%rsi), %rcx - movdqu %xmm0, (%rdi) - mov %rcx, 15(%rdi) -# ifdef USE_AS_STPCPY - lea 22(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $23, %r8 - lea 23(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit24): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %rcx - movdqu %xmm0, (%rdi) - mov %rcx, 16(%rdi) -# ifdef USE_AS_STPCPY - lea 23(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $24, %r8 - lea 24(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit25): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %rcx - movdqu %xmm0, (%rdi) - mov %rcx, 16(%rdi) - mov %dh, 24(%rdi) -# ifdef USE_AS_STPCPY - lea 24(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $25, %r8 - lea 25(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit26): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %rdx - mov 24(%rsi), %cx - movdqu %xmm0, (%rdi) - mov %rdx, 16(%rdi) - mov %cx, 24(%rdi) -# ifdef USE_AS_STPCPY - lea 25(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $26, %r8 - lea 26(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit27): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %rdx - mov 23(%rsi), %ecx - movdqu %xmm0, (%rdi) - mov %rdx, 16(%rdi) - mov %ecx, 23(%rdi) -# ifdef USE_AS_STPCPY - lea 26(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $27, %r8 - lea 27(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit28): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %rdx - mov 24(%rsi), %ecx - movdqu %xmm0, (%rdi) - mov %rdx, 16(%rdi) - mov %ecx, 24(%rdi) -# ifdef USE_AS_STPCPY - lea 27(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $28, %r8 - lea 28(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit29): - movdqu (%rsi), %xmm0 - movdqu 13(%rsi), %xmm2 - movdqu %xmm0, (%rdi) - movdqu %xmm2, 13(%rdi) -# ifdef USE_AS_STPCPY - lea 28(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $29, %r8 - lea 29(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit30): - movdqu (%rsi), %xmm0 - movdqu 14(%rsi), %xmm2 - movdqu %xmm0, (%rdi) - movdqu %xmm2, 14(%rdi) -# ifdef USE_AS_STPCPY - lea 29(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $30, %r8 - lea 30(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit31): - movdqu (%rsi), %xmm0 - movdqu 15(%rsi), %xmm2 - movdqu %xmm0, (%rdi) - movdqu %xmm2, 15(%rdi) -# ifdef USE_AS_STPCPY - lea 30(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $31, %r8 - lea 31(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - - .p2align 4 -L(Exit32): - movdqu (%rsi), %xmm0 - movdqu 16(%rsi), %xmm2 - movdqu %xmm0, (%rdi) - movdqu %xmm2, 16(%rdi) -# ifdef USE_AS_STPCPY - lea 31(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $32, %r8 - lea 32(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - ret - -# ifdef USE_AS_STRNCPY - - .p2align 4 -L(StrncpyExit0): -# ifdef USE_AS_STPCPY - mov %rdi, %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, (%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit1): - mov (%rsi), %dl - mov %dl, (%rdi) -# ifdef USE_AS_STPCPY - lea 1(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 1(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit2): - mov (%rsi), %dx - mov %dx, (%rdi) -# ifdef USE_AS_STPCPY - lea 2(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 2(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit3): - mov (%rsi), %cx - mov 2(%rsi), %dl - mov %cx, (%rdi) - mov %dl, 2(%rdi) -# ifdef USE_AS_STPCPY - lea 3(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 3(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit4): - mov (%rsi), %edx - mov %edx, (%rdi) -# ifdef USE_AS_STPCPY - lea 4(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 4(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit5): - mov (%rsi), %ecx - mov 4(%rsi), %dl - mov %ecx, (%rdi) - mov %dl, 4(%rdi) -# ifdef USE_AS_STPCPY - lea 5(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 5(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit6): - mov (%rsi), %ecx - mov 4(%rsi), %dx - mov %ecx, (%rdi) - mov %dx, 4(%rdi) -# ifdef USE_AS_STPCPY - lea 6(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 6(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit7): - mov (%rsi), %ecx - mov 3(%rsi), %edx - mov %ecx, (%rdi) - mov %edx, 3(%rdi) -# ifdef USE_AS_STPCPY - lea 7(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 7(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit8): - mov (%rsi), %rdx - mov %rdx, (%rdi) -# ifdef USE_AS_STPCPY - lea 8(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 8(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit9): - mov (%rsi), %rcx - mov 8(%rsi), %dl - mov %rcx, (%rdi) - mov %dl, 8(%rdi) -# ifdef USE_AS_STPCPY - lea 9(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 9(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit10): - mov (%rsi), %rcx - mov 8(%rsi), %dx - mov %rcx, (%rdi) - mov %dx, 8(%rdi) -# ifdef USE_AS_STPCPY - lea 10(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 10(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit11): - mov (%rsi), %rcx - mov 7(%rsi), %edx - mov %rcx, (%rdi) - mov %edx, 7(%rdi) -# ifdef USE_AS_STPCPY - lea 11(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 11(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit12): - mov (%rsi), %rcx - mov 8(%rsi), %edx - mov %rcx, (%rdi) - mov %edx, 8(%rdi) -# ifdef USE_AS_STPCPY - lea 12(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 12(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit13): - mov (%rsi), %rcx - mov 5(%rsi), %rdx - mov %rcx, (%rdi) - mov %rdx, 5(%rdi) -# ifdef USE_AS_STPCPY - lea 13(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 13(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit14): - mov (%rsi), %rcx - mov 6(%rsi), %rdx - mov %rcx, (%rdi) - mov %rdx, 6(%rdi) -# ifdef USE_AS_STPCPY - lea 14(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 14(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit15): - mov (%rsi), %rcx - mov 7(%rsi), %rdx - mov %rcx, (%rdi) - mov %rdx, 7(%rdi) -# ifdef USE_AS_STPCPY - lea 15(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 15(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit16): - movdqu (%rsi), %xmm0 - movdqu %xmm0, (%rdi) -# ifdef USE_AS_STPCPY - lea 16(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 16(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit17): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %cl - movdqu %xmm0, (%rdi) - mov %cl, 16(%rdi) -# ifdef USE_AS_STPCPY - lea 17(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 17(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit18): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %cx - movdqu %xmm0, (%rdi) - mov %cx, 16(%rdi) -# ifdef USE_AS_STPCPY - lea 18(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 18(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit19): - movdqu (%rsi), %xmm0 - mov 15(%rsi), %ecx - movdqu %xmm0, (%rdi) - mov %ecx, 15(%rdi) -# ifdef USE_AS_STPCPY - lea 19(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 19(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit20): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %ecx - movdqu %xmm0, (%rdi) - mov %ecx, 16(%rdi) -# ifdef USE_AS_STPCPY - lea 20(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 20(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit21): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %ecx - mov 20(%rsi), %dl - movdqu %xmm0, (%rdi) - mov %ecx, 16(%rdi) - mov %dl, 20(%rdi) -# ifdef USE_AS_STPCPY - lea 21(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 21(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit22): - movdqu (%rsi), %xmm0 - mov 14(%rsi), %rcx - movdqu %xmm0, (%rdi) - mov %rcx, 14(%rdi) -# ifdef USE_AS_STPCPY - lea 22(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 22(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit23): - movdqu (%rsi), %xmm0 - mov 15(%rsi), %rcx - movdqu %xmm0, (%rdi) - mov %rcx, 15(%rdi) -# ifdef USE_AS_STPCPY - lea 23(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 23(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit24): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %rcx - movdqu %xmm0, (%rdi) - mov %rcx, 16(%rdi) -# ifdef USE_AS_STPCPY - lea 24(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 24(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit25): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %rdx - mov 24(%rsi), %cl - movdqu %xmm0, (%rdi) - mov %rdx, 16(%rdi) - mov %cl, 24(%rdi) -# ifdef USE_AS_STPCPY - lea 25(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 25(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit26): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %rdx - mov 24(%rsi), %cx - movdqu %xmm0, (%rdi) - mov %rdx, 16(%rdi) - mov %cx, 24(%rdi) -# ifdef USE_AS_STPCPY - lea 26(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 26(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit27): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %rdx - mov 23(%rsi), %ecx - movdqu %xmm0, (%rdi) - mov %rdx, 16(%rdi) - mov %ecx, 23(%rdi) -# ifdef USE_AS_STPCPY - lea 27(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 27(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit28): - movdqu (%rsi), %xmm0 - mov 16(%rsi), %rdx - mov 24(%rsi), %ecx - movdqu %xmm0, (%rdi) - mov %rdx, 16(%rdi) - mov %ecx, 24(%rdi) -# ifdef USE_AS_STPCPY - lea 28(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 28(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit29): - movdqu (%rsi), %xmm0 - movdqu 13(%rsi), %xmm2 - movdqu %xmm0, (%rdi) - movdqu %xmm2, 13(%rdi) -# ifdef USE_AS_STPCPY - lea 29(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 29(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit30): - movdqu (%rsi), %xmm0 - movdqu 14(%rsi), %xmm2 - movdqu %xmm0, (%rdi) - movdqu %xmm2, 14(%rdi) -# ifdef USE_AS_STPCPY - lea 30(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 30(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit31): - movdqu (%rsi), %xmm0 - movdqu 15(%rsi), %xmm2 - movdqu %xmm0, (%rdi) - movdqu %xmm2, 15(%rdi) -# ifdef USE_AS_STPCPY - lea 31(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 31(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit32): - movdqu (%rsi), %xmm0 - movdqu 16(%rsi), %xmm2 - movdqu %xmm0, (%rdi) - movdqu %xmm2, 16(%rdi) -# ifdef USE_AS_STPCPY - lea 32(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 32(%rdi) -# endif - ret - - .p2align 4 -L(StrncpyExit33): - movdqu (%rsi), %xmm0 - movdqu 16(%rsi), %xmm2 - mov 32(%rsi), %cl - movdqu %xmm0, (%rdi) - movdqu %xmm2, 16(%rdi) - mov %cl, 32(%rdi) -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 33(%rdi) -# endif - ret - -# ifndef USE_AS_STRCAT - - .p2align 4 -L(Fill0): - ret - - .p2align 4 -L(Fill1): - mov %dl, (%rdi) - ret - - .p2align 4 -L(Fill2): - mov %dx, (%rdi) - ret - - .p2align 4 -L(Fill3): - mov %edx, -1(%rdi) - ret - - .p2align 4 -L(Fill4): - mov %edx, (%rdi) - ret - - .p2align 4 -L(Fill5): - mov %edx, (%rdi) - mov %dl, 4(%rdi) - ret - - .p2align 4 -L(Fill6): - mov %edx, (%rdi) - mov %dx, 4(%rdi) - ret - - .p2align 4 -L(Fill7): - mov %rdx, -1(%rdi) - ret - - .p2align 4 -L(Fill8): - mov %rdx, (%rdi) - ret - - .p2align 4 -L(Fill9): - mov %rdx, (%rdi) - mov %dl, 8(%rdi) - ret - - .p2align 4 -L(Fill10): - mov %rdx, (%rdi) - mov %dx, 8(%rdi) - ret - - .p2align 4 -L(Fill11): - mov %rdx, (%rdi) - mov %edx, 7(%rdi) - ret - - .p2align 4 -L(Fill12): - mov %rdx, (%rdi) - mov %edx, 8(%rdi) - ret - - .p2align 4 -L(Fill13): - mov %rdx, (%rdi) - mov %rdx, 5(%rdi) - ret - - .p2align 4 -L(Fill14): - mov %rdx, (%rdi) - mov %rdx, 6(%rdi) - ret - - .p2align 4 -L(Fill15): - movdqu %xmm0, -1(%rdi) - ret - - .p2align 4 -L(Fill16): - movdqu %xmm0, (%rdi) - ret - - .p2align 4 -L(CopyFrom1To16BytesUnalignedXmm2): - movdqu %xmm2, (%rdi, %rcx) - - .p2align 4 -L(CopyFrom1To16BytesXmmExit): - bsf %rdx, %rdx - add $15, %r8 - add %rcx, %rdi -# ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif - sub %rdx, %r8 - lea 1(%rdi, %rdx), %rdi - - .p2align 4 -L(StrncpyFillTailWithZero): - pxor %xmm0, %xmm0 - xor %rdx, %rdx - sub $16, %r8 - jbe L(StrncpyFillExit) - - movdqu %xmm0, (%rdi) - add $16, %rdi - - mov %rdi, %rsi - and $0xf, %rsi - sub %rsi, %rdi - add %rsi, %r8 - sub $64, %r8 - jb L(StrncpyFillLess64) - -L(StrncpyFillLoopMovdqa): - movdqa %xmm0, (%rdi) - movdqa %xmm0, 16(%rdi) - movdqa %xmm0, 32(%rdi) - movdqa %xmm0, 48(%rdi) - add $64, %rdi - sub $64, %r8 - jae L(StrncpyFillLoopMovdqa) - -L(StrncpyFillLess64): - add $32, %r8 - jl L(StrncpyFillLess32) - movdqa %xmm0, (%rdi) - movdqa %xmm0, 16(%rdi) - add $32, %rdi - sub $16, %r8 - jl L(StrncpyFillExit) - movdqa %xmm0, (%rdi) - add $16, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) - -L(StrncpyFillLess32): - add $16, %r8 - jl L(StrncpyFillExit) - movdqa %xmm0, (%rdi) - add $16, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) - -L(StrncpyFillExit): - add $16, %r8 - BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) - -/* end of ifndef USE_AS_STRCAT */ -# endif - - .p2align 4 -L(UnalignedLeaveCase2OrCase3): - test %rdx, %rdx - jnz L(Unaligned64LeaveCase2) -L(Unaligned64LeaveCase3): - lea 64(%r8), %rcx - and $-16, %rcx - add $48, %r8 - jl L(CopyFrom1To16BytesCase3) - movdqu %xmm4, (%rdi) - sub $16, %r8 - jb L(CopyFrom1To16BytesCase3) - movdqu %xmm5, 16(%rdi) - sub $16, %r8 - jb L(CopyFrom1To16BytesCase3) - movdqu %xmm6, 32(%rdi) - sub $16, %r8 - jb L(CopyFrom1To16BytesCase3) - movdqu %xmm7, 48(%rdi) -# ifdef USE_AS_STPCPY - lea 64(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - xor %ch, %ch - movb %ch, 64(%rdi) -# endif - ret - - .p2align 4 -L(Unaligned64LeaveCase2): - xor %rcx, %rcx - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rdx - add $48, %r8 - jle L(CopyFrom1To16BytesCase2OrCase3) - test %rdx, %rdx -# ifndef USE_AS_STRCAT - jnz L(CopyFrom1To16BytesUnalignedXmm4) -# else - jnz L(CopyFrom1To16Bytes) -# endif - pcmpeqb %xmm5, %xmm0 - pmovmskb %xmm0, %rdx - movdqu %xmm4, (%rdi) - add $16, %rcx - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rdx, %rdx -# ifndef USE_AS_STRCAT - jnz L(CopyFrom1To16BytesUnalignedXmm5) -# else - jnz L(CopyFrom1To16Bytes) -# endif - - pcmpeqb %xmm6, %xmm0 - pmovmskb %xmm0, %rdx - movdqu %xmm5, 16(%rdi) - add $16, %rcx - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rdx, %rdx -# ifndef USE_AS_STRCAT - jnz L(CopyFrom1To16BytesUnalignedXmm6) -# else - jnz L(CopyFrom1To16Bytes) -# endif - - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %rdx - movdqu %xmm6, 32(%rdi) - lea 16(%rdi, %rcx), %rdi - lea 16(%rsi, %rcx), %rsi - bsf %rdx, %rdx - cmp %r8, %rdx - jb L(CopyFrom1To16BytesExit) - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) - - .p2align 4 -L(ExitZero): -# ifndef USE_AS_STRCAT - mov %rdi, %rax -# endif - ret - -# endif - -# ifndef USE_AS_STRCAT -END (STRCPY) -# else -END (STRCAT) -# endif - .p2align 4 - .section .rodata -L(ExitTable): - .int JMPTBL(L(Exit1), L(ExitTable)) - .int JMPTBL(L(Exit2), L(ExitTable)) - .int JMPTBL(L(Exit3), L(ExitTable)) - .int JMPTBL(L(Exit4), L(ExitTable)) - .int JMPTBL(L(Exit5), L(ExitTable)) - .int JMPTBL(L(Exit6), L(ExitTable)) - .int JMPTBL(L(Exit7), L(ExitTable)) - .int JMPTBL(L(Exit8), L(ExitTable)) - .int JMPTBL(L(Exit9), L(ExitTable)) - .int JMPTBL(L(Exit10), L(ExitTable)) - .int JMPTBL(L(Exit11), L(ExitTable)) - .int JMPTBL(L(Exit12), L(ExitTable)) - .int JMPTBL(L(Exit13), L(ExitTable)) - .int JMPTBL(L(Exit14), L(ExitTable)) - .int JMPTBL(L(Exit15), L(ExitTable)) - .int JMPTBL(L(Exit16), L(ExitTable)) - .int JMPTBL(L(Exit17), L(ExitTable)) - .int JMPTBL(L(Exit18), L(ExitTable)) - .int JMPTBL(L(Exit19), L(ExitTable)) - .int JMPTBL(L(Exit20), L(ExitTable)) - .int JMPTBL(L(Exit21), L(ExitTable)) - .int JMPTBL(L(Exit22), L(ExitTable)) - .int JMPTBL(L(Exit23), L(ExitTable)) - .int JMPTBL(L(Exit24), L(ExitTable)) - .int JMPTBL(L(Exit25), L(ExitTable)) - .int JMPTBL(L(Exit26), L(ExitTable)) - .int JMPTBL(L(Exit27), L(ExitTable)) - .int JMPTBL(L(Exit28), L(ExitTable)) - .int JMPTBL(L(Exit29), L(ExitTable)) - .int JMPTBL(L(Exit30), L(ExitTable)) - .int JMPTBL(L(Exit31), L(ExitTable)) - .int JMPTBL(L(Exit32), L(ExitTable)) -# ifdef USE_AS_STRNCPY -L(ExitStrncpyTable): - .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) - .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) -# ifndef USE_AS_STRCAT - .p2align 4 -L(FillTable): - .int JMPTBL(L(Fill0), L(FillTable)) - .int JMPTBL(L(Fill1), L(FillTable)) - .int JMPTBL(L(Fill2), L(FillTable)) - .int JMPTBL(L(Fill3), L(FillTable)) - .int JMPTBL(L(Fill4), L(FillTable)) - .int JMPTBL(L(Fill5), L(FillTable)) - .int JMPTBL(L(Fill6), L(FillTable)) - .int JMPTBL(L(Fill7), L(FillTable)) - .int JMPTBL(L(Fill8), L(FillTable)) - .int JMPTBL(L(Fill9), L(FillTable)) - .int JMPTBL(L(Fill10), L(FillTable)) - .int JMPTBL(L(Fill11), L(FillTable)) - .int JMPTBL(L(Fill12), L(FillTable)) - .int JMPTBL(L(Fill13), L(FillTable)) - .int JMPTBL(L(Fill14), L(FillTable)) - .int JMPTBL(L(Fill15), L(FillTable)) - .int JMPTBL(L(Fill16), L(FillTable)) -# endif -# endif -#endif +#define AS_STRCPY +#define STPCPY __strcpy_sse2_unaligned +#include "stpcpy-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S index 9464ee8b63..92be04c2cd 100644 --- a/sysdeps/x86_64/multiarch/strcpy.S +++ b/sysdeps/x86_64/multiarch/strcpy.S @@ -28,31 +28,18 @@ #endif #ifdef USE_AS_STPCPY -# ifdef USE_AS_STRNCPY -# define STRCPY_SSSE3 __stpncpy_ssse3 -# define STRCPY_SSE2 __stpncpy_sse2 -# define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned -# define __GI_STRCPY __GI_stpncpy -# define __GI___STRCPY __GI___stpncpy -# else # define STRCPY_SSSE3 __stpcpy_ssse3 # define STRCPY_SSE2 __stpcpy_sse2 +# define STRCPY_AVX2 __stpcpy_avx2 # define STRCPY_SSE2_UNALIGNED __stpcpy_sse2_unaligned # define __GI_STRCPY __GI_stpcpy # define __GI___STRCPY __GI___stpcpy -# endif #else -# ifdef USE_AS_STRNCPY -# define STRCPY_SSSE3 __strncpy_ssse3 -# define STRCPY_SSE2 __strncpy_sse2 -# define STRCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned -# define __GI_STRCPY __GI_strncpy -# else # define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_AVX2 __strcpy_avx2 # define STRCPY_SSE2 __strcpy_sse2 # define STRCPY_SSE2_UNALIGNED __strcpy_sse2_unaligned # define __GI_STRCPY __GI_strcpy -# endif #endif @@ -64,7 +51,10 @@ ENTRY(STRCPY) cmpl $0, __cpu_features+KIND_OFFSET(%rip) jne 1f call __init_cpu_features -1: leaq STRCPY_SSE2_UNALIGNED(%rip), %rax +1: leaq STRCPY_AVX2(%rip), %rax + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) + jnz 2f + leaq STRCPY_SSE2_UNALIGNED(%rip), %rax testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) jnz 2f leaq STRCPY_SSE2(%rip), %rax diff --git a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S index fcc23a754a..e4c98e7702 100644 --- a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S @@ -1,3 +1,1888 @@ -#define USE_AS_STRNCPY -#define STRCPY __strncpy_sse2_unaligned -#include "strcpy-sse2-unaligned.S" +/* strcpy with SSE2 and unaligned load + Copyright (C) 2011-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# ifndef USE_AS_STRCAT +# include <sysdep.h> + +# ifndef STRCPY +# define STRCPY __strncpy_sse2_unaligned +# endif + +# define USE_AS_STRNCPY +# endif + +# define JMPTBL(I, B) I - B +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + lea (%r11, %rcx), %rcx; \ + jmp *%rcx + +# ifndef USE_AS_STRCAT + +.text +ENTRY (STRCPY) +# ifdef USE_AS_STRNCPY + mov %rdx, %r8 + test %r8, %r8 + jz L(ExitZero) +# endif + mov %rsi, %rcx +# ifndef USE_AS_STPCPY + mov %rdi, %rax /* save result */ +# endif + +# endif + + and $63, %rcx + cmp $32, %rcx + jbe L(SourceStringAlignmentLess32) + + and $-16, %rsi + and $15, %rcx + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%rsi), %xmm1 + pmovmskb %xmm1, %rdx + shr %cl, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + mov $16, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# else + mov $17, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# endif + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm0, %rdx + +# ifdef USE_AS_STRNCPY + add $16, %r10 + cmp %r10, %r8 + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes) + + movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%rdi) + +/* If source address alignment != destination address alignment */ + .p2align 4 +L(Unalign16Both): + sub %rcx, %rdi +# ifdef USE_AS_STRNCPY + add %rcx, %r8 +# endif + mov $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movaps 16(%rsi, %rcx), %xmm2 + movdqu %xmm1, (%rdi, %rcx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $48, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm2) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm3 + movdqu %xmm2, (%rdi, %rcx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm3) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm4 + movdqu %xmm3, (%rdi, %rcx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm4) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm1 + movdqu %xmm4, (%rdi, %rcx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm1) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm2 + movdqu %xmm1, (%rdi, %rcx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm2) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm3 + movdqu %xmm2, (%rdi, %rcx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm3) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movdqu %xmm3, (%rdi, %rcx) + mov %rsi, %rdx + lea 16(%rsi, %rcx), %rsi + and $-0x40, %rsi + sub %rsi, %rdx + sub %rdx, %rdi +# ifdef USE_AS_STRNCPY + lea 128(%r8, %rdx), %r8 +# endif +L(Unaligned64Loop): + movaps (%rsi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rsi), %xmm5 + movaps 32(%rsi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rsi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rdx +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %rdx, %rdx + jnz L(Unaligned64Leave) + +L(Unaligned64Loop_start): + add $64, %rdi + add $64, %rsi + movdqu %xmm4, -64(%rdi) + movaps (%rsi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%rdi) + movaps 16(%rsi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%rsi), %xmm3 + movdqu %xmm6, -32(%rdi) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%rdi) + movaps 48(%rsi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rdx +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %rdx, %rdx + jz L(Unaligned64Loop_start) + +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %rdx + pmovmskb %xmm1, %rcx + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %rcx, %rcx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %rdx + pmovmskb %xmm1, %rcx + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %rcx, %rdx + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 48(%rdi, %rdx), %rax +# endif + movdqu %xmm7, 48(%rdi) + add $15, %r8 + sub %rdx, %r8 + lea 49(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $48, %rsi + add $48, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + +/* If source address alignment == destination address alignment */ + +L(SourceStringAlignmentLess32): + pxor %xmm0, %xmm0 + movdqu (%rsi), %xmm1 + movdqu 16(%rsi), %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $16, %r8 +# else + cmp $17, %r8 +# endif + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1) + + pcmpeqb %xmm2, %xmm0 + movdqu %xmm1, (%rdi) + pmovmskb %xmm0, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $32, %r8 +# else + cmp $33, %r8 +# endif + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes1) + + and $-16, %rsi + and $15, %rcx + jmp L(Unalign16Both) + +/*------End of main part with loops---------------------*/ + +/* Case1 */ + +# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) + .p2align 4 +L(CopyFrom1To16Bytes): + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + .p2align 4 +L(CopyFrom1To16BytesTail): + add %rcx, %rsi + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %rsi + add $16, %rdi +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $16, %r8 +# endif +L(CopyFrom1To16BytesTail1): + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + bsf %rdx, %rdx + add %rcx, %rsi + add $16, %rdx + sub %rcx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + movdqu %xmm4, (%rdi) + add $63, %r8 + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %rcx, %rdx + movdqu %xmm4, (%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 16(%rdi, %rdx), %rax +# endif + movdqu %xmm5, 16(%rdi) + add $47, %r8 + sub %rdx, %r8 + lea 17(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $16, %rsi + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %rdx, %rdx + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 32(%rdi, %rdx), %rax +# endif + movdqu %xmm6, 32(%rdi) + add $31, %r8 + sub %rdx, %r8 + lea 33(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $32, %rsi + add $32, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + +# ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm6): + movdqu %xmm6, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm5): + movdqu %xmm5, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm4): + movdqu %xmm4, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm3): + movdqu %xmm3, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm1): + movdqu %xmm1, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) +# endif + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + add %rcx, %rsi + bsf %rdx, %rdx + add $16, %rdx + sub %rcx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +L(CopyFrom1To16BytesTailCase2): + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rcx, %rdi + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To32BytesCase2) + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTailCase2) + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %rdi + add $16, %rsi + sub $16, %r8 +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +# endif + +/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/ + + .p2align 4 +L(Exit1): + mov %dh, (%rdi) +# ifdef USE_AS_STPCPY + lea (%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $1, %r8 + lea 1(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit2): + mov (%rsi), %dx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $2, %r8 + lea 2(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit3): + mov (%rsi), %cx + mov %cx, (%rdi) + mov %dh, 2(%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $3, %r8 + lea 3(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit4): + mov (%rsi), %edx + mov %edx, (%rdi) +# ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $4, %r8 + lea 4(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit5): + mov (%rsi), %ecx + mov %dh, 4(%rdi) + mov %ecx, (%rdi) +# ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $5, %r8 + lea 5(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit6): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) +# ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $6, %r8 + lea 6(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit7): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) +# ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $7, %r8 + lea 7(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit8): + mov (%rsi), %rdx + mov %rdx, (%rdi) +# ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $8, %r8 + lea 8(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit9): + mov (%rsi), %rcx + mov %dh, 8(%rdi) + mov %rcx, (%rdi) +# ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $9, %r8 + lea 9(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit10): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $10, %r8 + lea 10(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit11): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $11, %r8 + lea 11(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit12): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $12, %r8 + lea 12(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit13): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) +# ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $13, %r8 + lea 13(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit14): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) +# ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $14, %r8 + lea 14(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit15): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $15, %r8 + lea 15(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit16): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +# ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $16, %r8 + lea 16(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit17): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) + mov %dh, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $17, %r8 + lea 17(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit18): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $18, %r8 + lea 18(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit19): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $19, %r8 + lea 19(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $20, %r8 + lea 20(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit21): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dh, 20(%rdi) +# ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $21, %r8 + lea 21(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit22): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $22, %r8 + lea 22(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit23): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $23, %r8 + lea 23(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $24, %r8 + lea 24(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) + mov %dh, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $25, %r8 + lea 25(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $26, %r8 + lea 26(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) +# ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $27, %r8 + lea 27(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit28): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $28, %r8 + lea 28(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit29): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) +# ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $29, %r8 + lea 29(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit30): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $30, %r8 + lea 30(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit31): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $31, %r8 + lea 31(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $32, %r8 + lea 32(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(StrncpyExit0): +# ifdef USE_AS_STPCPY + mov %rdi, %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, (%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit1): + mov (%rsi), %dl + mov %dl, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 1(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit2): + mov (%rsi), %dx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 2(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit3): + mov (%rsi), %cx + mov 2(%rsi), %dl + mov %cx, (%rdi) + mov %dl, 2(%rdi) +# ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 3(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit4): + mov (%rsi), %edx + mov %edx, (%rdi) +# ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 4(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit5): + mov (%rsi), %ecx + mov 4(%rsi), %dl + mov %ecx, (%rdi) + mov %dl, 4(%rdi) +# ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 5(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit6): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) +# ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 6(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit7): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) +# ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 7(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit8): + mov (%rsi), %rdx + mov %rdx, (%rdi) +# ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 8(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit9): + mov (%rsi), %rcx + mov 8(%rsi), %dl + mov %rcx, (%rdi) + mov %dl, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 9(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit10): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 10(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit11): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 11(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit12): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 12(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit13): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) +# ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 13(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit14): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) +# ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 14(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit15): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 15(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit16): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +# ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 16(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit17): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %cl, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 17(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit18): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 18(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit19): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 19(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 20(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit21): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + mov 20(%rsi), %dl + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dl, 20(%rdi) +# ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 21(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit22): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 22(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit23): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 23(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 24(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cl, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 25(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 26(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) +# ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 27(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit28): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 28(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit29): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) +# ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 29(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit30): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 30(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit31): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 31(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 32(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 32(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit33): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + mov 32(%rsi), %cl + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) + mov %cl, 32(%rdi) +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 33(%rdi) +# endif + ret + +# ifndef USE_AS_STRCAT + + .p2align 4 +L(Fill0): + ret + + .p2align 4 +L(Fill1): + mov %dl, (%rdi) + ret + + .p2align 4 +L(Fill2): + mov %dx, (%rdi) + ret + + .p2align 4 +L(Fill3): + mov %edx, -1(%rdi) + ret + + .p2align 4 +L(Fill4): + mov %edx, (%rdi) + ret + + .p2align 4 +L(Fill5): + mov %edx, (%rdi) + mov %dl, 4(%rdi) + ret + + .p2align 4 +L(Fill6): + mov %edx, (%rdi) + mov %dx, 4(%rdi) + ret + + .p2align 4 +L(Fill7): + mov %rdx, -1(%rdi) + ret + + .p2align 4 +L(Fill8): + mov %rdx, (%rdi) + ret + + .p2align 4 +L(Fill9): + mov %rdx, (%rdi) + mov %dl, 8(%rdi) + ret + + .p2align 4 +L(Fill10): + mov %rdx, (%rdi) + mov %dx, 8(%rdi) + ret + + .p2align 4 +L(Fill11): + mov %rdx, (%rdi) + mov %edx, 7(%rdi) + ret + + .p2align 4 +L(Fill12): + mov %rdx, (%rdi) + mov %edx, 8(%rdi) + ret + + .p2align 4 +L(Fill13): + mov %rdx, (%rdi) + mov %rdx, 5(%rdi) + ret + + .p2align 4 +L(Fill14): + mov %rdx, (%rdi) + mov %rdx, 6(%rdi) + ret + + .p2align 4 +L(Fill15): + movdqu %xmm0, -1(%rdi) + ret + + .p2align 4 +L(Fill16): + movdqu %xmm0, (%rdi) + ret + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm2): + movdqu %xmm2, (%rdi, %rcx) + + .p2align 4 +L(CopyFrom1To16BytesXmmExit): + bsf %rdx, %rdx + add $15, %r8 + add %rcx, %rdi +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + + .p2align 4 +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %rdx, %rdx + sub $16, %r8 + jbe L(StrncpyFillExit) + + movdqu %xmm0, (%rdi) + add $16, %rdi + + mov %rdi, %rsi + and $0xf, %rsi + sub %rsi, %rdi + add %rsi, %r8 + sub $64, %r8 + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + add $64, %rdi + sub $64, %r8 + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %r8 + jl L(StrncpyFillLess32) + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + add $32, %rdi + sub $16, %r8 + jl L(StrncpyFillExit) + movdqa %xmm0, (%rdi) + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +L(StrncpyFillLess32): + add $16, %r8 + jl L(StrncpyFillExit) + movdqa %xmm0, (%rdi) + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +L(StrncpyFillExit): + add $16, %r8 + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +/* end of ifndef USE_AS_STRCAT */ +# endif + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %rdx, %rdx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%r8), %rcx + and $-16, %rcx + add $48, %r8 + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%rdi) +# ifdef USE_AS_STPCPY + lea 64(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 64(%rdi) +# endif + ret + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %rcx, %rcx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rdx + add $48, %r8 + jle L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +# ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm4) +# else + jnz L(CopyFrom1To16Bytes) +# endif + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm4, (%rdi) + add $16, %rcx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +# ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm5) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm5, 16(%rdi) + add $16, %rcx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +# ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm6) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm6, 32(%rdi) + lea 16(%rdi, %rcx), %rdi + lea 16(%rsi, %rcx), %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(ExitZero): +# ifndef USE_AS_STRCAT + mov %rdi, %rax +# endif + ret + +# endif + +# ifndef USE_AS_STRCAT +END (STRCPY) +# else +END (STRCAT) +# endif + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +# ifdef USE_AS_STRNCPY +L(ExitStrncpyTable): + .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) +# ifndef USE_AS_STRCAT + .p2align 4 +L(FillTable): + .int JMPTBL(L(Fill0), L(FillTable)) + .int JMPTBL(L(Fill1), L(FillTable)) + .int JMPTBL(L(Fill2), L(FillTable)) + .int JMPTBL(L(Fill3), L(FillTable)) + .int JMPTBL(L(Fill4), L(FillTable)) + .int JMPTBL(L(Fill5), L(FillTable)) + .int JMPTBL(L(Fill6), L(FillTable)) + .int JMPTBL(L(Fill7), L(FillTable)) + .int JMPTBL(L(Fill8), L(FillTable)) + .int JMPTBL(L(Fill9), L(FillTable)) + .int JMPTBL(L(Fill10), L(FillTable)) + .int JMPTBL(L(Fill11), L(FillTable)) + .int JMPTBL(L(Fill12), L(FillTable)) + .int JMPTBL(L(Fill13), L(FillTable)) + .int JMPTBL(L(Fill14), L(FillTable)) + .int JMPTBL(L(Fill15), L(FillTable)) + .int JMPTBL(L(Fill16), L(FillTable)) +# endif +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strncpy.S b/sysdeps/x86_64/multiarch/strncpy.S index 6d87a0ba35..afbd87096c 100644 --- a/sysdeps/x86_64/multiarch/strncpy.S +++ b/sysdeps/x86_64/multiarch/strncpy.S @@ -1,5 +1,85 @@ -/* Multiple versions of strncpy - All versions must be listed in ifunc-impl-list.c. */ -#define STRCPY strncpy +/* Multiple versions of strcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + #define USE_AS_STRNCPY -#include "strcpy.S" +#ifndef STRNCPY +#define STRNCPY strncpy +#endif + +#ifdef USE_AS_STPCPY +# define STRNCPY_SSSE3 __stpncpy_ssse3 +# define STRNCPY_SSE2 __stpncpy_sse2 +# define STRNCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned +# define __GI_STRNCPY __GI_stpncpy +# define __GI___STRNCPY __GI___stpncpy +#else +# define STRNCPY_SSSE3 __strncpy_ssse3 +# define STRNCPY_SSE2 __strncpy_sse2 +# define STRNCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned +# define __GI_STRNCPY __GI_strncpy +#endif + + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(STRNCPY) + .type STRNCPY, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRNCPY_SSE2_UNALIGNED(%rip), %rax + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) + jnz 2f + leaq STRNCPY_SSE2(%rip), %rax + testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jz 2f + leaq STRNCPY_SSSE3(%rip), %rax +2: ret +END(STRNCPY) + +# undef ENTRY +# define ENTRY(name) \ + .type STRNCPY_SSE2, @function; \ + .align 16; \ + .globl STRNCPY_SSE2; \ + .hidden STRNCPY_SSE2; \ + STRNCPY_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRNCPY_SSE2, .-STRNCPY_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcpy calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRNCPY; __GI_STRNCPY = STRNCPY_SSE2 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRNCPY; __GI___STRNCPY = STRNCPY_SSE2 +#endif + +#ifndef USE_AS_STRNCPY +#include "../strcpy.S" +#endif |