diff options
-rw-r--r-- | ChangeLog | 21 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-strcpy.h (renamed from sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h) | 6 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpcpy-avx2.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpcpy.c | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpncpy-avx2.S | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpncpy.c | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcat-avx2.S | 275 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcat.c | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcpy-avx2.S | 1022 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcpy.c | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncat-avx2.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncat.c | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncpy-avx2.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncpy.c | 2 |
16 files changed, 1358 insertions, 6 deletions
diff --git a/ChangeLog b/ChangeLog index 101a267ebc..3a94b71fe7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +2019-01-14 Leonardo Sandoval <leonardo.sandoval.gonzalez@intel.com> + + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + strcat-avx2, strncat-avx2, strcpy-avx2, strncpy-avx2, + stpcpy-avx2 and stpncpy-avx2. + * sysdeps/x86_64/multiarch/ifunc-impl-list.c: + (__libc_ifunc_impl_list): Add tests for __strcat_avx2, + __strncat_avx2, __strcpy_avx2, __strncpy_avx2, __stpcpy_avx2 + and __stpncpy_avx2. + * sysdeps/x86_64/multiarch/{ifunc-unaligned-ssse3.h => + ifunc-strcpy.h}: rename header for a more generic name. + * sysdeps/x86_64/multiarch/ifunc-strcpy.h: + (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if + AVX unaligned load is fast and vzeroupper is preferred. + * sysdeps/x86_64/multiarch/stpcpy-avx2.S: New file. + * sysdeps/x86_64/multiarch/stpncpy-avx2.S: Likewise. + * sysdeps/x86_64/multiarch/strcat-avx2.S: Likewise. + * sysdeps/x86_64/multiarch/strcpy-avx2.S: Likewise. + * sysdeps/x86_64/multiarch/strncat-avx2.S: Likewise. + * sysdeps/x86_64/multiarch/strncpy-avx2.S: Likewise. + 2019-01-12 Dmitry V. Levin <ldv@altlinux.org> * argp/argp-help.c: Fix typo in comment. diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index bb5e970735..395e432c09 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -24,11 +24,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \ strrchr-sse2 strrchr-avx2 \ strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \ + strcat-avx2 strncat-avx2 \ strcat-ssse3 strncat-ssse3\ + strcpy-avx2 strncpy-avx2 \ strcpy-sse2 stpcpy-sse2 \ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ + stpcpy-avx2 stpncpy-avx2 \ strcat-sse2 \ strcat-sse2-unaligned strncat-sse2-unaligned \ strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index cee2aebaee..39a45d0742 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -199,6 +199,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, stpncpy, IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3), __stpncpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable), + __stpncpy_avx2) IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2)) @@ -207,6 +209,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, stpcpy, IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3), __stpcpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable), + __stpcpy_avx2) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2)) @@ -239,6 +243,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strcat.c. */ IFUNC_IMPL (i, name, strcat, + IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable), + __strcat_avx2) IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3), __strcat_ssse3) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) @@ -280,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strcpy.c. */ IFUNC_IMPL (i, name, strcpy, + IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable), + __strcpy_avx2) IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3), __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) @@ -321,6 +329,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strncat.c. */ IFUNC_IMPL (i, name, strncat, + IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable), + __strncat_avx2) IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3), __strncat_ssse3) IFUNC_IMPL_ADD (array, i, strncat, 1, @@ -329,6 +339,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strncpy.c. */ IFUNC_IMPL (i, name, strncpy, + IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable), + __strncpy_avx2) IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3), __strncpy_ssse3) IFUNC_IMPL_ADD (array, i, strncpy, 1, diff --git a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h index 4ddca9f15f..bc6a70fc7c 100644 --- a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h @@ -24,12 +24,18 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + return OPTIMIZE (avx2); + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2.S b/sysdeps/x86_64/multiarch/stpcpy-avx2.S new file mode 100644 index 0000000000..f0bd3029fe --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_avx2 +#include "strcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/stpcpy.c b/sysdeps/x86_64/multiarch/stpcpy.c index 0b12f5d7a1..fabe03204b 100644 --- a/sysdeps/x86_64/multiarch/stpcpy.c +++ b/sysdeps/x86_64/multiarch/stpcpy.c @@ -28,7 +28,7 @@ # undef __stpcpy # define SYMBOL_NAME stpcpy -# include "ifunc-unaligned-ssse3.h" +# include "ifunc-strcpy.h" libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ()); diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S new file mode 100644 index 0000000000..032b0407d0 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_avx2 +#include "strcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c index 71edc291e4..3b618e7491 100644 --- a/sysdeps/x86_64/multiarch/stpncpy.c +++ b/sysdeps/x86_64/multiarch/stpncpy.c @@ -26,7 +26,7 @@ # undef __stpncpy # define SYMBOL_NAME stpncpy -# include "ifunc-unaligned-ssse3.h" +# include "ifunc-strcpy.h" libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ()); diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S new file mode 100644 index 0000000000..b062356427 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S @@ -0,0 +1,275 @@ +/* strcat with AVX2 + Copyright (C) 2011-2018 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRCAT +# define STRCAT __strcat_avx2 +# endif + +# define USE_AS_STRCAT + +/* Number of bytes in a vector register */ +# define VEC_SIZE 32 + + .section .text.avx,"ax",@progbits +ENTRY (STRCAT) + mov %rdi, %r9 +# ifdef USE_AS_STRNCAT + mov %rdx, %r8 +# endif + + xor %eax, %eax + mov %edi, %ecx + and $((VEC_SIZE * 4) - 1), %ecx + vpxor %xmm6, %xmm6, %xmm6 + cmp $(VEC_SIZE * 3), %ecx + ja L(fourth_vector_boundary) + vpcmpeqb (%rdi), %ymm6, %ymm0 + vpmovmskb %ymm0, %edx + test %edx, %edx + jnz L(exit_null_on_first_vector) + mov %rdi, %rax + and $-VEC_SIZE, %rax + jmp L(align_vec_size_start) +L(fourth_vector_boundary): + mov %rdi, %rax + and $-VEC_SIZE, %rax + vpcmpeqb (%rax), %ymm6, %ymm0 + mov $-1, %r10d + sub %rax, %rcx + shl %cl, %r10d + vpmovmskb %ymm0, %edx + and %r10d, %edx + jnz L(exit) + +L(align_vec_size_start): + vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0 + vpmovmskb %ymm0, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 + vpmovmskb %ymm1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 + vpmovmskb %ymm2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 + vpmovmskb %ymm3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 + add $(VEC_SIZE * 4), %rax + vpmovmskb %ymm0, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 + vpmovmskb %ymm1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 + vpmovmskb %ymm2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 + vpmovmskb %ymm3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 + add $(VEC_SIZE * 4), %rax + vpmovmskb %ymm0, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 + vpmovmskb %ymm1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 + vpmovmskb %ymm2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 + vpmovmskb %ymm3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 + add $(VEC_SIZE * 4), %rax + vpmovmskb %ymm0, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 + vpmovmskb %ymm1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 + vpmovmskb %ymm2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 + vpmovmskb %ymm3, %edx + test %edx, %edx + jnz L(exit_null_on_fifth_vector) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 + add $(VEC_SIZE * 5), %rax + vpmovmskb %ymm0, %edx + test %edx, %edx + jnz L(exit) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1 + add $VEC_SIZE, %rax + vpmovmskb %ymm1, %edx + test %edx, %edx + jnz L(exit) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2 + add $VEC_SIZE, %rax + vpmovmskb %ymm2, %edx + test %edx, %edx + jnz L(exit) + + test $((VEC_SIZE * 4) - 1), %rax + jz L(align_four_vec_loop) + + vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3 + add $VEC_SIZE, %rax + vpmovmskb %ymm3, %edx + test %edx, %edx + jnz L(exit) + + add $VEC_SIZE, %rax + + .p2align 4 +L(align_four_vec_loop): + vmovaps (%rax), %ymm4 + vpminub VEC_SIZE(%rax), %ymm4, %ymm4 + vmovaps (VEC_SIZE * 2)(%rax), %ymm5 + vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5 + add $(VEC_SIZE * 4), %rax + vpminub %ymm4, %ymm5, %ymm5 + vpcmpeqb %ymm5, %ymm6, %ymm5 + vpmovmskb %ymm5, %edx + test %edx, %edx + jz L(align_four_vec_loop) + + vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0 + sub $(VEC_SIZE * 5), %rax + vpmovmskb %ymm0, %edx + test %edx, %edx + jnz L(exit_null_on_second_vector) + + vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 + vpmovmskb %ymm1, %edx + test %edx, %edx + jnz L(exit_null_on_third_vector) + + vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 + vpmovmskb %ymm2, %edx + test %edx, %edx + jnz L(exit_null_on_fourth_vector) + + vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 + vpmovmskb %ymm3, %edx + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 4), %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit): + sub %rdi, %rax +L(exit_null_on_first_vector): + bsf %rdx, %rdx + add %rdx, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_second_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $VEC_SIZE, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_third_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 2), %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_fourth_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 3), %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_null_on_fifth_vector): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $(VEC_SIZE * 4), %rax + + .p2align 4 +L(StartStrcpyPart): + lea (%r9, %rax), %rdi + mov %rsi, %rcx + mov %r9, %rax /* save result */ + +# ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(ExitZero) +# define USE_AS_STRNCPY +# endif + +# include "strcpy-avx2.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strcat.c b/sysdeps/x86_64/multiarch/strcat.c index a0341b9514..92c360afc4 100644 --- a/sysdeps/x86_64/multiarch/strcat.c +++ b/sysdeps/x86_64/multiarch/strcat.c @@ -24,7 +24,7 @@ # undef strcat # define SYMBOL_NAME strcat -# include "ifunc-unaligned-ssse3.h" +# include "ifunc-strcpy.h" libc_ifunc_redirected (__redirect_strcat, strcat, IFUNC_SELECTOR ()); diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S new file mode 100644 index 0000000000..81677f9060 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S @@ -0,0 +1,1022 @@ +/* strcpy with AVX2 + Copyright (C) 2011-2018 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# ifndef USE_AS_STRCAT +# include <sysdep.h> + +# ifndef STRCPY +# define STRCPY __strcpy_avx2 +# endif + +# endif + +/* Number of bytes in a vector register */ +# ifndef VEC_SIZE +# define VEC_SIZE 32 +# endif + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +/* zero register */ +#define xmmZ xmm0 +#define ymmZ ymm0 + +/* mask register */ +#define ymmM ymm1 + +# ifndef USE_AS_STRCAT + + .section .text.avx,"ax",@progbits +ENTRY (STRCPY) +# ifdef USE_AS_STRNCPY + mov %rdx, %r8 + test %r8, %r8 + jz L(ExitZero) +# endif + mov %rsi, %rcx +# ifndef USE_AS_STPCPY + mov %rdi, %rax /* save result */ +# endif + +# endif + + vpxor %xmmZ, %xmmZ, %xmmZ + + and $((VEC_SIZE * 4) - 1), %ecx + cmp $(VEC_SIZE * 2), %ecx + jbe L(SourceStringAlignmentLessTwoVecSize) + + and $-VEC_SIZE, %rsi + and $(VEC_SIZE - 1), %ecx + + vpcmpeqb (%rsi), %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + shr %cl, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + mov $VEC_SIZE, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# else + mov $(VEC_SIZE + 1), %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# endif + jbe L(CopyVecSizeTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyVecSizeTail) + + vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2 + vpmovmskb %ymm2, %edx + +# ifdef USE_AS_STRNCPY + add $VEC_SIZE, %r10 + cmp %r10, %r8 + jbe L(CopyTwoVecSizeCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyTwoVecSize) + + vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */ + vmovdqu %ymm2, (%rdi) + +/* If source address alignment != destination address alignment */ + .p2align 4 +L(UnalignVecSizeBoth): + sub %rcx, %rdi +# ifdef USE_AS_STRNCPY + add %rcx, %r8 + sbb %rcx, %rcx + or %rcx, %r8 +# endif + mov $VEC_SIZE, %rcx + vmovdqa (%rsi, %rcx), %ymm2 + vmovdqu %ymm2, (%rdi, %rcx) + vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 + vpcmpeqb %ymm2, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $(VEC_SIZE * 3), %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec2) +# else + jnz L(CopyVecSize) +# endif + + vmovdqu %ymm2, (%rdi, %rcx) + vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 + vpcmpeqb %ymm3, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec3) +# else + jnz L(CopyVecSize) +# endif + + vmovdqu %ymm3, (%rdi, %rcx) + vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4 + vpcmpeqb %ymm4, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec4) +# else + jnz L(CopyVecSize) +# endif + + vmovdqu %ymm4, (%rdi, %rcx) + vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 + vpcmpeqb %ymm2, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec2) +# else + jnz L(CopyVecSize) +# endif + + vmovdqu %ymm2, (%rdi, %rcx) + vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 + vpcmpeqb %ymm2, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec2) +# else + jnz L(CopyVecSize) +# endif + + vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 + vmovdqu %ymm2, (%rdi, %rcx) + vpcmpeqb %ymm3, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + add $VEC_SIZE, %rcx +# ifdef USE_AS_STRNCPY + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) +# endif + test %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec3) +# else + jnz L(CopyVecSize) +# endif + + vmovdqu %ymm3, (%rdi, %rcx) + mov %rsi, %rdx + lea VEC_SIZE(%rsi, %rcx), %rsi + and $-(VEC_SIZE * 4), %rsi + sub %rsi, %rdx + sub %rdx, %rdi +# ifdef USE_AS_STRNCPY + lea (VEC_SIZE * 8)(%r8, %rdx), %r8 +# endif +L(UnalignedFourVecSizeLoop): + vmovdqa (%rsi), %ymm4 + vmovdqa VEC_SIZE(%rsi), %ymm5 + vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 + vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 + vpminub %ymm5, %ymm4, %ymm2 + vpminub %ymm7, %ymm6, %ymm3 + vpminub %ymm2, %ymm3, %ymm3 + vpcmpeqb %ymmM, %ymm3, %ymm3 + vpmovmskb %ymm3, %edx +# ifdef USE_AS_STRNCPY + sub $(VEC_SIZE * 4), %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jnz L(UnalignedFourVecSizeLeave) + +L(UnalignedFourVecSizeLoop_start): + add $(VEC_SIZE * 4), %rdi + add $(VEC_SIZE * 4), %rsi + vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi) + vmovdqa (%rsi), %ymm4 + vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi) + vmovdqa VEC_SIZE(%rsi), %ymm5 + vpminub %ymm5, %ymm4, %ymm2 + vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi) + vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 + vmovdqu %ymm7, -VEC_SIZE(%rdi) + vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 + vpminub %ymm7, %ymm6, %ymm3 + vpminub %ymm2, %ymm3, %ymm3 + vpcmpeqb %ymmM, %ymm3, %ymm3 + vpmovmskb %ymm3, %edx +# ifdef USE_AS_STRNCPY + sub $(VEC_SIZE * 4), %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jz L(UnalignedFourVecSizeLoop_start) + +L(UnalignedFourVecSizeLeave): + vpcmpeqb %ymm4, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + test %edx, %edx + jnz L(CopyVecSizeUnaligned_0) + + vpcmpeqb %ymm5, %ymmZ, %ymmM + vpmovmskb %ymmM, %ecx + test %ecx, %ecx + jnz L(CopyVecSizeUnaligned_16) + + vpcmpeqb %ymm6, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + test %edx, %edx + jnz L(CopyVecSizeUnaligned_32) + + vpcmpeqb %ymm7, %ymmZ, %ymmM + vpmovmskb %ymmM, %ecx + bsf %ecx, %edx + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, VEC_SIZE(%rdi) + vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (VEC_SIZE * 3)(%rdi, %rdx), %rax +# endif + vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) + add $(VEC_SIZE - 1), %r8 + sub %rdx, %r8 + lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $(VEC_SIZE * 3), %rsi + add $(VEC_SIZE * 3), %rdi + jmp L(CopyVecSizeExit) +# endif + +/* If source address alignment == destination address alignment */ + +L(SourceStringAlignmentLessTwoVecSize): + vmovdqu (%rsi), %ymm3 + vmovdqu VEC_SIZE(%rsi), %ymm2 + vpcmpeqb %ymm3, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $VEC_SIZE, %r8 +# else + cmp $(VEC_SIZE + 1), %r8 +# endif + jbe L(CopyVecSizeTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyVecSizeTail1) + + vmovdqu %ymm3, (%rdi) + vpcmpeqb %ymm2, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $(VEC_SIZE * 2), %r8 +# else + cmp $((VEC_SIZE * 2) + 1), %r8 +# endif + jbe L(CopyTwoVecSize1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyTwoVecSize1) + + and $-VEC_SIZE, %rsi + and $(VEC_SIZE - 1), %ecx + jmp L(UnalignVecSizeBoth) + +/*------End of main part with loops---------------------*/ + +/* Case1 */ + +# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) + .p2align 4 +L(CopyVecSize): + add %rcx, %rdi +# endif +L(CopyVecSizeTail): + add %rcx, %rsi +L(CopyVecSizeTail1): + bsf %edx, %edx +L(CopyVecSizeExit): + cmp $32, %edx + jae L(Exit32_63) + cmp $16, %edx + jae L(Exit16_31) + cmp $8, %edx + jae L(Exit8_15) + cmp $4, %edx + jae L(Exit4_7) + cmp $3, %edx + je L(Exit3) + cmp $1, %edx + ja L(Exit2) + je L(Exit1) + movb $0, (%rdi) +# ifdef USE_AS_STPCPY + lea (%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $1, %r8 + lea 1(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + VZEROUPPER + ret + + .p2align 4 +L(CopyTwoVecSize1): + add $VEC_SIZE, %rsi + add $VEC_SIZE, %rdi +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $VEC_SIZE, %r8 +# endif + jmp L(CopyVecSizeTail1) + + .p2align 4 +L(CopyTwoVecSize): + bsf %edx, %edx + add %rcx, %rsi + add $VEC_SIZE, %edx + sub %ecx, %edx + jmp L(CopyVecSizeExit) + + .p2align 4 +L(CopyVecSizeUnaligned_0): + bsf %edx, %edx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + vmovdqu %ymm4, (%rdi) + add $((VEC_SIZE * 4) - 1), %r8 + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + jmp L(CopyVecSizeExit) +# endif + + .p2align 4 +L(CopyVecSizeUnaligned_16): + bsf %ecx, %edx + vmovdqu %ymm4, (%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea VEC_SIZE(%rdi, %rdx), %rax +# endif + vmovdqu %ymm5, VEC_SIZE(%rdi) + add $((VEC_SIZE * 3) - 1), %r8 + sub %rdx, %r8 + lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $VEC_SIZE, %rsi + add $VEC_SIZE, %rdi + jmp L(CopyVecSizeExit) +# endif + + .p2align 4 +L(CopyVecSizeUnaligned_32): + bsf %edx, %edx + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, VEC_SIZE(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (VEC_SIZE * 2)(%rdi, %rdx), %rax +# endif + vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) + add $((VEC_SIZE * 2) - 1), %r8 + sub %rdx, %r8 + lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $(VEC_SIZE * 2), %rsi + add $(VEC_SIZE * 2), %rdi + jmp L(CopyVecSizeExit) +# endif + +# ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT + .p2align 4 +L(CopyVecSizeUnalignedVec6): + vmovdqu %ymm6, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) + + .p2align 4 +L(CopyVecSizeUnalignedVec5): + vmovdqu %ymm5, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) + + .p2align 4 +L(CopyVecSizeUnalignedVec4): + vmovdqu %ymm4, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) + + .p2align 4 +L(CopyVecSizeUnalignedVec3): + vmovdqu %ymm3, (%rdi, %rcx) + jmp L(CopyVecSizeVecExit) +# endif + +/* Case2 */ + + .p2align 4 +L(CopyVecSizeCase2): + add $VEC_SIZE, %r8 + add %rcx, %rdi + add %rcx, %rsi + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + + .p2align 4 +L(CopyTwoVecSizeCase2): + add %rcx, %rsi + bsf %edx, %edx + add $VEC_SIZE, %edx + sub %ecx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + +L(CopyVecSizeTailCase2): + add %rcx, %rsi + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + +L(CopyVecSizeTail1Case2): + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) + jmp L(StrncpyExit) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyVecSizeCase2OrCase3): + test %rdx, %rdx + jnz L(CopyVecSizeCase2) +L(CopyVecSizeCase3): + add $VEC_SIZE, %r8 + add %rcx, %rdi + add %rcx, %rsi + jmp L(StrncpyExit) + + .p2align 4 +L(CopyTwoVecSizeCase2OrCase3): + test %rdx, %rdx + jnz L(CopyTwoVecSizeCase2) + add %rcx, %rsi + jmp L(StrncpyExit) + + .p2align 4 +L(CopyVecSizeTailCase2OrCase3): + test %rdx, %rdx + jnz L(CopyVecSizeTailCase2) + add %rcx, %rsi + jmp L(StrncpyExit) + + .p2align 4 +L(CopyTwoVecSize1Case2OrCase3): + add $VEC_SIZE, %rdi + add $VEC_SIZE, %rsi + sub $VEC_SIZE, %r8 +L(CopyVecSizeTail1Case2OrCase3): + test %rdx, %rdx + jnz L(CopyVecSizeTail1Case2) + jmp L(StrncpyExit) +# endif + +/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ + + .p2align 4 +L(Exit1): + movzwl (%rsi), %edx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $2, %r8 + lea 2(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + VZEROUPPER + ret + + .p2align 4 +L(Exit2): + movzwl (%rsi), %ecx + mov %cx, (%rdi) + movb $0, 2(%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $3, %r8 + lea 3(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + VZEROUPPER + ret + + .p2align 4 +L(Exit3): + mov (%rsi), %edx + mov %edx, (%rdi) +# ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $4, %r8 + lea 4(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + VZEROUPPER + ret + + .p2align 4 +L(Exit4_7): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov -3(%rsi, %rdx), %ecx + mov %ecx, -3(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + VZEROUPPER + ret + + .p2align 4 +L(Exit8_15): + mov (%rsi), %rcx + mov -7(%rsi, %rdx), %r9 + mov %rcx, (%rdi) + mov %r9, -7(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + VZEROUPPER + ret + + .p2align 4 +L(Exit16_31): + vmovdqu (%rsi), %xmm2 + vmovdqu -15(%rsi, %rdx), %xmm3 + vmovdqu %xmm2, (%rdi) + vmovdqu %xmm3, -15(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + VZEROUPPER + ret + + .p2align 4 +L(Exit32_63): + vmovdqu (%rsi), %ymm2 + vmovdqu -31(%rsi, %rdx), %ymm3 + vmovdqu %ymm2, (%rdi) + vmovdqu %ymm3, -31(%rdi, %rdx) +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub %rdx, %r8 + sub $1, %r8 + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + VZEROUPPER + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(StrncpyExit1): + movzbl (%rsi), %edx + mov %dl, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, 1(%rdi) +# endif + VZEROUPPER + ret + + .p2align 4 +L(StrncpyExit2): + movzwl (%rsi), %edx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, 2(%rdi) +# endif + VZEROUPPER + ret + + .p2align 4 +L(StrncpyExit3_4): + movzwl (%rsi), %ecx + movzwl -2(%rsi, %r8), %edx + mov %cx, (%rdi) + mov %dx, -2(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + VZEROUPPER + ret + + .p2align 4 +L(StrncpyExit5_8): + mov (%rsi), %ecx + mov -4(%rsi, %r8), %edx + mov %ecx, (%rdi) + mov %edx, -4(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + VZEROUPPER + ret + + .p2align 4 +L(StrncpyExit9_16): + mov (%rsi), %rcx + mov -8(%rsi, %r8), %rdx + mov %rcx, (%rdi) + mov %rdx, -8(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + VZEROUPPER + ret + + .p2align 4 +L(StrncpyExit17_32): + vmovdqu (%rsi), %xmm2 + vmovdqu -16(%rsi, %r8), %xmm3 + vmovdqu %xmm2, (%rdi) + vmovdqu %xmm3, -16(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + VZEROUPPER + ret + + .p2align 4 +L(StrncpyExit33_64): + /* 0/32, 31/16 */ + vmovdqu (%rsi), %ymm2 + vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3 + vmovdqu %ymm2, (%rdi) + vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8) +# ifdef USE_AS_STPCPY + lea (%rdi, %r8), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) +# endif + VZEROUPPER + ret + + .p2align 4 +L(StrncpyExit65): + /* 0/32, 32/32, 64/1 */ + vmovdqu (%rsi), %ymm2 + vmovdqu 32(%rsi), %ymm3 + mov 64(%rsi), %cl + vmovdqu %ymm2, (%rdi) + vmovdqu %ymm3, 32(%rdi) + mov %cl, 64(%rdi) +# ifdef USE_AS_STPCPY + lea 65(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, 65(%rdi) +# endif + VZEROUPPER + ret + +# ifndef USE_AS_STRCAT + + .p2align 4 +L(Fill1): + mov %dl, (%rdi) + VZEROUPPER + ret + + .p2align 4 +L(Fill2): + mov %dx, (%rdi) + VZEROUPPER + ret + + .p2align 4 +L(Fill3_4): + mov %dx, (%rdi) + mov %dx, -2(%rdi, %r8) + VZEROUPPER + ret + + .p2align 4 +L(Fill5_8): + mov %edx, (%rdi) + mov %edx, -4(%rdi, %r8) + VZEROUPPER + ret + + .p2align 4 +L(Fill9_16): + mov %rdx, (%rdi) + mov %rdx, -8(%rdi, %r8) + VZEROUPPER + ret + + .p2align 4 +L(Fill17_32): + vmovdqu %xmmZ, (%rdi) + vmovdqu %xmmZ, -16(%rdi, %r8) + VZEROUPPER + ret + + .p2align 4 +L(CopyVecSizeUnalignedVec2): + vmovdqu %ymm2, (%rdi, %rcx) + + .p2align 4 +L(CopyVecSizeVecExit): + bsf %edx, %edx + add $(VEC_SIZE - 1), %r8 + add %rcx, %rdi +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + + .p2align 4 +L(StrncpyFillTailWithZero): + xor %edx, %edx + sub $VEC_SIZE, %r8 + jbe L(StrncpyFillExit) + + vmovdqu %ymmZ, (%rdi) + add $VEC_SIZE, %rdi + + mov %rdi, %rsi + and $(VEC_SIZE - 1), %esi + sub %rsi, %rdi + add %rsi, %r8 + sub $(VEC_SIZE * 4), %r8 + jb L(StrncpyFillLessFourVecSize) + +L(StrncpyFillLoopVmovdqa): + vmovdqa %ymmZ, (%rdi) + vmovdqa %ymmZ, VEC_SIZE(%rdi) + vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi) + vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi) + add $(VEC_SIZE * 4), %rdi + sub $(VEC_SIZE * 4), %r8 + jae L(StrncpyFillLoopVmovdqa) + +L(StrncpyFillLessFourVecSize): + add $(VEC_SIZE * 2), %r8 + jl L(StrncpyFillLessTwoVecSize) + vmovdqa %ymmZ, (%rdi) + vmovdqa %ymmZ, VEC_SIZE(%rdi) + add $(VEC_SIZE * 2), %rdi + sub $VEC_SIZE, %r8 + jl L(StrncpyFillExit) + vmovdqa %ymmZ, (%rdi) + add $VEC_SIZE, %rdi + jmp L(Fill) + + .p2align 4 +L(StrncpyFillLessTwoVecSize): + add $VEC_SIZE, %r8 + jl L(StrncpyFillExit) + vmovdqa %ymmZ, (%rdi) + add $VEC_SIZE, %rdi + jmp L(Fill) + + .p2align 4 +L(StrncpyFillExit): + add $VEC_SIZE, %r8 +L(Fill): + cmp $17, %r8d + jae L(Fill17_32) + cmp $9, %r8d + jae L(Fill9_16) + cmp $5, %r8d + jae L(Fill5_8) + cmp $3, %r8d + jae L(Fill3_4) + cmp $1, %r8d + ja L(Fill2) + je L(Fill1) + VZEROUPPER + ret + +/* end of ifndef USE_AS_STRCAT */ +# endif + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %rdx, %rdx + jnz L(UnalignedFourVecSizeLeaveCase2) +L(UnalignedFourVecSizeLeaveCase3): + lea (VEC_SIZE * 4)(%r8), %rcx + and $-VEC_SIZE, %rcx + add $(VEC_SIZE * 3), %r8 + jl L(CopyVecSizeCase3) + vmovdqu %ymm4, (%rdi) + sub $VEC_SIZE, %r8 + jb L(CopyVecSizeCase3) + vmovdqu %ymm5, VEC_SIZE(%rdi) + sub $VEC_SIZE, %r8 + jb L(CopyVecSizeCase3) + vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) + sub $VEC_SIZE, %r8 + jb L(CopyVecSizeCase3) + vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) +# ifdef USE_AS_STPCPY + lea (VEC_SIZE * 4)(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (VEC_SIZE * 4)(%rdi) +# endif + VZEROUPPER + ret + + .p2align 4 +L(UnalignedFourVecSizeLeaveCase2): + xor %ecx, %ecx + vpcmpeqb %ymm4, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + add $(VEC_SIZE * 3), %r8 + jle L(CopyVecSizeCase2OrCase3) + test %edx, %edx +# ifndef USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec4) +# else + jnz L(CopyVecSize) +# endif + vpcmpeqb %ymm5, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + vmovdqu %ymm4, (%rdi) + add $VEC_SIZE, %rcx + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) + test %edx, %edx +# ifndef USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec5) +# else + jnz L(CopyVecSize) +# endif + + vpcmpeqb %ymm6, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + vmovdqu %ymm5, VEC_SIZE(%rdi) + add $VEC_SIZE, %rcx + sub $VEC_SIZE, %r8 + jbe L(CopyVecSizeCase2OrCase3) + test %edx, %edx +# ifndef USE_AS_STRCAT + jnz L(CopyVecSizeUnalignedVec6) +# else + jnz L(CopyVecSize) +# endif + + vpcmpeqb %ymm7, %ymmZ, %ymmM + vpmovmskb %ymmM, %edx + vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) + lea VEC_SIZE(%rdi, %rcx), %rdi + lea VEC_SIZE(%rsi, %rcx), %rsi + bsf %edx, %edx + cmp %r8d, %edx + jb L(CopyVecSizeExit) +L(StrncpyExit): + cmp $65, %r8d + je L(StrncpyExit65) + cmp $33, %r8d + jae L(StrncpyExit33_64) + cmp $17, %r8d + jae L(StrncpyExit17_32) + cmp $9, %r8d + jae L(StrncpyExit9_16) + cmp $5, %r8d + jae L(StrncpyExit5_8) + cmp $3, %r8d + jae L(StrncpyExit3_4) + cmp $1, %r8d + ja L(StrncpyExit2) + je L(StrncpyExit1) +# ifdef USE_AS_STPCPY + mov %rdi, %rax +# endif +# ifdef USE_AS_STRCAT + movb $0, (%rdi) +# endif + VZEROUPPER + ret + + .p2align 4 +L(ExitZero): +# ifndef USE_AS_STRCAT + mov %rdi, %rax +# endif + VZEROUPPER + ret + +# endif + +# ifndef USE_AS_STRCAT +END (STRCPY) +# else +END (STRCAT) +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strcpy.c b/sysdeps/x86_64/multiarch/strcpy.c index f125fd12f9..5e079edfca 100644 --- a/sysdeps/x86_64/multiarch/strcpy.c +++ b/sysdeps/x86_64/multiarch/strcpy.c @@ -24,7 +24,7 @@ # undef strcpy # define SYMBOL_NAME strcpy -# include "ifunc-unaligned-ssse3.h" +# include "ifunc-strcpy.h" libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ()); diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S new file mode 100644 index 0000000000..bfefa659bb --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat-avx2.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_avx2 +#include "strcat-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c index e95f62acac..b6cae40263 100644 --- a/sysdeps/x86_64/multiarch/strncat.c +++ b/sysdeps/x86_64/multiarch/strncat.c @@ -24,7 +24,7 @@ # undef strncat # define SYMBOL_NAME strncat -# include "ifunc-unaligned-ssse3.h" +# include "ifunc-strcpy.h" libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ()); strong_alias (strncat, __strncat); diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S new file mode 100644 index 0000000000..9ef8c87627 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_avx2 +#include "strcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c index daa150cee3..d6fd34671d 100644 --- a/sysdeps/x86_64/multiarch/strncpy.c +++ b/sysdeps/x86_64/multiarch/strncpy.c @@ -24,7 +24,7 @@ # undef strncpy # define SYMBOL_NAME strncpy -# include "ifunc-unaligned-ssse3.h" +# include "ifunc-strcpy.h" libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ()); |