diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/memset.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/memset.S | 34 |
1 files changed, 21 insertions, 13 deletions
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S index 8e3b9b9764..4e52d8f8c4 100644 --- a/sysdeps/x86_64/multiarch/memset.S +++ b/sysdeps/x86_64/multiarch/memset.S @@ -26,35 +26,43 @@ ENTRY(memset) .type memset, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX - leaq __memset_sse2(%rip), %rax + lea __memset_sse2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 1f + lea __memset_sse2_unaligned(%rip), %RAX_LP +1: HAS_ARCH_FEATURE (AVX2_Usable) jz 2f - leaq __memset_avx2(%rip), %rax -#ifdef HAVE_AVX512_ASM_SUPPORT + lea __memset_avx2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz L(AVX512F) + lea __memset_avx2_unaligned(%rip), %RAX_LP +L(AVX512F): +# ifdef HAVE_AVX512_ASM_SUPPORT HAS_ARCH_FEATURE (AVX512F_Usable) jz 2f + lea __memset_avx512_no_vzeroupper(%rip), %RAX_LP HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 2f - leaq __memset_avx512_no_vzeroupper(%rip), %rax -#endif + jnz 2f + lea __memset_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memset_avx512_unaligned(%rip), %RAX_LP +# endif 2: ret END(memset) #endif #if IS_IN (libc) -# undef memset -# define memset __memset_sse2 - -# undef __memset_chk -# define __memset_chk __memset_chk_sse2 +# define MEMSET_SYMBOL(p,s) p##_sse2_##s # ifdef SHARED # undef libc_hidden_builtin_def /* It doesn't make sense to send libc-internal memset calls through a PLT. - The speedup we get from using GPR instruction is likely eaten away + The speedup we get from using SSE2 instructions is likely eaten away by the indirect call in the PLT. */ # define libc_hidden_builtin_def(name) \ - .globl __GI_memset; __GI_memset = __memset_sse2 + .globl __GI_memset; __GI_memset = __memset_sse2_unaligned # endif # undef strong_alias |