diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/mempcpy.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/mempcpy.S | 74 |
1 files changed, 29 insertions, 45 deletions
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S index ed78623565..f9c6df301c 100644 --- a/sysdeps/x86_64/multiarch/mempcpy.S +++ b/sysdeps/x86_64/multiarch/mempcpy.S @@ -25,62 +25,46 @@ DSO. In static binaries we need mempcpy before the initialization happened. */ #if defined SHARED && IS_IN (libc) + .text ENTRY(__mempcpy) .type __mempcpy, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX -#ifdef HAVE_AVX512_ASM_SUPPORT +# ifdef HAVE_AVX512_ASM_SUPPORT HAS_ARCH_FEATURE (AVX512F_Usable) jz 1f + lea __mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 1f - leaq __mempcpy_avx512_no_vzeroupper(%rip), %rax + jnz 2f + lea __mempcpy_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __mempcpy_avx512_unaligned(%rip), %RAX_LP ret -#endif -1: leaq __mempcpy_sse2(%rip), %rax - HAS_CPU_FEATURE (SSSE3) +# endif +1: lea __mempcpy_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) jz 2f - leaq __mempcpy_ssse3(%rip), %rax - HAS_ARCH_FEATURE (Fast_Copy_Backward) + lea __mempcpy_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __mempcpy_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) jz 2f - leaq __mempcpy_ssse3_back(%rip), %rax - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + lea __mempcpy_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) jz 2f - leaq __mempcpy_avx_unaligned(%rip), %rax + lea __mempcpy_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __mempcpy_ssse3(%rip), %RAX_LP 2: ret END(__mempcpy) -# undef ENTRY -# define ENTRY(name) \ - .type __mempcpy_sse2, @function; \ - .p2align 4; \ - .globl __mempcpy_sse2; \ - .hidden __mempcpy_sse2; \ - __mempcpy_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __mempcpy_sse2, .-__mempcpy_sse2 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __mempcpy_chk_sse2, @function; \ - .globl __mempcpy_chk_sse2; \ - .p2align 4; \ - __mempcpy_chk_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2 - -# undef libc_hidden_def -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal mempcpy calls through a PLT. - The speedup we get from using SSSE3 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_def(name) \ - .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2 -# define libc_hidden_builtin_def(name) \ - .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2 +weak_alias (__mempcpy, mempcpy) #endif - -#include "../mempcpy.S" |