From c867597bff2562180a18da4b8dba89d24e8b65c4 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 8 Jun 2016 13:57:50 -0700 Subject: X86-64: Remove previous default/SSE2/AVX2 memcpy/memmove Since the new SSE2/AVX2 memcpy/memmove are faster than the previous ones, we can remove the previous SSE2/AVX2 memcpy/memmove and replace them with the new ones. No change in IFUNC selection if SSE2 and AVX2 memcpy/memmove weren't used before. If SSE2 or AVX2 memcpy/memmove were used, the new SSE2 or AVX2 memcpy/memmove optimized with Enhanced REP MOVSB will be used for processors with ERMS. The new AVX512 memcpy/memmove will be used for processors with AVX512 which prefer vzeroupper. Since the new SSE2 memcpy/memmove are faster than the previous default memcpy/memmove used in libc.a and ld.so, we also remove the previous default memcpy/memmove and make them the default memcpy/memmove, except that non-temporal store isn't used in ld.so. Together, it reduces the size of libc.so by about 6 KB and the size of ld.so by about 2 KB. [BZ #19776] * sysdeps/x86_64/memcpy.S: Make it dummy. * sysdeps/x86_64/mempcpy.S: Likewise. * sysdeps/x86_64/memmove.S: New file. * sysdeps/x86_64/memmove_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.S: Likewise. * sysdeps/x86_64/memmove.c: Removed. * sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: Likewise. * sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Likewise. * sysdeps/x86_64/multiarch/memmove-avx-unaligned.S: Likewise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memmove.c: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove memcpy-sse2-unaligned, memmove-avx-unaligned, memcpy-avx-unaligned and memmove-sse2-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Replace __memmove_chk_avx512_unaligned_2 with __memmove_chk_avx512_unaligned. Remove __memmove_chk_avx_unaligned_2. Replace __memmove_chk_sse2_unaligned_2 with __memmove_chk_sse2_unaligned. Remove __memmove_chk_sse2 and __memmove_avx_unaligned_2. Replace __memmove_avx512_unaligned_2 with __memmove_avx512_unaligned. Replace __memmove_sse2_unaligned_2 with __memmove_sse2_unaligned. Remove __memmove_sse2. Replace __memcpy_chk_avx512_unaligned_2 with __memcpy_chk_avx512_unaligned. Remove __memcpy_chk_avx_unaligned_2. Replace __memcpy_chk_sse2_unaligned_2 with __memcpy_chk_sse2_unaligned. Remove __memcpy_chk_sse2. Remove __memcpy_avx_unaligned_2. Replace __memcpy_avx512_unaligned_2 with __memcpy_avx512_unaligned. Remove __memcpy_sse2_unaligned_2 and __memcpy_sse2. Replace __mempcpy_chk_avx512_unaligned_2 with __mempcpy_chk_avx512_unaligned. Remove __mempcpy_chk_avx_unaligned_2. Replace __mempcpy_chk_sse2_unaligned_2 with __mempcpy_chk_sse2_unaligned. Remove __mempcpy_chk_sse2. Replace __mempcpy_avx512_unaligned_2 with __mempcpy_avx512_unaligned. Remove __mempcpy_avx_unaligned_2. Replace __mempcpy_sse2_unaligned_2 with __mempcpy_sse2_unaligned. Remove __mempcpy_sse2. * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Support __memcpy_avx512_unaligned_erms and __memcpy_avx512_unaligned. Use __memcpy_avx_unaligned_erms and __memcpy_sse2_unaligned_erms if processor has ERMS. Default to __memcpy_sse2_unaligned. (ENTRY): Removed. (END): Likewise. (ENTRY_CHK): Likewise. (libc_hidden_builtin_def): Likewise. Don't include ../memcpy.S. * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Support __memcpy_chk_avx512_unaligned_erms and __memcpy_chk_avx512_unaligned. Use __memcpy_chk_avx_unaligned_erms and __memcpy_chk_sse2_unaligned_erms if if processor has ERMS. Default to __memcpy_chk_sse2_unaligned. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S Change function suffix from unaligned_2 to unaligned. * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Support __mempcpy_avx512_unaligned_erms and __mempcpy_avx512_unaligned. Use __mempcpy_avx_unaligned_erms and __mempcpy_sse2_unaligned_erms if processor has ERMS. Default to __mempcpy_sse2_unaligned. (ENTRY): Removed. (END): Likewise. (ENTRY_CHK): Likewise. (libc_hidden_builtin_def): Likewise. Don't include ../mempcpy.S. (mempcpy): New. Add a weak alias. * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Support __mempcpy_chk_avx512_unaligned_erms and __mempcpy_chk_avx512_unaligned. Use __mempcpy_chk_avx_unaligned_erms and __mempcpy_chk_sse2_unaligned_erms if if processor has ERMS. Default to __mempcpy_chk_sse2_unaligned. --- sysdeps/x86_64/multiarch/Makefile | 6 +- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 63 +--- sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S | 391 --------------------- sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S | 175 --------- sysdeps/x86_64/multiarch/memcpy.S | 62 ++-- sysdeps/x86_64/multiarch/memcpy_chk.S | 40 ++- sysdeps/x86_64/multiarch/memmove-avx-unaligned.S | 22 -- .../x86_64/multiarch/memmove-sse2-unaligned-erms.S | 13 - .../x86_64/multiarch/memmove-vec-unaligned-erms.S | 24 +- sysdeps/x86_64/multiarch/memmove.S | 98 ++++++ sysdeps/x86_64/multiarch/memmove.c | 73 ---- sysdeps/x86_64/multiarch/memmove_chk.S | 71 ++++ sysdeps/x86_64/multiarch/memmove_chk.c | 46 --- sysdeps/x86_64/multiarch/mempcpy.S | 74 ++-- sysdeps/x86_64/multiarch/mempcpy_chk.S | 38 +- 15 files changed, 306 insertions(+), 890 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S delete mode 100644 sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S delete mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned.S delete mode 100644 sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S create mode 100644 sysdeps/x86_64/multiarch/memmove.S delete mode 100644 sysdeps/x86_64/multiarch/memmove.c create mode 100644 sysdeps/x86_64/multiarch/memmove_chk.S delete mode 100644 sysdeps/x86_64/multiarch/memmove_chk.c (limited to 'sysdeps/x86_64/multiarch') diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index d78e667566..3736f54ce4 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -6,10 +6,9 @@ ifeq ($(subdir),string) sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcmp-sse2-unaligned strncmp-ssse3 \ - memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \ + memcmp-sse4 memcpy-ssse3 \ memmove-ssse3 \ - memcpy-ssse3-back memmove-avx-unaligned \ - memcpy-avx-unaligned \ + memcpy-ssse3-back \ memmove-ssse3-back \ memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \ strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ @@ -20,7 +19,6 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ strcspn-c strpbrk-c strspn-c varshift \ memset-avx512-no-vzeroupper \ - memmove-sse2-unaligned-erms \ memmove-avx-unaligned-erms \ memmove-avx512-unaligned-erms \ memset-avx2-unaligned-erms \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index ca05ff6ebf..449b04647e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -54,7 +54,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memmove_chk_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_ARCH_FEATURE (AVX512F_Usable), - __memmove_chk_avx512_unaligned_2) + __memmove_chk_avx512_unaligned) IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_ARCH_FEATURE (AVX512F_Usable), __memmove_chk_avx512_unaligned_erms) @@ -62,9 +62,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_ARCH_FEATURE (AVX_Usable), __memmove_chk_avx_unaligned) - IFUNC_IMPL_ADD (array, i, __memmove_chk, - HAS_ARCH_FEATURE (AVX_Usable), - __memmove_chk_avx_unaligned_2) IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_ARCH_FEATURE (AVX_Usable), __memmove_chk_avx_unaligned_erms) @@ -75,20 +72,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_CPU_FEATURE (SSSE3), __memmove_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, - __memmove_chk_sse2_unaligned_2) + __memmove_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, - __memmove_chk_sse2_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, - __memmove_chk_sse2)) + __memmove_chk_sse2_unaligned_erms)) /* Support sysdeps/x86_64/multiarch/memmove.S. */ IFUNC_IMPL (i, name, memmove, IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX_Usable), __memmove_avx_unaligned) - IFUNC_IMPL_ADD (array, i, memmove, - HAS_ARCH_FEATURE (AVX_Usable), - __memmove_avx_unaligned_2) IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX_Usable), __memmove_avx_unaligned_erms) @@ -98,7 +90,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memmove_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX512F_Usable), - __memmove_avx512_unaligned_2) + __memmove_avx512_unaligned) IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX512F_Usable), __memmove_avx512_unaligned_erms) @@ -109,10 +101,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) IFUNC_IMPL_ADD (array, i, memmove, 1, - __memmove_sse2_unaligned_2) + __memmove_sse2_unaligned) IFUNC_IMPL_ADD (array, i, memmove, 1, - __memmove_sse2_unaligned_erms) - IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2)) + __memmove_sse2_unaligned_erms)) /* Support sysdeps/x86_64/multiarch/memset_chk.S. */ IFUNC_IMPL (i, name, __memset_chk, @@ -326,7 +317,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memcpy_chk_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_ARCH_FEATURE (AVX512F_Usable), - __memcpy_chk_avx512_unaligned_2) + __memcpy_chk_avx512_unaligned) IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_ARCH_FEATURE (AVX512F_Usable), __memcpy_chk_avx512_unaligned_erms) @@ -334,9 +325,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_chk_avx_unaligned) - IFUNC_IMPL_ADD (array, i, __memcpy_chk, - HAS_ARCH_FEATURE (AVX_Usable), - __memcpy_chk_avx_unaligned_2) IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_chk_avx_unaligned_erms) @@ -347,20 +335,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_CPU_FEATURE (SSSE3), __memcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, - __memcpy_chk_sse2_unaligned_2) - IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, - __memcpy_chk_sse2_unaligned_erms) + __memcpy_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, - __memcpy_chk_sse2)) + __memcpy_chk_sse2_unaligned_erms)) /* Support sysdeps/x86_64/multiarch/memcpy.S. */ IFUNC_IMPL (i, name, memcpy, IFUNC_IMPL_ADD (array, i, memcpy, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_avx_unaligned) - IFUNC_IMPL_ADD (array, i, memcpy, - HAS_ARCH_FEATURE (AVX_Usable), - __memcpy_avx_unaligned_2) IFUNC_IMPL_ADD (array, i, memcpy, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_avx_unaligned_erms) @@ -374,18 +357,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memcpy_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, memcpy, HAS_ARCH_FEATURE (AVX512F_Usable), - __memcpy_avx512_unaligned_2) + __memcpy_avx512_unaligned) IFUNC_IMPL_ADD (array, i, memcpy, HAS_ARCH_FEATURE (AVX512F_Usable), __memcpy_avx512_unaligned_erms) #endif IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned) - IFUNC_IMPL_ADD (array, i, memcpy, 1, - __memcpy_sse2_unaligned_2) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned_erms) - IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms) - IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2)) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)) /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */ IFUNC_IMPL (i, name, __mempcpy_chk, @@ -395,7 +375,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __mempcpy_chk_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_ARCH_FEATURE (AVX512F_Usable), - __mempcpy_chk_avx512_unaligned_2) + __mempcpy_chk_avx512_unaligned) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_ARCH_FEATURE (AVX512F_Usable), __mempcpy_chk_avx512_unaligned_erms) @@ -403,9 +383,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_chk_avx_unaligned) - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - HAS_ARCH_FEATURE (AVX_Usable), - __mempcpy_chk_avx_unaligned_2) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_chk_avx_unaligned_erms) @@ -416,11 +393,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, HAS_CPU_FEATURE (SSSE3), __mempcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, - __mempcpy_chk_sse2_unaligned_2) - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, - __mempcpy_chk_sse2_unaligned_erms) + __mempcpy_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, - __mempcpy_chk_sse2)) + __mempcpy_chk_sse2_unaligned_erms)) /* Support sysdeps/x86_64/multiarch/mempcpy.S. */ IFUNC_IMPL (i, name, mempcpy, @@ -430,7 +405,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __mempcpy_avx512_no_vzeroupper) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX512F_Usable), - __mempcpy_avx512_unaligned_2) + __mempcpy_avx512_unaligned) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX512F_Usable), __mempcpy_avx512_unaligned_erms) @@ -438,9 +413,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_avx_unaligned) - IFUNC_IMPL_ADD (array, i, mempcpy, - HAS_ARCH_FEATURE (AVX_Usable), - __mempcpy_avx_unaligned_2) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_avx_unaligned_erms) @@ -449,11 +421,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), __mempcpy_ssse3) IFUNC_IMPL_ADD (array, i, mempcpy, 1, - __mempcpy_sse2_unaligned_2) + __mempcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2_unaligned_erms) - IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms) - IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2)) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)) /* Support sysdeps/x86_64/multiarch/strncmp.S. */ IFUNC_IMPL (i, name, strncmp, diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S deleted file mode 100644 index dd4187fa36..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S +++ /dev/null @@ -1,391 +0,0 @@ -/* memcpy with AVX - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include - -#if IS_IN (libc) \ - && (defined SHARED \ - || defined USE_AS_MEMMOVE \ - || !defined USE_MULTIARCH) - -#include "asm-syntax.h" -#ifndef MEMCPY -# define MEMCPY __memcpy_avx_unaligned -# define MEMCPY_CHK __memcpy_chk_avx_unaligned -# define MEMPCPY __mempcpy_avx_unaligned -# define MEMPCPY_CHK __mempcpy_chk_avx_unaligned -#endif - - .section .text.avx,"ax",@progbits -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE -ENTRY (MEMPCPY_CHK) - cmpq %rdx, %rcx - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMPCPY_CHK) - -ENTRY (MEMPCPY) - movq %rdi, %rax - addq %rdx, %rax - jmp L(start) -END (MEMPCPY) -#endif - -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmpq %rdx, %rcx - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %rdi, %rax -#ifdef USE_AS_MEMPCPY - add %rdx, %rax -#endif -L(start): - cmp $256, %rdx - jae L(256bytesormore) - cmp $16, %dl - jb L(less_16bytes) - cmp $128, %dl - jb L(less_128bytes) - vmovdqu (%rsi), %xmm0 - lea (%rsi, %rdx), %rcx - vmovdqu 0x10(%rsi), %xmm1 - vmovdqu 0x20(%rsi), %xmm2 - vmovdqu 0x30(%rsi), %xmm3 - vmovdqu 0x40(%rsi), %xmm4 - vmovdqu 0x50(%rsi), %xmm5 - vmovdqu 0x60(%rsi), %xmm6 - vmovdqu 0x70(%rsi), %xmm7 - vmovdqu -0x80(%rcx), %xmm8 - vmovdqu -0x70(%rcx), %xmm9 - vmovdqu -0x60(%rcx), %xmm10 - vmovdqu -0x50(%rcx), %xmm11 - vmovdqu -0x40(%rcx), %xmm12 - vmovdqu -0x30(%rcx), %xmm13 - vmovdqu -0x20(%rcx), %xmm14 - vmovdqu -0x10(%rcx), %xmm15 - lea (%rdi, %rdx), %rdx - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, 0x10(%rdi) - vmovdqu %xmm2, 0x20(%rdi) - vmovdqu %xmm3, 0x30(%rdi) - vmovdqu %xmm4, 0x40(%rdi) - vmovdqu %xmm5, 0x50(%rdi) - vmovdqu %xmm6, 0x60(%rdi) - vmovdqu %xmm7, 0x70(%rdi) - vmovdqu %xmm8, -0x80(%rdx) - vmovdqu %xmm9, -0x70(%rdx) - vmovdqu %xmm10, -0x60(%rdx) - vmovdqu %xmm11, -0x50(%rdx) - vmovdqu %xmm12, -0x40(%rdx) - vmovdqu %xmm13, -0x30(%rdx) - vmovdqu %xmm14, -0x20(%rdx) - vmovdqu %xmm15, -0x10(%rdx) - ret - .p2align 4 -L(less_128bytes): - cmp $64, %dl - jb L(less_64bytes) - vmovdqu (%rsi), %xmm0 - lea (%rsi, %rdx), %rcx - vmovdqu 0x10(%rsi), %xmm1 - vmovdqu 0x20(%rsi), %xmm2 - lea (%rdi, %rdx), %rdx - vmovdqu 0x30(%rsi), %xmm3 - vmovdqu -0x40(%rcx), %xmm4 - vmovdqu -0x30(%rcx), %xmm5 - vmovdqu -0x20(%rcx), %xmm6 - vmovdqu -0x10(%rcx), %xmm7 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, 0x10(%rdi) - vmovdqu %xmm2, 0x20(%rdi) - vmovdqu %xmm3, 0x30(%rdi) - vmovdqu %xmm4, -0x40(%rdx) - vmovdqu %xmm5, -0x30(%rdx) - vmovdqu %xmm6, -0x20(%rdx) - vmovdqu %xmm7, -0x10(%rdx) - ret - - .p2align 4 -L(less_64bytes): - cmp $32, %dl - jb L(less_32bytes) - vmovdqu (%rsi), %xmm0 - vmovdqu 0x10(%rsi), %xmm1 - vmovdqu -0x20(%rsi, %rdx), %xmm6 - vmovdqu -0x10(%rsi, %rdx), %xmm7 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, 0x10(%rdi) - vmovdqu %xmm6, -0x20(%rdi, %rdx) - vmovdqu %xmm7, -0x10(%rdi, %rdx) - ret - - .p2align 4 -L(less_32bytes): - vmovdqu (%rsi), %xmm0 - vmovdqu -0x10(%rsi, %rdx), %xmm7 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm7, -0x10(%rdi, %rdx) - ret - - .p2align 4 -L(less_16bytes): - cmp $8, %dl - jb L(less_8bytes) - movq -0x08(%rsi, %rdx), %rcx - movq (%rsi), %rsi - movq %rsi, (%rdi) - movq %rcx, -0x08(%rdi, %rdx) - ret - - .p2align 4 -L(less_8bytes): - cmp $4, %dl - jb L(less_4bytes) - mov -0x04(%rsi, %rdx), %ecx - mov (%rsi), %esi - mov %esi, (%rdi) - mov %ecx, -0x04(%rdi, %rdx) - ret - -L(less_4bytes): - cmp $1, %dl - jbe L(less_2bytes) - mov -0x02(%rsi, %rdx), %cx - mov (%rsi), %si - mov %si, (%rdi) - mov %cx, -0x02(%rdi, %rdx) - ret - -L(less_2bytes): - jb L(less_0bytes) - mov (%rsi), %cl - mov %cl, (%rdi) -L(less_0bytes): - ret - - .p2align 4 -L(256bytesormore): -#ifdef USE_AS_MEMMOVE - mov %rdi, %rcx - sub %rsi, %rcx - cmp %rdx, %rcx - jc L(copy_backward) -#endif - cmp $2048, %rdx - jae L(gobble_data_movsb) - mov %rax, %r8 - lea (%rsi, %rdx), %rcx - mov %rdi, %r10 - vmovdqu -0x80(%rcx), %xmm5 - vmovdqu -0x70(%rcx), %xmm6 - mov $0x80, %rax - and $-32, %rdi - add $32, %rdi - vmovdqu -0x60(%rcx), %xmm7 - vmovdqu -0x50(%rcx), %xmm8 - mov %rdi, %r11 - sub %r10, %r11 - vmovdqu -0x40(%rcx), %xmm9 - vmovdqu -0x30(%rcx), %xmm10 - sub %r11, %rdx - vmovdqu -0x20(%rcx), %xmm11 - vmovdqu -0x10(%rcx), %xmm12 - vmovdqu (%rsi), %ymm4 - add %r11, %rsi - sub %eax, %edx -L(goble_128_loop): - vmovdqu (%rsi), %ymm0 - vmovdqu 0x20(%rsi), %ymm1 - vmovdqu 0x40(%rsi), %ymm2 - vmovdqu 0x60(%rsi), %ymm3 - add %rax, %rsi - vmovdqa %ymm0, (%rdi) - vmovdqa %ymm1, 0x20(%rdi) - vmovdqa %ymm2, 0x40(%rdi) - vmovdqa %ymm3, 0x60(%rdi) - add %rax, %rdi - sub %eax, %edx - jae L(goble_128_loop) - add %eax, %edx - add %rdi, %rdx - vmovdqu %ymm4, (%r10) - vzeroupper - vmovdqu %xmm5, -0x80(%rdx) - vmovdqu %xmm6, -0x70(%rdx) - vmovdqu %xmm7, -0x60(%rdx) - vmovdqu %xmm8, -0x50(%rdx) - vmovdqu %xmm9, -0x40(%rdx) - vmovdqu %xmm10, -0x30(%rdx) - vmovdqu %xmm11, -0x20(%rdx) - vmovdqu %xmm12, -0x10(%rdx) - mov %r8, %rax - ret - - .p2align 4 -L(gobble_data_movsb): -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %rcx -#else - mov __x86_shared_cache_size_half(%rip), %rcx -#endif - shl $3, %rcx - cmp %rcx, %rdx - jae L(gobble_big_data_fwd) - mov %rdx, %rcx - rep movsb - ret - - .p2align 4 -L(gobble_big_data_fwd): - lea (%rsi, %rdx), %rcx - vmovdqu (%rsi), %ymm4 - vmovdqu -0x80(%rsi,%rdx), %xmm5 - vmovdqu -0x70(%rcx), %xmm6 - vmovdqu -0x60(%rcx), %xmm7 - vmovdqu -0x50(%rcx), %xmm8 - vmovdqu -0x40(%rcx), %xmm9 - vmovdqu -0x30(%rcx), %xmm10 - vmovdqu -0x20(%rcx), %xmm11 - vmovdqu -0x10(%rcx), %xmm12 - mov %rdi, %r8 - and $-32, %rdi - add $32, %rdi - mov %rdi, %r10 - sub %r8, %r10 - sub %r10, %rdx - add %r10, %rsi - lea (%rdi, %rdx), %rcx - add $-0x80, %rdx -L(gobble_mem_fwd_loop): - prefetchnta 0x1c0(%rsi) - prefetchnta 0x280(%rsi) - vmovdqu (%rsi), %ymm0 - vmovdqu 0x20(%rsi), %ymm1 - vmovdqu 0x40(%rsi), %ymm2 - vmovdqu 0x60(%rsi), %ymm3 - sub $-0x80, %rsi - vmovntdq %ymm0, (%rdi) - vmovntdq %ymm1, 0x20(%rdi) - vmovntdq %ymm2, 0x40(%rdi) - vmovntdq %ymm3, 0x60(%rdi) - sub $-0x80, %rdi - add $-0x80, %rdx - jb L(gobble_mem_fwd_loop) - sfence - vmovdqu %ymm4, (%r8) - vzeroupper - vmovdqu %xmm5, -0x80(%rcx) - vmovdqu %xmm6, -0x70(%rcx) - vmovdqu %xmm7, -0x60(%rcx) - vmovdqu %xmm8, -0x50(%rcx) - vmovdqu %xmm9, -0x40(%rcx) - vmovdqu %xmm10, -0x30(%rcx) - vmovdqu %xmm11, -0x20(%rcx) - vmovdqu %xmm12, -0x10(%rcx) - ret - -#ifdef USE_AS_MEMMOVE - .p2align 4 -L(copy_backward): -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %rcx -#else - mov __x86_shared_cache_size_half(%rip), %rcx -#endif - shl $3, %rcx - vmovdqu (%rsi), %xmm5 - vmovdqu 0x10(%rsi), %xmm6 - add %rdx, %rdi - vmovdqu 0x20(%rsi), %xmm7 - vmovdqu 0x30(%rsi), %xmm8 - lea -0x20(%rdi), %r10 - mov %rdi, %r11 - vmovdqu 0x40(%rsi), %xmm9 - vmovdqu 0x50(%rsi), %xmm10 - and $0x1f, %r11 - vmovdqu 0x60(%rsi), %xmm11 - vmovdqu 0x70(%rsi), %xmm12 - xor %r11, %rdi - add %rdx, %rsi - vmovdqu -0x20(%rsi), %ymm4 - sub %r11, %rsi - sub %r11, %rdx - cmp %rcx, %rdx - ja L(gobble_big_data_bwd) - add $-0x80, %rdx -L(gobble_mem_bwd_llc): - vmovdqu -0x20(%rsi), %ymm0 - vmovdqu -0x40(%rsi), %ymm1 - vmovdqu -0x60(%rsi), %ymm2 - vmovdqu -0x80(%rsi), %ymm3 - lea -0x80(%rsi), %rsi - vmovdqa %ymm0, -0x20(%rdi) - vmovdqa %ymm1, -0x40(%rdi) - vmovdqa %ymm2, -0x60(%rdi) - vmovdqa %ymm3, -0x80(%rdi) - lea -0x80(%rdi), %rdi - add $-0x80, %rdx - jb L(gobble_mem_bwd_llc) - vmovdqu %ymm4, (%r10) - vzeroupper - vmovdqu %xmm5, (%rax) - vmovdqu %xmm6, 0x10(%rax) - vmovdqu %xmm7, 0x20(%rax) - vmovdqu %xmm8, 0x30(%rax) - vmovdqu %xmm9, 0x40(%rax) - vmovdqu %xmm10, 0x50(%rax) - vmovdqu %xmm11, 0x60(%rax) - vmovdqu %xmm12, 0x70(%rax) - ret - - .p2align 4 -L(gobble_big_data_bwd): - add $-0x80, %rdx -L(gobble_mem_bwd_loop): - prefetchnta -0x1c0(%rsi) - prefetchnta -0x280(%rsi) - vmovdqu -0x20(%rsi), %ymm0 - vmovdqu -0x40(%rsi), %ymm1 - vmovdqu -0x60(%rsi), %ymm2 - vmovdqu -0x80(%rsi), %ymm3 - lea -0x80(%rsi), %rsi - vmovntdq %ymm0, -0x20(%rdi) - vmovntdq %ymm1, -0x40(%rdi) - vmovntdq %ymm2, -0x60(%rdi) - vmovntdq %ymm3, -0x80(%rdi) - lea -0x80(%rdi), %rdi - add $-0x80, %rdx - jb L(gobble_mem_bwd_loop) - sfence - vmovdqu %ymm4, (%r10) - vzeroupper - vmovdqu %xmm5, (%rax) - vmovdqu %xmm6, 0x10(%rax) - vmovdqu %xmm7, 0x20(%rax) - vmovdqu %xmm8, 0x30(%rax) - vmovdqu %xmm9, 0x40(%rax) - vmovdqu %xmm10, 0x50(%rax) - vmovdqu %xmm11, 0x60(%rax) - vmovdqu %xmm12, 0x70(%rax) - ret -#endif -END (MEMCPY) -#endif diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S deleted file mode 100644 index c4509831fa..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S +++ /dev/null @@ -1,175 +0,0 @@ -/* memcpy with unaliged loads - Copyright (C) 2013-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if IS_IN (libc) - -#include - -#include "asm-syntax.h" - - -ENTRY(__memcpy_sse2_unaligned) - movq %rsi, %rax - leaq (%rdx,%rdx), %rcx - subq %rdi, %rax - subq %rdx, %rax - cmpq %rcx, %rax - jb L(overlapping) - cmpq $16, %rdx - jbe L(less_16) - movdqu (%rsi), %xmm8 - cmpq $32, %rdx - movdqu %xmm8, (%rdi) - movdqu -16(%rsi,%rdx), %xmm8 - movdqu %xmm8, -16(%rdi,%rdx) - ja .L31 -L(return): - movq %rdi, %rax - ret - .p2align 4,,10 - .p2align 4 -.L31: - movdqu 16(%rsi), %xmm8 - cmpq $64, %rdx - movdqu %xmm8, 16(%rdi) - movdqu -32(%rsi,%rdx), %xmm8 - movdqu %xmm8, -32(%rdi,%rdx) - jbe L(return) - movdqu 32(%rsi), %xmm8 - cmpq $128, %rdx - movdqu %xmm8, 32(%rdi) - movdqu -48(%rsi,%rdx), %xmm8 - movdqu %xmm8, -48(%rdi,%rdx) - movdqu 48(%rsi), %xmm8 - movdqu %xmm8, 48(%rdi) - movdqu -64(%rsi,%rdx), %xmm8 - movdqu %xmm8, -64(%rdi,%rdx) - jbe L(return) - leaq 64(%rdi), %rcx - addq %rdi, %rdx - andq $-64, %rdx - andq $-64, %rcx - movq %rcx, %rax - subq %rdi, %rax - addq %rax, %rsi - cmpq %rdx, %rcx - je L(return) - movq %rsi, %r10 - subq %rcx, %r10 - leaq 16(%r10), %r9 - leaq 32(%r10), %r8 - leaq 48(%r10), %rax - .p2align 4,,10 - .p2align 4 -L(loop): - movdqu (%rcx,%r10), %xmm8 - movdqa %xmm8, (%rcx) - movdqu (%rcx,%r9), %xmm8 - movdqa %xmm8, 16(%rcx) - movdqu (%rcx,%r8), %xmm8 - movdqa %xmm8, 32(%rcx) - movdqu (%rcx,%rax), %xmm8 - movdqa %xmm8, 48(%rcx) - addq $64, %rcx - cmpq %rcx, %rdx - jne L(loop) - jmp L(return) -L(overlapping): - cmpq %rsi, %rdi - jae .L3 - testq %rdx, %rdx - .p2align 4,,5 - je L(return) - movq %rdx, %r9 - leaq 16(%rsi), %rcx - leaq 16(%rdi), %r8 - shrq $4, %r9 - movq %r9, %rax - salq $4, %rax - cmpq %rcx, %rdi - setae %cl - cmpq %r8, %rsi - setae %r8b - orl %r8d, %ecx - cmpq $15, %rdx - seta %r8b - testb %r8b, %cl - je .L16 - testq %rax, %rax - je .L16 - xorl %ecx, %ecx - xorl %r8d, %r8d -.L7: - movdqu (%rsi,%rcx), %xmm8 - addq $1, %r8 - movdqu %xmm8, (%rdi,%rcx) - addq $16, %rcx - cmpq %r8, %r9 - ja .L7 - cmpq %rax, %rdx - je L(return) -.L21: - movzbl (%rsi,%rax), %ecx - movb %cl, (%rdi,%rax) - addq $1, %rax - cmpq %rax, %rdx - ja .L21 - jmp L(return) -L(less_16): - testb $24, %dl - jne L(between_9_16) - testb $4, %dl - .p2align 4,,5 - jne L(between_5_8) - testq %rdx, %rdx - .p2align 4,,2 - je L(return) - movzbl (%rsi), %eax - testb $2, %dl - movb %al, (%rdi) - je L(return) - movzwl -2(%rsi,%rdx), %eax - movw %ax, -2(%rdi,%rdx) - jmp L(return) -.L3: - leaq -1(%rdx), %rax - .p2align 4,,10 - .p2align 4 -.L11: - movzbl (%rsi,%rax), %edx - movb %dl, (%rdi,%rax) - subq $1, %rax - jmp .L11 -L(between_9_16): - movq (%rsi), %rax - movq %rax, (%rdi) - movq -8(%rsi,%rdx), %rax - movq %rax, -8(%rdi,%rdx) - jmp L(return) -.L16: - xorl %eax, %eax - jmp .L21 -L(between_5_8): - movl (%rsi), %eax - movl %eax, (%rdi) - movl -4(%rsi,%rdx), %eax - movl %eax, -4(%rdi,%rdx) - jmp L(return) -END(__memcpy_sse2_unaligned) - -#endif diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S index 5b045d7847..f6771a4696 100644 --- a/sysdeps/x86_64/multiarch/memcpy.S +++ b/sysdeps/x86_64/multiarch/memcpy.S @@ -19,7 +19,6 @@ . */ #include -#include #include /* Define multiple versions only for the definition in lib and for @@ -30,21 +29,34 @@ ENTRY(__new_memcpy) .type __new_memcpy, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX -#ifdef HAVE_AVX512_ASM_SUPPORT +# ifdef HAVE_AVX512_ASM_SUPPORT HAS_ARCH_FEATURE (AVX512F_Usable) jz 1f + lea __memcpy_avx512_no_vzeroupper(%rip), %RAX_LP HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 1f - lea __memcpy_avx512_no_vzeroupper(%rip), %RAX_LP + jnz 2f + lea __memcpy_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memcpy_avx512_unaligned(%rip), %RAX_LP ret -#endif +# endif 1: lea __memcpy_avx_unaligned(%rip), %RAX_LP HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - jnz 2f + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memcpy_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): lea __memcpy_sse2_unaligned(%rip), %RAX_LP HAS_ARCH_FEATURE (Fast_Unaligned_Copy) - jnz 2f - lea __memcpy_sse2(%rip), %RAX_LP + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memcpy_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): HAS_CPU_FEATURE (SSSE3) jz 2f lea __memcpy_ssse3_back(%rip), %RAX_LP @@ -54,37 +66,7 @@ ENTRY(__new_memcpy) 2: ret END(__new_memcpy) -# undef ENTRY -# define ENTRY(name) \ - .type __memcpy_sse2, @function; \ - .globl __memcpy_sse2; \ - .hidden __memcpy_sse2; \ - .p2align 4; \ - __memcpy_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __memcpy_sse2, .-__memcpy_sse2 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __memcpy_chk_sse2, @function; \ - .globl __memcpy_chk_sse2; \ - .p2align 4; \ - __memcpy_chk_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2 - -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal memcpy calls through a PLT. - The speedup we get from using SSSE3 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memcpy; __GI_memcpy = __memcpy_sse2 - +# undef memcpy +# include versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14); #endif - -#include "../memcpy.S" diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S index 648217e971..11f13104c2 100644 --- a/sysdeps/x86_64/multiarch/memcpy_chk.S +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S @@ -30,24 +30,40 @@ ENTRY(__memcpy_chk) .type __memcpy_chk, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX -#ifdef HAVE_AVX512_ASM_SUPPORT +# ifdef HAVE_AVX512_ASM_SUPPORT HAS_ARCH_FEATURE (AVX512F_Usable) - jz 1f + jz 1f + lea __memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 1f - leaq __memcpy_chk_avx512_no_vzeroupper(%rip), %rax + jnz 2f + lea __memcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memcpy_chk_avx512_unaligned(%rip), %RAX_LP ret -#endif -1: leaq __memcpy_chk_sse2(%rip), %rax +# endif +1: lea __memcpy_chk_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memcpy_chk_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __memcpy_chk_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): HAS_CPU_FEATURE (SSSE3) jz 2f - leaq __memcpy_chk_ssse3(%rip), %rax + lea __memcpy_chk_ssse3_back(%rip), %RAX_LP HAS_ARCH_FEATURE (Fast_Copy_Backward) - jz 2f - leaq __memcpy_chk_ssse3_back(%rip), %rax - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - jz 2f - leaq __memcpy_chk_avx_unaligned(%rip), %rax + jnz 2f + lea __memcpy_chk_ssse3(%rip), %RAX_LP 2: ret END(__memcpy_chk) # else diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S deleted file mode 100644 index 75e35f2957..0000000000 --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S +++ /dev/null @@ -1,22 +0,0 @@ -/* memmove with AVX - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_avx_unaligned -#define MEMCPY_CHK __memmove_chk_avx_unaligned -#include "memcpy-avx-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S deleted file mode 100644 index d7edb18923..0000000000 --- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S +++ /dev/null @@ -1,13 +0,0 @@ -#if IS_IN (libc) -# define VEC_SIZE 16 -# define VEC(i) xmm##i -# define VMOVNT movntdq -/* Use movups and movaps for smaller code sizes. */ -# define VMOVU movups -# define VMOVA movaps - -# define SECTION(p) p -# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s - -# include "memmove-vec-unaligned-erms.S" -#endif diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index 3742c106eb..a2cce39a16 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -106,28 +106,28 @@ .section SECTION(.text),"ax",@progbits #if defined SHARED && IS_IN (libc) -ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2)) +ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) cmpq %rdx, %rcx jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2)) +END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) #endif #if VEC_SIZE == 16 || defined SHARED -ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned_2)) +ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) movq %rdi, %rax addq %rdx, %rax jmp L(start) -END (MEMPCPY_SYMBOL (__mempcpy, unaligned_2)) +END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) #endif #if defined SHARED && IS_IN (libc) -ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2)) +ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) cmpq %rdx, %rcx jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2)) +END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) #endif -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2)) +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) movq %rdi, %rax L(start): cmpq $VEC_SIZE, %rdx @@ -148,7 +148,7 @@ L(nop): #endif ret #if defined USE_MULTIARCH && IS_IN (libc) -END (MEMMOVE_SYMBOL (__memmove, unaligned_2)) +END (MEMMOVE_SYMBOL (__memmove, unaligned)) # if VEC_SIZE == 16 && defined SHARED /* Only used to measure performance of REP MOVSB. */ @@ -539,11 +539,11 @@ strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) # endif -strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2), - MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned_2)) +strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), + MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) # endif #endif #if VEC_SIZE == 16 || defined SHARED -strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2), - MEMCPY_SYMBOL (__memcpy, unaligned_2)) +strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), + MEMCPY_SYMBOL (__memcpy, unaligned)) #endif diff --git a/sysdeps/x86_64/multiarch/memmove.S b/sysdeps/x86_64/multiarch/memmove.S new file mode 100644 index 0000000000..25c3586ee9 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove.S @@ -0,0 +1,98 @@ +/* Multiple versions of memmove + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +/* Define multiple versions only for the definition in lib and for + DSO. */ +#if IS_IN (libc) + .text +ENTRY(__libc_memmove) + .type __libc_memmove, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX +# ifdef HAVE_AVX512_ASM_SUPPORT + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __memmove_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memmove_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memmove_avx512_unaligned(%rip), %RAX_LP + ret +# endif +1: lea __memmove_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memmove_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __memmove_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memmove_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __memmove_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __memmove_ssse3(%rip), %RAX_LP +2: ret +END(__libc_memmove) +#endif + +#if IS_IN (libc) +# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s + +# ifdef SHARED +libc_hidden_ver (__memmove_sse2_unaligned, memmove) +libc_hidden_ver (__memcpy_sse2_unaligned, memcpy) +libc_hidden_ver (__mempcpy_sse2_unaligned, mempcpy) +libc_hidden_ver (__mempcpy_sse2_unaligned, __mempcpy) + +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal memmove calls through a PLT. + The speedup we get from using SSE2 instructions is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def +# endif +strong_alias (__libc_memmove, memmove) +#endif + +#if !defined SHARED || !IS_IN (libc) +weak_alias (__mempcpy, mempcpy) +#endif + +#include "../memmove.S" + +#if defined SHARED && IS_IN (libc) +# include +# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14) +/* Use __memmove_sse2_unaligned to support overlapping addresses. */ +compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c deleted file mode 100644 index 8da5640bb0..0000000000 --- a/sysdeps/x86_64/multiarch/memmove.c +++ /dev/null @@ -1,73 +0,0 @@ -/* Multiple versions of memmove. - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if IS_IN (libc) -# define MEMMOVE __memmove_sse2 -# ifdef SHARED -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(name) \ - __hidden_ver1 (__memmove_sse2, __GI_memmove, __memmove_sse2); -# endif - -/* Redefine memmove so that the compiler won't complain about the type - mismatch with the IFUNC selector in strong_alias, below. */ -# undef memmove -# define memmove __redirect_memmove -# include -# undef memmove - -extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden; -extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden; -extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden; -extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden; -# ifdef HAVE_AVX512_ASM_SUPPORT - extern __typeof (__redirect_memmove) __memmove_avx512_no_vzeroupper attribute_hidden; -# endif - -#endif - -#include "string/memmove.c" - -#if IS_IN (libc) -# include -# include "init-arch.h" - -/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle - ifunc symbol properly. */ -extern __typeof (__redirect_memmove) __libc_memmove; -libc_ifunc (__libc_memmove, -#ifdef HAVE_AVX512_ASM_SUPPORT - HAS_ARCH_FEATURE (AVX512F_Usable) - && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - ? __memmove_avx512_no_vzeroupper - : -#endif - (HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - ? __memmove_avx_unaligned - : (HAS_CPU_FEATURE (SSSE3) - ? (HAS_ARCH_FEATURE (Fast_Copy_Backward) - ? __memmove_ssse3_back : __memmove_ssse3) - : __memmove_sse2))); - -strong_alias (__libc_memmove, memmove) - -# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14) -compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5); -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/memmove_chk.S b/sysdeps/x86_64/multiarch/memmove_chk.S new file mode 100644 index 0000000000..cd639b8862 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove_chk.S @@ -0,0 +1,71 @@ +/* Multiple versions of __memmove_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch memmove functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__memmove_chk) + .type __memmove_chk, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX +# ifdef HAVE_AVX512_ASM_SUPPORT + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __memmove_chk_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memmove_chk_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memmove_chk_avx512_unaligned(%rip), %RAX_LP + ret +# endif +1: lea __memmove_chk_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memmove_chk_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __memmove_chk_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memmove_chk_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __memmove_chk_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __memmove_chk_ssse3(%rip), %RAX_LP +2: ret +END(__memmove_chk) +# else +# include "../memmove_chk.S" +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c deleted file mode 100644 index f64da63180..0000000000 --- a/sysdeps/x86_64/multiarch/memmove_chk.c +++ /dev/null @@ -1,46 +0,0 @@ -/* Multiple versions of __memmove_chk. - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include "init-arch.h" - -#define MEMMOVE_CHK __memmove_chk_sse2 - -extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden; -extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden; -extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden; -extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden; -# ifdef HAVE_AVX512_ASM_SUPPORT - extern __typeof (__memmove_chk) __memmove_chk_avx512_no_vzeroupper attribute_hidden; -# endif - -#include "debug/memmove_chk.c" - -libc_ifunc (__memmove_chk, -#ifdef HAVE_AVX512_ASM_SUPPORT - HAS_ARCH_FEATURE (AVX512F_Usable) - && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - ? __memmove_chk_avx512_no_vzeroupper - : -#endif - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) ? __memmove_chk_avx_unaligned : - (HAS_CPU_FEATURE (SSSE3) - ? (HAS_ARCH_FEATURE (Fast_Copy_Backward) - ? __memmove_chk_ssse3_back : __memmove_chk_ssse3) - : __memmove_chk_sse2)); diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S index ed78623565..f9c6df301c 100644 --- a/sysdeps/x86_64/multiarch/mempcpy.S +++ b/sysdeps/x86_64/multiarch/mempcpy.S @@ -25,62 +25,46 @@ DSO. In static binaries we need mempcpy before the initialization happened. */ #if defined SHARED && IS_IN (libc) + .text ENTRY(__mempcpy) .type __mempcpy, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX -#ifdef HAVE_AVX512_ASM_SUPPORT +# ifdef HAVE_AVX512_ASM_SUPPORT HAS_ARCH_FEATURE (AVX512F_Usable) jz 1f + lea __mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 1f - leaq __mempcpy_avx512_no_vzeroupper(%rip), %rax + jnz 2f + lea __mempcpy_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __mempcpy_avx512_unaligned(%rip), %RAX_LP ret -#endif -1: leaq __mempcpy_sse2(%rip), %rax - HAS_CPU_FEATURE (SSSE3) +# endif +1: lea __mempcpy_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) jz 2f - leaq __mempcpy_ssse3(%rip), %rax - HAS_ARCH_FEATURE (Fast_Copy_Backward) + lea __mempcpy_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __mempcpy_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) jz 2f - leaq __mempcpy_ssse3_back(%rip), %rax - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + lea __mempcpy_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) jz 2f - leaq __mempcpy_avx_unaligned(%rip), %rax + lea __mempcpy_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __mempcpy_ssse3(%rip), %RAX_LP 2: ret END(__mempcpy) -# undef ENTRY -# define ENTRY(name) \ - .type __mempcpy_sse2, @function; \ - .p2align 4; \ - .globl __mempcpy_sse2; \ - .hidden __mempcpy_sse2; \ - __mempcpy_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __mempcpy_sse2, .-__mempcpy_sse2 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __mempcpy_chk_sse2, @function; \ - .globl __mempcpy_chk_sse2; \ - .p2align 4; \ - __mempcpy_chk_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2 - -# undef libc_hidden_def -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal mempcpy calls through a PLT. - The speedup we get from using SSSE3 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_def(name) \ - .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2 -# define libc_hidden_builtin_def(name) \ - .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2 +weak_alias (__mempcpy, mempcpy) #endif - -#include "../mempcpy.S" diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S index 6e8a89d38c..80f460fd01 100644 --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S @@ -30,24 +30,40 @@ ENTRY(__mempcpy_chk) .type __mempcpy_chk, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX -#ifdef HAVE_AVX512_ASM_SUPPORT +# ifdef HAVE_AVX512_ASM_SUPPORT HAS_ARCH_FEATURE (AVX512F_Usable) jz 1f + lea __mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 1f - leaq __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax + jnz 2f + lea __mempcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __mempcpy_chk_avx512_unaligned(%rip), %RAX_LP ret -#endif -1: leaq __mempcpy_chk_sse2(%rip), %rax - HAS_CPU_FEATURE (SSSE3) +# endif +1: lea __mempcpy_chk_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) jz 2f - leaq __mempcpy_chk_ssse3(%rip), %rax - HAS_ARCH_FEATURE (Fast_Copy_Backward) + lea __mempcpy_chk_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __mempcpy_chk_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) jz 2f - leaq __mempcpy_chk_ssse3_back(%rip), %rax - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + lea __mempcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) jz 2f - leaq __mempcpy_chk_avx_unaligned(%rip), %rax + lea __mempcpy_chk_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __mempcpy_chk_ssse3(%rip), %RAX_LP 2: ret END(__mempcpy_chk) # else -- cgit 1.4.1