diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch')
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 9 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-avx2.S | 168 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 20 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset.S | 34 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset_chk.S | 20 |
7 files changed, 39 insertions, 217 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index d305145bf0..d78e667566 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -18,12 +18,11 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ - strcspn-c strpbrk-c strspn-c varshift memset-avx2 \ + strcspn-c strpbrk-c strspn-c varshift \ memset-avx512-no-vzeroupper \ memmove-sse2-unaligned-erms \ memmove-avx-unaligned-erms \ memmove-avx512-unaligned-erms \ - memset-sse2-unaligned-erms \ memset-avx2-unaligned-erms \ memset-avx512-unaligned-erms CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 1e880f6edc..ca05ff6ebf 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -117,16 +117,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/memset_chk.S. */ IFUNC_IMPL (i, name, __memset_chk, IFUNC_IMPL_ADD (array, i, __memset_chk, 1, - __memset_chk_sse2) - IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_ARCH_FEATURE (AVX2_Usable), - __memset_chk_avx2) - IFUNC_IMPL_ADD (array, i, __memset_chk, - HAS_ARCH_FEATURE (AVX2_Usable), __memset_chk_avx2_unaligned) IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_ARCH_FEATURE (AVX2_Usable), @@ -146,7 +141,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/memset.S. */ IFUNC_IMPL (i, name, memset, - IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2_unaligned) IFUNC_IMPL_ADD (array, i, memset, 1, @@ -154,9 +148,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms) IFUNC_IMPL_ADD (array, i, memset, HAS_ARCH_FEATURE (AVX2_Usable), - __memset_avx2) - IFUNC_IMPL_ADD (array, i, memset, - HAS_ARCH_FEATURE (AVX2_Usable), __memset_avx2_unaligned) IFUNC_IMPL_ADD (array, i, memset, HAS_ARCH_FEATURE (AVX2_Usable), diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S deleted file mode 100644 index df634728d4..0000000000 --- a/sysdeps/x86_64/multiarch/memset-avx2.S +++ /dev/null @@ -1,168 +0,0 @@ -/* memset with AVX2 - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" -#ifndef MEMSET -# define MEMSET __memset_avx2 -# define MEMSET_CHK __memset_chk_avx2 -#endif - - .section .text.avx2,"ax",@progbits -#if defined PIC -ENTRY (MEMSET_CHK) - cmpq %rdx, %rcx - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMSET_CHK) -#endif - -ENTRY (MEMSET) - vpxor %xmm0, %xmm0, %xmm0 - vmovd %esi, %xmm1 - lea (%rdi, %rdx), %rsi - mov %rdi, %rax - vpshufb %xmm0, %xmm1, %xmm0 - cmp $16, %rdx - jb L(less_16bytes) - cmp $256, %rdx - jae L(256bytesormore) - cmp $128, %dl - jb L(less_128bytes) - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm0, 0x10(%rdi) - vmovdqu %xmm0, 0x20(%rdi) - vmovdqu %xmm0, 0x30(%rdi) - vmovdqu %xmm0, 0x40(%rdi) - vmovdqu %xmm0, 0x50(%rdi) - vmovdqu %xmm0, 0x60(%rdi) - vmovdqu %xmm0, 0x70(%rdi) - vmovdqu %xmm0, -0x80(%rsi) - vmovdqu %xmm0, -0x70(%rsi) - vmovdqu %xmm0, -0x60(%rsi) - vmovdqu %xmm0, -0x50(%rsi) - vmovdqu %xmm0, -0x40(%rsi) - vmovdqu %xmm0, -0x30(%rsi) - vmovdqu %xmm0, -0x20(%rsi) - vmovdqu %xmm0, -0x10(%rsi) - ret - - .p2align 4 -L(less_128bytes): - cmp $64, %dl - jb L(less_64bytes) - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm0, 0x10(%rdi) - vmovdqu %xmm0, 0x20(%rdi) - vmovdqu %xmm0, 0x30(%rdi) - vmovdqu %xmm0, -0x40(%rsi) - vmovdqu %xmm0, -0x30(%rsi) - vmovdqu %xmm0, -0x20(%rsi) - vmovdqu %xmm0, -0x10(%rsi) - ret - - .p2align 4 -L(less_64bytes): - cmp $32, %dl - jb L(less_32bytes) - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm0, 0x10(%rdi) - vmovdqu %xmm0, -0x20(%rsi) - vmovdqu %xmm0, -0x10(%rsi) - ret - - .p2align 4 -L(less_32bytes): - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm0, -0x10(%rsi) - ret - - .p2align 4 -L(less_16bytes): - cmp $8, %dl - jb L(less_8bytes) - vmovq %xmm0, (%rdi) - vmovq %xmm0, -0x08(%rsi) - ret - - .p2align 4 -L(less_8bytes): - vmovd %xmm0, %ecx - cmp $4, %dl - jb L(less_4bytes) - mov %ecx, (%rdi) - mov %ecx, -0x04(%rsi) - ret - - .p2align 4 -L(less_4bytes): - cmp $2, %dl - jb L(less_2bytes) - mov %cx, (%rdi) - mov %cx, -0x02(%rsi) - ret - - .p2align 4 -L(less_2bytes): - cmp $1, %dl - jb L(less_1bytes) - mov %cl, (%rdi) -L(less_1bytes): - ret - - .p2align 4 -L(256bytesormore): - vinserti128 $1, %xmm0, %ymm0, %ymm0 - and $-0x20, %rdi - add $0x20, %rdi - vmovdqu %ymm0, (%rax) - sub %rdi, %rax - lea -0x80(%rax, %rdx), %rcx - cmp $4096, %rcx - ja L(gobble_data) -L(gobble_128_loop): - vmovdqa %ymm0, (%rdi) - vmovdqa %ymm0, 0x20(%rdi) - vmovdqa %ymm0, 0x40(%rdi) - vmovdqa %ymm0, 0x60(%rdi) - sub $-0x80, %rdi - add $-0x80, %ecx - jb L(gobble_128_loop) - mov %rsi, %rax - vmovdqu %ymm0, -0x80(%rsi) - vmovdqu %ymm0, -0x60(%rsi) - vmovdqu %ymm0, -0x40(%rsi) - vmovdqu %ymm0, -0x20(%rsi) - sub %rdx, %rax - vzeroupper - ret - - .p2align 4 -L(gobble_data): - sub $-0x80, %rcx - vmovd %xmm0, %eax - rep stosb - mov %rsi, %rax - sub %rdx, %rax - vzeroupper - ret - -END (MEMSET) -#endif diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S deleted file mode 100644 index 4bf3d36428..0000000000 --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +++ /dev/null @@ -1,20 +0,0 @@ -#if IS_IN (libc) -# define VEC_SIZE 16 -# define VEC(i) xmm##i -/* Don't use movups and movaps since it will get larger nop paddings - for alignment. */ -# define VMOVU movdqu -# define VMOVA movdqa - -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - movd d, %xmm0; \ - movq r, %rax; \ - punpcklbw %xmm0, %xmm0; \ - punpcklwd %xmm0, %xmm0; \ - pshufd $0, %xmm0, %xmm0 - -# define SECTION(p) p -# define MEMSET_SYMBOL(p,s) p##_sse2_##s - -# include "memset-vec-unaligned-erms.S" -#endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index b1df228413..28e71fd576 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -69,7 +69,7 @@ #endif .section SECTION(.text),"ax",@progbits -#if VEC_SIZE == 16 && IS_IN (libc) && 0 +#if VEC_SIZE == 16 && IS_IN (libc) ENTRY (__bzero) movq %rdi, %rax /* Set return value. */ movq %rsi, %rdx /* Set n. */ diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S index 8e3b9b9764..4e52d8f8c4 100644 --- a/sysdeps/x86_64/multiarch/memset.S +++ b/sysdeps/x86_64/multiarch/memset.S @@ -26,35 +26,43 @@ ENTRY(memset) .type memset, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX - leaq __memset_sse2(%rip), %rax + lea __memset_sse2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 1f + lea __memset_sse2_unaligned(%rip), %RAX_LP +1: HAS_ARCH_FEATURE (AVX2_Usable) jz 2f - leaq __memset_avx2(%rip), %rax -#ifdef HAVE_AVX512_ASM_SUPPORT + lea __memset_avx2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz L(AVX512F) + lea __memset_avx2_unaligned(%rip), %RAX_LP +L(AVX512F): +# ifdef HAVE_AVX512_ASM_SUPPORT HAS_ARCH_FEATURE (AVX512F_Usable) jz 2f + lea __memset_avx512_no_vzeroupper(%rip), %RAX_LP HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 2f - leaq __memset_avx512_no_vzeroupper(%rip), %rax -#endif + jnz 2f + lea __memset_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memset_avx512_unaligned(%rip), %RAX_LP +# endif 2: ret END(memset) #endif #if IS_IN (libc) -# undef memset -# define memset __memset_sse2 - -# undef __memset_chk -# define __memset_chk __memset_chk_sse2 +# define MEMSET_SYMBOL(p,s) p##_sse2_##s # ifdef SHARED # undef libc_hidden_builtin_def /* It doesn't make sense to send libc-internal memset calls through a PLT. - The speedup we get from using GPR instruction is likely eaten away + The speedup we get from using SSE2 instructions is likely eaten away by the indirect call in the PLT. */ # define libc_hidden_builtin_def(name) \ - .globl __GI_memset; __GI_memset = __memset_sse2 + .globl __GI_memset; __GI_memset = __memset_sse2_unaligned # endif # undef strong_alias diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S index 9a7b270274..8517cfc073 100644 --- a/sysdeps/x86_64/multiarch/memset_chk.S +++ b/sysdeps/x86_64/multiarch/memset_chk.S @@ -26,16 +26,28 @@ ENTRY(__memset_chk) .type __memset_chk, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX - leaq __memset_chk_sse2(%rip), %rax + lea __memset_chk_sse2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 1f + lea __memset_chk_sse2_unaligned(%rip), %RAX_LP +1: HAS_ARCH_FEATURE (AVX2_Usable) jz 2f - leaq __memset_chk_avx2(%rip), %rax + lea __memset_chk_avx2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz L(AVX512F) + lea __memset_chk_avx2_unaligned(%rip), %RAX_LP +L(AVX512F): #ifdef HAVE_AVX512_ASM_SUPPORT HAS_ARCH_FEATURE (AVX512F_Usable) jz 2f + lea __memset_chk_avx512_no_vzeroupper(%rip), %RAX_LP HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 2f - leaq __memset_chk_avx512_no_vzeroupper(%rip), %rax + jnz 2f + lea __memset_chk_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memset_chk_avx512_unaligned(%rip), %RAX_LP #endif 2: ret END(__memset_chk) |