From e05a252da92a4dd15d4be40a855d31bd864804e9 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 28 Aug 2015 05:40:35 -0700 Subject: Correct x86-64 memcpy/mempcpy multiarch selector For x86-64 memcpy/mempcpy, we choose the best implementation by the order: 1. __memcpy_avx_unaligned if AVX_Fast_Unaligned_Load bit is set. 2. __memcpy_sse2_unaligned if Fast_Unaligned_Load bit is set. 3. __memcpy_sse2 if SSSE3 isn't available. 4. __memcpy_ssse3_back if Fast_Copy_Backward bit it set. 5. __memcpy_ssse3 In libc.a and ld.so, we choose __memcpy_sse2_unaligned which is optimized for current Intel and AMD x86-64 processors. [BZ #18880] * sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Moved to ... * sysdeps/x86_64/memcpy.S: Here. Remove "#if !IS_IN (libc)". Add libc_hidden_builtin_def and versioned_symbol. (__memcpy_chk): New. (__memcpy_sse2_unaligned): Renamed to ... (memcpy): This. Support USE_AS_MEMPCPY. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add mempcpy-sse2. * sysdeps/x86_64/memcpy.S: Moved to ... sysdeps/x86_64/multiarch/memcpy-sse2.S: Here. (__memcpy_chk): Renamed to ... (__memcpy_chk_sse2): This. (memcpy): Renamed to ... (__memcpy_sse2): This. * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Properly select the best implementation. (ENTRY): Replace __memcpy_sse2 with __memcpy_sse2_unaligned. (END): Likewise. (libc_hidden_builtin_def): Likewise. (ENTRY_CHK): Replace __memcpy_chk_sse2 with __memcpy_chk_sse2_unaligned. (END_CHK): Likewise. * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Properly select the best implementation. * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Properly select the best implementation. (ENTRY): Replace __mempcpy_sse2 with __mempcpy_sse2_unaligned. (END): Likewise. (libc_hidden_def): Likewise. (libc_hidden_builtin_def): Likewise. (ENTRY_CHK): Replace __mempcpy_chk_sse2 with __mempcpy_chk_sse2_unaligned. (END_CHK): Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Properly select the best implementation. --- sysdeps/x86_64/memcpy.S | 692 +++++------------------ sysdeps/x86_64/multiarch/Makefile | 2 +- sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S | 175 ------ sysdeps/x86_64/multiarch/memcpy-sse2.S | 569 +++++++++++++++++++ sysdeps/x86_64/multiarch/memcpy.S | 37 +- sysdeps/x86_64/multiarch/memcpy_chk.S | 17 +- sysdeps/x86_64/multiarch/mempcpy-sse2.S | 4 + sysdeps/x86_64/multiarch/mempcpy.S | 39 +- sysdeps/x86_64/multiarch/mempcpy_chk.S | 17 +- 9 files changed, 789 insertions(+), 763 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S create mode 100644 sysdeps/x86_64/multiarch/memcpy-sse2.S create mode 100644 sysdeps/x86_64/multiarch/mempcpy-sse2.S diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S index eea8c2a5af..00c17a4287 100644 --- a/sysdeps/x86_64/memcpy.S +++ b/sysdeps/x86_64/memcpy.S @@ -1,9 +1,5 @@ -/* - Optimized memcpy for x86-64. - - Copyright (C) 2007-2015 Free Software Foundation, Inc. - Contributed by Evandro Menezes , 2007. - +/* Optimized memcpy for x86-64 + Copyright (C) 2013-2015 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,33 +14,20 @@ You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see - . -*/ + . */ #include -#include "asm-syntax.h" -/* Stack slots in the red-zone. */ +#include "asm-syntax.h" -#ifdef USE_AS_MEMPCPY -# define RETVAL (0) -#else -# define RETVAL (-8) -# if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc) -# define memcpy __memcpy -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(name) \ +#if defined SHARED && IS_IN (libc) +# if !defined USE_AS_MEMPCPY && !defined USE_MULTIARCH +# define memcpy __memcpy +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ .globl __GI_memcpy; __GI_memcpy = __memcpy -# endif -#endif -#define SAVE0 (RETVAL - 8) -#define SAVE1 (SAVE0 - 8) -#define SAVE2 (SAVE1 - 8) -#define SAVE3 (SAVE2 - 8) - - .text +# endif -#if defined PIC && IS_IN (libc) ENTRY_CHK (__memcpy_chk) cmpq %rdx, %rcx @@ -53,525 +36,160 @@ ENTRY_CHK (__memcpy_chk) END_CHK (__memcpy_chk) #endif -ENTRY(memcpy) /* (void *, const void*, size_t) */ - -/* Handle tiny blocks. */ - -L(1try): /* up to 32B */ - cmpq $32, %rdx -#ifndef USE_AS_MEMPCPY - movq %rdi, %rax /* save return value */ -#endif - jae L(1after) - -L(1): /* 1-byte once */ - testb $1, %dl - jz L(1a) - - movzbl (%rsi), %ecx - movb %cl, (%rdi) - - incq %rsi - incq %rdi - - .p2align 4,, 4 - -L(1a): /* 2-byte once */ - testb $2, %dl - jz L(1b) - - movzwl (%rsi), %ecx - movw %cx, (%rdi) - - addq $2, %rsi - addq $2, %rdi - - .p2align 4,, 4 - -L(1b): /* 4-byte once */ - testb $4, %dl - jz L(1c) - - movl (%rsi), %ecx - movl %ecx, (%rdi) - - addq $4, %rsi - addq $4, %rdi - - .p2align 4,, 4 - -L(1c): /* 8-byte once */ - testb $8, %dl - jz L(1d) - - movq (%rsi), %rcx - movq %rcx, (%rdi) - - addq $8, %rsi - addq $8, %rdi - - .p2align 4,, 4 - -L(1d): /* 16-byte loop */ - andl $0xf0, %edx - jz L(exit) - - .p2align 4 - -L(1loop): - movq (%rsi), %rcx - movq 8(%rsi), %r8 - movq %rcx, (%rdi) - movq %r8, 8(%rdi) - - subl $16, %edx - - leaq 16(%rsi), %rsi - leaq 16(%rdi), %rdi - - jnz L(1loop) - - .p2align 4,, 4 - -L(exit): /* exit */ +ENTRY(memcpy) + movq %rsi, %rax #ifdef USE_AS_MEMPCPY - movq %rdi, %rax /* return value */ -#else - rep + leaq (%rdi,%rdx), %r11 #endif - retq - - .p2align 4 - -L(1after): -#ifndef USE_AS_MEMPCPY - movq %rax, RETVAL(%rsp) /* save return value */ -#endif - -/* Align to the natural word size. */ - -L(aligntry): - movl %esi, %ecx /* align by source */ - - andl $7, %ecx - jz L(alignafter) /* already aligned */ - -L(align): /* align */ - leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */ - subl $8, %ecx - - .p2align 4 - -L(alignloop): /* 1-byte alignment loop */ - movzbl (%rsi), %eax - movb %al, (%rdi) - - incl %ecx - - leaq 1(%rsi), %rsi - leaq 1(%rdi), %rdi - - jnz L(alignloop) - - .p2align 4 - -L(alignafter): - -/* Handle mid-sized blocks. */ - -L(32try): /* up to 1KB */ - cmpq $1024, %rdx - ja L(32after) - -L(32): /* 32-byte loop */ - movl %edx, %ecx - shrl $5, %ecx - jz L(32skip) - - .p2align 4 - -L(32loop): - decl %ecx - - movq (%rsi), %rax - movq 8(%rsi), %r8 - movq 16(%rsi), %r9 - movq 24(%rsi), %r10 - - movq %rax, (%rdi) - movq %r8, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - - leaq 32(%rsi), %rsi - leaq 32(%rdi), %rdi - - jz L(32skip) /* help out smaller blocks */ - - decl %ecx - - movq (%rsi), %rax - movq 8(%rsi), %r8 - movq 16(%rsi), %r9 - movq 24(%rsi), %r10 - - movq %rax, (%rdi) - movq %r8, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - - leaq 32(%rsi), %rsi - leaq 32(%rdi), %rdi - - jnz L(32loop) - - .p2align 4 - -L(32skip): - andl $31, %edx /* check for left overs */ + leaq (%rdx,%rdx), %rcx + subq %rdi, %rax + subq %rdx, %rax + cmpq %rcx, %rax + jb L(overlapping) + cmpq $16, %rdx + jbe L(less_16) + movdqu (%rsi), %xmm8 + cmpq $32, %rdx + movdqu %xmm8, (%rdi) + movdqu -16(%rsi,%rdx), %xmm8 + movdqu %xmm8, -16(%rdi,%rdx) + ja .L31 +L(return): #ifdef USE_AS_MEMPCPY - jnz L(1) - - movq %rdi, %rax -#else - movq RETVAL(%rsp), %rax - jnz L(1) - - rep -#endif - retq /* exit */ - - .p2align 4 - -L(32after): - -/* - In order to minimize code-size in RTLD, algorithms specific for - larger blocks are excluded when building for RTLD. -*/ - -/* Handle blocks smaller than 1/2 L1. */ - -L(fasttry): /* first 1/2 L1 */ -#if IS_IN (libc) /* only up to this algorithm outside of libc.so */ - mov __x86_data_cache_size_half(%rip), %R11_LP - cmpq %rdx, %r11 /* calculate the smaller of */ - cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */ -#endif - -L(fast): /* good ol' MOVS */ -#if IS_IN (libc) - movq %r11, %rcx - andq $-8, %r11 + movq %r11, %rax #else - movq %rdx, %rcx -#endif - shrq $3, %rcx - jz L(fastskip) - - rep - movsq - - .p2align 4,, 4 - -L(fastskip): -#if IS_IN (libc) - subq %r11, %rdx /* check for more */ - testq $-8, %rdx - jnz L(fastafter) -#endif - - andl $7, %edx /* check for left overs */ -#ifdef USE_AS_MEMPCPY - jnz L(1) - movq %rdi, %rax -#else - movq RETVAL(%rsp), %rax - jnz L(1) - - rep #endif - retq /* exit */ - -#if IS_IN (libc) /* none of the algorithms below for RTLD */ - - .p2align 4 - -L(fastafter): - -/* Handle large blocks smaller than 1/2 L2. */ - -L(pretry): /* first 1/2 L2 */ - mov __x86_shared_cache_size_half (%rip), %R8_LP - cmpq %rdx, %r8 /* calculate the lesser of */ - cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */ - -L(pre): /* 64-byte with prefetching */ - movq %r8, %rcx - andq $-64, %r8 - shrq $6, %rcx - jz L(preskip) - - movq %r14, SAVE0(%rsp) - cfi_rel_offset (%r14, SAVE0) - movq %r13, SAVE1(%rsp) - cfi_rel_offset (%r13, SAVE1) - movq %r12, SAVE2(%rsp) - cfi_rel_offset (%r12, SAVE2) - movq %rbx, SAVE3(%rsp) - cfi_rel_offset (%rbx, SAVE3) - - cmpl $0, __x86_prefetchw(%rip) - jz L(preloop) /* check if PREFETCHW OK */ - + ret + .p2align 4,,10 .p2align 4 - -/* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */ - -L(prewloop): /* cache-line in state M */ - decq %rcx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %r9 - movq 24 (%rsi), %r10 - movq 32 (%rsi), %r11 - movq 40 (%rsi), %r12 - movq 48 (%rsi), %r13 - movq 56 (%rsi), %r14 - - prefetcht0 0 + 896 (%rsi) - prefetcht0 64 + 896 (%rsi) - - movq %rax, (%rdi) - movq %rbx, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - movq %r11, 32(%rdi) - movq %r12, 40(%rdi) - movq %r13, 48(%rdi) - movq %r14, 56(%rdi) - - leaq 64(%rsi), %rsi - leaq 64(%rdi), %rdi - - jz L(prebail) - - decq %rcx - - movq (%rsi), %rax - movq 8(%rsi), %rbx - movq 16(%rsi), %r9 - movq 24(%rsi), %r10 - movq 32(%rsi), %r11 - movq 40(%rsi), %r12 - movq 48(%rsi), %r13 - movq 56(%rsi), %r14 - - movq %rax, (%rdi) - movq %rbx, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - movq %r11, 32(%rdi) - movq %r12, 40(%rdi) - movq %r13, 48(%rdi) - movq %r14, 56(%rdi) - - prefetchw 896 - 64(%rdi) - prefetchw 896 - 0(%rdi) - - leaq 64(%rsi), %rsi - leaq 64(%rdi), %rdi - - jnz L(prewloop) - jmp L(prebail) - +.L31: + movdqu 16(%rsi), %xmm8 + cmpq $64, %rdx + movdqu %xmm8, 16(%rdi) + movdqu -32(%rsi,%rdx), %xmm8 + movdqu %xmm8, -32(%rdi,%rdx) + jbe L(return) + movdqu 32(%rsi), %xmm8 + cmpq $128, %rdx + movdqu %xmm8, 32(%rdi) + movdqu -48(%rsi,%rdx), %xmm8 + movdqu %xmm8, -48(%rdi,%rdx) + movdqu 48(%rsi), %xmm8 + movdqu %xmm8, 48(%rdi) + movdqu -64(%rsi,%rdx), %xmm8 + movdqu %xmm8, -64(%rdi,%rdx) + jbe L(return) + leaq 64(%rdi), %rcx + addq %rdi, %rdx + andq $-64, %rdx + andq $-64, %rcx + movq %rcx, %rax + subq %rdi, %rax + addq %rax, %rsi + cmpq %rdx, %rcx + je L(return) + movq %rsi, %r10 + subq %rcx, %r10 + leaq 16(%r10), %r9 + leaq 32(%r10), %r8 + leaq 48(%r10), %rax + .p2align 4,,10 .p2align 4 - -/* ... when PREFETCHW is not available. */ - -L(preloop): /* cache-line in state E */ - decq %rcx - - movq (%rsi), %rax - movq 8(%rsi), %rbx - movq 16(%rsi), %r9 - movq 24(%rsi), %r10 - movq 32(%rsi), %r11 - movq 40(%rsi), %r12 - movq 48(%rsi), %r13 - movq 56(%rsi), %r14 - - prefetcht0 896 + 0(%rsi) - prefetcht0 896 + 64(%rsi) - - movq %rax, (%rdi) - movq %rbx, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - movq %r11, 32(%rdi) - movq %r12, 40(%rdi) - movq %r13, 48(%rdi) - movq %r14, 56(%rdi) - - leaq 64 (%rsi), %rsi - leaq 64 (%rdi), %rdi - - jz L(prebail) - - decq %rcx - - movq (%rsi), %rax - movq 8(%rsi), %rbx - movq 16(%rsi), %r9 - movq 24(%rsi), %r10 - movq 32(%rsi), %r11 - movq 40(%rsi), %r12 - movq 48(%rsi), %r13 - movq 56(%rsi), %r14 - - prefetcht0 896 - 64(%rdi) - prefetcht0 896 - 0(%rdi) - - movq %rax, (%rdi) - movq %rbx, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - movq %r11, 32(%rdi) - movq %r12, 40(%rdi) - movq %r13, 48(%rdi) - movq %r14, 56(%rdi) - - leaq 64(%rsi), %rsi - leaq 64(%rdi), %rdi - - jnz L(preloop) - -L(prebail): - movq SAVE3(%rsp), %rbx - cfi_restore (%rbx) - movq SAVE2(%rsp), %r12 - cfi_restore (%r12) - movq SAVE1(%rsp), %r13 - cfi_restore (%r13) - movq SAVE0(%rsp), %r14 - cfi_restore (%r14) - -/* .p2align 4 */ - -L(preskip): - subq %r8, %rdx /* check for more */ - testq $-64, %rdx - jnz L(preafter) - - andl $63, %edx /* check for left overs */ -#ifdef USE_AS_MEMPCPY - jnz L(1) - - movq %rdi, %rax -#else - movq RETVAL(%rsp), %rax - jnz L(1) - - rep -#endif - retq /* exit */ - +L(loop): + movdqu (%rcx,%r10), %xmm8 + movdqa %xmm8, (%rcx) + movdqu (%rcx,%r9), %xmm8 + movdqa %xmm8, 16(%rcx) + movdqu (%rcx,%r8), %xmm8 + movdqa %xmm8, 32(%rcx) + movdqu (%rcx,%rax), %xmm8 + movdqa %xmm8, 48(%rcx) + addq $64, %rcx + cmpq %rcx, %rdx + jne L(loop) + jmp L(return) +L(overlapping): + cmpq %rsi, %rdi + jae .L3 + testq %rdx, %rdx + .p2align 4,,5 + je L(return) + movq %rdx, %r9 + leaq 16(%rsi), %rcx + leaq 16(%rdi), %r8 + shrq $4, %r9 + movq %r9, %rax + salq $4, %rax + cmpq %rcx, %rdi + setae %cl + cmpq %r8, %rsi + setae %r8b + orl %r8d, %ecx + cmpq $15, %rdx + seta %r8b + testb %r8b, %cl + je .L16 + testq %rax, %rax + je .L16 + xorl %ecx, %ecx + xorl %r8d, %r8d +.L7: + movdqu (%rsi,%rcx), %xmm8 + addq $1, %r8 + movdqu %xmm8, (%rdi,%rcx) + addq $16, %rcx + cmpq %r8, %r9 + ja .L7 + cmpq %rax, %rdx + je L(return) +.L21: + movzbl (%rsi,%rax), %ecx + movb %cl, (%rdi,%rax) + addq $1, %rax + cmpq %rax, %rdx + ja .L21 + jmp L(return) +L(less_16): + testb $24, %dl + jne L(between_9_16) + testb $4, %dl + .p2align 4,,5 + jne L(between_5_8) + testq %rdx, %rdx + .p2align 4,,2 + je L(return) + movzbl (%rsi), %eax + testb $2, %dl + movb %al, (%rdi) + je L(return) + movzwl -2(%rsi,%rdx), %eax + movw %ax, -2(%rdi,%rdx) + jmp L(return) +.L3: + leaq -1(%rdx), %rax + .p2align 4,,10 .p2align 4 - -L(preafter): - -/* Handle huge blocks. */ - -L(NTtry): - -L(NT): /* non-temporal 128-byte */ - movq %rdx, %rcx - shrq $7, %rcx - jz L(NTskip) - - movq %r14, SAVE0(%rsp) - cfi_rel_offset (%r14, SAVE0) - movq %r13, SAVE1(%rsp) - cfi_rel_offset (%r13, SAVE1) - movq %r12, SAVE2(%rsp) - cfi_rel_offset (%r12, SAVE2) - - .p2align 4 - -L(NTloop): - prefetchnta 768(%rsi) - prefetchnta 832(%rsi) - - decq %rcx - - movq (%rsi), %rax - movq 8(%rsi), %r8 - movq 16(%rsi), %r9 - movq 24(%rsi), %r10 - movq 32(%rsi), %r11 - movq 40(%rsi), %r12 - movq 48(%rsi), %r13 - movq 56(%rsi), %r14 - - movntiq %rax, (%rdi) - movntiq %r8, 8(%rdi) - movntiq %r9, 16(%rdi) - movntiq %r10, 24(%rdi) - movntiq %r11, 32(%rdi) - movntiq %r12, 40(%rdi) - movntiq %r13, 48(%rdi) - movntiq %r14, 56(%rdi) - - movq 64(%rsi), %rax - movq 72(%rsi), %r8 - movq 80(%rsi), %r9 - movq 88(%rsi), %r10 - movq 96(%rsi), %r11 - movq 104(%rsi), %r12 - movq 112(%rsi), %r13 - movq 120(%rsi), %r14 - - movntiq %rax, 64(%rdi) - movntiq %r8, 72(%rdi) - movntiq %r9, 80(%rdi) - movntiq %r10, 88(%rdi) - movntiq %r11, 96(%rdi) - movntiq %r12, 104(%rdi) - movntiq %r13, 112(%rdi) - movntiq %r14, 120(%rdi) - - leaq 128(%rsi), %rsi - leaq 128(%rdi), %rdi - - jnz L(NTloop) - - sfence /* serialize memory stores */ - - movq SAVE2(%rsp), %r12 - cfi_restore (%r12) - movq SAVE1(%rsp), %r13 - cfi_restore (%r13) - movq SAVE0(%rsp), %r14 - cfi_restore (%r14) - -L(NTskip): - andl $127, %edx /* check for left overs */ -#ifdef USE_AS_MEMPCPY - jnz L(1) - - movq %rdi, %rax -#else - movq RETVAL(%rsp), %rax - jnz L(1) - - rep -#endif - retq /* exit */ - -#endif /* IS_IN (libc) */ - +.L11: + movzbl (%rsi,%rax), %edx + movb %dl, (%rdi,%rax) + subq $1, %rax + jmp .L11 +L(between_9_16): + movq (%rsi), %rax + movq %rax, (%rdi) + movq -8(%rsi,%rdx), %rax + movq %rax, -8(%rdi,%rdx) + jmp L(return) +.L16: + xorl %eax, %eax + jmp .L21 +L(between_5_8): + movl (%rsi), %eax + movl %eax, (%rdi) + movl -4(%rsi,%rdx), %eax + movl %eax, -4(%rdi,%rdx) + jmp L(return) END(memcpy) #ifndef USE_AS_MEMPCPY diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index d10b4d4fb3..917e1a0c79 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -8,7 +8,7 @@ ifeq ($(subdir),string) sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcmp-sse2-unaligned strncmp-ssse3 \ memcmp-sse4 memcpy-ssse3 \ - memcpy-sse2-unaligned mempcpy-ssse3 \ + memcpy-sse2 mempcpy-sse2 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \ memmove-ssse3-back strcasecmp_l-ssse3 \ diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S deleted file mode 100644 index 5693ba7395..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S +++ /dev/null @@ -1,175 +0,0 @@ -/* memcpy with unaliged loads - Copyright (C) 2013-2015 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if IS_IN (libc) - -#include - -#include "asm-syntax.h" - - -ENTRY(__memcpy_sse2_unaligned) - movq %rsi, %rax - leaq (%rdx,%rdx), %rcx - subq %rdi, %rax - subq %rdx, %rax - cmpq %rcx, %rax - jb L(overlapping) - cmpq $16, %rdx - jbe L(less_16) - movdqu (%rsi), %xmm8 - cmpq $32, %rdx - movdqu %xmm8, (%rdi) - movdqu -16(%rsi,%rdx), %xmm8 - movdqu %xmm8, -16(%rdi,%rdx) - ja .L31 -L(return): - movq %rdi, %rax - ret - .p2align 4,,10 - .p2align 4 -.L31: - movdqu 16(%rsi), %xmm8 - cmpq $64, %rdx - movdqu %xmm8, 16(%rdi) - movdqu -32(%rsi,%rdx), %xmm8 - movdqu %xmm8, -32(%rdi,%rdx) - jbe L(return) - movdqu 32(%rsi), %xmm8 - cmpq $128, %rdx - movdqu %xmm8, 32(%rdi) - movdqu -48(%rsi,%rdx), %xmm8 - movdqu %xmm8, -48(%rdi,%rdx) - movdqu 48(%rsi), %xmm8 - movdqu %xmm8, 48(%rdi) - movdqu -64(%rsi,%rdx), %xmm8 - movdqu %xmm8, -64(%rdi,%rdx) - jbe L(return) - leaq 64(%rdi), %rcx - addq %rdi, %rdx - andq $-64, %rdx - andq $-64, %rcx - movq %rcx, %rax - subq %rdi, %rax - addq %rax, %rsi - cmpq %rdx, %rcx - je L(return) - movq %rsi, %r10 - subq %rcx, %r10 - leaq 16(%r10), %r9 - leaq 32(%r10), %r8 - leaq 48(%r10), %rax - .p2align 4,,10 - .p2align 4 -L(loop): - movdqu (%rcx,%r10), %xmm8 - movdqa %xmm8, (%rcx) - movdqu (%rcx,%r9), %xmm8 - movdqa %xmm8, 16(%rcx) - movdqu (%rcx,%r8), %xmm8 - movdqa %xmm8, 32(%rcx) - movdqu (%rcx,%rax), %xmm8 - movdqa %xmm8, 48(%rcx) - addq $64, %rcx - cmpq %rcx, %rdx - jne L(loop) - jmp L(return) -L(overlapping): - cmpq %rsi, %rdi - jae .L3 - testq %rdx, %rdx - .p2align 4,,5 - je L(return) - movq %rdx, %r9 - leaq 16(%rsi), %rcx - leaq 16(%rdi), %r8 - shrq $4, %r9 - movq %r9, %rax - salq $4, %rax - cmpq %rcx, %rdi - setae %cl - cmpq %r8, %rsi - setae %r8b - orl %r8d, %ecx - cmpq $15, %rdx - seta %r8b - testb %r8b, %cl - je .L16 - testq %rax, %rax - je .L16 - xorl %ecx, %ecx - xorl %r8d, %r8d -.L7: - movdqu (%rsi,%rcx), %xmm8 - addq $1, %r8 - movdqu %xmm8, (%rdi,%rcx) - addq $16, %rcx - cmpq %r8, %r9 - ja .L7 - cmpq %rax, %rdx - je L(return) -.L21: - movzbl (%rsi,%rax), %ecx - movb %cl, (%rdi,%rax) - addq $1, %rax - cmpq %rax, %rdx - ja .L21 - jmp L(return) -L(less_16): - testb $24, %dl - jne L(between_9_16) - testb $4, %dl - .p2align 4,,5 - jne L(between_5_8) - testq %rdx, %rdx - .p2align 4,,2 - je L(return) - movzbl (%rsi), %eax - testb $2, %dl - movb %al, (%rdi) - je L(return) - movzwl -2(%rsi,%rdx), %eax - movw %ax, -2(%rdi,%rdx) - jmp L(return) -.L3: - leaq -1(%rdx), %rax - .p2align 4,,10 - .p2align 4 -.L11: - movzbl (%rsi,%rax), %edx - movb %dl, (%rdi,%rax) - subq $1, %rax - jmp .L11 -L(between_9_16): - movq (%rsi), %rax - movq %rax, (%rdi) - movq -8(%rsi,%rdx), %rax - movq %rax, -8(%rdi,%rdx) - jmp L(return) -.L16: - xorl %eax, %eax - jmp .L21 -L(between_5_8): - movl (%rsi), %eax - movl %eax, (%rdi) - movl -4(%rsi,%rdx), %eax - movl %eax, -4(%rdi,%rdx) - jmp L(return) -END(__memcpy_sse2_unaligned) - -#endif diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2.S b/sysdeps/x86_64/multiarch/memcpy-sse2.S new file mode 100644 index 0000000000..9585b1fd03 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcpy-sse2.S @@ -0,0 +1,569 @@ +/* + Optimized memcpy for x86-64. + + Copyright (C) 2007-2015 Free Software Foundation, Inc. + Contributed by Evandro Menezes , 2007. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . +*/ + +#include +#include "asm-syntax.h" + +/* Stack slots in the red-zone. */ + +#ifdef USE_AS_MEMPCPY +# define RETVAL (0) +#else +# define RETVAL (-8) +#endif +#define SAVE0 (RETVAL - 8) +#define SAVE1 (SAVE0 - 8) +#define SAVE2 (SAVE1 - 8) +#define SAVE3 (SAVE2 - 8) + + .text + +#if defined SHARED && IS_IN (libc) +ENTRY_CHK (__memcpy_chk_sse2) + + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) + +END_CHK (__memcpy_chk_sse2) +#endif + +ENTRY(__memcpy_sse2) /* (void *, const void*, size_t) */ + +/* Handle tiny blocks. */ + +L(1try): /* up to 32B */ + cmpq $32, %rdx +#ifndef USE_AS_MEMPCPY + movq %rdi, %rax /* save return value */ +#endif + jae L(1after) + +L(1): /* 1-byte once */ + testb $1, %dl + jz L(1a) + + movzbl (%rsi), %ecx + movb %cl, (%rdi) + + incq %rsi + incq %rdi + + .p2align 4,, 4 + +L(1a): /* 2-byte once */ + testb $2, %dl + jz L(1b) + + movzwl (%rsi), %ecx + movw %cx, (%rdi) + + addq $2, %rsi + addq $2, %rdi + + .p2align 4,, 4 + +L(1b): /* 4-byte once */ + testb $4, %dl + jz L(1c) + + movl (%rsi), %ecx + movl %ecx, (%rdi) + + addq $4, %rsi + addq $4, %rdi + + .p2align 4,, 4 + +L(1c): /* 8-byte once */ + testb $8, %dl + jz L(1d) + + movq (%rsi), %rcx + movq %rcx, (%rdi) + + addq $8, %rsi + addq $8, %rdi + + .p2align 4,, 4 + +L(1d): /* 16-byte loop */ + andl $0xf0, %edx + jz L(exit) + + .p2align 4 + +L(1loop): + movq (%rsi), %rcx + movq 8(%rsi), %r8 + movq %rcx, (%rdi) + movq %r8, 8(%rdi) + + subl $16, %edx + + leaq 16(%rsi), %rsi + leaq 16(%rdi), %rdi + + jnz L(1loop) + + .p2align 4,, 4 + +L(exit): /* exit */ +#ifdef USE_AS_MEMPCPY + movq %rdi, %rax /* return value */ +#else + rep +#endif + retq + + .p2align 4 + +L(1after): +#ifndef USE_AS_MEMPCPY + movq %rax, RETVAL(%rsp) /* save return value */ +#endif + +/* Align to the natural word size. */ + +L(aligntry): + movl %esi, %ecx /* align by source */ + + andl $7, %ecx + jz L(alignafter) /* already aligned */ + +L(align): /* align */ + leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */ + subl $8, %ecx + + .p2align 4 + +L(alignloop): /* 1-byte alignment loop */ + movzbl (%rsi), %eax + movb %al, (%rdi) + + incl %ecx + + leaq 1(%rsi), %rsi + leaq 1(%rdi), %rdi + + jnz L(alignloop) + + .p2align 4 + +L(alignafter): + +/* Handle mid-sized blocks. */ + +L(32try): /* up to 1KB */ + cmpq $1024, %rdx + ja L(32after) + +L(32): /* 32-byte loop */ + movl %edx, %ecx + shrl $5, %ecx + jz L(32skip) + + .p2align 4 + +L(32loop): + decl %ecx + + movq (%rsi), %rax + movq 8(%rsi), %r8 + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 + + movq %rax, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + + leaq 32(%rsi), %rsi + leaq 32(%rdi), %rdi + + jz L(32skip) /* help out smaller blocks */ + + decl %ecx + + movq (%rsi), %rax + movq 8(%rsi), %r8 + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 + + movq %rax, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + + leaq 32(%rsi), %rsi + leaq 32(%rdi), %rdi + + jnz L(32loop) + + .p2align 4 + +L(32skip): + andl $31, %edx /* check for left overs */ +#ifdef USE_AS_MEMPCPY + jnz L(1) + + movq %rdi, %rax +#else + movq RETVAL(%rsp), %rax + jnz L(1) + + rep +#endif + retq /* exit */ + + .p2align 4 + +L(32after): + +/* + In order to minimize code-size in RTLD, algorithms specific for + larger blocks are excluded when building for RTLD. +*/ + +/* Handle blocks smaller than 1/2 L1. */ + +L(fasttry): /* first 1/2 L1 */ +#if IS_IN (libc) /* only up to this algorithm outside of libc.so */ + mov __x86_data_cache_size_half(%rip), %R11_LP + cmpq %rdx, %r11 /* calculate the smaller of */ + cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */ +#endif + +L(fast): /* good ol' MOVS */ +#if IS_IN (libc) + movq %r11, %rcx + andq $-8, %r11 +#else + movq %rdx, %rcx +#endif + shrq $3, %rcx + jz L(fastskip) + + rep + movsq + + .p2align 4,, 4 + +L(fastskip): +#if IS_IN (libc) + subq %r11, %rdx /* check for more */ + testq $-8, %rdx + jnz L(fastafter) +#endif + + andl $7, %edx /* check for left overs */ +#ifdef USE_AS_MEMPCPY + jnz L(1) + + movq %rdi, %rax +#else + movq RETVAL(%rsp), %rax + jnz L(1) + + rep +#endif + retq /* exit */ + +#if IS_IN (libc) /* none of the algorithms below for RTLD */ + + .p2align 4 + +L(fastafter): + +/* Handle large blocks smaller than 1/2 L2. */ + +L(pretry): /* first 1/2 L2 */ + mov __x86_shared_cache_size_half (%rip), %R8_LP + cmpq %rdx, %r8 /* calculate the lesser of */ + cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */ + +L(pre): /* 64-byte with prefetching */ + movq %r8, %rcx + andq $-64, %r8 + shrq $6, %rcx + jz L(preskip) + + movq %r14, SAVE0(%rsp) + cfi_rel_offset (%r14, SAVE0) + movq %r13, SAVE1(%rsp) + cfi_rel_offset (%r13, SAVE1) + movq %r12, SAVE2(%rsp) + cfi_rel_offset (%r12, SAVE2) + movq %rbx, SAVE3(%rsp) + cfi_rel_offset (%rbx, SAVE3) + + cmpl $0, __x86_prefetchw(%rip) + jz L(preloop) /* check if PREFETCHW OK */ + + .p2align 4 + +/* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */ + +L(prewloop): /* cache-line in state M */ + decq %rcx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %r9 + movq 24 (%rsi), %r10 + movq 32 (%rsi), %r11 + movq 40 (%rsi), %r12 + movq 48 (%rsi), %r13 + movq 56 (%rsi), %r14 + + prefetcht0 0 + 896 (%rsi) + prefetcht0 64 + 896 (%rsi) + + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + movq %r11, 32(%rdi) + movq %r12, 40(%rdi) + movq %r13, 48(%rdi) + movq %r14, 56(%rdi) + + leaq 64(%rsi), %rsi + leaq 64(%rdi), %rdi + + jz L(prebail) + + decq %rcx + + movq (%rsi), %rax + movq 8(%rsi), %rbx + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 + movq 32(%rsi), %r11 + movq 40(%rsi), %r12 + movq 48(%rsi), %r13 + movq 56(%rsi), %r14 + + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + movq %r11, 32(%rdi) + movq %r12, 40(%rdi) + movq %r13, 48(%rdi) + movq %r14, 56(%rdi) + + prefetchw 896 - 64(%rdi) + prefetchw 896 - 0(%rdi) + + leaq 64(%rsi), %rsi + leaq 64(%rdi), %rdi + + jnz L(prewloop) + jmp L(prebail) + + .p2align 4 + +/* ... when PREFETCHW is not available. */ + +L(preloop): /* cache-line in state E */ + decq %rcx + + movq (%rsi), %rax + movq 8(%rsi), %rbx + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 + movq 32(%rsi), %r11 + movq 40(%rsi), %r12 + movq 48(%rsi), %r13 + movq 56(%rsi), %r14 + + prefetcht0 896 + 0(%rsi) + prefetcht0 896 + 64(%rsi) + + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + movq %r11, 32(%rdi) + movq %r12, 40(%rdi) + movq %r13, 48(%rdi) + movq %r14, 56(%rdi) + + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi + + jz L(prebail) + + decq %rcx + + movq (%rsi), %rax + movq 8(%rsi), %rbx + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 + movq 32(%rsi), %r11 + movq 40(%rsi), %r12 + movq 48(%rsi), %r13 + movq 56(%rsi), %r14 + + prefetcht0 896 - 64(%rdi) + prefetcht0 896 - 0(%rdi) + + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + movq %r11, 32(%rdi) + movq %r12, 40(%rdi) + movq %r13, 48(%rdi) + movq %r14, 56(%rdi) + + leaq 64(%rsi), %rsi + leaq 64(%rdi), %rdi + + jnz L(preloop) + +L(prebail): + movq SAVE3(%rsp), %rbx + cfi_restore (%rbx) + movq SAVE2(%rsp), %r12 + cfi_restore (%r12) + movq SAVE1(%rsp), %r13 + cfi_restore (%r13) + movq SAVE0(%rsp), %r14 + cfi_restore (%r14) + +/* .p2align 4 */ + +L(preskip): + subq %r8, %rdx /* check for more */ + testq $-64, %rdx + jnz L(preafter) + + andl $63, %edx /* check for left overs */ +#ifdef USE_AS_MEMPCPY + jnz L(1) + + movq %rdi, %rax +#else + movq RETVAL(%rsp), %rax + jnz L(1) + + rep +#endif + retq /* exit */ + + .p2align 4 + +L(preafter): + +/* Handle huge blocks. */ + +L(NTtry): + +L(NT): /* non-temporal 128-byte */ + movq %rdx, %rcx + shrq $7, %rcx + jz L(NTskip) + + movq %r14, SAVE0(%rsp) + cfi_rel_offset (%r14, SAVE0) + movq %r13, SAVE1(%rsp) + cfi_rel_offset (%r13, SAVE1) + movq %r12, SAVE2(%rsp) + cfi_rel_offset (%r12, SAVE2) + + .p2align 4 + +L(NTloop): + prefetchnta 768(%rsi) + prefetchnta 832(%rsi) + + decq %rcx + + movq (%rsi), %rax + movq 8(%rsi), %r8 + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 + movq 32(%rsi), %r11 + movq 40(%rsi), %r12 + movq 48(%rsi), %r13 + movq 56(%rsi), %r14 + + movntiq %rax, (%rdi) + movntiq %r8, 8(%rdi) + movntiq %r9, 16(%rdi) + movntiq %r10, 24(%rdi) + movntiq %r11, 32(%rdi) + movntiq %r12, 40(%rdi) + movntiq %r13, 48(%rdi) + movntiq %r14, 56(%rdi) + + movq 64(%rsi), %rax + movq 72(%rsi), %r8 + movq 80(%rsi), %r9 + movq 88(%rsi), %r10 + movq 96(%rsi), %r11 + movq 104(%rsi), %r12 + movq 112(%rsi), %r13 + movq 120(%rsi), %r14 + + movntiq %rax, 64(%rdi) + movntiq %r8, 72(%rdi) + movntiq %r9, 80(%rdi) + movntiq %r10, 88(%rdi) + movntiq %r11, 96(%rdi) + movntiq %r12, 104(%rdi) + movntiq %r13, 112(%rdi) + movntiq %r14, 120(%rdi) + + leaq 128(%rsi), %rsi + leaq 128(%rdi), %rdi + + jnz L(NTloop) + + sfence /* serialize memory stores */ + + movq SAVE2(%rsp), %r12 + cfi_restore (%r12) + movq SAVE1(%rsp), %r13 + cfi_restore (%r13) + movq SAVE0(%rsp), %r14 + cfi_restore (%r14) + +L(NTskip): + andl $127, %edx /* check for left overs */ +#ifdef USE_AS_MEMPCPY + jnz L(1) + + movq %rdi, %rax +#else + movq RETVAL(%rsp), %rax + jnz L(1) + + rep +#endif + retq /* exit */ + +#endif /* IS_IN (libc) */ + +END(__memcpy_sse2) diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S index 7e119d30e5..99c481c866 100644 --- a/sysdeps/x86_64/multiarch/memcpy.S +++ b/sysdeps/x86_64/multiarch/memcpy.S @@ -32,48 +32,49 @@ ENTRY(__new_memcpy) LOAD_RTLD_GLOBAL_RO_RDX leaq __memcpy_avx_unaligned(%rip), %rax HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - jz 1f - ret -1: leaq __memcpy_sse2(%rip), %rax - HAS_ARCH_FEATURE (Slow_BSF) - jnz 2f + jnz 3f leaq __memcpy_sse2_unaligned(%rip), %rax - ret -2: HAS_CPU_FEATURE (SSSE3) - jz 3f + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 3f + leaq __memcpy_sse2(%rip), %rax + HAS_CPU_FEATURE (SSSE3) + jz 3f + leaq __memcpy_ssse3_back(%rip), %rax + HAS_CPU_FEATURE (Fast_Copy_Backward) + jnz 3f leaq __memcpy_ssse3(%rip), %rax 3: ret END(__new_memcpy) # undef ENTRY # define ENTRY(name) \ - .type __memcpy_sse2, @function; \ - .globl __memcpy_sse2; \ - .hidden __memcpy_sse2; \ + .type __memcpy_sse2_unaligned, @function; \ + .globl __memcpy_sse2_unaligned; \ + .hidden __memcpy_sse2_unaligned; \ .p2align 4; \ - __memcpy_sse2: cfi_startproc; \ + __memcpy_sse2_unaligned: cfi_startproc; \ CALL_MCOUNT # undef END # define END(name) \ - cfi_endproc; .size __memcpy_sse2, .-__memcpy_sse2 + cfi_endproc; .size __memcpy_sse2_unaligned, .-__memcpy_sse2_unaligned # undef ENTRY_CHK # define ENTRY_CHK(name) \ - .type __memcpy_chk_sse2, @function; \ - .globl __memcpy_chk_sse2; \ + .type __memcpy_chk_sse2_unaligned, @function; \ + .globl __memcpy_chk_sse2_unaligned; \ .p2align 4; \ - __memcpy_chk_sse2: cfi_startproc; \ + __memcpy_chk_sse2_unaligned: cfi_startproc; \ CALL_MCOUNT # undef END_CHK # define END_CHK(name) \ - cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2 + cfi_endproc; .size __memcpy_chk_sse2_unaligned, .-__memcpy_chk_sse2_unaligned # undef libc_hidden_builtin_def /* It doesn't make sense to send libc-internal memcpy calls through a PLT. The speedup we get from using SSSE3 instruction is likely eaten away by the indirect call in the PLT. */ # define libc_hidden_builtin_def(name) \ - .globl __GI_memcpy; __GI_memcpy = __memcpy_sse2 + .globl __GI_memcpy; __GI_memcpy = __memcpy_sse2_unaligned versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14); #endif diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S index 81f83ddb71..591d18a0c1 100644 --- a/sysdeps/x86_64/multiarch/memcpy_chk.S +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S @@ -30,16 +30,19 @@ ENTRY(__memcpy_chk) .type __memcpy_chk, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX + leaq __memcpy_chk_avx_unaligned(%rip), %rax + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jnz 2f + leaq __memcpy_chk_sse2_unaligned(%rip), %rax + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f leaq __memcpy_chk_sse2(%rip), %rax HAS_CPU_FEATURE (SSSE3) jz 2f - leaq __memcpy_chk_ssse3(%rip), %rax - HAS_ARCH_FEATURE (Fast_Copy_Backward) - jz 2f - leaq __memcpy_chk_ssse3_back(%rip), %rax - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - jz 2f - leaq __memcpy_chk_avx_unaligned(%rip), %rax + leaq __memcpy_chk_ssse3_back(%rip), %rax + HAS_CPU_FEATURE (Fast_Copy_Backward) + jnz 2f + leaq __memcpy_chk_ssse3(%rip), %rax 2: ret END(__memcpy_chk) # else diff --git a/sysdeps/x86_64/multiarch/mempcpy-sse2.S b/sysdeps/x86_64/multiarch/mempcpy-sse2.S new file mode 100644 index 0000000000..e8bde29dc1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/mempcpy-sse2.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define __memcpy_sse2 __mempcpy_sse2 +#define __memcpy_chk_sse2 __mempcpy_chk_sse2 +#include "memcpy-sse2.S" diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S index ad36840d54..450915f60f 100644 --- a/sysdeps/x86_64/multiarch/mempcpy.S +++ b/sysdeps/x86_64/multiarch/mempcpy.S @@ -28,41 +28,44 @@ ENTRY(__mempcpy) .type __mempcpy, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX + leaq __mempcpy_avx_unaligned(%rip), %rax + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jnz 2f + leaq __mempcpy_sse2_unaligned(%rip), %rax + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f leaq __mempcpy_sse2(%rip), %rax HAS_CPU_FEATURE (SSSE3) jz 2f - leaq __mempcpy_ssse3(%rip), %rax - HAS_ARCH_FEATURE (Fast_Copy_Backward) - jz 2f - leaq __mempcpy_ssse3_back(%rip), %rax - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - jz 2f - leaq __mempcpy_avx_unaligned(%rip), %rax + leaq __mempcpy_ssse3_back(%rip), %rax + HAS_CPU_FEATURE (Fast_Copy_Backward) + jnz 2f + leaq __mempcpy_ssse3(%rip), %rax 2: ret END(__mempcpy) # undef ENTRY # define ENTRY(name) \ - .type __mempcpy_sse2, @function; \ + .type __mempcpy_sse2_unaligned, @function; \ .p2align 4; \ - .globl __mempcpy_sse2; \ - .hidden __mempcpy_sse2; \ - __mempcpy_sse2: cfi_startproc; \ + .globl __mempcpy_sse2_unaligned; \ + .hidden __mempcpy_sse2_unaligned; \ + __mempcpy_sse2_unaligned: cfi_startproc; \ CALL_MCOUNT # undef END # define END(name) \ - cfi_endproc; .size __mempcpy_sse2, .-__mempcpy_sse2 + cfi_endproc; .size __mempcpy_sse2_unaligned, .-__mempcpy_sse2_unaligned # undef ENTRY_CHK # define ENTRY_CHK(name) \ - .type __mempcpy_chk_sse2, @function; \ - .globl __mempcpy_chk_sse2; \ + .type __mempcpy_chk_sse2_unaligned, @function; \ + .globl __mempcpy_chk_sse2_unaligned; \ .p2align 4; \ - __mempcpy_chk_sse2: cfi_startproc; \ + __mempcpy_chk_sse2_unaligned: cfi_startproc; \ CALL_MCOUNT # undef END_CHK # define END_CHK(name) \ - cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2 + cfi_endproc; .size __mempcpy_chk_sse2_unaligned, .-__mempcpy_chk_sse2_unaligned # undef libc_hidden_def # undef libc_hidden_builtin_def @@ -70,9 +73,9 @@ END(__mempcpy) The speedup we get from using SSSE3 instruction is likely eaten away by the indirect call in the PLT. */ # define libc_hidden_def(name) \ - .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2 + .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2_unaligned # define libc_hidden_builtin_def(name) \ - .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2 + .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2_unaligned #endif #include "../mempcpy.S" diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S index 0a46b567ec..9dc7dc80b7 100644 --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S @@ -30,16 +30,19 @@ ENTRY(__mempcpy_chk) .type __mempcpy_chk, @gnu_indirect_function LOAD_RTLD_GLOBAL_RO_RDX + leaq __mempcpy_chk_avx_unaligned(%rip), %rax + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jnz 2f + leaq __mempcpy_chk_sse2_unaligned(%rip), %rax + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f leaq __mempcpy_chk_sse2(%rip), %rax HAS_CPU_FEATURE (SSSE3) jz 2f - leaq __mempcpy_chk_ssse3(%rip), %rax - HAS_ARCH_FEATURE (Fast_Copy_Backward) - jz 2f - leaq __mempcpy_chk_ssse3_back(%rip), %rax - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - jz 2f - leaq __mempcpy_chk_avx_unaligned(%rip), %rax + leaq __mempcpy_chk_ssse3_back(%rip), %rax + HAS_CPU_FEATURE (Fast_Copy_Backward) + jnz 2f + leaq __mempcpy_chk_ssse3(%rip), %rax 2: ret END(__mempcpy_chk) # else -- cgit 1.4.1