diff options
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 1 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S | 175 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memcpy.S | 15 |
4 files changed, 185 insertions, 8 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 86787ee6ea..203d16eed3 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -7,7 +7,7 @@ endif ifeq ($(subdir),string) sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ - strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ + strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 05315fdd7a..28d35793c5 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -227,6 +227,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2)) /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */ diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S new file mode 100644 index 0000000000..efdfea238f --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S @@ -0,0 +1,175 @@ +/* memcpy with unaliged loads + Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#include "asm-syntax.h" + +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + + +ENTRY(__memcpy_sse2_unaligned) + movq %rsi, %rax + leaq (%rdx,%rdx), %rcx + subq %rdi, %rax + subq %rdx, %rax + cmpq %rcx, %rax + jb L(overlapping) + cmpq $16, %rdx + jbe L(less_16) + movdqu (%rsi), %xmm8 + cmpq $32, %rdx + movdqu %xmm8, (%rdi) + movdqu -16(%rsi,%rdx), %xmm8 + movdqu %xmm8, -16(%rdi,%rdx) + ja .L31 +L(return): + movq %rdi, %rax + ret + .p2align 4,,10 + ALIGN(4) +.L31: + movdqu 16(%rsi), %xmm8 + cmpq $64, %rdx + movdqu %xmm8, 16(%rdi) + movdqu -32(%rsi,%rdx), %xmm8 + movdqu %xmm8, -32(%rdi,%rdx) + jbe L(return) + movdqu 32(%rsi), %xmm8 + cmpq $128, %rdx + movdqu %xmm8, 32(%rdi) + movdqu -48(%rsi,%rdx), %xmm8 + movdqu %xmm8, -48(%rdi,%rdx) + movdqu 48(%rsi), %xmm8 + movdqu %xmm8, 48(%rdi) + movdqu -64(%rsi,%rdx), %xmm8 + movdqu %xmm8, -64(%rdi,%rdx) + jbe L(return) + leaq 64(%rdi), %rcx + addq %rdi, %rdx + andq $-64, %rdx + andq $-64, %rcx + movq %rcx, %rax + subq %rdi, %rax + addq %rax, %rsi + cmpq %rdx, %rcx + je L(return) + movq %rsi, %r10 + subq %rcx, %r10 + leaq 16(%r10), %r9 + leaq 32(%r10), %r8 + leaq 48(%r10), %rax + .p2align 4,,10 + ALIGN(4) +L(loop): + movdqu (%rcx,%r10), %xmm8 + movdqa %xmm8, (%rcx) + movdqu (%rcx,%r9), %xmm8 + movdqa %xmm8, 16(%rcx) + movdqu (%rcx,%r8), %xmm8 + movdqa %xmm8, 32(%rcx) + movdqu (%rcx,%rax), %xmm8 + movdqa %xmm8, 48(%rcx) + addq $64, %rcx + cmpq %rcx, %rdx + jne L(loop) + jmp L(return) +L(overlapping): + cmpq %rsi, %rdi + jae .L3 + testq %rdx, %rdx + .p2align 4,,5 + je L(return) + movq %rdx, %r9 + leaq 16(%rsi), %rcx + leaq 16(%rdi), %r8 + shrq $4, %r9 + movq %r9, %rax + salq $4, %rax + cmpq %rcx, %rdi + setae %cl + cmpq %r8, %rsi + setae %r8b + orl %r8d, %ecx + cmpq $15, %rdx + seta %r8b + testb %r8b, %cl + je .L16 + testq %rax, %rax + je .L16 + xorl %ecx, %ecx + xorl %r8d, %r8d +.L7: + movdqu (%rsi,%rcx), %xmm8 + addq $1, %r8 + movdqu %xmm8, (%rdi,%rcx) + addq $16, %rcx + cmpq %r8, %r9 + ja .L7 + cmpq %rax, %rdx + je L(return) +.L21: + movzbl (%rsi,%rax), %ecx + movb %cl, (%rdi,%rax) + addq $1, %rax + cmpq %rax, %rdx + ja .L21 + jmp L(return) +L(less_16): + testb $24, %dl + jne L(between_9_16) + testb $4, %dl + .p2align 4,,5 + jne L(between_5_8) + testq %rdx, %rdx + .p2align 4,,2 + je L(return) + movzbl (%rsi), %eax + testb $2, %dl + movb %al, (%rdi) + je L(return) + movzwl -2(%rsi,%rdx), %eax + movw %ax, -2(%rdi,%rdx) + jmp L(return) +.L3: + leaq -1(%rdx), %rax + .p2align 4,,10 + ALIGN(4) +.L11: + movzbl (%rsi,%rax), %edx + movb %dl, (%rdi,%rax) + subq $1, %rax + jmp .L11 +L(between_9_16): + movq (%rsi), %rax + movq %rax, (%rdi) + movq -8(%rsi,%rdx), %rax + movq %rax, -8(%rdi,%rdx) + jmp L(return) +.L16: + xorl %eax, %eax + jmp .L21 +L(between_5_8): + movl (%rsi), %eax + movl %eax, (%rdi) + movl -4(%rsi,%rdx), %eax + movl %eax, -4(%rdi,%rdx) + jmp L(return) +END(__memcpy_sse2_unaligned) diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S index b452f5304b..a1e5031376 100644 --- a/sysdeps/x86_64/multiarch/memcpy.S +++ b/sysdeps/x86_64/multiarch/memcpy.S @@ -33,13 +33,14 @@ ENTRY(__new_memcpy) jne 1f call __init_cpu_features 1: leaq __memcpy_sse2(%rip), %rax - testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) - jz 2f - leaq __memcpy_ssse3(%rip), %rax - testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) - jz 2f - leaq __memcpy_ssse3_back(%rip), %rax -2: ret + testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) + jnz 2f + leaq __memcpy_sse2_unaligned(%rip), %rax + ret +2: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jz 3f + leaq __memcpy_ssse3(%rip), %rax +3: ret END(__new_memcpy) # undef ENTRY |