diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S new file mode 100644 index 0000000000..efdfea238f --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S @@ -0,0 +1,175 @@ +/* memcpy with unaliged loads + Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#include "asm-syntax.h" + +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + + +ENTRY(__memcpy_sse2_unaligned) + movq %rsi, %rax + leaq (%rdx,%rdx), %rcx + subq %rdi, %rax + subq %rdx, %rax + cmpq %rcx, %rax + jb L(overlapping) + cmpq $16, %rdx + jbe L(less_16) + movdqu (%rsi), %xmm8 + cmpq $32, %rdx + movdqu %xmm8, (%rdi) + movdqu -16(%rsi,%rdx), %xmm8 + movdqu %xmm8, -16(%rdi,%rdx) + ja .L31 +L(return): + movq %rdi, %rax + ret + .p2align 4,,10 + ALIGN(4) +.L31: + movdqu 16(%rsi), %xmm8 + cmpq $64, %rdx + movdqu %xmm8, 16(%rdi) + movdqu -32(%rsi,%rdx), %xmm8 + movdqu %xmm8, -32(%rdi,%rdx) + jbe L(return) + movdqu 32(%rsi), %xmm8 + cmpq $128, %rdx + movdqu %xmm8, 32(%rdi) + movdqu -48(%rsi,%rdx), %xmm8 + movdqu %xmm8, -48(%rdi,%rdx) + movdqu 48(%rsi), %xmm8 + movdqu %xmm8, 48(%rdi) + movdqu -64(%rsi,%rdx), %xmm8 + movdqu %xmm8, -64(%rdi,%rdx) + jbe L(return) + leaq 64(%rdi), %rcx + addq %rdi, %rdx + andq $-64, %rdx + andq $-64, %rcx + movq %rcx, %rax + subq %rdi, %rax + addq %rax, %rsi + cmpq %rdx, %rcx + je L(return) + movq %rsi, %r10 + subq %rcx, %r10 + leaq 16(%r10), %r9 + leaq 32(%r10), %r8 + leaq 48(%r10), %rax + .p2align 4,,10 + ALIGN(4) +L(loop): + movdqu (%rcx,%r10), %xmm8 + movdqa %xmm8, (%rcx) + movdqu (%rcx,%r9), %xmm8 + movdqa %xmm8, 16(%rcx) + movdqu (%rcx,%r8), %xmm8 + movdqa %xmm8, 32(%rcx) + movdqu (%rcx,%rax), %xmm8 + movdqa %xmm8, 48(%rcx) + addq $64, %rcx + cmpq %rcx, %rdx + jne L(loop) + jmp L(return) +L(overlapping): + cmpq %rsi, %rdi + jae .L3 + testq %rdx, %rdx + .p2align 4,,5 + je L(return) + movq %rdx, %r9 + leaq 16(%rsi), %rcx + leaq 16(%rdi), %r8 + shrq $4, %r9 + movq %r9, %rax + salq $4, %rax + cmpq %rcx, %rdi + setae %cl + cmpq %r8, %rsi + setae %r8b + orl %r8d, %ecx + cmpq $15, %rdx + seta %r8b + testb %r8b, %cl + je .L16 + testq %rax, %rax + je .L16 + xorl %ecx, %ecx + xorl %r8d, %r8d +.L7: + movdqu (%rsi,%rcx), %xmm8 + addq $1, %r8 + movdqu %xmm8, (%rdi,%rcx) + addq $16, %rcx + cmpq %r8, %r9 + ja .L7 + cmpq %rax, %rdx + je L(return) +.L21: + movzbl (%rsi,%rax), %ecx + movb %cl, (%rdi,%rax) + addq $1, %rax + cmpq %rax, %rdx + ja .L21 + jmp L(return) +L(less_16): + testb $24, %dl + jne L(between_9_16) + testb $4, %dl + .p2align 4,,5 + jne L(between_5_8) + testq %rdx, %rdx + .p2align 4,,2 + je L(return) + movzbl (%rsi), %eax + testb $2, %dl + movb %al, (%rdi) + je L(return) + movzwl -2(%rsi,%rdx), %eax + movw %ax, -2(%rdi,%rdx) + jmp L(return) +.L3: + leaq -1(%rdx), %rax + .p2align 4,,10 + ALIGN(4) +.L11: + movzbl (%rsi,%rax), %edx + movb %dl, (%rdi,%rax) + subq $1, %rax + jmp .L11 +L(between_9_16): + movq (%rsi), %rax + movq %rax, (%rdi) + movq -8(%rsi,%rdx), %rax + movq %rax, -8(%rdi,%rdx) + jmp L(return) +.L16: + xorl %eax, %eax + jmp .L21 +L(between_5_8): + movl (%rsi), %eax + movl %eax, (%rdi) + movl -4(%rsi,%rdx), %eax + movl %eax, -4(%rdi,%rdx) + jmp L(return) +END(__memcpy_sse2_unaligned) |