1 files changed, 175 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..efdfea238f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,175 @@
+/* memcpy with unaliged loads
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#include "asm-syntax.h"
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+
+ENTRY(__memcpy_sse2_unaligned)
+	movq	%rsi, %rax
+	leaq	(%rdx,%rdx), %rcx
+	subq	%rdi, %rax
+	subq	%rdx, %rax
+	cmpq	%rcx, %rax
+	jb	L(overlapping)
+	cmpq	$16, %rdx
+	jbe	L(less_16)
+	movdqu	(%rsi), %xmm8
+	cmpq	$32, %rdx
+	movdqu	%xmm8, (%rdi)
+	movdqu	-16(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -16(%rdi,%rdx)
+	ja	.L31
+L(return):
+	movq	%rdi, %rax
+	ret
+	.p2align 4,,10
+	ALIGN(4)
+.L31:
+	movdqu	16(%rsi), %xmm8
+	cmpq	$64, %rdx
+	movdqu	%xmm8, 16(%rdi)
+	movdqu	-32(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -32(%rdi,%rdx)
+	jbe	L(return)
+	movdqu	32(%rsi), %xmm8
+	cmpq	$128, %rdx
+	movdqu	%xmm8, 32(%rdi)
+	movdqu	-48(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -48(%rdi,%rdx)
+	movdqu	48(%rsi), %xmm8
+	movdqu	%xmm8, 48(%rdi)
+	movdqu	-64(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -64(%rdi,%rdx)
+	jbe	L(return)
+	leaq	64(%rdi), %rcx
+	addq	%rdi, %rdx
+	andq	$-64, %rdx
+	andq	$-64, %rcx
+	movq	%rcx, %rax
+	subq	%rdi, %rax
+	addq	%rax, %rsi
+	cmpq	%rdx, %rcx
+	je	L(return)
+	movq	%rsi, %r10
+	subq	%rcx, %r10
+	leaq	16(%r10), %r9
+	leaq	32(%r10), %r8
+	leaq	48(%r10), %rax
+	.p2align 4,,10
+	ALIGN(4)
+L(loop):
+	movdqu	(%rcx,%r10), %xmm8
+	movdqa	%xmm8, (%rcx)
+	movdqu	(%rcx,%r9), %xmm8
+	movdqa	%xmm8, 16(%rcx)
+	movdqu	(%rcx,%r8), %xmm8
+	movdqa	%xmm8, 32(%rcx)
+	movdqu	(%rcx,%rax), %xmm8
+	movdqa	%xmm8, 48(%rcx)
+	addq	$64, %rcx
+	cmpq	%rcx, %rdx
+	jne	L(loop)
+	jmp	L(return)
+L(overlapping):
+	cmpq	%rsi, %rdi
+	jae	.L3
+	testq	%rdx, %rdx
+	.p2align 4,,5
+	je	L(return)
+	movq	%rdx, %r9
+	leaq	16(%rsi), %rcx
+	leaq	16(%rdi), %r8
+	shrq	$4, %r9
+	movq	%r9, %rax
+	salq	$4, %rax
+	cmpq	%rcx, %rdi
+	setae	%cl
+	cmpq	%r8, %rsi
+	setae	%r8b
+	orl	%r8d, %ecx
+	cmpq	$15, %rdx
+	seta	%r8b
+	testb	%r8b, %cl
+	je	.L16
+	testq	%rax, %rax
+	je	.L16
+	xorl	%ecx, %ecx
+	xorl	%r8d, %r8d
+.L7:
+	movdqu	(%rsi,%rcx), %xmm8
+	addq	$1, %r8
+	movdqu	%xmm8, (%rdi,%rcx)
+	addq	$16, %rcx
+	cmpq	%r8, %r9
+	ja	.L7
+	cmpq	%rax, %rdx
+	je	L(return)
+.L21:
+	movzbl	(%rsi,%rax), %ecx
+	movb	%cl, (%rdi,%rax)
+	addq	$1, %rax
+	cmpq	%rax, %rdx
+	ja	.L21
+	jmp	L(return)
+L(less_16):
+	testb	$24, %dl
+	jne	L(between_9_16)
+	testb	$4, %dl
+	.p2align 4,,5
+	jne	L(between_5_8)
+	testq	%rdx, %rdx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%rsi), %eax
+	testb	$2, %dl
+	movb	%al, (%rdi)
+	je	L(return)
+	movzwl	-2(%rsi,%rdx), %eax
+	movw	%ax, -2(%rdi,%rdx)
+	jmp	L(return)
+.L3:
+	leaq	-1(%rdx), %rax
+	.p2align 4,,10
+	ALIGN(4)
+.L11:
+	movzbl	(%rsi,%rax), %edx
+	movb	%dl, (%rdi,%rax)
+	subq	$1, %rax
+	jmp	.L11
+L(between_9_16):
+	movq	(%rsi), %rax
+	movq	%rax, (%rdi)
+	movq	-8(%rsi,%rdx), %rax
+	movq	%rax, -8(%rdi,%rdx)
+	jmp	L(return)
+.L16:
+	xorl	%eax, %eax
+	jmp	.L21
+L(between_5_8):
+	movl	(%rsi), %eax
+	movl	%eax, (%rdi)
+	movl	-4(%rsi,%rdx), %eax
+	movl	%eax, -4(%rdi,%rdx)
+	jmp	L(return)
+END(__memcpy_sse2_unaligned)