Faster memcpy on x64.

We add new memcpy version that uses unaligned loads which are fast on modern processors. This allows second improvement which is avoiding computed jump which is relatively expensive operation. Tests available here: http://kam.mff.cuni.cz/~ondra/memcpy_profile_result27_04_13.tar.bz2
author: Ondrej Bilka <neleai@seznam.cz> 2013-05-20 08:20:00 +0200
committer: Ondrej Bilka <neleai@seznam.cz> 2013-05-20 08:24:41 +0200
commit: 2d48b41c8fa610067c4d664ac2339ae6ca43e78c (patch)
tree: 4d1ca07a1228ba16c12a67ddb08595770f397da1 /sysdeps/x86_64/multiarch
parent: 3e694268750d51acc6a68b0ee7ded25a52902c20 (diff)
download: glibc-2d48b41c8fa610067c4d664ac2339ae6ca43e78c.tar.gz
glibc-2d48b41c8fa610067c4d664ac2339ae6ca43e78c.tar.xz
glibc-2d48b41c8fa610067c4d664ac2339ae6ca43e78c.zip
4 files changed, 185 insertions, 8 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 86787ee6ea..203d16eed3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,7 +7,7 @@ endif
 ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
-		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
+		   strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
 		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 05315fdd7a..28d35793c5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -227,6 +227,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
 			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..efdfea238f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,175 @@
+/* memcpy with unaliged loads
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#include "asm-syntax.h"
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+
+ENTRY(__memcpy_sse2_unaligned)
+	movq	%rsi, %rax
+	leaq	(%rdx,%rdx), %rcx
+	subq	%rdi, %rax
+	subq	%rdx, %rax
+	cmpq	%rcx, %rax
+	jb	L(overlapping)
+	cmpq	$16, %rdx
+	jbe	L(less_16)
+	movdqu	(%rsi), %xmm8
+	cmpq	$32, %rdx
+	movdqu	%xmm8, (%rdi)
+	movdqu	-16(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -16(%rdi,%rdx)
+	ja	.L31
+L(return):
+	movq	%rdi, %rax
+	ret
+	.p2align 4,,10
+	ALIGN(4)
+.L31:
+	movdqu	16(%rsi), %xmm8
+	cmpq	$64, %rdx
+	movdqu	%xmm8, 16(%rdi)
+	movdqu	-32(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -32(%rdi,%rdx)
+	jbe	L(return)
+	movdqu	32(%rsi), %xmm8
+	cmpq	$128, %rdx
+	movdqu	%xmm8, 32(%rdi)
+	movdqu	-48(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -48(%rdi,%rdx)
+	movdqu	48(%rsi), %xmm8
+	movdqu	%xmm8, 48(%rdi)
+	movdqu	-64(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -64(%rdi,%rdx)
+	jbe	L(return)
+	leaq	64(%rdi), %rcx
+	addq	%rdi, %rdx
+	andq	$-64, %rdx
+	andq	$-64, %rcx
+	movq	%rcx, %rax
+	subq	%rdi, %rax
+	addq	%rax, %rsi
+	cmpq	%rdx, %rcx
+	je	L(return)
+	movq	%rsi, %r10
+	subq	%rcx, %r10
+	leaq	16(%r10), %r9
+	leaq	32(%r10), %r8
+	leaq	48(%r10), %rax
+	.p2align 4,,10
+	ALIGN(4)
+L(loop):
+	movdqu	(%rcx,%r10), %xmm8
+	movdqa	%xmm8, (%rcx)
+	movdqu	(%rcx,%r9), %xmm8
+	movdqa	%xmm8, 16(%rcx)
+	movdqu	(%rcx,%r8), %xmm8
+	movdqa	%xmm8, 32(%rcx)
+	movdqu	(%rcx,%rax), %xmm8
+	movdqa	%xmm8, 48(%rcx)
+	addq	$64, %rcx
+	cmpq	%rcx, %rdx
+	jne	L(loop)
+	jmp	L(return)
+L(overlapping):
+	cmpq	%rsi, %rdi
+	jae	.L3
+	testq	%rdx, %rdx
+	.p2align 4,,5
+	je	L(return)
+	movq	%rdx, %r9
+	leaq	16(%rsi), %rcx
+	leaq	16(%rdi), %r8
+	shrq	$4, %r9
+	movq	%r9, %rax
+	salq	$4, %rax
+	cmpq	%rcx, %rdi
+	setae	%cl
+	cmpq	%r8, %rsi
+	setae	%r8b
+	orl	%r8d, %ecx
+	cmpq	$15, %rdx
+	seta	%r8b
+	testb	%r8b, %cl
+	je	.L16
+	testq	%rax, %rax
+	je	.L16
+	xorl	%ecx, %ecx
+	xorl	%r8d, %r8d
+.L7:
+	movdqu	(%rsi,%rcx), %xmm8
+	addq	$1, %r8
+	movdqu	%xmm8, (%rdi,%rcx)
+	addq	$16, %rcx
+	cmpq	%r8, %r9
+	ja	.L7
+	cmpq	%rax, %rdx
+	je	L(return)
+.L21:
+	movzbl	(%rsi,%rax), %ecx
+	movb	%cl, (%rdi,%rax)
+	addq	$1, %rax
+	cmpq	%rax, %rdx
+	ja	.L21
+	jmp	L(return)
+L(less_16):
+	testb	$24, %dl
+	jne	L(between_9_16)
+	testb	$4, %dl
+	.p2align 4,,5
+	jne	L(between_5_8)
+	testq	%rdx, %rdx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%rsi), %eax
+	testb	$2, %dl
+	movb	%al, (%rdi)
+	je	L(return)
+	movzwl	-2(%rsi,%rdx), %eax
+	movw	%ax, -2(%rdi,%rdx)
+	jmp	L(return)
+.L3:
+	leaq	-1(%rdx), %rax
+	.p2align 4,,10
+	ALIGN(4)
+.L11:
+	movzbl	(%rsi,%rax), %edx
+	movb	%dl, (%rdi,%rax)
+	subq	$1, %rax
+	jmp	.L11
+L(between_9_16):
+	movq	(%rsi), %rax
+	movq	%rax, (%rdi)
+	movq	-8(%rsi,%rdx), %rax
+	movq	%rax, -8(%rdi,%rdx)
+	jmp	L(return)
+.L16:
+	xorl	%eax, %eax
+	jmp	.L21
+L(between_5_8):
+	movl	(%rsi), %eax
+	movl	%eax, (%rdi)
+	movl	-4(%rsi,%rdx), %eax
+	movl	%eax, -4(%rdi,%rdx)
+	jmp	L(return)
+END(__memcpy_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index b452f5304b..a1e5031376 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -33,13 +33,14 @@ ENTRY(__new_memcpy)
 	jne	1f
 	call	__init_cpu_features
 1:	leaq	__memcpy_sse2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
-	jz	2f
-	leaq	__memcpy_ssse3(%rip), %rax
-	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
-	jz	2f
-	leaq	__memcpy_ssse3_back(%rip), %rax
-2:	ret
+	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+	jnz	2f
+	leaq	__memcpy_sse2_unaligned(%rip), %rax
+	ret
+2:	testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jz 3f
+	leaq    __memcpy_ssse3(%rip), %rax
+3:	ret
 END(__new_memcpy)
 
 # undef ENTRY
author	Ondrej Bilka <neleai@seznam.cz>	2013-05-20 08:20:00 +0200
committer	Ondrej Bilka <neleai@seznam.cz>	2013-05-20 08:24:41 +0200
commit	2d48b41c8fa610067c4d664ac2339ae6ca43e78c (patch)
tree	4d1ca07a1228ba16c12a67ddb08595770f397da1 /sysdeps/x86_64/multiarch
parent	3e694268750d51acc6a68b0ee7ded25a52902c20 (diff)
download	glibc-2d48b41c8fa610067c4d664ac2339ae6ca43e78c.tar.gz glibc-2d48b41c8fa610067c4d664ac2339ae6ca43e78c.tar.xz glibc-2d48b41c8fa610067c4d664ac2339ae6ca43e78c.zip