about summary refs log tree commit diff
path: root/sysdeps
diff options
context:
space:
mode:
authorLing Ma <ling.ml@alibaba-inc.com>2014-07-14 00:02:52 -0400
committerH.J. Lu <hjl.tools@gmail.com>2014-07-30 08:02:35 -0700
commit05f3633da4f9df870d04dd77336e793746e57ed4 (patch)
treebd980df4702ffd9ebfa9cb4dac8823a659f0c430 /sysdeps
parenta53fbd8e6cd2f69bdfa3431d616a5f332aea6664 (diff)
downloadglibc-05f3633da4f9df870d04dd77336e793746e57ed4.tar.gz
glibc-05f3633da4f9df870d04dd77336e793746e57ed4.tar.xz
glibc-05f3633da4f9df870d04dd77336e793746e57ed4.zip
Improve 64bit memcpy performance for Haswell CPU with AVX instruction
In this patch we take advantage of HSW memory bandwidth, manage to
reduce miss branch prediction by avoiding using branch instructions and
force destination to be aligned with avx instruction.

The CPU2006 403.gcc benchmark indicates this patch improves performance
from 2% to 10%.
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/x86_64/multiarch/Makefile1
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c12
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S376
-rw-r--r--sysdeps/x86_64/multiarch/memcpy.S4
-rw-r--r--sysdeps/x86_64/multiarch/memcpy_chk.S3
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx-unaligned.S22
-rw-r--r--sysdeps/x86_64/multiarch/memmove.c12
-rw-r--r--sysdeps/x86_64/multiarch/memmove_chk.c6
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S22
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy.S3
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy_chk.S3
11 files changed, 458 insertions, 6 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 3bb9702b95..d7002a9df3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -11,6 +11,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memcpy-sse2-unaligned mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
+		   memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
 		   memmove-ssse3-back strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7e93e598db..78e9b20079 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -46,6 +46,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
   IFUNC_IMPL (i, name, __memmove_chk,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
+			      __memmove_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
 			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
@@ -55,6 +57,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memmove.S.  */
   IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
+			      __memmove_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
 			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
@@ -214,6 +218,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #ifdef SHARED
   /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
   IFUNC_IMPL (i, name, __memcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
+			      __memcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
 			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
@@ -223,6 +229,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
   IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX,
+			      __memcpy_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
 			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
@@ -231,6 +239,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
   IFUNC_IMPL (i, name, __mempcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
+			      __mempcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
 			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
@@ -240,6 +250,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
   IFUNC_IMPL (i, name, mempcpy,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
+			      __mempcpy_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
new file mode 100644
index 0000000000..3cac1e33cd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -0,0 +1,376 @@
+/* memcpy with AVX
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+#ifndef MEMCPY
+# define MEMCPY	__memcpy_avx_unaligned
+# define MEMCPY_CHK	__memcpy_chk_avx_unaligned
+#endif
+
+	.section .text.avx,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%rdi, %rax
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+	cmp	$256, %rdx
+	jae	L(256bytesormore)
+	cmp	$16, %dl
+	jb	L(less_16bytes)
+	cmp	$128, %dl
+	jb	L(less_128bytes)
+	vmovdqu (%rsi), %xmm0
+	lea	(%rsi, %rdx), %rcx
+	vmovdqu 0x10(%rsi), %xmm1
+	vmovdqu 0x20(%rsi), %xmm2
+	vmovdqu 0x30(%rsi), %xmm3
+	vmovdqu 0x40(%rsi), %xmm4
+	vmovdqu 0x50(%rsi), %xmm5
+	vmovdqu 0x60(%rsi), %xmm6
+	vmovdqu 0x70(%rsi), %xmm7
+	vmovdqu -0x80(%rcx), %xmm8
+	vmovdqu -0x70(%rcx), %xmm9
+	vmovdqu -0x60(%rcx), %xmm10
+	vmovdqu -0x50(%rcx), %xmm11
+	vmovdqu -0x40(%rcx), %xmm12
+	vmovdqu -0x30(%rcx), %xmm13
+	vmovdqu -0x20(%rcx), %xmm14
+	vmovdqu -0x10(%rcx), %xmm15
+	lea	(%rdi, %rdx), %rdx
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, 0x10(%rdi)
+	vmovdqu %xmm2, 0x20(%rdi)
+	vmovdqu %xmm3, 0x30(%rdi)
+	vmovdqu %xmm4, 0x40(%rdi)
+	vmovdqu %xmm5, 0x50(%rdi)
+	vmovdqu %xmm6, 0x60(%rdi)
+	vmovdqu %xmm7, 0x70(%rdi)
+	vmovdqu %xmm8, -0x80(%rdx)
+	vmovdqu %xmm9, -0x70(%rdx)
+	vmovdqu %xmm10, -0x60(%rdx)
+	vmovdqu %xmm11, -0x50(%rdx)
+	vmovdqu %xmm12, -0x40(%rdx)
+	vmovdqu %xmm13, -0x30(%rdx)
+	vmovdqu %xmm14, -0x20(%rdx)
+	vmovdqu %xmm15, -0x10(%rdx)
+	ret
+	.p2align 4
+L(less_128bytes):
+	cmp	$64, %dl
+	jb	L(less_64bytes)
+	vmovdqu (%rsi), %xmm0
+	lea	(%rsi, %rdx), %rcx
+	vmovdqu 0x10(%rsi), %xmm1
+	vmovdqu 0x20(%rsi), %xmm2
+	lea	(%rdi, %rdx), %rdx
+	vmovdqu 0x30(%rsi), %xmm3
+	vmovdqu -0x40(%rcx), %xmm4
+	vmovdqu -0x30(%rcx), %xmm5
+	vmovdqu -0x20(%rcx), %xmm6
+	vmovdqu -0x10(%rcx), %xmm7
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, 0x10(%rdi)
+	vmovdqu %xmm2, 0x20(%rdi)
+	vmovdqu %xmm3, 0x30(%rdi)
+	vmovdqu %xmm4, -0x40(%rdx)
+	vmovdqu %xmm5, -0x30(%rdx)
+	vmovdqu %xmm6, -0x20(%rdx)
+	vmovdqu %xmm7, -0x10(%rdx)
+	ret
+
+	.p2align 4
+L(less_64bytes):
+	cmp	$32, %dl
+	jb	L(less_32bytes)
+	vmovdqu (%rsi), %xmm0
+	vmovdqu 0x10(%rsi), %xmm1
+	vmovdqu -0x20(%rsi, %rdx), %xmm6
+	vmovdqu -0x10(%rsi, %rdx), %xmm7
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, 0x10(%rdi)
+	vmovdqu %xmm6, -0x20(%rdi, %rdx)
+	vmovdqu %xmm7, -0x10(%rdi, %rdx)
+	ret
+
+	.p2align 4
+L(less_32bytes):
+	vmovdqu (%rsi), %xmm0
+	vmovdqu -0x10(%rsi, %rdx), %xmm7
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm7, -0x10(%rdi, %rdx)
+	ret
+
+	.p2align 4
+L(less_16bytes):
+	cmp	$8, %dl
+	jb	L(less_8bytes)
+	movq -0x08(%rsi, %rdx),	%rcx
+	movq (%rsi),	%rsi
+	movq %rsi, (%rdi)
+	movq %rcx, -0x08(%rdi, %rdx)
+	ret
+
+	.p2align 4
+L(less_8bytes):
+	cmp	$4, %dl
+	jb	L(less_4bytes)
+	mov -0x04(%rsi, %rdx), %ecx
+	mov (%rsi),	%esi
+	mov %esi, (%rdi)
+	mov %ecx, -0x04(%rdi, %rdx)
+	ret
+
+L(less_4bytes):
+	cmp	$1, %dl
+	jbe	L(less_2bytes)
+	mov -0x02(%rsi, %rdx),	%cx
+	mov (%rsi),	%si
+	mov %si, (%rdi)
+	mov %cx, -0x02(%rdi, %rdx)
+	ret
+
+L(less_2bytes):
+	jb	L(less_0bytes)
+	mov	(%rsi), %cl
+	mov	%cl,	(%rdi)
+L(less_0bytes):
+	ret
+
+	.p2align 4
+L(256bytesormore):
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %rcx
+	sub	%rsi, %rcx
+	cmp	%rdx, %rcx
+	jc	L(copy_backward)
+#endif
+	cmp	$2048, %rdx
+	jae	L(gobble_data_movsb)
+	mov	%rax, %r8
+	lea	(%rsi, %rdx), %rcx
+	mov	%rdi, %r10
+	vmovdqu -0x80(%rcx), %xmm5
+	vmovdqu -0x70(%rcx), %xmm6
+	mov	$0x80, %rax
+	and	$-32, %rdi
+	add	$32, %rdi
+	vmovdqu -0x60(%rcx), %xmm7
+	vmovdqu -0x50(%rcx), %xmm8
+	mov	%rdi, %r11
+	sub	%r10, %r11
+	vmovdqu -0x40(%rcx), %xmm9
+	vmovdqu -0x30(%rcx), %xmm10
+	sub	%r11, %rdx
+	vmovdqu -0x20(%rcx), %xmm11
+	vmovdqu -0x10(%rcx), %xmm12
+	vmovdqu	(%rsi), %ymm4
+	add	%r11, %rsi
+	sub	%eax, %edx
+L(goble_128_loop):
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 0x20(%rsi), %ymm1
+	vmovdqu 0x40(%rsi), %ymm2
+	vmovdqu 0x60(%rsi), %ymm3
+	add	%rax, %rsi
+	vmovdqa %ymm0, (%rdi)
+	vmovdqa %ymm1, 0x20(%rdi)
+	vmovdqa %ymm2, 0x40(%rdi)
+	vmovdqa %ymm3, 0x60(%rdi)
+	add	%rax, %rdi
+	sub	%eax, %edx
+	jae	L(goble_128_loop)
+	add	%eax, %edx
+	add	%rdi, %rdx
+	vmovdqu	%ymm4, (%r10)
+	vzeroupper
+	vmovdqu %xmm5, -0x80(%rdx)
+	vmovdqu %xmm6, -0x70(%rdx)
+	vmovdqu %xmm7, -0x60(%rdx)
+	vmovdqu %xmm8, -0x50(%rdx)
+	vmovdqu %xmm9, -0x40(%rdx)
+	vmovdqu %xmm10, -0x30(%rdx)
+	vmovdqu %xmm11, -0x20(%rdx)
+	vmovdqu %xmm12, -0x10(%rdx)
+	mov	%r8, %rax
+	ret
+
+	.p2align 4
+L(gobble_data_movsb):
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_shared_cache_size_half(%rip), %rcx
+#endif
+	shl	$3, %rcx
+	cmp	%rcx, %rdx
+	jae	L(gobble_big_data_fwd)
+	mov	%rdx, %rcx
+	mov	%rdx, %rcx
+	rep	movsb
+	ret
+
+	.p2align 4
+L(gobble_big_data_fwd):
+	lea	(%rsi, %rdx), %rcx
+	vmovdqu	(%rsi), %ymm4
+	vmovdqu -0x80(%rsi,%rdx), %xmm5
+	vmovdqu -0x70(%rcx), %xmm6
+	vmovdqu -0x60(%rcx), %xmm7
+	vmovdqu -0x50(%rcx), %xmm8
+	vmovdqu -0x40(%rcx), %xmm9
+	vmovdqu -0x30(%rcx), %xmm10
+	vmovdqu -0x20(%rcx), %xmm11
+	vmovdqu -0x10(%rcx), %xmm12
+	mov	%rdi, %r8
+	and	$-32, %rdi
+	add	$32, %rdi
+	mov	%rdi, %r10
+	sub	%r8, %r10
+	sub	%r10, %rdx
+	add	%r10, %rsi
+	lea	(%rdi, %rdx), %rcx
+	add	$-0x80, %rdx
+L(gobble_mem_fwd_loop):
+	prefetchnta 0x1c0(%rsi)
+	prefetchnta 0x280(%rsi)
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu	0x20(%rsi), %ymm1
+	vmovdqu	0x40(%rsi), %ymm2
+	vmovdqu	0x60(%rsi), %ymm3
+	sub	$-0x80, %rsi
+	vmovntdq	%ymm0, (%rdi)
+	vmovntdq	%ymm1, 0x20(%rdi)
+	vmovntdq	%ymm2, 0x40(%rdi)
+	vmovntdq	%ymm3, 0x60(%rdi)
+	sub	$-0x80, %rdi
+	add	$-0x80, %rdx
+	jb	L(gobble_mem_fwd_loop)
+	sfence
+	vmovdqu	%ymm4, (%r8)
+	vzeroupper
+	vmovdqu %xmm5, -0x80(%rcx)
+	vmovdqu %xmm6, -0x70(%rcx)
+	vmovdqu %xmm7, -0x60(%rcx)
+	vmovdqu %xmm8, -0x50(%rcx)
+	vmovdqu %xmm9, -0x40(%rcx)
+	vmovdqu %xmm10, -0x30(%rcx)
+	vmovdqu %xmm11, -0x20(%rcx)
+	vmovdqu %xmm12, -0x10(%rcx)
+	ret
+
+#ifdef USE_AS_MEMMOVE
+	.p2align 4
+L(copy_backward):
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_shared_cache_size_half(%rip), %rcx
+#endif
+	shl	$3, %rcx
+	vmovdqu (%rsi), %xmm5
+	vmovdqu 0x10(%rsi), %xmm6
+	add	%rdx, %rdi
+	vmovdqu 0x20(%rsi), %xmm7
+	vmovdqu 0x30(%rsi), %xmm8
+	lea	-0x20(%rdi), %r10
+	mov %rdi, %r11
+	vmovdqu 0x40(%rsi), %xmm9
+	vmovdqu 0x50(%rsi), %xmm10
+	and	$0x1f, %r11
+	vmovdqu 0x60(%rsi), %xmm11
+	vmovdqu 0x70(%rsi), %xmm12
+	xor	%r11, %rdi
+	add	%rdx, %rsi
+	vmovdqu	-0x20(%rsi), %ymm4
+	sub	%r11, %rsi
+	sub	%r11, %rdx
+	cmp	%rcx, %rdx
+	ja	L(gobble_big_data_bwd)
+	add	$-0x80, %rdx
+L(gobble_mem_bwd_llc):
+	vmovdqu	-0x20(%rsi), %ymm0
+	vmovdqu	-0x40(%rsi), %ymm1
+	vmovdqu	-0x60(%rsi), %ymm2
+	vmovdqu	-0x80(%rsi), %ymm3
+	lea	-0x80(%rsi), %rsi
+	vmovdqa	%ymm0, -0x20(%rdi)
+	vmovdqa	%ymm1, -0x40(%rdi)
+	vmovdqa	%ymm2, -0x60(%rdi)
+	vmovdqa	%ymm3, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	add	$-0x80, %rdx
+	jb	L(gobble_mem_bwd_llc)
+	vmovdqu	%ymm4, (%r10)
+	vzeroupper
+	vmovdqu %xmm5, (%rax)
+	vmovdqu %xmm6, 0x10(%rax)
+	vmovdqu %xmm7, 0x20(%rax)
+	vmovdqu %xmm8, 0x30(%rax)
+	vmovdqu %xmm9, 0x40(%rax)
+	vmovdqu %xmm10, 0x50(%rax)
+	vmovdqu %xmm11, 0x60(%rax)
+	vmovdqu %xmm12, 0x70(%rax)
+	ret
+
+	.p2align 4
+L(gobble_big_data_bwd):
+	add	$-0x80, %rdx
+L(gobble_mem_bwd_loop):
+	prefetchnta -0x1c0(%rsi)
+	prefetchnta -0x280(%rsi)
+	vmovdqu	-0x20(%rsi), %ymm0
+	vmovdqu	-0x40(%rsi), %ymm1
+	vmovdqu	-0x60(%rsi), %ymm2
+	vmovdqu	-0x80(%rsi), %ymm3
+	lea	-0x80(%rsi), %rsi
+	vmovntdq	%ymm0, -0x20(%rdi)
+	vmovntdq	%ymm1, -0x40(%rdi)
+	vmovntdq	%ymm2, -0x60(%rdi)
+	vmovntdq	%ymm3, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	add	$-0x80, %rdx
+	jb	L(gobble_mem_bwd_loop)
+	sfence
+	vmovdqu	%ymm4, (%r10)
+	vzeroupper
+	vmovdqu %xmm5, (%rax)
+	vmovdqu %xmm6, 0x10(%rax)
+	vmovdqu %xmm7, 0x20(%rax)
+	vmovdqu %xmm8, 0x30(%rax)
+	vmovdqu %xmm9, 0x40(%rax)
+	vmovdqu %xmm10, 0x50(%rax)
+	vmovdqu %xmm11, 0x60(%rax)
+	vmovdqu %xmm12, 0x70(%rax)
+	ret
+#endif
+END (MEMCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 40ae926386..e666695407 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -32,6 +32,10 @@ ENTRY(__new_memcpy)
 	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
 	jne	1f
 	call	__init_cpu_features
+1:	leaq	__memcpy_avx_unaligned(%rip), %rax
+	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	jz 1f
+	ret
 1:	leaq	__memcpy_sse2(%rip), %rax
 	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
 	jnz	2f
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 3c0270fd23..076b19a9ea 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -39,6 +39,9 @@ ENTRY(__memcpy_chk)
 	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
 	jz	2f
 	leaq	__memcpy_chk_ssse3_back(%rip), %rax
+	testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	jz  2f
+	leaq    __memcpy_chk_avx_unaligned(%rip), %rax
 2:	ret
 END(__memcpy_chk)
 # else
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
new file mode 100644
index 0000000000..faed9fb4c8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
@@ -0,0 +1,22 @@
+/* memmove with AVX
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_avx_unaligned
+#define MEMCPY_CHK	__memmove_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index ba86e7bbb1..0c9af7e4df 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -35,6 +35,8 @@
 extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
+
 #endif
 
 #include "string/memmove.c"
@@ -47,10 +49,12 @@ extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
    ifunc symbol properly.  */
 extern __typeof (__redirect_memmove) __libc_memmove;
 libc_ifunc (__libc_memmove,
-	    HAS_SSSE3
-	    ? (HAS_FAST_COPY_BACKWARD
-	       ? __memmove_ssse3_back : __memmove_ssse3)
-	    : __memmove_sse2)
+	    HAS_AVX
+	    ? __memmove_avx_unaligned
+	    : (HAS_SSSE3
+	       ? (HAS_FAST_COPY_BACKWARD
+	          ? __memmove_ssse3_back : __memmove_ssse3)
+	       : __memmove_sse2));
 
 strong_alias (__libc_memmove, memmove)
 
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
index cb1acb6598..44344f2820 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -25,11 +25,13 @@
 extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
+extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
 
 #include "debug/memmove_chk.c"
 
 libc_ifunc (__memmove_chk,
-	    HAS_SSSE3
+	    HAS_AVX ? __memmove_chk_avx_unaligned :
+	    (HAS_SSSE3
 	    ? (HAS_FAST_COPY_BACKWARD
 	       ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
-	    : __memmove_chk_sse2);
+	    : __memmove_chk_sse2));
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
new file mode 100644
index 0000000000..438bda386d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
@@ -0,0 +1,22 @@
+/* mempcpy with AVX
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_avx_unaligned
+#define MEMCPY_CHK	__mempcpy_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index b9f04c2ec4..7589d8c1ec 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -37,6 +37,9 @@ ENTRY(__mempcpy)
 	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
 	jz	2f
 	leaq	__mempcpy_ssse3_back(%rip), %rax
+	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	jz	2f
+	leaq	__mempcpy_avx_unaligned(%rip), %rax
 2:	ret
 END(__mempcpy)
 
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index c28473a669..88e0b74e83 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -39,6 +39,9 @@ ENTRY(__mempcpy_chk)
 	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
 	jz	2f
 	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
+	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	jz	2f
+	leaq	__mempcpy_chk_avx_unaligned(%rip), %rax
 2:	ret
 END(__mempcpy_chk)
 # else