about summary refs log tree commit diff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2015-01-30 06:50:20 -0800
committerH.J. Lu <hjl.tools@gmail.com>2015-01-30 15:37:58 -0800
commit5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97 (patch)
tree8eabf127206283d2421bc40b6bc44e123e346598
parentb658fdd82b4524cf6a39881d092caa23f63d93ac (diff)
downloadglibc-5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97.tar.gz
glibc-5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97.tar.xz
glibc-5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97.zip
Use AVX unaligned memcpy only if AVX2 is available
memcpy with unaligned 256-bit AVX register loads/stores are slow on older
processorsl like Sandy Bridge.  This patch adds bit_AVX_Fast_Unaligned_Load
and sets it only when AVX2 is available.

	[BZ #17801]
	* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
	Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
	* sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
	New.
	(index_AVX_Fast_Unaligned_Load): Likewise.
	(HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
	* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
	bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
	* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
	* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
	* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
	* sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
	HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
	* sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
-rw-r--r--ChangeLog18
-rw-r--r--NEWS4
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.c9
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.h4
-rw-r--r--sysdeps/x86_64/multiarch/memcpy.S2
-rw-r--r--sysdeps/x86_64/multiarch/memcpy_chk.S2
-rw-r--r--sysdeps/x86_64/multiarch/memmove.c2
-rw-r--r--sysdeps/x86_64/multiarch/memmove_chk.c2
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy.S2
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy_chk.S2
10 files changed, 37 insertions, 10 deletions
diff --git a/ChangeLog b/ChangeLog
index 26f7f3f3b1..a696e396b2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2015-01-30  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #17801]
+	* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
+	Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
+	* sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
+	New.
+	(index_AVX_Fast_Unaligned_Load): Likewise.
+	(HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
+	* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
+	bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
+	* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
+	* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
+	* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
+	* sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
+	HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
+	* sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
+
 2015-01-29  Andreas Schwab  <schwab@suse.de>
 
 	* sysdeps/nptl/allocrtsig.c: Include <signal.h>.
diff --git a/NEWS b/NEWS
index 8e2729bddd..c91b9fc58a 100644
--- a/NEWS
+++ b/NEWS
@@ -17,8 +17,8 @@ Version 2.21
   17601, 17608, 17616, 17625, 17630, 17633, 17634, 17635, 17647, 17653,
   17657, 17658, 17664, 17665, 17668, 17682, 17702, 17717, 17719, 17722,
   17723, 17724, 17725, 17732, 17733, 17744, 17745, 17746, 17747, 17748,
-  17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17803,
-  17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892.
+  17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17801,
+  17803, 17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892.
 
 * A new semaphore algorithm has been implemented in generic C code for all
   machines. Previous custom assembly implementations of semaphore were
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index 9299360612..7dec21884d 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -171,9 +171,14 @@ __init_cpu_features (void)
 	  /* Determine if AVX is usable.  */
 	  if (CPUID_AVX)
 	    __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
-	  /* Determine if AVX2 is usable.  */
+#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
+# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
+#endif
+	  /* Determine if AVX2 is usable.  Unaligned load with 256-bit
+	     AVX registers are faster on processors with AVX2.  */
 	  if (CPUID_AVX2)
-	    __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable;
+	    __cpu_features.feature[index_AVX2_Usable]
+	      |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
 	  /* Determine if FMA is usable.  */
 	  if (CPUID_FMA)
 	    __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index 55f1c5b34c..e6b5ba5530 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -25,6 +25,7 @@
 #define bit_FMA4_Usable			(1 << 8)
 #define bit_Slow_SSE4_2			(1 << 9)
 #define bit_AVX2_Usable			(1 << 10)
+#define bit_AVX_Fast_Unaligned_Load	(1 << 11)
 
 /* CPUID Feature flags.  */
 
@@ -74,6 +75,7 @@
 # define index_FMA4_Usable		FEATURE_INDEX_1*FEATURE_SIZE
 # define index_Slow_SSE4_2		FEATURE_INDEX_1*FEATURE_SIZE
 # define index_AVX2_Usable		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
 
 #else	/* __ASSEMBLER__ */
 
@@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_FMA4_Usable		FEATURE_INDEX_1
 # define index_Slow_SSE4_2		FEATURE_INDEX_1
 # define index_AVX2_Usable		FEATURE_INDEX_1
+# define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1
 
 # define HAS_ARCH_FEATURE(name) \
   ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
@@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define HAS_AVX2			HAS_ARCH_FEATURE (AVX2_Usable)
 # define HAS_FMA			HAS_ARCH_FEATURE (FMA_Usable)
 # define HAS_FMA4			HAS_ARCH_FEATURE (FMA4_Usable)
+# define HAS_AVX_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 
 #endif	/* __ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 992e40db81..4e18cd3070 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -33,7 +33,7 @@ ENTRY(__new_memcpy)
 	jne	1f
 	call	__init_cpu_features
 1:	leaq	__memcpy_avx_unaligned(%rip), %rax
-	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
 	jz 1f
 	ret
 1:	leaq	__memcpy_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 5e9cf004b0..1e756ea0c2 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -39,7 +39,7 @@ ENTRY(__memcpy_chk)
 	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
 	jz	2f
 	leaq	__memcpy_chk_ssse3_back(%rip), %rax
-	testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
 	jz  2f
 	leaq    __memcpy_chk_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index d93bfd05c0..dd153a3eaa 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
    ifunc symbol properly.  */
 extern __typeof (__redirect_memmove) __libc_memmove;
 libc_ifunc (__libc_memmove,
-	    HAS_AVX
+	    HAS_AVX_FAST_UNALIGNED_LOAD
 	    ? __memmove_avx_unaligned
 	    : (HAS_SSSE3
 	       ? (HAS_FAST_COPY_BACKWARD
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
index 743ca2a460..8b12d002dc 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
 #include "debug/memmove_chk.c"
 
 libc_ifunc (__memmove_chk,
-	    HAS_AVX ? __memmove_chk_avx_unaligned :
+	    HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned :
 	    (HAS_SSSE3
 	    ? (HAS_FAST_COPY_BACKWARD
 	       ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index cdf1dab62b..2eaacdf049 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -37,7 +37,7 @@ ENTRY(__mempcpy)
 	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
 	jz	2f
 	leaq	__mempcpy_ssse3_back(%rip), %rax
-	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
 	jz	2f
 	leaq	__mempcpy_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index b7f9e89ea2..17b84701b0 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk)
 	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
 	jz	2f
 	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
-	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
 	jz	2f
 	leaq	__mempcpy_chk_avx_unaligned(%rip), %rax
 2:	ret