about summary refs log tree commit diff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2017-06-02 17:32:21 -0700
committerH.J. Lu <hjl.tools@gmail.com>2017-06-02 17:32:37 -0700
commit808fd9e6fe23f96078d4e297de1131117d3898bb (patch)
tree76336f1f7937cfd45db787e7b659a77da655ba26
parent3e6def237a5681387b27ac55298b3ab25a054dbf (diff)
downloadglibc-808fd9e6fe23f96078d4e297de1131117d3898bb.tar.gz
glibc-808fd9e6fe23f96078d4e297de1131117d3898bb.tar.xz
glibc-808fd9e6fe23f96078d4e297de1131117d3898bb.zip
x86: Update __x86_shared_non_temporal_threshold
__x86_shared_non_temporal_threshold was set to 6 times of per-core
shared cache size, based on the large memcpy micro benchmark in glibc
on a 8-core processor.  For a processor with more than 8 cores, the
threshold is too low.  Set __x86_shared_non_temporal_threshold to the
3/4 of the total shared cache size so that it is unchanged on 8-core
processors.  On processors with less than 8 cores, the threshold is
lower.

	* sysdeps/x86/cacheinfo.c (__x86_shared_non_temporal_threshold):
	Set to the 3/4 of the total shared cache size.
-rw-r--r--ChangeLog5
-rw-r--r--sysdeps/x86/cacheinfo.c6
2 files changed, 9 insertions, 2 deletions
diff --git a/ChangeLog b/ChangeLog
index 3ac07f0517..a97f09d3da 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2017-06-02  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/x86/cacheinfo.c (__x86_shared_non_temporal_threshold):
+	Set to the 3/4 of the total shared cache size.
+
 2017-06-02  Rical Jasan  <ricaljasan@pacific.net>
 
 	* manual/errno.texi: Remove redundant error strings.
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 12ffeef5b5..f66f2b86e0 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -767,8 +767,10 @@ intel_bug_no_cache_info:
 
   /* The large memcpy micro benchmark in glibc shows that 6 times of
      shared cache size is the approximate value above which non-temporal
-     store becomes faster.  */
-  __x86_shared_non_temporal_threshold = __x86_shared_cache_size * 6;
+     store becomes faster on a 8-core processor.  This is the 3/4 of the
+     total shared cache size.  */
+  __x86_shared_non_temporal_threshold
+    = __x86_shared_cache_size * threads * 3 / 4;
 }
 
 #endif