about summary refs log tree commit diff
path: root/sysdeps/x86/dl-cacheinfo.h
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86/dl-cacheinfo.h')
-rw-r--r--sysdeps/x86/dl-cacheinfo.h109
1 files changed, 96 insertions, 13 deletions
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index a1c03b8903..e9579505a3 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -567,6 +567,48 @@ handle_zhaoxin (int name)
   return 0;
 }
 
+static long int __attribute__ ((noinline))
+handle_hygon (int name)
+{
+  unsigned int eax;
+  unsigned int ebx;
+  unsigned int ecx;
+  unsigned int edx;
+  unsigned int count = 0x1;
+
+  if (name >= _SC_LEVEL3_CACHE_SIZE)
+    count = 0x3;
+  else if (name >= _SC_LEVEL2_CACHE_SIZE)
+    count = 0x2;
+  else if (name >= _SC_LEVEL1_DCACHE_SIZE)
+    count = 0x0;
+
+  /* Use __cpuid__ '0x8000_001D' to compute cache details.  */
+  __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+
+  switch (name)
+    {
+    case _SC_LEVEL1_ICACHE_ASSOC:
+    case _SC_LEVEL1_DCACHE_ASSOC:
+    case _SC_LEVEL2_CACHE_ASSOC:
+    case _SC_LEVEL3_CACHE_ASSOC:
+      return ((ebx >> 22) & 0x3ff) + 1;
+    case _SC_LEVEL1_ICACHE_LINESIZE:
+    case _SC_LEVEL1_DCACHE_LINESIZE:
+    case _SC_LEVEL2_CACHE_LINESIZE:
+    case _SC_LEVEL3_CACHE_LINESIZE:
+      return (ebx & 0xfff) + 1;
+    case _SC_LEVEL1_ICACHE_SIZE:
+    case _SC_LEVEL1_DCACHE_SIZE:
+    case _SC_LEVEL2_CACHE_SIZE:
+    case _SC_LEVEL3_CACHE_SIZE:
+      return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1);
+    default:
+      __builtin_unreachable ();
+    }
+  return -1;
+}
+
 static void
 get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
                 long int core)
@@ -889,6 +931,24 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
 
       shared_per_thread = shared;
     }
+  else if (cpu_features->basic.kind == arch_kind_hygon)
+    {
+      data = handle_hygon (_SC_LEVEL1_DCACHE_SIZE);
+      shared = handle_hygon (_SC_LEVEL3_CACHE_SIZE);
+      shared_per_thread = shared;
+
+      level1_icache_size = handle_hygon (_SC_LEVEL1_ICACHE_SIZE);
+      level1_icache_linesize = handle_hygon (_SC_LEVEL1_ICACHE_LINESIZE);
+      level1_dcache_size = data;
+      level1_dcache_assoc = handle_hygon (_SC_LEVEL1_DCACHE_ASSOC);
+      level1_dcache_linesize = handle_hygon (_SC_LEVEL1_DCACHE_LINESIZE);
+      level2_cache_size = handle_hygon (_SC_LEVEL2_CACHE_SIZE);;
+      level2_cache_assoc = handle_hygon (_SC_LEVEL2_CACHE_ASSOC);
+      level2_cache_linesize = handle_hygon (_SC_LEVEL2_CACHE_LINESIZE);
+      level3_cache_size = shared;
+      level3_cache_assoc = handle_hygon (_SC_LEVEL3_CACHE_ASSOC);
+      level3_cache_linesize = handle_hygon (_SC_LEVEL3_CACHE_LINESIZE);
+    }
 
   cpu_features->level1_icache_size = level1_icache_size;
   cpu_features->level1_icache_linesize = level1_icache_linesize;
@@ -988,14 +1048,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
     rep_movsb_threshold = 2112;
 
-  /* Non-temporal stores are more performant on Intel and AMD hardware above
-     non_temporal_threshold. Enable this for both Intel and AMD hardware. */
-  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
-      && (cpu_features->basic.kind == arch_kind_intel
-	  || cpu_features->basic.kind == arch_kind_amd))
-    memset_non_temporal_threshold = non_temporal_threshold;
-
   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
      cases slower than the vectorized path (and for some alignments,
      it is really slow, check BZ #30994).  */
@@ -1017,6 +1069,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (tunable_size != 0)
     shared = tunable_size;
 
+  /* Non-temporal stores are more performant on some hardware above
+     non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
+     Intel, AMD and Hygon hardware. */
+  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
+    memset_non_temporal_threshold = non_temporal_threshold;
+
   tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
   if (tunable_size > minimum_non_temporal_threshold
       && tunable_size <= maximum_non_temporal_threshold)
@@ -1042,18 +1101,42 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
        slightly better than ERMS.  */
     rep_stosb_threshold = SIZE_MAX;
 
+  /*
+     For memset, the non-temporal implementation is only accessed through the
+     stosb code. ie:
+     ```
+     if (size >= rep_stosb_thresh)
+     {
+    	if (size >= non_temporal_thresh)
+     {
+     do_non_temporal ();
+     }
+    	do_stosb ();
+     }
+     do_normal_vec_loop ();
+     ```
+     So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
+     to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
+    `rep stosb` will never be used.
+   */
+  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+			   memset_non_temporal_threshold,
+			   minimum_non_temporal_threshold, SIZE_MAX);
+  /* Do `rep_stosb_thresh = non_temporal_thresh` after setting/getting the
+     final value of `x86_memset_non_temporal_threshold`. In some cases this can
+     be a matter of correctness.  */
+  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_STOSB))
+    rep_stosb_threshold
+	= TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+  TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
+			   SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
 			   minimum_non_temporal_threshold,
 			   maximum_non_temporal_threshold);
-  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
-			   memset_non_temporal_threshold,
-			   minimum_non_temporal_threshold, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
 			   minimum_rep_movsb_threshold, SIZE_MAX);
-  TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
-			   SIZE_MAX);
 
   unsigned long int rep_movsb_stop_threshold;
   /* Setting the upper bound of ERMS to the computed value of