1 files changed, 18 insertions, 20 deletions
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index d5101615e3..f34d12846c 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   long int data = -1;
   long int shared = -1;
   long int shared_per_thread = -1;
-  long int core = -1;
   unsigned int threads = 0;
   unsigned long int level1_icache_size = -1;
   unsigned long int level1_icache_linesize = -1;
@@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (cpu_features->basic.kind == arch_kind_intel)
     {
       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
-      core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
       shared_per_thread = shared;
 
@@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
 	= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
       level1_dcache_linesize
 	= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
-      level2_cache_size = core;
+      level2_cache_size
+	= handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
       level2_cache_assoc
 	= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
       level2_cache_linesize
@@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       level4_cache_size
 	= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
 
-      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+      get_common_cache_info (&shared, &shared_per_thread, &threads,
+			     level2_cache_size);
     }
   else if (cpu_features->basic.kind == arch_kind_zhaoxin)
     {
       data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
-      core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
       shared_per_thread = shared;
 
@@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       level1_dcache_size = data;
       level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
       level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
-      level2_cache_size = core;
+      level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
       level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
       level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
       level3_cache_size = shared;
       level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
       level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
 
-      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+      get_common_cache_info (&shared, &shared_per_thread, &threads,
+			     level2_cache_size);
     }
   else if (cpu_features->basic.kind == arch_kind_amd)
     {
       data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
-      core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
 
       level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
@@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       level1_dcache_size = data;
       level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
       level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
-      level2_cache_size = core;
+      level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
       level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
       level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
       level3_cache_size = shared;
@@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       if (shared <= 0)
         {
            /* No shared L3 cache.  All we have is the L2 cache.  */
-           shared = core;
+           shared = level2_cache_size;
         }
       else if (cpu_features->basic.family < 0x17)
         {
            /* Account for exclusive L2 and L3 caches.  */
-           shared += core;
+           shared += level2_cache_size;
         }
 
       shared_per_thread = shared;
@@ -987,6 +986,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
     rep_movsb_threshold = 2112;
 
+   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+      cases slower than the vectorized path (and for some alignments,
+      it is really slow, check BZ #30994).  */
+  if (cpu_features->basic.kind == arch_kind_amd)
+    rep_movsb_threshold = non_temporal_threshold;
+
   /* The default threshold to use Enhanced REP STOSB.  */
   unsigned long int rep_stosb_threshold = 2048;
 
@@ -1028,16 +1033,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
 			   SIZE_MAX);
 
   unsigned long int rep_movsb_stop_threshold;
-  /* ERMS feature is implemented from AMD Zen3 architecture and it is
-     performing poorly for data above L2 cache size. Henceforth, adding
-     an upper bound threshold parameter to limit the usage of Enhanced
-     REP MOVSB operations and setting its value to L2 cache size.  */
-  if (cpu_features->basic.kind == arch_kind_amd)
-    rep_movsb_stop_threshold = core;
   /* Setting the upper bound of ERMS to the computed value of
-     non-temporal threshold for architectures other than AMD.  */
-  else
-    rep_movsb_stop_threshold = non_temporal_threshold;
+     non-temporal threshold for all architectures.  */
+  rep_movsb_stop_threshold = non_temporal_threshold;
 
   cpu_features->data_cache_size = data;
   cpu_features->shared_cache_size = shared;