diff options
author | Noah Goldstein <goldstein.w.n@gmail.com> | 2023-06-07 13:18:03 -0500 |
---|---|---|
committer | Noah Goldstein <goldstein.w.n@gmail.com> | 2023-06-12 11:33:39 -0500 |
commit | 180897c161a171d8ef0faee1c6c9fd6b57d8b13b (patch) | |
tree | 89e71e02a6e1edc57bb13f311228816dcbc92bd6 /sysdeps/x86/cpu-features.c | |
parent | f193ea20eddc6cef84cba54cf1a647204ee6a86b (diff) | |
download | glibc-180897c161a171d8ef0faee1c6c9fd6b57d8b13b.tar.gz glibc-180897c161a171d8ef0faee1c6c9fd6b57d8b13b.tar.xz glibc-180897c161a171d8ef0faee1c6c9fd6b57d8b13b.zip |
x86: Make the divisor in setting `non_temporal_threshold` cpu specific
Different systems prefer a different divisors. From benchmarks[1] so far the following divisors have been found: ICX : 2 SKX : 2 BWD : 8 For Intel, we are generalizing that BWD and older prefers 8 as a divisor, and SKL and newer prefers 2. This number can be further tuned as benchmarks are run. [1]: https://github.com/goldsteinn/memcpy-nt-benchmarks Reviewed-by: DJ Delorie <dj@redhat.com>
Diffstat (limited to 'sysdeps/x86/cpu-features.c')
-rw-r--r-- | sysdeps/x86/cpu-features.c | 31 |
1 files changed, 22 insertions, 9 deletions
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index d52a718e92..525828f59c 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -636,6 +636,7 @@ init_cpu_features (struct cpu_features *cpu_features) unsigned int stepping = 0; enum cpu_features_kind kind; + cpu_features->cachesize_non_temporal_divisor = 4; #if !HAS_CPUID if (__get_cpuid_max (0, 0) == 0) { @@ -716,13 +717,13 @@ init_cpu_features (struct cpu_features *cpu_features) /* Bigcore/Default Tuning. */ default: + default_tuning: /* Unknown family 0x06 processors. Assuming this is one of Core i3/i5/i7 processors if AVX is available. */ if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) break; - /* Fall through. */ - case INTEL_BIGCORE_NEHALEM: - case INTEL_BIGCORE_WESTMERE: + + enable_modern_features: /* Rep string instructions, unaligned load, unaligned copy, and pminub are fast on Intel Core i3, i5 and i7. */ cpu_features->preferred[index_arch_Fast_Rep_String] @@ -732,12 +733,23 @@ init_cpu_features (struct cpu_features *cpu_features) | bit_arch_Prefer_PMINUB_for_stringop); break; - /* - Default tuned Bigcore microarch. + case INTEL_BIGCORE_NEHALEM: + case INTEL_BIGCORE_WESTMERE: + /* Older CPUs prefer non-temporal stores at lower threshold. */ + cpu_features->cachesize_non_temporal_divisor = 8; + goto enable_modern_features; + + /* Older Bigcore microarch (smaller non-temporal store + threshold). */ case INTEL_BIGCORE_SANDYBRIDGE: case INTEL_BIGCORE_IVYBRIDGE: case INTEL_BIGCORE_HASWELL: case INTEL_BIGCORE_BROADWELL: + cpu_features->cachesize_non_temporal_divisor = 8; + goto default_tuning; + + /* Newer Bigcore microarch (larger non-temporal store + threshold). */ case INTEL_BIGCORE_SKYLAKE: case INTEL_BIGCORE_KABYLAKE: case INTEL_BIGCORE_COMETLAKE: @@ -753,13 +765,14 @@ init_cpu_features (struct cpu_features *cpu_features) case INTEL_BIGCORE_SAPPHIRERAPIDS: case INTEL_BIGCORE_EMERALDRAPIDS: case INTEL_BIGCORE_GRANITERAPIDS: - */ + cpu_features->cachesize_non_temporal_divisor = 2; + goto default_tuning; - /* - Default tuned Mixed (bigcore + atom SOC). + /* Default tuned Mixed (bigcore + atom SOC). */ case INTEL_MIXED_LAKEFIELD: case INTEL_MIXED_ALDERLAKE: - */ + cpu_features->cachesize_non_temporal_divisor = 2; + goto default_tuning; } /* Disable TSX on some processors to avoid TSX on kernels that |