diff options
author | Noah Goldstein <goldstein.w.n@gmail.com> | 2024-05-24 12:38:51 -0500 |
---|---|---|
committer | Noah Goldstein <goldstein.w.n@gmail.com> | 2024-05-30 12:36:09 -0500 |
commit | 46b5e98ef6f1b9f4b53851f152ecb8209064b26c (patch) | |
tree | f3008f5bbe51de418ade2aeb13363408c7ba2536 /sysdeps/x86 | |
parent | 5bf0ab80573d66e4ae5d94b094659094336da90f (diff) | |
download | glibc-46b5e98ef6f1b9f4b53851f152ecb8209064b26c.tar.gz glibc-46b5e98ef6f1b9f4b53851f152ecb8209064b26c.tar.xz glibc-46b5e98ef6f1b9f4b53851f152ecb8209064b26c.zip |
x86: Add seperate non-temporal tunable for memset
The tuning for non-temporal stores for memset vs memcpy is not always the same. This includes both the exact value and whether non-temporal stores are profitable at all for a given arch. This patch add `x86_memset_non_temporal_threshold`. Currently we disable non-temporal stores for non Intel vendors as the only benchmarks showing its benefit have been on Intel hardware. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Diffstat (limited to 'sysdeps/x86')
-rw-r--r-- | sysdeps/x86/cacheinfo.h | 8 | ||||
-rw-r--r-- | sysdeps/x86/dl-cacheinfo.h | 16 | ||||
-rw-r--r-- | sysdeps/x86/dl-diagnostics-cpu.c | 2 | ||||
-rw-r--r-- | sysdeps/x86/dl-tunables.list | 3 | ||||
-rw-r--r-- | sysdeps/x86/include/cpu-features.h | 4 |
5 files changed, 31 insertions, 2 deletions
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h index ab73556772..83491607c7 100644 --- a/sysdeps/x86/cacheinfo.h +++ b/sysdeps/x86/cacheinfo.h @@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024; long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; long int __x86_shared_cache_size attribute_hidden = 1024 * 1024; -/* Threshold to use non temporal store. */ +/* Threshold to use non temporal store in memmove. */ long int __x86_shared_non_temporal_threshold attribute_hidden; +/* Threshold to use non temporal store in memset. */ +long int __x86_memset_non_temporal_threshold attribute_hidden; + /* Threshold to use Enhanced REP MOVSB. */ long int __x86_rep_movsb_threshold attribute_hidden = 2048; @@ -77,6 +80,9 @@ init_cacheinfo (void) __x86_shared_non_temporal_threshold = cpu_features->non_temporal_threshold; + __x86_memset_non_temporal_threshold + = cpu_features->memset_non_temporal_threshold; + __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index 5a98f70364..d375a7cba6 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) rep_movsb_threshold = 2112; + /* Non-temporal stores in memset have only been tested on Intel hardware. + Until we benchmark data on other x86 processor, disable non-temporal + stores in memset. */ + unsigned long int memset_non_temporal_threshold = SIZE_MAX; + if (cpu_features->basic.kind == arch_kind_intel) + memset_non_temporal_threshold = non_temporal_threshold; + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of cases slower than the vectorized path (and for some alignments, it is really slow, check BZ #30994). */ @@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) && tunable_size <= maximum_non_temporal_threshold) non_temporal_threshold = tunable_size; + tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); + if (tunable_size > minimum_non_temporal_threshold + && tunable_size <= maximum_non_temporal_threshold) + memset_non_temporal_threshold = tunable_size; + tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL); if (tunable_size > minimum_rep_movsb_threshold) rep_movsb_threshold = tunable_size; @@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, minimum_non_temporal_threshold, maximum_non_temporal_threshold); + TUNABLE_SET_WITH_BOUNDS ( + x86_memset_non_temporal_threshold, memset_non_temporal_threshold, + minimum_non_temporal_threshold, maximum_non_temporal_threshold); TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, minimum_rep_movsb_threshold, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, @@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) cpu_features->data_cache_size = data; cpu_features->shared_cache_size = shared; cpu_features->non_temporal_threshold = non_temporal_threshold; + cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold; cpu_features->rep_movsb_threshold = rep_movsb_threshold; cpu_features->rep_stosb_threshold = rep_stosb_threshold; cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold; diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c index ceafde9481..49eeb5f70a 100644 --- a/sysdeps/x86/dl-diagnostics-cpu.c +++ b/sysdeps/x86/dl-diagnostics-cpu.c @@ -94,6 +94,8 @@ _dl_diagnostics_cpu (void) cpu_features->shared_cache_size); print_cpu_features_value ("non_temporal_threshold", cpu_features->non_temporal_threshold); + print_cpu_features_value ("memset_non_temporal_threshold", + cpu_features->memset_non_temporal_threshold); print_cpu_features_value ("rep_movsb_threshold", cpu_features->rep_movsb_threshold); print_cpu_features_value ("rep_movsb_stop_threshold", diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list index 7d82da0dec..a0a1299592 100644 --- a/sysdeps/x86/dl-tunables.list +++ b/sysdeps/x86/dl-tunables.list @@ -30,6 +30,9 @@ glibc { x86_non_temporal_threshold { type: SIZE_T } + x86_memset_non_temporal_threshold { + type: SIZE_T + } x86_rep_movsb_threshold { type: SIZE_T # Since there is overhead to set up REP MOVSB operation, REP diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h index cd7bd27cf3..aaae44f0e1 100644 --- a/sysdeps/x86/include/cpu-features.h +++ b/sysdeps/x86/include/cpu-features.h @@ -944,8 +944,10 @@ struct cpu_features /* Shared cache size for use in memory and string routines, typically L2 or L3 size. */ unsigned long int shared_cache_size; - /* Threshold to use non temporal store. */ + /* Threshold to use non temporal store in memmove. */ unsigned long int non_temporal_threshold; + /* Threshold to use non temporal store in memset. */ + unsigned long int memset_non_temporal_threshold; /* Threshold to use "rep movsb". */ unsigned long int rep_movsb_threshold; /* Threshold to stop using "rep movsb". */ |