diff options
author | Adhemerval Zanella <adhemerval.zanella@linaro.org> | 2023-10-27 00:08:15 -0300 |
---|---|---|
committer | Adhemerval Zanella <adhemerval.zanella@linaro.org> | 2023-10-27 09:24:40 -0300 |
commit | 4c023248ca791eefec5a7553057c0af3b9035f6c (patch) | |
tree | 08449508c9fd02a85b5baf5796e45de9bcb69b39 | |
parent | 51da90c20d54c5098d9e1b01950a43bd2197813d (diff) | |
download | glibc-4c023248ca791eefec5a7553057c0af3b9035f6c.tar.gz glibc-4c023248ca791eefec5a7553057c0af3b9035f6c.tar.xz glibc-4c023248ca791eefec5a7553057c0af3b9035f6c.zip |
x86: Do not prefer ERMS for memset on Zen3+
The REP STOSB usage on memset does show any performance gain on Zen3/Zen4 cores compared to the vectorized loops. Checked on x86_64-linux-gnu.
-rw-r--r-- | sysdeps/x86/dl-cacheinfo.h | 16 |
1 files changed, 11 insertions, 5 deletions
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index 546ff0725a..f5fd7b2566 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -1010,11 +1010,17 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (tunable_size > minimum_rep_movsb_threshold) rep_movsb_threshold = tunable_size; - /* NB: The default value of the x86_rep_stosb_threshold tunable is the - same as the default value of __x86_rep_stosb_threshold and the - minimum value is fixed. */ - rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold, - long int, NULL); + /* For AMD Zen3+ architecture, the performance of vectorized loop is + slight better than ERMS. */ + if (cpu_features->basic.kind == arch_kind_amd) + rep_stosb_threshold = SIZE_MAX; + + if (TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold)) + /* NB: The default value of the x86_rep_stosb_threshold tunable is the + same as the default value of __x86_rep_stosb_threshold and the + minimum value is fixed. */ + rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold, + long int, NULL); TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); |