diff options
author | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2022-06-06 12:17:43 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2022-06-06 19:46:55 -0700 |
commit | 5082a287d5e9a1f9cb98b7c982a708a3684f1d5c (patch) | |
tree | 7fd99341b565ce257383a603daedcc9e019e2230 /sysdeps/x86_64/multiarch/strstr.c | |
parent | 8521001731d6539382fa875f1cac9864c466ef27 (diff) | |
download | glibc-5082a287d5e9a1f9cb98b7c982a708a3684f1d5c.tar.gz glibc-5082a287d5e9a1f9cb98b7c982a708a3684f1d5c.tar.xz glibc-5082a287d5e9a1f9cb98b7c982a708a3684f1d5c.zip |
x86_64: Add strstr function with 512-bit EVEX
Adding a 512-bit EVEX version of strstr. The algorithm works as follows: (1) We spend a few cycles at the begining to peek into the needle. We locate an edge in the needle (first occurance of 2 consequent distinct characters) and also store the first 64-bytes into a zmm register. (2) We search for the edge in the haystack by looking into one cache line of the haystack at a time. This avoids having to read past a page boundary which can cause a seg fault. (3) If an edge is found in the haystack we first compare the first 64-bytes of the needle (already stored in a zmm register) before we proceed with a full string compare performed byte by byte. Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512) Geometric mean of all benchmarks: new / old = 0.66 Difficult skiptable(0) : new / old = 0.02 Difficult skiptable(1) : new / old = 0.01 Difficult 2-way : new / old = 0.25 Difficult testing first 2 : new / old = 1.26 Difficult skiptable(0) : new / old = 0.05 Difficult skiptable(1) : new / old = 0.06 Difficult 2-way : new / old = 0.26 Difficult testing first 2 : new / old = 1.05 Difficult skiptable(0) : new / old = 0.42 Difficult skiptable(1) : new / old = 0.24 Difficult 2-way : new / old = 0.21 Difficult testing first 2 : new / old = 1.04 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Diffstat (limited to 'sysdeps/x86_64/multiarch/strstr.c')
-rw-r--r-- | sysdeps/x86_64/multiarch/strstr.c | 24 |
1 files changed, 20 insertions, 4 deletions
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c index 95600a9de5..2fb8b169b6 100644 --- a/sysdeps/x86_64/multiarch/strstr.c +++ b/sysdeps/x86_64/multiarch/strstr.c @@ -35,16 +35,32 @@ extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden; extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden; #include "init-arch.h" /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc (__libc_strstr, - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - ? __strstr_sse2_unaligned - : __strstr_sse2) +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features *cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return __strstr_avx512; + + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) + return __strstr_sse2_unaligned; + + return __strstr_sse2; +} + +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ()); #undef strstr strong_alias (__libc_strstr, strstr) |