about summary refs log tree commit diff
path: root/sysdeps/x86_64/multiarch/strstr.c
diff options
context:
space:
mode:
authorRaghuveer Devulapalli <raghuveer.devulapalli@intel.com>2022-06-06 12:17:43 -0700
committerH.J. Lu <hjl.tools@gmail.com>2022-06-06 19:46:55 -0700
commit5082a287d5e9a1f9cb98b7c982a708a3684f1d5c (patch)
tree7fd99341b565ce257383a603daedcc9e019e2230 /sysdeps/x86_64/multiarch/strstr.c
parent8521001731d6539382fa875f1cac9864c466ef27 (diff)
downloadglibc-5082a287d5e9a1f9cb98b7c982a708a3684f1d5c.tar.gz
glibc-5082a287d5e9a1f9cb98b7c982a708a3684f1d5c.tar.xz
glibc-5082a287d5e9a1f9cb98b7c982a708a3684f1d5c.zip
x86_64: Add strstr function with 512-bit EVEX
Adding a 512-bit EVEX version of strstr. The algorithm works as follows:

(1) We spend a few cycles at the begining to peek into the needle. We
locate an edge in the needle (first occurance of 2 consequent distinct
characters) and also store the first 64-bytes into a zmm register.

(2) We search for the edge in the haystack by looking into one cache
line of the haystack at a time. This avoids having to read past a page
boundary which can cause a seg fault.

(3) If an edge is found in the haystack we first compare the first
64-bytes of the needle (already stored in a zmm register) before we
proceed with a full string compare performed byte by byte.

Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512)

Geometric mean of all benchmarks: new / old =  0.66

Difficult skiptable(0) : new / old =  0.02
Difficult skiptable(1) : new / old =  0.01
Difficult 2-way : new / old =  0.25
Difficult testing first 2 : new / old =  1.26
Difficult skiptable(0) : new / old =  0.05
Difficult skiptable(1) : new / old =  0.06
Difficult 2-way : new / old =  0.26
Difficult testing first 2 : new / old =  1.05
Difficult skiptable(0) : new / old =  0.42
Difficult skiptable(1) : new / old =  0.24
Difficult 2-way : new / old =  0.21
Difficult testing first 2 : new / old =  1.04
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Diffstat (limited to 'sysdeps/x86_64/multiarch/strstr.c')
-rw-r--r--sysdeps/x86_64/multiarch/strstr.c24
1 files changed, 20 insertions, 4 deletions
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index 95600a9de5..2fb8b169b6 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -35,16 +35,32 @@
 
 extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
 extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
 
 #include "init-arch.h"
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr,
-	    HAS_ARCH_FEATURE (Fast_Unaligned_Load)
-	    ? __strstr_sse2_unaligned
-	    : __strstr_sse2)
 
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+    return __strstr_avx512;
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+    return __strstr_sse2_unaligned;
+
+  return __strstr_sse2;
+}
+
+libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
 #undef strstr
 strong_alias (__libc_strstr, strstr)