diff options
Diffstat (limited to 'sysdeps/x86_64')
-rw-r--r-- | sysdeps/x86_64/isa-default-impl.h | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/memchr.S | 357 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-evex.h | 29 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 72 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memchr-avx2.S | 5 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memchr-evex.S | 5 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memchr-sse2.S | 363 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/rawmemchr-avx2.S | 7 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/rawmemchr-evex.S | 7 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/rawmemchr-sse2.S | 198 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/rtld-memchr.S | 18 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/rtld-rawmemchr.S | 18 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wmemchr-avx2.S | 7 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wmemchr-evex.S | 7 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wmemchr-sse2.S | 9 | ||||
-rw-r--r-- | sysdeps/x86_64/rawmemchr.S | 184 | ||||
-rw-r--r-- | sysdeps/x86_64/wmemchr.S | 28 |
17 files changed, 720 insertions, 604 deletions
diff --git a/sysdeps/x86_64/isa-default-impl.h b/sysdeps/x86_64/isa-default-impl.h index 34634668e5..7d7832b1f5 100644 --- a/sysdeps/x86_64/isa-default-impl.h +++ b/sysdeps/x86_64/isa-default-impl.h @@ -46,4 +46,14 @@ # error "Unsupported ISA Level!" #endif +#if IS_IN(rtld) +# if !defined USE_MULTIARCH +# error "RTLD version should only exist in multiarch build" +# endif +#else +# if defined USE_MULTIARCH +# error "Multiarch build should not use ISA_DEFAULT_IMPL without RTLD" +# endif +#endif + #include ISA_DEFAULT_IMPL diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index a160fd9b00..20b43508c4 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -15,358 +15,13 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdep.h> +#define MEMCHR __memchr -#ifdef USE_AS_WMEMCHR -# define MEMCHR wmemchr -# define PCMPEQ pcmpeqd -# define CHAR_PER_VEC 4 -#else -# define MEMCHR memchr -# define PCMPEQ pcmpeqb -# define CHAR_PER_VEC 16 -#endif +#define DEFAULT_IMPL_V1 "multiarch/memchr-sse2.S" +#define DEFAULT_IMPL_V3 "multiarch/memchr-avx2.S" +#define DEFAULT_IMPL_V4 "multiarch/memchr-evex.S" -/* fast SSE2 version with using pmaxub and 64 byte loop */ +#include "isa-default-impl.h" - .text -ENTRY(MEMCHR) - movd %esi, %xmm1 - mov %edi, %ecx - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %edx, %edx -#endif -#ifdef USE_AS_WMEMCHR - test %RDX_LP, %RDX_LP - jz L(return_null) -#else - punpcklbw %xmm1, %xmm1 - test %RDX_LP, %RDX_LP - jz L(return_null) - punpcklbw %xmm1, %xmm1 -#endif - - and $63, %ecx - pshufd $0, %xmm1, %xmm1 - - cmp $48, %ecx - ja L(crosscache) - - movdqu (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - - jnz L(matches_1) - sub $CHAR_PER_VEC, %rdx - jbe L(return_null) - add $16, %rdi - and $15, %ecx - and $-16, %rdi -#ifdef USE_AS_WMEMCHR - shr $2, %ecx -#endif - add %rcx, %rdx - sub $(CHAR_PER_VEC * 4), %rdx - jbe L(exit_loop) - jmp L(loop_prolog) - - .p2align 4 -L(crosscache): - and $15, %ecx - and $-16, %rdi - movdqa (%rdi), %xmm0 - - PCMPEQ %xmm1, %xmm0 - /* Check if there is a match. */ - pmovmskb %xmm0, %eax - /* Remove the leading bytes. */ - sar %cl, %eax - test %eax, %eax - je L(unaligned_no_match) - /* Check which byte is a match. */ - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - add %rdi, %rax - add %rcx, %rax - ret - - .p2align 4 -L(unaligned_no_match): - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using - "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void - possible addition overflow. */ - neg %rcx - add $16, %rcx -#ifdef USE_AS_WMEMCHR - shr $2, %ecx -#endif - sub %rcx, %rdx - jbe L(return_null) - add $16, %rdi - sub $(CHAR_PER_VEC * 4), %rdx - jbe L(exit_loop) - - .p2align 4 -L(loop_prolog): - movdqa (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm4 - PCMPEQ %xmm1, %xmm4 - add $64, %rdi - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - - test $0x3f, %rdi - jz L(align64_loop) - - sub $(CHAR_PER_VEC * 4), %rdx - jbe L(exit_loop) - - movdqa (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - pmovmskb %xmm3, %eax - - add $64, %rdi - test %eax, %eax - jnz L(matches0) - - mov %rdi, %rcx - and $-64, %rdi - and $63, %ecx -#ifdef USE_AS_WMEMCHR - shr $2, %ecx -#endif - add %rcx, %rdx - - .p2align 4 -L(align64_loop): - sub $(CHAR_PER_VEC * 4), %rdx - jbe L(exit_loop) - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm2 - movdqa 32(%rdi), %xmm3 - movdqa 48(%rdi), %xmm4 - - PCMPEQ %xmm1, %xmm0 - PCMPEQ %xmm1, %xmm2 - PCMPEQ %xmm1, %xmm3 - PCMPEQ %xmm1, %xmm4 - - pmaxub %xmm0, %xmm3 - pmaxub %xmm2, %xmm4 - pmaxub %xmm3, %xmm4 - pmovmskb %xmm4, %eax - - add $64, %rdi - - test %eax, %eax - jz L(align64_loop) - - sub $64, %rdi - - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - - PCMPEQ 48(%rdi), %xmm1 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - pmovmskb %xmm1, %eax - bsf %eax, %eax - lea 48(%rdi, %rax), %rax - ret - - .p2align 4 -L(exit_loop): - add $(CHAR_PER_VEC * 2), %edx - jle L(exit_loop_32) - - movdqa (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32_1) - sub $CHAR_PER_VEC, %edx - jle L(return_null) - - PCMPEQ 48(%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches48_1) - xor %eax, %eax - ret - - .p2align 4 -L(exit_loop_32): - add $(CHAR_PER_VEC * 2), %edx - movdqa (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches_1) - sub $CHAR_PER_VEC, %edx - jbe L(return_null) - - PCMPEQ 16(%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches16_1) - xor %eax, %eax - ret - - .p2align 4 -L(matches0): - bsf %eax, %eax - lea -16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches): - bsf %eax, %eax - add %rdi, %rax - ret - - .p2align 4 -L(matches16): - bsf %eax, %eax - lea 16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches32): - bsf %eax, %eax - lea 32(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches_1): - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - add %rdi, %rax - ret - - .p2align 4 -L(matches16_1): - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - lea 16(%rdi, %rax), %rax - ret - - .p2align 4 -L(matches32_1): - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - lea 32(%rdi, %rax), %rax - ret - - .p2align 4 -L(matches48_1): - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - lea 48(%rdi, %rax), %rax - ret - - .p2align 4 -L(return_null): - xor %eax, %eax - ret -END(MEMCHR) - -#ifndef USE_AS_WMEMCHR -strong_alias (memchr, __memchr) +weak_alias (__memchr, memchr) libc_hidden_builtin_def(memchr) -#endif diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h index b8f7a12ea2..856c6261f8 100644 --- a/sysdeps/x86_64/multiarch/ifunc-evex.h +++ b/sysdeps/x86_64/multiarch/ifunc-evex.h @@ -19,24 +19,28 @@ #include <init-arch.h> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { - const struct cpu_features* cpu_features = __get_cpu_features (); - - if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) - && CPU_FEATURE_USABLE_P (cpu_features, BMI2) - && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + const struct cpu_features *cpu_features = __get_cpu_features (); + + /* NB: The X86_ISA_* feature check macros are evaluated at + compile time. */ + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2) + && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, + AVX_Fast_Unaligned_Load)) { - if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) - && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) { if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) return OPTIMIZE (evex_rtm); @@ -47,9 +51,12 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) return OPTIMIZE (avx2_rtm); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, + Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } + /* This is unreachable (compile time checked) if ISA level >= 3 + so no need for a robust fallback here. */ return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 883362f63d..bf52cf96d0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -25,7 +25,8 @@ /* Fill ARRAY of MAX elements with IFUNC implementations for function NAME supported on target machine and return the number of valid - entries. */ + entries. Each set of implementations for a given function is sorted in + descending order by ISA level. */ size_t __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, @@ -53,24 +54,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/memchr.c. */ IFUNC_IMPL (i, name, memchr, - IFUNC_IMPL_ADD (array, i, memchr, - CPU_FEATURE_USABLE (AVX2), - __memchr_avx2) - IFUNC_IMPL_ADD (array, i, memchr, - (CPU_FEATURE_USABLE (AVX2) - && CPU_FEATURE_USABLE (RTM)), - __memchr_avx2_rtm) - IFUNC_IMPL_ADD (array, i, memchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __memchr_evex) - IFUNC_IMPL_ADD (array, i, memchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __memchr_evex_rtm) - IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, + CPU_FEATURE_USABLE (AVX2), + __memchr_avx2) + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memchr_avx2_rtm) + /* Can be lowered to V1 if a V2 implementation is added. */ + X86_IFUNC_IMPL_ADD_V2 (array, i, memchr, + 1, + __memchr_sse2)) /* Support sysdeps/x86_64/multiarch/memcmp.c. */ IFUNC_IMPL (i, name, memcmp, @@ -288,24 +292,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ IFUNC_IMPL (i, name, rawmemchr, - IFUNC_IMPL_ADD (array, i, rawmemchr, - CPU_FEATURE_USABLE (AVX2), - __rawmemchr_avx2) - IFUNC_IMPL_ADD (array, i, rawmemchr, - (CPU_FEATURE_USABLE (AVX2) - && CPU_FEATURE_USABLE (RTM)), - __rawmemchr_avx2_rtm) - IFUNC_IMPL_ADD (array, i, rawmemchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __rawmemchr_evex) - IFUNC_IMPL_ADD (array, i, rawmemchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __rawmemchr_evex_rtm) - IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, + CPU_FEATURE_USABLE (AVX2), + __rawmemchr_avx2) + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __rawmemchr_avx2_rtm) + /* Can be lowered to V1 if a V2 implementation is added. */ + X86_IFUNC_IMPL_ADD_V2 (array, i, rawmemchr, + 1, + __rawmemchr_sse2)) /* Support sysdeps/x86_64/multiarch/strlen.c. */ IFUNC_IMPL (i, name, strlen, @@ -748,24 +755,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ IFUNC_IMPL (i, name, wmemchr, - IFUNC_IMPL_ADD (array, i, wmemchr, - CPU_FEATURE_USABLE (AVX2), - __wmemchr_avx2) - IFUNC_IMPL_ADD (array, i, wmemchr, - (CPU_FEATURE_USABLE (AVX2) - && CPU_FEATURE_USABLE (RTM)), - __wmemchr_avx2_rtm) - IFUNC_IMPL_ADD (array, i, wmemchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wmemchr_evex) - IFUNC_IMPL_ADD (array, i, wmemchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wmemchr_evex_rtm) - IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, + CPU_FEATURE_USABLE (AVX2), + __wmemchr_avx2) + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wmemchr_avx2_rtm) + /* Can be lowered to V1 if a V2 implementation is added. */ + X86_IFUNC_IMPL_ADD_V2 (array, i, wmemchr, + 1, + __wmemchr_sse2)) /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ IFUNC_IMPL (i, name, wmemcmp, diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index c5a256eb37..39be5f7083 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -16,9 +16,10 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) +#include <isa-level.h> +#include <sysdep.h> -# include <sysdep.h> +#if ISA_SHOULD_BUILD (3) # ifndef MEMCHR # define MEMCHR __memchr_avx2 diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S index 0fd11b7632..0dd4f1dcce 100644 --- a/sysdeps/x86_64/multiarch/memchr-evex.S +++ b/sysdeps/x86_64/multiarch/memchr-evex.S @@ -16,9 +16,10 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) +#include <isa-level.h> +#include <sysdep.h> -# include <sysdep.h> +#if ISA_SHOULD_BUILD (4) # ifndef MEMCHR # define MEMCHR __memchr_evex diff --git a/sysdeps/x86_64/multiarch/memchr-sse2.S b/sysdeps/x86_64/multiarch/memchr-sse2.S index 2c6fdd41d6..8c561cd687 100644 --- a/sysdeps/x86_64/multiarch/memchr-sse2.S +++ b/sysdeps/x86_64/multiarch/memchr-sse2.S @@ -16,13 +16,360 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) -# define memchr __memchr_sse2 +#include <isa-level.h> +#include <sysdep.h> -# undef strong_alias -# define strong_alias(memchr, __memchr) -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(memchr) -#endif +/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation + so we need this to build for ISA V2 builds. */ +#if ISA_SHOULD_BUILD (2) + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2 +# endif +# ifdef USE_AS_WMEMCHR +# define PCMPEQ pcmpeqd +# define CHAR_PER_VEC 4 +# else +# define PCMPEQ pcmpeqb +# define CHAR_PER_VEC 16 +# endif + +/* fast SSE2 version with using pmaxub and 64 byte loop */ + + .text +ENTRY(MEMCHR) + movd %esi, %xmm1 + mov %edi, %ecx + +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif +# ifdef USE_AS_WMEMCHR + test %RDX_LP, %RDX_LP + jz L(return_null) +# else + punpcklbw %xmm1, %xmm1 + test %RDX_LP, %RDX_LP + jz L(return_null) + punpcklbw %xmm1, %xmm1 +# endif + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %ecx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + + jnz L(matches_1) + sub $CHAR_PER_VEC, %rdx + jbe L(return_null) + add $16, %rdi + and $15, %ecx + and $-16, %rdi +# ifdef USE_AS_WMEMCHR + shr $2, %ecx +# endif + add %rcx, %rdx + sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %ecx + and $-16, %rdi + movdqa (%rdi), %xmm0 + + PCMPEQ %xmm1, %xmm0 + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) + /* Check which byte is a match. */ + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void + possible addition overflow. */ + neg %rcx + add $16, %rcx +# ifdef USE_AS_WMEMCHR + shr $2, %ecx +# endif + sub %rcx, %rdx + jbe L(return_null) + add $16, %rdi + sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + + .p2align 4 +L(loop_prolog): + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm4 + PCMPEQ %xmm1, %xmm4 + add $64, %rdi + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + test $0x3f, %rdi + jz L(align64_loop) + + sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + + add $64, %rdi + test %eax, %eax + jnz L(matches0) + + mov %rdi, %rcx + and $-64, %rdi + and $63, %ecx +# ifdef USE_AS_WMEMCHR + shr $2, %ecx +# endif + add %rcx, %rdx + + .p2align 4 +L(align64_loop): + sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + PCMPEQ %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm2 + PCMPEQ %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm4 -#include "../memchr.S" + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + + add $64, %rdi + + test %eax, %eax + jz L(align64_loop) + + sub $64, %rdi + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + + PCMPEQ 48(%rdi), %xmm1 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(exit_loop): + add $(CHAR_PER_VEC * 2), %edx + jle L(exit_loop_32) + + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) + sub $CHAR_PER_VEC, %edx + jle L(return_null) + + PCMPEQ 48(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches48_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + add $(CHAR_PER_VEC * 2), %edx + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) + sub $CHAR_PER_VEC, %edx + jbe L(return_null) + + PCMPEQ 16(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches16_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsf %eax, %eax + lea -16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches): + bsf %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsf %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsf %eax, %eax + lea 32(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches_1): + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + add %rdi, %rax + ret + + .p2align 4 +L(matches16_1): + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + lea 16(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches32_1): + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + lea 32(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches48_1): + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret +END(MEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S index 128f9ea637..d6bff28757 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S @@ -1,4 +1,7 @@ -#define MEMCHR __rawmemchr_avx2 -#define USE_AS_RAWMEMCHR 1 +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_avx2 +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR #include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S index ec942b77ba..dc1c450699 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S @@ -1,4 +1,7 @@ -#define MEMCHR __rawmemchr_evex -#define USE_AS_RAWMEMCHR 1 +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR #include "memchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S index 3841c14c34..e2c2e20d85 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S @@ -16,14 +16,192 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -/* Define multiple versions only for the definition in libc. */ -#if IS_IN (libc) -# define __rawmemchr __rawmemchr_sse2 - -# undef weak_alias -# define weak_alias(__rawmemchr, rawmemchr) -# undef libc_hidden_def -# define libc_hidden_def(__rawmemchr) -#endif +#include <isa-level.h> +#include <sysdep.h> + +/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation + so we need this to build for ISA V2 builds. */ +#if ISA_SHOULD_BUILD (2) + +# ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_sse2 +# endif + + .text +ENTRY (RAWMEMCHR) + movd %rsi, %xmm1 + mov %rdi, %rcx + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + and $63, %rcx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %rcx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + + jnz L(matches) + add $16, %rdi + and $-16, %rdi + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 + + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax + + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + add $16, %rdi + + .p2align 4 +L(loop_prolog): + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm4 + pcmpeqb %xmm1, %xmm4 + add $64, %rdi + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + test $0x3f, %rdi + jz L(align64_loop) + + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) -#include "../rawmemchr.S" + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + + add $64, %rdi + test %eax, %eax + jnz L(matches0) + + and $-64, %rdi + + .p2align 4 +L(align64_loop): + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + + add $64, %rdi + + test %eax, %eax + jz L(align64_loop) + + sub $64, %rdi + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + + pcmpeqb 48(%rdi), %xmm1 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches0): + bsf %eax, %eax + lea -16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches): + bsf %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsf %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsf %eax, %eax + lea 32(%rax, %rdi), %rax + ret + +END (RAWMEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/rtld-memchr.S b/sysdeps/x86_64/multiarch/rtld-memchr.S new file mode 100644 index 0000000000..a14b192bed --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-memchr.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../memchr.S" diff --git a/sysdeps/x86_64/multiarch/rtld-rawmemchr.S b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S new file mode 100644 index 0000000000..5d4110a052 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../rawmemchr.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S index 282854f1a1..2bf93fd84b 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-avx2.S +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S @@ -1,4 +1,7 @@ -#define MEMCHR __wmemchr_avx2 -#define USE_AS_WMEMCHR 1 +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_avx2 +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR #include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S index 06cd0f9f5a..5512d5cdc3 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-evex.S +++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S @@ -1,4 +1,7 @@ -#define MEMCHR __wmemchr_evex -#define USE_AS_WMEMCHR 1 +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_evex +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR #include "memchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-sse2.S b/sysdeps/x86_64/multiarch/wmemchr-sse2.S index 70a965d552..b675a070d4 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-sse2.S +++ b/sysdeps/x86_64/multiarch/wmemchr-sse2.S @@ -1,4 +1,7 @@ -#define USE_AS_WMEMCHR 1 -#define wmemchr __wmemchr_sse2 +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_sse2 +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR -#include "../memchr.S" +#include "memchr-sse2.S" diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S index 4c1a3383b9..ba7e5202e6 100644 --- a/sysdeps/x86_64/rawmemchr.S +++ b/sysdeps/x86_64/rawmemchr.S @@ -17,185 +17,13 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdep.h> +#define RAWMEMCHR __rawmemchr - .text -ENTRY (__rawmemchr) - movd %rsi, %xmm1 - mov %rdi, %rcx +#define DEFAULT_IMPL_V1 "multiarch/rawmemchr-sse2.S" +#define DEFAULT_IMPL_V3 "multiarch/rawmemchr-avx2.S" +#define DEFAULT_IMPL_V4 "multiarch/rawmemchr-evex.S" - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 - - and $63, %rcx - pshufd $0, %xmm1, %xmm1 - - cmp $48, %rcx - ja L(crosscache) - - movdqu (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 -/* Check if there is a match. */ - pmovmskb %xmm0, %eax - test %eax, %eax - - jnz L(matches) - add $16, %rdi - and $-16, %rdi - jmp L(loop_prolog) - - .p2align 4 -L(crosscache): - and $15, %rcx - and $-16, %rdi - movdqa (%rdi), %xmm0 - - pcmpeqb %xmm1, %xmm0 -/* Check if there is a match. */ - pmovmskb %xmm0, %eax -/* Remove the leading bytes. */ - sar %cl, %eax - test %eax, %eax - je L(unaligned_no_match) -/* Check which byte is a match. */ - bsf %eax, %eax - - add %rdi, %rax - add %rcx, %rax - ret - - .p2align 4 -L(unaligned_no_match): - add $16, %rdi - - .p2align 4 -L(loop_prolog): - movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm4 - pcmpeqb %xmm1, %xmm4 - add $64, %rdi - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - - test $0x3f, %rdi - jz L(align64_loop) - - movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - - add $64, %rdi - test %eax, %eax - jnz L(matches0) - - and $-64, %rdi - - .p2align 4 -L(align64_loop): - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm2 - movdqa 32(%rdi), %xmm3 - movdqa 48(%rdi), %xmm4 - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 - - pmaxub %xmm0, %xmm3 - pmaxub %xmm2, %xmm4 - pmaxub %xmm3, %xmm4 - pmovmskb %xmm4, %eax - - add $64, %rdi - - test %eax, %eax - jz L(align64_loop) - - sub $64, %rdi - - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - - pcmpeqb 48(%rdi), %xmm1 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - pmovmskb %xmm1, %eax - bsf %eax, %eax - lea 48(%rdi, %rax), %rax - ret - - .p2align 4 -L(matches0): - bsf %eax, %eax - lea -16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches): - bsf %eax, %eax - add %rdi, %rax - ret - - .p2align 4 -L(matches16): - bsf %eax, %eax - lea 16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches32): - bsf %eax, %eax - lea 32(%rax, %rdi), %rax - ret - -END (__rawmemchr) +#include "isa-default-impl.h" weak_alias (__rawmemchr, rawmemchr) -libc_hidden_builtin_def (__rawmemchr) +libc_hidden_def (__rawmemchr) diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S new file mode 100644 index 0000000000..30565b2067 --- /dev/null +++ b/sysdeps/x86_64/wmemchr.S @@ -0,0 +1,28 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define WMEMCHR __wmemchr + +#define DEFAULT_IMPL_V1 "multiarch/wmemchr-sse2.S" +#define DEFAULT_IMPL_V3 "multiarch/wmemchr-avx2.S" +#define DEFAULT_IMPL_V4 "multiarch/wmemchr-evex.S" + +#include "isa-default-impl.h" + +libc_hidden_def (__wmemchr) +weak_alias (__wmemchr, wmemchr) +libc_hidden_weak (wmemchr) |