diff options
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/x86/Makefile | 7 | ||||
-rw-r--r-- | sysdeps/x86/cpu-features.c | 38 | ||||
-rw-r--r-- | sysdeps/x86/cpu-features.h | 6 | ||||
-rw-r--r-- | sysdeps/x86/tst-get-cpu-features.c | 2 | ||||
-rw-r--r-- | sysdeps/x86/tst-strncmp-rtm.c | 43 | ||||
-rw-r--r-- | sysdeps/x86/tst-wcsncmp-rtm.c | 21 | ||||
-rw-r--r-- | sysdeps/x86_64/Makefile | 7 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen-evex.S | 581 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S | 1 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncmp-avx2.S | 1 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wcsncmp-avx2.S | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/tst-rsi-strlen.c | 81 | ||||
-rw-r--r-- | sysdeps/x86_64/tst-rsi-wcslen.c | 20 |
16 files changed, 537 insertions, 287 deletions
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile index e821d95fa3..ced664fb93 100644 --- a/sysdeps/x86/Makefile +++ b/sysdeps/x86/Makefile @@ -33,7 +33,9 @@ tests += \ tst-strcpy-rtm \ tst-strlen-rtm \ tst-strncmp-rtm \ - tst-strrchr-rtm + tst-strrchr-rtm \ + tst-wcsncmp-rtm \ +# tests CFLAGS-tst-memchr-rtm.c += -mrtm CFLAGS-tst-memcmp-rtm.c += -mrtm @@ -43,8 +45,9 @@ CFLAGS-tst-memset-rtm.c += -mrtm CFLAGS-tst-strchr-rtm.c += -mrtm CFLAGS-tst-strcpy-rtm.c += -mrtm CFLAGS-tst-strlen-rtm.c += -mrtm -CFLAGS-tst-strncmp-rtm.c += -mrtm +CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error CFLAGS-tst-strrchr-rtm.c += -mrtm +CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error endif ifneq ($(enable-cet),no) diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index eb554b6d14..484efe7a0f 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -71,7 +71,6 @@ update_usable (struct cpu_features *cpu_features) CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_6); CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_7); CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_9); - CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_11); CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_12); CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_13); CPU_FEATURE_UNSET (cpu_features, INDEX_7_EDX_17); @@ -318,6 +317,9 @@ update_usable (struct cpu_features *cpu_features) /* Determine if PKU is usable. */ if (CPU_FEATURES_CPU_P (cpu_features, OSPKE)) CPU_FEATURE_SET (cpu_features, PKU); + + if (CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT)) + CPU_FEATURE_UNSET (cpu_features, RTM); } static void @@ -516,11 +518,39 @@ init_cpu_features (struct cpu_features *cpu_features) break; } - /* Disable TSX on some Haswell processors to avoid TSX on kernels that - weren't updated with the latest microcode package (which disables - broken feature by default). */ + /* Disable TSX on some processors to avoid TSX on kernels that + weren't updated with the latest microcode package (which + disables broken feature by default). */ switch (model) { + case 0x55: + if (stepping <= 5) + goto disable_tsx; + break; + case 0x8e: + /* NB: Although the errata documents that for model == 0x8e, + only 0xb stepping or lower are impacted, the intention of + the errata was to disable TSX on all client processors on + all steppings. Include 0xc stepping which is an Intel + Core i7-8665U, a client mobile processor. */ + case 0x9e: + if (stepping > 0xc) + break; + /* Fall through. */ + case 0x4e: + case 0x5e: + { + /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for + processors listed in: + +https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html + */ +disable_tsx: + CPU_FEATURE_UNSET (cpu_features, HLE); + CPU_FEATURE_UNSET (cpu_features, RTM); + CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT); + } + break; case 0x3f: /* Xeon E7 v3 with stepping >= 4 has working TSX. */ if (stepping >= 4) diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h index 72ea74d083..8995a15f09 100644 --- a/sysdeps/x86/cpu-features.h +++ b/sysdeps/x86/cpu-features.h @@ -295,7 +295,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define bit_cpu_AVX512_VP2INTERSECT (1u << 8) #define bit_cpu_INDEX_7_EDX_9 (1u << 9) #define bit_cpu_MD_CLEAR (1u << 10) -#define bit_cpu_INDEX_7_EDX_11 (1u << 11) +#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11) #define bit_cpu_INDEX_7_EDX_12 (1u << 12) #define bit_cpu_INDEX_7_EDX_13 (1u << 13) #define bit_cpu_SERIALIZE (1u << 14) @@ -508,7 +508,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7 #define index_cpu_INDEX_7_EDX_9 COMMON_CPUID_INDEX_7 #define index_cpu_MD_CLEAR COMMON_CPUID_INDEX_7 -#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7 +#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7 #define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7 #define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7 #define index_cpu_SERIALIZE COMMON_CPUID_INDEX_7 @@ -721,7 +721,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define reg_AVX512_VP2INTERSECT edx #define reg_INDEX_7_EDX_9 edx #define reg_MD_CLEAR edx -#define reg_INDEX_7_EDX_11 edx +#define reg_RTM_ALWAYS_ABORT edx #define reg_INDEX_7_EDX_12 edx #define reg_INDEX_7_EDX_13 edx #define reg_SERIALIZE edx diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c index 080c58e70b..527de3b5d9 100644 --- a/sysdeps/x86/tst-get-cpu-features.c +++ b/sysdeps/x86/tst-get-cpu-features.c @@ -183,6 +183,7 @@ do_test (void) CHECK_CPU_FEATURE (FSRM); CHECK_CPU_FEATURE (AVX512_VP2INTERSECT); CHECK_CPU_FEATURE (MD_CLEAR); + CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT); CHECK_CPU_FEATURE (SERIALIZE); CHECK_CPU_FEATURE (HYBRID); CHECK_CPU_FEATURE (TSXLDTRK); @@ -336,6 +337,7 @@ do_test (void) CHECK_CPU_FEATURE_USABLE (FSRM); CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT); CHECK_CPU_FEATURE_USABLE (MD_CLEAR); + CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT); CHECK_CPU_FEATURE_USABLE (SERIALIZE); CHECK_CPU_FEATURE_USABLE (HYBRID); CHECK_CPU_FEATURE_USABLE (TSXLDTRK); diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c index 236ad951b5..aef9866cf2 100644 --- a/sysdeps/x86/tst-strncmp-rtm.c +++ b/sysdeps/x86/tst-strncmp-rtm.c @@ -16,20 +16,35 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ +#include <stdint.h> #include <tst-string-rtm.h> +#ifdef WIDE +# define CHAR wchar_t +# define MEMSET wmemset +# define STRNCMP wcsncmp +# define TEST_NAME "wcsncmp" +#else /* !WIDE */ +# define CHAR char +# define MEMSET memset +# define STRNCMP strncmp +# define TEST_NAME "strncmp" +#endif /* !WIDE */ + + + #define LOOP 3000 #define STRING_SIZE 1024 -char string1[STRING_SIZE]; -char string2[STRING_SIZE]; +CHAR string1[STRING_SIZE]; +CHAR string2[STRING_SIZE]; __attribute__ ((noinline, noclone)) static int prepare (void) { - memset (string1, 'a', STRING_SIZE - 1); - memset (string2, 'a', STRING_SIZE - 1); - if (strncmp (string1, string2, STRING_SIZE) == 0) + MEMSET (string1, 'a', STRING_SIZE - 1); + MEMSET (string2, 'a', STRING_SIZE - 1); + if (STRNCMP (string1, string2, STRING_SIZE) == 0) return EXIT_SUCCESS; else return EXIT_FAILURE; @@ -39,7 +54,17 @@ __attribute__ ((noinline, noclone)) static int function (void) { - if (strncmp (string1, string2, STRING_SIZE) == 0) + if (STRNCMP (string1, string2, STRING_SIZE) == 0) + return 0; + else + return 1; +} + +__attribute__ ((noinline, noclone)) +static int +function_overflow (void) +{ + if (STRNCMP (string1, string2, SIZE_MAX) == 0) return 0; else return 1; @@ -48,5 +73,9 @@ function (void) static int do_test (void) { - return do_test_1 ("strncmp", LOOP, prepare, function); + int status = do_test_1 (TEST_NAME, LOOP, prepare, function); + if (status != EXIT_SUCCESS) + return status; + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow); + return status; } diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c new file mode 100644 index 0000000000..bad3b86378 --- /dev/null +++ b/sysdeps/x86/tst-wcsncmp-rtm.c @@ -0,0 +1,21 @@ +/* Test case for wcsncmp inside a transactionally executing RTM region. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define WIDE 1 +#include <wchar.h> +#include "tst-strncmp-rtm.c" diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index 42b97c5cc7..020044da80 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -20,6 +20,8 @@ endif ifeq ($(subdir),string) sysdep_routines += strcasecmp_l-nonascii strncase_l-nonascii gen-as-const-headers += locale-defines.sym +tests += \ + tst-rsi-strlen endif ifeq ($(subdir),elf) @@ -150,6 +152,11 @@ ifeq ($(subdir),csu) gen-as-const-headers += tlsdesc.sym rtld-offsets.sym endif +ifeq ($(subdir),wcsmbs) +tests += \ + tst-rsi-wcslen +endif + $(objpfx)x86_64/tst-x86_64mod-1.os: $(objpfx)tst-x86_64mod-1.os $(make-target-directory) rm -f $@ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 11e9e25820..920e64241e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -295,7 +295,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, strlen, (CPU_FEATURE_USABLE (AVX512VL) - && CPU_FEATURE_USABLE (AVX512BW)), + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), __strlen_evex) IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) @@ -312,7 +313,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strnlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, strnlen, (CPU_FEATURE_USABLE (AVX512VL) - && CPU_FEATURE_USABLE (AVX512BW)), + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), __strnlen_evex) IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) @@ -655,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wcslen_evex) - IFUNC_IMPL_ADD (array, i, wcsnlen, + IFUNC_IMPL_ADD (array, i, wcslen, CPU_FEATURE_USABLE (SSE4_1), - __wcsnlen_sse4_1) + __wcslen_sse4_1) IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S index 759e5b64c2..c2714c4478 100644 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S @@ -95,7 +95,7 @@ ENTRY (STRCMP) length to bound a valid memory region. In these cases just use 'wcscmp'. */ shrq $56, %rcx - jnz __wcscmp_avx2 + jnz OVERFLOW_STRCMP # endif /* Convert units: from wide to byte char. */ shl $2, %RDX_LP diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S index 0583819078..4bf6874b82 100644 --- a/sysdeps/x86_64/multiarch/strlen-evex.S +++ b/sysdeps/x86_64/multiarch/strlen-evex.S @@ -29,11 +29,13 @@ # ifdef USE_AS_WCSLEN # define VPCMP vpcmpd # define VPMINU vpminud -# define SHIFT_REG r9d +# define SHIFT_REG ecx +# define CHAR_SIZE 4 # else # define VPCMP vpcmpb # define VPMINU vpminub -# define SHIFT_REG ecx +# define SHIFT_REG edx +# define CHAR_SIZE 1 # endif # define XMMZERO xmm16 @@ -46,132 +48,165 @@ # define YMM6 ymm22 # define VEC_SIZE 32 +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) .section .text.evex,"ax",@progbits ENTRY (STRLEN) # ifdef USE_AS_STRNLEN - /* Check for zero length. */ + /* Check zero length. */ test %RSI_LP, %RSI_LP jz L(zero) -# ifdef USE_AS_WCSLEN - shl $2, %RSI_LP -# elif defined __ILP32__ +# ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %esi, %esi # endif mov %RSI_LP, %R8_LP # endif - movl %edi, %ecx - movq %rdi, %rdx + movl %edi, %eax vpxorq %XMMZERO, %XMMZERO, %XMMZERO - + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. */ + andl $(PAGE_SIZE - 1), %eax /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. Each bit in K0 represents a null byte. */ VPCMP $0, (%rdi), %YMMZERO, %k0 kmovd %k0, %eax - testl %eax, %eax - # ifdef USE_AS_STRNLEN - jnz L(first_vec_x0_check) - /* Adjust length and check the end of data. */ - subq $VEC_SIZE, %rsi - jbe L(max) -# else - jnz L(first_vec_x0) + /* If length < CHAR_PER_VEC handle special. */ + cmpq $CHAR_PER_VEC, %rsi + jbe L(first_vec_x0) # endif - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax + ret # ifdef USE_AS_STRNLEN - /* Adjust length. */ - addq %rcx, %rsi +L(zero): + xorl %eax, %eax + ret - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + .p2align 4 +L(first_vec_x0): + /* Set bit for max len so that tzcnt will return min of max len + and position of first match. */ + btsq %rsi, %rax + tzcntl %eax, %eax + ret # endif - jmp L(more_4x_vec) .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - -# ifdef USE_AS_WCSLEN - /* NB: Divide shift count by 4 since each bit in K0 represent 4 - bytes. */ - movl %ecx, %SHIFT_REG - sarl $2, %SHIFT_REG +L(first_vec_x1): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ +# ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal CHAR_PER_VEC(%rdi, %rax), %eax # endif - VPCMP $0, (%rdi), %YMMZERO, %k0 - kmovd %k0, %eax + ret - /* Remove the leading bytes. */ - sarxl %SHIFT_REG, %eax, %eax - testl %eax, %eax - jz L(aligned_more) + .p2align 4 +L(first_vec_x2): tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) -# endif - addq %rdi, %rax - addq %rcx, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax # endif ret .p2align 4 -L(aligned_more): +L(first_vec_x3): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" - to void possible addition overflow. */ - negq %rcx - addq $VEC_SIZE, %rcx - - /* Check the end of data. */ - subq %rcx, %rsi - jbe L(max) + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax # endif + ret - addq $VEC_SIZE, %rdi - + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax # endif + ret -L(more_4x_vec): + .p2align 5 +L(aligned_more): + movq %rdi, %rdx + /* Align data to VEC_SIZE. */ + andq $-(VEC_SIZE), %rdi +L(cross_page_continue): /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time since data is only aligned to VEC_SIZE. */ - VPCMP $0, (%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x0) - +# ifdef USE_AS_STRNLEN + /* + CHAR_SIZE because it simplies the logic in + last_4x_vec_or_less. */ + leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx + subq %rdx, %rcx +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx +# endif +# endif + /* Load first VEC regardless. */ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 +# ifdef USE_AS_STRNLEN + /* Adjust length. If near end handle specially. */ + subq %rcx, %rsi + jb L(last_4x_vec_or_less) +# endif kmovd %k0, %eax testl %eax, %eax jnz L(first_vec_x1) VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 kmovd %k0, %eax - testl %eax, %eax + test %eax, %eax jnz L(first_vec_x2) VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 @@ -179,258 +214,276 @@ L(more_4x_vec): testl %eax, %eax jnz L(first_vec_x3) - addq $(VEC_SIZE * 4), %rdi - -# ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) -# endif - - /* Align data to 4 * VEC_SIZE. */ - movq %rdi, %rcx - andl $(4 * VEC_SIZE - 1), %ecx - andq $-(4 * VEC_SIZE), %rdi + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x4) + addq $VEC_SIZE, %rdi # ifdef USE_AS_STRNLEN - /* Adjust length. */ + /* Check if at last VEC_SIZE * 4 length. */ + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi + jbe L(last_4x_vec_or_less_load) + movl %edi, %ecx + andl $(VEC_SIZE * 4 - 1), %ecx +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx +# endif + /* Readjust length. */ addq %rcx, %rsi # endif + /* Align data to VEC_SIZE * 4. */ + andq $-(VEC_SIZE * 4), %rdi + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - VMOVA (%rdi), %YMM1 - VMOVA VEC_SIZE(%rdi), %YMM2 - VMOVA (VEC_SIZE * 2)(%rdi), %YMM3 - VMOVA (VEC_SIZE * 3)(%rdi), %YMM4 - - VPMINU %YMM1, %YMM2, %YMM5 - VPMINU %YMM3, %YMM4, %YMM6 + /* Load first VEC regardless. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 +# ifdef USE_AS_STRNLEN + /* Break if at end of length. */ + subq $(CHAR_PER_VEC * 4), %rsi + jb L(last_4x_vec_or_less_cmpeq) +# endif + /* Save some code size by microfusing VPMINU with the load. Since + the matches in ymm2/ymm4 can only be returned if there where no + matches in ymm1/ymm3 respectively there is no issue with overlap. + */ + VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 + VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 + VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 + + VPCMP $0, %YMM2, %YMMZERO, %k0 + VPCMP $0, %YMM4, %YMMZERO, %k1 + subq $-(VEC_SIZE * 4), %rdi + kortestd %k0, %k1 + jz L(loop_4x_vec) + + /* Check if end was in first half. */ + kmovd %k0, %eax + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + shrq $2, %rdi +# endif + testl %eax, %eax + jz L(second_vec_return) - VPMINU %YMM5, %YMM6, %YMM5 - VPCMP $0, %YMM5, %YMMZERO, %k0 - ktestd %k0, %k0 - jnz L(4x_vec_end) + VPCMP $0, %YMM1, %YMMZERO, %k2 + kmovd %k2, %edx + /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ +# ifdef USE_AS_WCSLEN + sall $CHAR_PER_VEC, %eax + orl %edx, %eax + tzcntl %eax, %eax +# else + salq $CHAR_PER_VEC, %rax + orq %rdx, %rax + tzcntq %rax, %rax +# endif + addq %rdi, %rax + ret - addq $(VEC_SIZE * 4), %rdi -# ifndef USE_AS_STRNLEN - jmp L(loop_4x_vec) -# else - subq $(VEC_SIZE * 4), %rsi - ja L(loop_4x_vec) +# ifdef USE_AS_STRNLEN +L(last_4x_vec_or_less_load): + /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 +L(last_4x_vec_or_less_cmpeq): + VPCMP $0, %YMM1, %YMMZERO, %k0 + addq $(VEC_SIZE * 3), %rdi L(last_4x_vec_or_less): - /* Less than 4 * VEC and aligned to VEC_SIZE. */ - addl $(VEC_SIZE * 2), %esi - jle L(last_2x_vec) - - VPCMP $0, (%rdi), %YMMZERO, %k0 kmovd %k0, %eax + /* If remaining length > VEC_SIZE * 2. This works if esi is off by + VEC_SIZE * 4. */ + testl $(CHAR_PER_VEC * 2), %esi + jnz L(last_4x_vec) + + /* length may have been negative or positive by an offset of + CHAR_PER_VEC * 4 depending on where this was called from. This + fixes that. */ + andl $(CHAR_PER_VEC * 4 - 1), %esi testl %eax, %eax - jnz L(first_vec_x0) + jnz L(last_vec_x1_check) - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x1) + /* Check the end of data. */ + subl $CHAR_PER_VEC, %esi + jb L(max) VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x2_check) - subl $VEC_SIZE, %esi - jle L(max) + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max) - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x3_check) + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi +# endif + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax + ret +L(max): movq %r8, %rax + ret +# endif + + /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) + in the 4x VEC loop can use 2 byte encoding. */ + .p2align 4 +L(second_vec_return): + VPCMP $0, %YMM3, %YMMZERO, %k0 + /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ +# ifdef USE_AS_WCSLEN + kunpckbw %k0, %k1, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax +# else + kunpckdq %k0, %k1, %k0 + kmovq %k0, %rax + tzcntq %rax, %rax +# endif + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax + ret + + +# ifdef USE_AS_STRNLEN +L(last_vec_x1_check): + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax ret .p2align 4 -L(last_2x_vec): - addl $(VEC_SIZE * 2), %esi +L(last_4x_vec): + /* Test first 2x VEC normally. */ + testl %eax, %eax + jnz L(last_vec_x1) - VPCMP $0, (%rdi), %YMMZERO, %k0 + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 kmovd %k0, %eax testl %eax, %eax - jnz L(first_vec_x0_check) - subl $VEC_SIZE, %esi - jle L(max) + jnz L(last_vec_x2) - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 + /* Normalize length. */ + andl $(CHAR_PER_VEC * 4 - 1), %esi + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 kmovd %k0, %eax testl %eax, %eax - jnz L(first_vec_x1_check) - movq %r8, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - ret + jnz L(last_vec_x3) - .p2align 4 -L(first_vec_x0_check): + /* Check the end of data. */ + subl $(CHAR_PER_VEC * 3), %esi + jb L(max) + + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq %rdi, %rax - subq %rdx, %rax + cmpl %eax, %esi + jb L(max_end) + + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax ret .p2align 4 -L(first_vec_x1_check): +L(last_vec_x1): tzcntl %eax, %eax + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $VEC_SIZE, %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax ret .p2align 4 -L(first_vec_x2_check): +L(last_vec_x2): tzcntl %eax, %eax + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 2), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax ret .p2align 4 -L(first_vec_x3_check): +L(last_vec_x3): tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif + subl $(CHAR_PER_VEC * 2), %esi /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 3), %rax - addq %rdi, %rax - subq %rdx, %rax + cmpl %eax, %esi + jb L(max_end) + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax ret - - .p2align 4 -L(max): +L(max_end): movq %r8, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - ret - - .p2align 4 -L(zero): - xorl %eax, %eax ret # endif + /* Cold case for crossing page with first load. */ .p2align 4 -L(first_vec_x0): - tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - addq %rdi, %rax - subq %rdx, %rax +L(cross_page_boundary): + movq %rdi, %rdx + /* Align data to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + /* Remove the leading bytes. */ # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide shift count by 4 since each bit in K0 represent 4 + bytes. */ + movl %edx, %ecx + shrl $2, %ecx + andl $(CHAR_PER_VEC - 1), %ecx # endif - ret - - .p2align 4 -L(first_vec_x1): + /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ + sarxl %SHIFT_REG, %eax, %eax + testl %eax, %eax +# ifndef USE_AS_STRNLEN + jz L(cross_page_continue) tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - addq $VEC_SIZE, %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif ret - - .p2align 4 -L(first_vec_x2): - tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - addq $(VEC_SIZE * 2), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif +# else + jnz L(cross_page_less_vec) +# ifndef USE_AS_WCSLEN + movl %edx, %ecx + andl $(CHAR_PER_VEC - 1), %ecx +# endif + movl $CHAR_PER_VEC, %eax + subl %ecx, %eax + /* Check the end of data. */ + cmpq %rax, %rsi + ja L(cross_page_continue) + movl %esi, %eax ret - - .p2align 4 -L(4x_vec_end): - VPCMP $0, %YMM1, %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x0) - VPCMP $0, %YMM2, %YMMZERO, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(first_vec_x1) - VPCMP $0, %YMM3, %YMMZERO, %k2 - kmovd %k2, %eax - testl %eax, %eax - jnz L(first_vec_x2) - VPCMP $0, %YMM4, %YMMZERO, %k3 - kmovd %k3, %eax -L(first_vec_x3): +L(cross_page_less_vec): tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - addq $(VEC_SIZE * 3), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif + /* Select min of length and position of first null. */ + cmpq %rax, %rsi + cmovb %esi, %eax ret +# endif END (STRLEN) #endif diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S index 37d1224bb9..68bad365ba 100644 --- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S @@ -1,3 +1,4 @@ #define STRCMP __strncmp_avx2_rtm #define USE_AS_STRNCMP 1 +#define OVERFLOW_STRCMP __strcmp_avx2_rtm #include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S index 1678bcc235..f138e9f1fd 100644 --- a/sysdeps/x86_64/multiarch/strncmp-avx2.S +++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S @@ -1,3 +1,4 @@ #define STRCMP __strncmp_avx2 #define USE_AS_STRNCMP 1 +#define OVERFLOW_STRCMP __strcmp_avx2 #include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S index 4e88c70cc6..f467582cbe 100644 --- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S @@ -1,5 +1,5 @@ #define STRCMP __wcsncmp_avx2_rtm #define USE_AS_STRNCMP 1 #define USE_AS_WCSCMP 1 - +#define OVERFLOW_STRCMP __wcscmp_avx2_rtm #include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S index 4fa1de4d3f..e9ede522b8 100644 --- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S @@ -1,5 +1,5 @@ #define STRCMP __wcsncmp_avx2 #define USE_AS_STRNCMP 1 #define USE_AS_WCSCMP 1 - +#define OVERFLOW_STRCMP __wcscmp_avx2 #include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/tst-rsi-strlen.c b/sysdeps/x86_64/tst-rsi-strlen.c new file mode 100644 index 0000000000..a80c4f85c2 --- /dev/null +++ b/sysdeps/x86_64/tst-rsi-strlen.c @@ -0,0 +1,81 @@ +/* Test strlen with 0 in the RSI register. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifdef WIDE +# define TEST_NAME "wcslen" +#else +# define TEST_NAME "strlen" +#endif /* WIDE */ + +#define TEST_MAIN +#include <string/test-string.h> + +#ifdef WIDE +# include <wchar.h> +# define STRLEN wcslen +# define CHAR wchar_t +#else +# define STRLEN strlen +# define CHAR char +#endif /* WIDE */ + +IMPL (STRLEN, 1) + +typedef size_t (*proto_t) (const CHAR *); + +typedef struct +{ + void (*fn) (void); +} parameter_t; + +size_t +__attribute__ ((weak, noinline, noclone)) +do_strlen (parameter_t *a, int zero, const CHAR *str) +{ + return CALL (a, str); +} + +static int +test_main (void) +{ + test_init (); + + size_t size = page_size / sizeof (CHAR) - 1; + CHAR *buf = (CHAR *) buf2; + buf[size] = 0; + + parameter_t a; + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + a.fn = impl->fn; + /* NB: Pass 0 in RSI. */ + size_t res = do_strlen (&a, 0, buf); + if (res != size) + { + error (0, 0, "Wrong result in function %s: %zu != %zu", + impl->name, res, size); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include <support/test-driver.c> diff --git a/sysdeps/x86_64/tst-rsi-wcslen.c b/sysdeps/x86_64/tst-rsi-wcslen.c new file mode 100644 index 0000000000..f45a7dfb51 --- /dev/null +++ b/sysdeps/x86_64/tst-rsi-wcslen.c @@ -0,0 +1,20 @@ +/* Test wcslen with 0 in the RSI register. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define WIDE 1 +#include "tst-rsi-strlen.c" |