diff options
Diffstat (limited to 'sysdeps/x86_64')
-rw-r--r-- | sysdeps/x86_64/dl-machine.h | 27 | ||||
-rw-r--r-- | sysdeps/x86_64/dl-tls.c | 5 | ||||
-rw-r--r-- | sysdeps/x86_64/fpu/libm-test-ulps | 96 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-memset.h | 18 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memmove-ssse3.S | 14 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strchr-evex-base.S | 8 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcmp-evex.S | 26 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strnlen-evex-base.S | 462 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strnlen-evex.S | 428 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strnlen-evex512.S | 259 | ||||
-rw-r--r-- | sysdeps/x86_64/nptl/tcb-offsets.sym | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/x32/Makefile | 6 | ||||
-rw-r--r-- | sysdeps/x86_64/x32/dl-machine.h | 76 |
13 files changed, 582 insertions, 846 deletions
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h index a6de3793e4..4f12955875 100644 --- a/sysdeps/x86_64/dl-machine.h +++ b/sysdeps/x86_64/dl-machine.h @@ -139,37 +139,37 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], .globl _start\n\ .globl _dl_start_user\n\ _start:\n\ - movq %rsp, %rdi\n\ + mov %" RSP_LP ", %" RDI_LP "\n\ call _dl_start\n\ _dl_start_user:\n\ # Save the user entry point address in %r12.\n\ - movq %rax, %r12\n\ + mov %" RAX_LP ", %" R12_LP "\n\ # Save %rsp value in %r13.\n\ - movq %rsp, %r13\n\ + mov %" RSP_LP ", % " R13_LP "\n\ "\ RTLD_START_ENABLE_X86_FEATURES \ "\ # Read the original argument count.\n\ - movq (%rsp), %rdx\n\ + mov (%rsp), %" RDX_LP "\n\ # Call _dl_init (struct link_map *main_map, int argc, char **argv, char **env)\n\ # argc -> rsi\n\ - movq %rdx, %rsi\n\ + mov %" RDX_LP ", %" RSI_LP "\n\ # And align stack for the _dl_init call. \n\ - andq $-16, %rsp\n\ + and $-16, %" RSP_LP "\n\ # _dl_loaded -> rdi\n\ - movq _rtld_local(%rip), %rdi\n\ + mov _rtld_local(%rip), %" RDI_LP "\n\ # env -> rcx\n\ - leaq 16(%r13,%rdx,8), %rcx\n\ + lea 2*" LP_SIZE "(%r13,%rdx," LP_SIZE "), %" RCX_LP "\n\ # argv -> rdx\n\ - leaq 8(%r13), %rdx\n\ + lea " LP_SIZE "(%r13), %" RDX_LP "\n\ # Clear %rbp to mark outermost frame obviously even for constructors.\n\ xorl %ebp, %ebp\n\ # Call the function to run the initializers.\n\ call _dl_init\n\ # Pass our finalizer function to the user in %rdx, as per ELF ABI.\n\ - leaq _dl_fini(%rip), %rdx\n\ + lea _dl_fini(%rip), %" RDX_LP "\n\ # And make sure %rsp points to argc stored on the stack.\n\ - movq %r13, %rsp\n\ + mov %" R13_LP ", %" RSP_LP "\n\ # Jump to the user's entry point.\n\ jmp *%r12\n\ .previous\n\ @@ -234,8 +234,13 @@ elf_machine_plt_value (struct link_map *map, const ElfW(Rela) *reloc, /* Names of the architecture-specific auditing callback functions. */ +#ifdef __LP64__ #define ARCH_LA_PLTENTER x86_64_gnu_pltenter #define ARCH_LA_PLTEXIT x86_64_gnu_pltexit +#else +#define ARCH_LA_PLTENTER x32_gnu_pltenter +#define ARCH_LA_PLTEXIT x32_gnu_pltexit +#endif #endif /* !dl_machine_h */ diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c index 869023bbba..b3c1e4fcd7 100644 --- a/sysdeps/x86_64/dl-tls.c +++ b/sysdeps/x86_64/dl-tls.c @@ -41,7 +41,10 @@ __tls_get_addr_slow (GET_ADDR_ARGS) dtv_t *dtv = THREAD_DTV (); size_t gen = atomic_load_acquire (&GL(dl_tls_generation)); - if (__glibc_unlikely (dtv[0].counter != gen)) + if (__glibc_unlikely (dtv[0].counter != gen) + /* See comment in __tls_get_addr in elf/dl-tls.c. */ + && !(_dl_tls_allocate_active () + && GET_ADDR_MODULE < _dl_tls_initial_modid_limit)) return update_get_addr (GET_ADDR_PARAM, gen); return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL); diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps index 3592dfae1d..c2e36dcbdf 100644 --- a/sysdeps/x86_64/fpu/libm-test-ulps +++ b/sysdeps/x86_64/fpu/libm-test-ulps @@ -1430,28 +1430,28 @@ Function: "exp10_vlen8_avx2": float: 1 Function: "exp10m1": -double: 2 -float: 1 -float128: 1 -ldouble: 1 +double: 4 +float: 2 +float128: 3 +ldouble: 4 Function: "exp10m1_downward": -double: 1 -float: 1 -float128: 3 -ldouble: 2 +double: 3 +float: 3 +float128: 6 +ldouble: 6 Function: "exp10m1_towardzero": -double: 1 -float: 1 -float128: 3 -ldouble: 2 +double: 2 +float: 3 +float128: 6 +ldouble: 5 Function: "exp10m1_upward": -double: 3 -float: 1 -float128: 3 -ldouble: 2 +double: 5 +float: 4 +float128: 6 +ldouble: 6 Function: "exp2": double: 1 @@ -1498,28 +1498,28 @@ Function: "exp2_vlen8_avx2": float: 1 Function: "exp2m1": -double: 1 -float: 1 -float128: 1 -ldouble: 1 - -Function: "exp2m1_downward": double: 2 -float: 1 +float: 2 float128: 2 ldouble: 3 +Function: "exp2m1_downward": +double: 3 +float: 3 +float128: 3 +ldouble: 6 + Function: "exp2m1_towardzero": -double: 2 -float: 1 -float128: 2 -ldouble: 3 +double: 3 +float: 2 +float128: 4 +ldouble: 5 Function: "exp2m1_upward": -double: 1 -float: 1 -float128: 2 -ldouble: 3 +double: 3 +float: 3 +float128: 5 +ldouble: 6 Function: "exp_downward": double: 1 @@ -1808,28 +1808,28 @@ Function: "log10_vlen8_avx2": float: 1 Function: "log10p1": -double: 1 -float: 1 +double: 2 +float: 2 float128: 3 -ldouble: 2 +ldouble: 4 Function: "log10p1_downward": double: 2 -float: 1 -float128: 2 -ldouble: 4 +float: 3 +float128: 4 +ldouble: 8 Function: "log10p1_towardzero": -double: 2 +double: 3 float: 2 -float128: 2 -ldouble: 4 +float128: 3 +ldouble: 8 Function: "log10p1_upward": double: 2 -float: 1 -float128: 3 -ldouble: 3 +float: 3 +float128: 4 +ldouble: 6 Function: "log1p": double: 1 @@ -1920,10 +1920,10 @@ Function: "log2_vlen8_avx2": float: 1 Function: "log2p1": -double: 1 -float: 1 +double: 2 +float: 2 float128: 3 -ldouble: 2 +ldouble: 4 Function: "log2p1_downward": double: 2 @@ -1938,9 +1938,9 @@ float128: 2 ldouble: 4 Function: "log2p1_upward": -double: 1 +double: 2 float: 2 -float128: 2 +float128: 3 ldouble: 5 Function: "log_downward": diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h index 7a637ef7ca..8dc3d7ab5a 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memset.h +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h @@ -46,6 +46,13 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) attribute_hidden; +static inline int +prefer_erms_nt_impl (const struct cpu_features *cpu_features) +{ + return CPU_FEATURE_USABLE_P (cpu_features, ERMS) + || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset); +} + static inline void * IFUNC_SELECTOR (void) { @@ -61,7 +68,7 @@ IFUNC_SELECTOR (void) && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + if (prefer_erms_nt_impl (cpu_features)) return OPTIMIZE (avx512_unaligned_erms); return OPTIMIZE (avx512_unaligned); @@ -76,7 +83,7 @@ IFUNC_SELECTOR (void) && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + if (prefer_erms_nt_impl (cpu_features)) return OPTIMIZE (evex_unaligned_erms); return OPTIMIZE (evex_unaligned); @@ -84,7 +91,7 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + if (prefer_erms_nt_impl (cpu_features)) return OPTIMIZE (avx2_unaligned_erms_rtm); return OPTIMIZE (avx2_unaligned_rtm); @@ -93,14 +100,15 @@ IFUNC_SELECTOR (void) if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + if (prefer_erms_nt_impl (cpu_features)) return OPTIMIZE (avx2_unaligned_erms); return OPTIMIZE (avx2_unaligned); } } - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) + || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)) return OPTIMIZE (sse2_unaligned_erms); return OPTIMIZE (sse2_unaligned); diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S index 048d015712..01008fd981 100644 --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S +++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S @@ -151,13 +151,10 @@ L(more_2x_vec): loop. */ movups %xmm0, (%rdi) -# ifdef SHARED_CACHE_SIZE_HALF - cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP -# else - cmp __x86_shared_cache_size_half(%rip), %rdx -# endif + cmp __x86_shared_non_temporal_threshold(%rip), %rdx ja L(large_memcpy) +L(loop_fwd): leaq -64(%rdi, %rdx), %r8 andq $-16, %rdi movl $48, %edx @@ -199,6 +196,13 @@ L(large_memcpy): movups -64(%r9, %rdx), %xmm10 movups -80(%r9, %rdx), %xmm11 + /* Check if src and dst overlap. If they do use cacheable + writes to potentially gain positive interference between + the loads during the memmove. */ + subq %rdi, %r9 + cmpq %rdx, %r9 + jb L(loop_fwd) + sall $5, %ecx leal (%rcx, %rcx, 2), %r8d leaq -96(%rdi, %rdx), %rcx diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S index 04e2c0e79e..3a0b7c9d64 100644 --- a/sysdeps/x86_64/multiarch/strchr-evex-base.S +++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S @@ -124,13 +124,13 @@ L(page_cross): VPCMPNE %VMM(1), %VMM(0), %k1 VPTEST %VMM(1), %VMM(1), %k0{%k1} KMOV %k0, %VRAX -# ifdef USE_AS_WCSCHR + sar %cl, %VRAX +#ifdef USE_AS_WCSCHR sub $VEC_MATCH_MASK, %VRAX -# else +#else inc %VRAX -# endif +#endif /* Ignore number of character for alignment adjustment. */ - shr %cl, %VRAX jz L(align_more) bsf %VRAX, %VRAX diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S index 06730ab2a1..cea034f394 100644 --- a/sysdeps/x86_64/multiarch/strcmp-evex.S +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S @@ -209,7 +209,9 @@ returned. */ .section SECTION(.text), "ax", @progbits - .align 16 + /* Align 64 bytes here. This is to get the L(loop) block ideally + aligned for the DSB. */ + .align 64 .type STRCMP, @function .globl STRCMP # ifdef USE_AS_STRCASECMP_L @@ -509,9 +511,7 @@ L(ret4): ret # endif - /* 32 byte align here ensures the main loop is ideally aligned - for DSB. */ - .p2align 5 + .p2align 4,, 4 L(more_3x_vec): /* Safe to compare 4x vectors. */ VMOVU (VEC_SIZE)(%rdi), %VMM(0) @@ -1426,10 +1426,9 @@ L(less_32_till_page): L(ret_zero_page_cross_slow_case0): xorl %eax, %eax ret -# endif - - +# else .p2align 4,, 10 +# endif L(less_16_till_page): cmpl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax ja L(less_8_till_page) @@ -1482,8 +1481,12 @@ L(less_16_till_page): # endif jmp L(prepare_loop_aligned) - - +# ifndef USE_AS_STRNCMP + /* Fits in aligning bytes. */ +L(ret_zero_4_loop): + xorl %eax, %eax + ret +# endif .p2align 4,, 10 L(less_8_till_page): @@ -1554,6 +1557,7 @@ L(ret_less_8_wcs): # ifdef USE_AS_STRNCMP .p2align 4,, 2 +L(ret_zero_4_loop): L(ret_zero_page_cross_slow_case1): xorl %eax, %eax ret @@ -1586,10 +1590,6 @@ L(less_4_loop): subq $-(CHAR_PER_VEC * 4), %rdx # endif jmp L(prepare_loop_aligned) - -L(ret_zero_4_loop): - xorl %eax, %eax - ret L(ret_less_4_loop): xorl %r8d, %eax subl %r8d, %eax diff --git a/sysdeps/x86_64/multiarch/strnlen-evex-base.S b/sysdeps/x86_64/multiarch/strnlen-evex-base.S new file mode 100644 index 0000000000..1c2cfdfe06 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-evex-base.S @@ -0,0 +1,462 @@ +/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions. + Copyright (C) 2022-2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + +# include <sysdep.h> + +#ifdef USE_AS_WCSLEN +# define VPCMPEQ vpcmpeqd +# define VPTESTN vptestnmd +# define VPMINU vpminud +# define CHAR_SIZE 4 +#else +# define VPCMPEQ vpcmpeqb +# define VPTESTN vptestnmb +# define VPMINU vpminub +# define CHAR_SIZE 1 +#endif + +#define XZERO VMM_128(0) +#define VZERO VMM(0) +#define PAGE_SIZE 4096 +#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +#if CHAR_PER_VEC == 32 +# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) +#else +# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) +#endif + +#ifdef USE_AS_WCSLEN +/* For wide-character, we care more about limitting code size + than optimally aligning targets, so just cap nop padding + reasonably low. */ +# define P2ALIGN(...) .p2align 4,, 6 +# define P2ALIGN_CLAMPED(...) P2ALIGN(__VA_ARGS__) +#else +# define P2ALIGN(x) .p2align x +# define P2ALIGN_CLAMPED(x, y) .p2align x,, y +#endif + + .section SECTION(.text), "ax", @progbits + /* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN(STRNLEN, 6) + /* rdi is pointer to array, rsi is the upper limit. */ + + /* Check zero length. */ + test %RSI_LP, %RSI_LP + jz L(zero) + +#ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi +#endif + + vpxorq %XZERO, %XZERO, %XZERO + + /* Check that we won't cross a page boundary with our first load. */ + movl %edi, %eax + shll $20, %eax + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax + ja L(crosses_page_boundary) + + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a + null byte. */ + VPCMPEQ (%rdi), %VZERO, %k0 + KMOV %k0, %VRCX + + /* If src (rcx) is zero, bsf does not change the result. NB: + Must use 64-bit bsf here so that upper bits of len are not + cleared. */ + movq %rsi, %rax + bsfq %rcx, %rax + + /* If rax > CHAR_PER_VEC then rcx must have been zero (no null + CHAR) and rsi must be > CHAR_PER_VEC. */ + cmpq $CHAR_PER_VEC, %rax + ja L(more_1x_vec) + + /* Check if first match in bounds. */ + cmpq %rax, %rsi + cmovb %esi, %eax + ret + +#if VEC_SIZE == 32 + P2ALIGN_CLAMPED(4, 2) +L(zero): +L(max_0): + movl %esi, %eax + ret +#endif + + P2ALIGN_CLAMPED(4, 10) +L(more_1x_vec): +L(cross_page_continue): + /* After this calculation, rax stores the number of elements + left to be processed The complexity comes from the fact some + elements get read twice due to alignment and we need to be + sure we don't count them twice (else, it would just be rsi - + CHAR_PER_VEC). */ + +#ifdef USE_AS_WCSLEN + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can + overflow. */ + movq %rdi, %rax + andq $(VEC_SIZE * -1), %rdi + subq %rdi, %rax + sarq $2, %rax + leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax +#else + /* Calculate ptr + N - VEC_SIZE, then mask off the low bits, + then subtract ptr to get the new aligned limit value. */ + leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax + andq $(VEC_SIZE * -1), %rdi + subq %rdi, %rax +#endif + + VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 + + /* Checking here is faster for 256-bit but not 512-bit */ +#if VEC_SIZE == 0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_check) +#endif + + cmpq $(CHAR_PER_VEC * 2), %rax + ja L(more_2x_vec) + +L(last_2x_vec_or_less): + + /* Checking here is faster for 512-bit but not 256-bit */ +#if VEC_SIZE != 0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_check) +#endif + + /* Check for the end of data. */ + SUB_SHORT (CHAR_PER_VEC, rax) + jbe L(max_0) + + /* Check the final remaining vector. */ + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX +#if VEC_SIZE == 32 + jz L(max_0) +#else + jnz L(last_vec_check) + P2ALIGN_CLAMPED(4, 2) +L(zero): +L(max_0): + movl %esi, %eax + ret + +#endif + P2ALIGN_CLAMPED(4, 4) +L(last_vec_check): + bsf %VRDX, %VRDX + sub %eax, %edx + lea (%rsi, %rdx), %eax + cmovae %esi, %eax + ret + + +#if VEC_SIZE == 32 + P2ALIGN_CLAMPED(4, 8) +#endif +L(last_4x_vec_or_less): + addl $(CHAR_PER_VEC * -4), %eax + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 + +#if VEC_SIZE == 64 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_check) +#endif + + subq $(VEC_SIZE * -4), %rdi + cmpl $(CHAR_PER_VEC * 2), %eax + jbe L(last_2x_vec_or_less) + + P2ALIGN_CLAMPED(4, 6) +L(more_2x_vec): + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without + rechecking bounds. */ + + /* Already checked in 256-bit case */ +#if VEC_SIZE != 0 + KMOV %k0, %VRDX + + test %VRDX, %VRDX + jnz L(first_vec_x1) +#endif + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + + test %VRDX, %VRDX + jnz L(first_vec_x2) + + cmpq $(CHAR_PER_VEC * 4), %rax + ja L(more_4x_vec) + + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + addl $(CHAR_PER_VEC * -2), %eax + test %VRDX, %VRDX + jnz L(last_vec_check) + + subb $(CHAR_PER_VEC), %al + jbe L(max_1) + + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + + test %VRDX, %VRDX + jnz L(last_vec_check) +L(max_1): + movl %esi, %eax + ret + + + P2ALIGN_CLAMPED(4, 14) +L(first_vec_x2): +#if VEC_SIZE == 64 + /* If VEC_SIZE == 64 we can fit logic for full return label in + spare bytes before next cache line. */ + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax + ret + P2ALIGN_CLAMPED(4, 6) +#else + addl $CHAR_PER_VEC, %esi +#endif +L(first_vec_x1): + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax + ret + +#if VEC_SIZE == 64 + P2ALIGN_CLAMPED(4, 6) +L(first_vec_x4): +# if VEC_SIZE == 64 + /* If VEC_SIZE == 64 we can fit logic for full return label in + spare bytes before next cache line. */ + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax + ret + P2ALIGN_CLAMPED(4, 6) +# else + addl $CHAR_PER_VEC, %esi +# endif +L(first_vec_x3): + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax + ret +#endif + + P2ALIGN_CLAMPED(6, 20) +L(more_4x_vec): + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(first_vec_x3) + + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(first_vec_x4) + + /* Check if at last VEC_SIZE * 4 length before aligning for the + loop. */ + cmpq $(CHAR_PER_VEC * 8), %rax + jbe L(last_4x_vec_or_less) + + + /* Compute number of words checked after aligning. */ +#ifdef USE_AS_WCSLEN + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can + overflow. */ + leaq (VEC_SIZE * -3)(%rdi), %rdx +#else + leaq (VEC_SIZE * -3)(%rdi, %rax), %rax +#endif + + subq $(VEC_SIZE * -1), %rdi + + /* Align data to VEC_SIZE * 4. */ +#if VEC_SIZE == 64 + /* Saves code size. No evex512 processor has partial register + stalls. If that change this can be replaced with `andq + $-(VEC_SIZE * 4), %rdi`. */ + xorb %dil, %dil +#else + andq $-(VEC_SIZE * 4), %rdi +#endif + +#ifdef USE_AS_WCSLEN + subq %rdi, %rdx + sarq $2, %rdx + addq %rdx, %rax +#else + subq %rdi, %rax +#endif + + // mov %rdi, %rdx + + P2ALIGN(6) +L(loop): + /* VPMINU and VPCMP combination provide better performance as + compared to alternative combinations. */ + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) + + VPTESTN %VMM(2), %VMM(2), %k0 + VPTESTN %VMM(4), %VMM(4), %k1 + + subq $-(VEC_SIZE * 4), %rdi + KORTEST %k0, %k1 + + jnz L(loopend) + subq $(CHAR_PER_VEC * 4), %rax + ja L(loop) + mov %rsi, %rax + ret + + +#if VEC_SIZE == 32 + P2ALIGN_CLAMPED(4, 6) +L(first_vec_x4): +# if VEC_SIZE == 64 + /* If VEC_SIZE == 64 we can fit logic for full return label in + spare bytes before next cache line. */ + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax + ret + P2ALIGN_CLAMPED(4, 6) +# else + addl $CHAR_PER_VEC, %esi +# endif +L(first_vec_x3): + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax + ret +#endif + + + P2ALIGN_CLAMPED(4, 11) +L(loopend): + /* We found a null terminator in one of the 4 vectors. */ + + /* Check the first vector. */ + movq %rax, %r8 + VPTESTN %VMM(1), %VMM(1), %k2 + KMOV %k2, %VRCX + bsf %rcx, %r8 + + cmpq $(CHAR_PER_VEC), %r8 + jbe L(end_vec) + + /* Check the second vector. */ + subq $(CHAR_PER_VEC), %rax + movq %rax, %r8 + KMOV %k0, %VRCX + bsf %rcx, %r8 + + cmpq $(CHAR_PER_VEC), %r8 + jbe L(end_vec) + + /* Check the third vector. */ + subq $(CHAR_PER_VEC), %rax + movq %rax, %r8 + VPTESTN %VMM(3), %VMM(3), %k2 + KMOV %k2, %VRCX + bsf %rcx, %r8 + + cmpq $(CHAR_PER_VEC), %r8 + jbe L(end_vec) + + /* It is in the fourth vector. */ + subq $(CHAR_PER_VEC), %rax + movq %rax, %r8 + KMOV %k1, %VRCX + bsf %rcx, %r8 + + P2ALIGN_CLAMPED(4, 3) +L(end_vec): + /* Get the number that has been processed. */ + movq %rsi, %rcx + subq %rax, %rcx + + /* Add that to the offset we found the null terminator at. */ + leaq (%r8, %rcx), %rax + + /* Take the min of that and the limit. */ + cmpq %rsi, %rax + cmovnb %rsi, %rax + ret + + P2ALIGN_CLAMPED(4, 11) +L(crosses_page_boundary): + /* Align data backwards to VEC_SIZE. */ + shrl $20, %eax + movq %rdi, %rcx + andq $-VEC_SIZE, %rcx + VPCMPEQ (%rcx), %VZERO, %k0 + + KMOV %k0, %VRCX +#ifdef USE_AS_WCSLEN + shrl $2, %eax + andl $(CHAR_PER_VEC - 1), %eax +#endif + /* By this point rax contains number of bytes we need to skip. */ + shrx %VRAX, %VRCX, %VRCX + + /* Calculates CHAR_PER_VEC - eax and stores in eax. */ + negl %eax + andl $(CHAR_PER_VEC - 1), %eax + + movq %rsi, %rdx + bsf %VRCX, %VRDX + cmpq %rax, %rdx + ja L(cross_page_continue) + + /* The vector had a null terminator or we are at the limit. */ + movl %edx, %eax + cmpq %rdx, %rsi + cmovb %esi, %eax + ret + +END(STRNLEN) +#endif diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S index 91b16830eb..c41288906c 100644 --- a/sysdeps/x86_64/multiarch/strnlen-evex.S +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S @@ -1,423 +1,7 @@ -/* strnlen/wcsnlen optimized with 256-bit EVEX instructions. - Copyright (C) 2022-2024 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <isa-level.h> -#include <sysdep.h> - -#if ISA_SHOULD_BUILD (4) - -# ifndef VEC_SIZE -# include "x86-evex256-vecs.h" -# endif - - -# ifndef STRNLEN -# define STRNLEN __strnlen_evex -# endif - -# ifdef USE_AS_WCSLEN -# define VPCMPEQ vpcmpeqd -# define VPCMPNEQ vpcmpneqd -# define VPTESTN vptestnmd -# define VPTEST vptestmd -# define VPMINU vpminud -# define CHAR_SIZE 4 - -# else -# define VPCMPEQ vpcmpeqb -# define VPCMPNEQ vpcmpneqb -# define VPTESTN vptestnmb -# define VPTEST vptestmb -# define VPMINU vpminub -# define CHAR_SIZE 1 - -# define REG_WIDTH VEC_SIZE -# endif - -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - -# include "reg-macros.h" - -# if CHAR_PER_VEC == 32 -# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) -# else -# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) -# endif - - - -# if CHAR_PER_VEC == 64 -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) -# else -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) -# endif - - -# define XZERO VMM_128(0) -# define VZERO VMM(0) -# define PAGE_SIZE 4096 - - .section SECTION(.text), "ax", @progbits -ENTRY_P2ALIGN (STRNLEN, 6) - /* Check zero length. */ - test %RSI_LP, %RSI_LP - jz L(zero) -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %esi, %esi -# endif - - movl %edi, %eax - vpxorq %XZERO, %XZERO, %XZERO - andl $(PAGE_SIZE - 1), %eax - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - ja L(cross_page_boundary) - - /* Check the first VEC_SIZE bytes. Each bit in K0 represents a - null byte. */ - VPCMPEQ (%rdi), %VZERO, %k0 - - KMOV %k0, %VRCX - movq %rsi, %rax - - /* If src (rcx) is zero, bsf does not change the result. NB: - Must use 64-bit bsf here so that upper bits of len are not - cleared. */ - bsfq %rcx, %rax - /* If rax > CHAR_PER_VEC then rcx must have been zero (no null - CHAR) and rsi must be > CHAR_PER_VEC. */ - cmpq $CHAR_PER_VEC, %rax - ja L(more_1x_vec) - /* Check if first match in bounds. */ - cmpq %rax, %rsi - cmovb %esi, %eax - ret - - -# if CHAR_PER_VEC != 32 - .p2align 4,, 2 -L(zero): -L(max_0): - movl %esi, %eax - ret -# endif - - /* Aligned more for strnlen compares remaining length vs 2 * - CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before - going to the loop. */ - .p2align 4,, 10 -L(more_1x_vec): -L(cross_page_continue): - /* Compute number of words checked after aligning. */ -# ifdef USE_AS_WCSLEN - /* Need to compute directly for wcslen as CHAR_SIZE * rsi can - overflow. */ - movq %rdi, %rax - andq $(VEC_SIZE * -1), %rdi - subq %rdi, %rax - sarq $2, %rax - leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax -# else - leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax - andq $(VEC_SIZE * -1), %rdi - subq %rdi, %rax -# endif - - - VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 - - cmpq $(CHAR_PER_VEC * 2), %rax - ja L(more_2x_vec) - -L(last_2x_vec_or_less): - KMOV %k0, %VRDX - test %VRDX, %VRDX - jnz L(last_vec_check) - - /* Check the end of data. */ - SUB_SHORT (CHAR_PER_VEC, rax) - jbe L(max_0) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 - KMOV %k0, %VRDX - test %VRDX, %VRDX - jz L(max_0) - /* Best place for LAST_VEC_CHECK if ZMM. */ - .p2align 4,, 8 -L(last_vec_check): - bsf %VRDX, %VRDX - sub %eax, %edx - lea (%rsi, %rdx), %eax - cmovae %esi, %eax - ret - -# if CHAR_PER_VEC == 32 - .p2align 4,, 2 -L(zero): -L(max_0): - movl %esi, %eax - ret -# endif - - .p2align 4,, 8 -L(last_4x_vec_or_less): - addl $(CHAR_PER_VEC * -4), %eax - VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 - subq $(VEC_SIZE * -4), %rdi - cmpl $(CHAR_PER_VEC * 2), %eax - jbe L(last_2x_vec_or_less) - - .p2align 4,, 6 -L(more_2x_vec): - /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without - rechecking bounds. */ - - KMOV %k0, %VRDX - - test %VRDX, %VRDX - jnz L(first_vec_x1) - - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 - KMOV %k0, %VRDX - test %VRDX, %VRDX - jnz L(first_vec_x2) - - cmpq $(CHAR_PER_VEC * 4), %rax - ja L(more_4x_vec) - - - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 - KMOV %k0, %VRDX - addl $(CHAR_PER_VEC * -2), %eax - test %VRDX, %VRDX - jnz L(last_vec_check) - - subl $(CHAR_PER_VEC), %eax - jbe L(max_1) - - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 - KMOV %k0, %VRDX - - test %VRDX, %VRDX - jnz L(last_vec_check) -L(max_1): - movl %esi, %eax - ret - - .p2align 4,, 3 -L(first_vec_x2): -# if VEC_SIZE == 64 - /* If VEC_SIZE == 64 we can fit logic for full return label in - spare bytes before next cache line. */ - bsf %VRDX, %VRDX - sub %eax, %esi - leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax - ret - .p2align 4,, 6 -# else - addl $CHAR_PER_VEC, %esi -# endif -L(first_vec_x1): - bsf %VRDX, %VRDX - sub %eax, %esi - leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax - ret - - - .p2align 4,, 6 -L(first_vec_x4): -# if VEC_SIZE == 64 - /* If VEC_SIZE == 64 we can fit logic for full return label in - spare bytes before next cache line. */ - bsf %VRDX, %VRDX - sub %eax, %esi - leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax - ret - .p2align 4,, 6 -# else - addl $CHAR_PER_VEC, %esi -# endif -L(first_vec_x3): - bsf %VRDX, %VRDX - sub %eax, %esi - leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax - ret - - .p2align 4,, 5 -L(more_4x_vec): - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 - KMOV %k0, %VRDX - test %VRDX, %VRDX - jnz L(first_vec_x3) - - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 - KMOV %k0, %VRDX - test %VRDX, %VRDX - jnz L(first_vec_x4) - - /* Check if at last VEC_SIZE * 4 length before aligning for the - loop. */ - cmpq $(CHAR_PER_VEC * 8), %rax - jbe L(last_4x_vec_or_less) - - - /* Compute number of words checked after aligning. */ -# ifdef USE_AS_WCSLEN - /* Need to compute directly for wcslen as CHAR_SIZE * rsi can - overflow. */ - leaq (VEC_SIZE * -3)(%rdi), %rdx -# else - leaq (VEC_SIZE * -3)(%rdi, %rax), %rax -# endif - - subq $(VEC_SIZE * -1), %rdi - - /* Align data to VEC_SIZE * 4. */ -# if VEC_SIZE == 64 - /* Saves code size. No evex512 processor has partial register - stalls. If that change this can be replaced with `andq - $-(VEC_SIZE * 4), %rdi`. */ - xorb %dil, %dil -# else - andq $-(VEC_SIZE * 4), %rdi -# endif - -# ifdef USE_AS_WCSLEN - subq %rdi, %rdx - sarq $2, %rdx - addq %rdx, %rax -# else - subq %rdi, %rax -# endif - /* Compare 4 * VEC at a time forward. */ - .p2align 4,, 11 -L(loop_4x_vec): - VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) - VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) - VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) - VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) - VPTESTN %VMM(2), %VMM(2), %k0 - VPTESTN %VMM(4), %VMM(4), %k2 - subq $-(VEC_SIZE * 4), %rdi - /* Break if at end of length. */ - subq $(CHAR_PER_VEC * 4), %rax - jbe L(loop_len_end) - - - KORTEST %k0, %k2 - jz L(loop_4x_vec) - - -L(loop_last_4x_vec): - movq %rsi, %rcx - subq %rax, %rsi - VPTESTN %VMM(1), %VMM(1), %k1 - KMOV %k1, %VRDX - test %VRDX, %VRDX - jnz L(last_vec_x0) - - KMOV %k0, %VRDX - test %VRDX, %VRDX - jnz L(last_vec_x1) - - VPTESTN %VMM(3), %VMM(3), %k0 - - /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for - returning last 2x VEC. For VEC_SIZE == 64 we test each VEC - individually, for VEC_SIZE == 32 we combine them in a single - 64-bit GPR. */ -# if CHAR_PER_VEC == 64 - KMOV %k0, %VRDX - test %VRDX, %VRDX - jnz L(last_vec_x2) - KMOV %k2, %VRDX -# else - /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. - */ - kmovd %k2, %edx - kmovd %k0, %eax - salq $CHAR_PER_VEC, %rdx - orq %rax, %rdx -# endif - - /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. - */ - bsfq %rdx, %rdx - leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax - cmpq %rax, %rcx - cmovb %rcx, %rax - ret - - /* Handle last 4x VEC after loop. All VECs have been loaded. */ - .p2align 4,, 4 -L(loop_len_end): - KORTEST %k0, %k2 - jnz L(loop_last_4x_vec) - movq %rsi, %rax - ret - - -# if CHAR_PER_VEC == 64 - /* Since we can't combine the last 2x VEC for VEC_SIZE == 64 - need return label for it. */ - .p2align 4,, 8 -L(last_vec_x2): - bsf %VRDX, %VRDX - leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax - cmpq %rax, %rcx - cmovb %rcx, %rax - ret -# endif - - - .p2align 4,, 10 -L(last_vec_x1): - addq $CHAR_PER_VEC, %rsi -L(last_vec_x0): - bsf %VRDX, %VRDX - leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax - cmpq %rax, %rcx - cmovb %rcx, %rax - ret - - - .p2align 4,, 8 -L(cross_page_boundary): - /* Align data to VEC_SIZE. */ - movq %rdi, %rcx - andq $-VEC_SIZE, %rcx - VPCMPEQ (%rcx), %VZERO, %k0 - - KMOV %k0, %VRCX -# ifdef USE_AS_WCSLEN - shrl $2, %eax - andl $(CHAR_PER_VEC - 1), %eax -# endif - shrx %VRAX, %VRCX, %VRCX - - negl %eax - andl $(CHAR_PER_VEC - 1), %eax - movq %rsi, %rdx - bsf %VRCX, %VRDX - cmpq %rax, %rdx - ja L(cross_page_continue) - movl %edx, %eax - cmpq %rdx, %rsi - cmovb %esi, %eax - ret -END (STRNLEN) +#ifndef STRNLEN +#define STRNLEN __strnlen_evex #endif + +#include "x86-evex256-vecs.h" +#include "reg-macros.h" +#include "strnlen-evex-base.S" diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S index f8e55883bb..8ef54078f8 100644 --- a/sysdeps/x86_64/multiarch/strnlen-evex512.S +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S @@ -1,264 +1,7 @@ -/* Placeholder function, not used by any processor at the moment. - Copyright (C) 2022-2024 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - #ifndef STRNLEN #define STRNLEN __strnlen_evex512 #endif #include "x86-evex512-vecs.h" #include "reg-macros.h" - -#include <isa-level.h> - -#if ISA_SHOULD_BUILD (4) - -# include <sysdep.h> - -# ifdef USE_AS_WCSLEN -# define VPCMPEQ vpcmpeqd -# define VPTESTN vptestnmd -# define VPMINU vpminud -# define CHAR_SIZE 4 -# else -# define VPCMPEQ vpcmpeqb -# define VPTESTN vptestnmb -# define VPMINU vpminub -# define CHAR_SIZE 1 -# endif - -# define PAGE_SIZE 4096 -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - - .section SECTION(.text),"ax",@progbits -/* Aligning entry point to 64 byte, provides better performance for - one vector length string. */ -ENTRY_P2ALIGN (STRNLEN, 6) - /* Check zero length. */ - test %RSI_LP, %RSI_LP - jz L(ret_max) -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %esi, %esi -# endif - - movl %edi, %eax - vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) - sall $20, %eax - cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax - ja L(page_cross) - - /* Compare [w]char for null, mask bit will be set for match. */ - VPCMPEQ (%rdi), %VMM(0), %k0 - KMOV %k0, %VRCX - /* Store max length in rax. */ - mov %rsi, %rax - /* If rcx is 0, rax will have max length. We can not use VRCX - and VRAX here for evex256 because, upper 32 bits may be - undefined for ecx and eax. */ - bsfq %rcx, %rax - cmp $CHAR_PER_VEC, %rax - ja L(align_more) - cmpq %rax, %rsi - cmovb %esi, %eax - ret - - /* At this point vector max length reached. */ - .p2align 4,,3 -L(ret_max): - movq %rsi, %rax - ret - -L(align_more): - mov %rdi, %rax - /* Align rax to VEC_SIZE. */ - andq $-VEC_SIZE, %rax - movq %rdi, %rdx - subq %rax, %rdx -# ifdef USE_AS_WCSLEN - shr $2, %VRDX -# endif - /* At this point rdx contains [w]chars already compared. */ - leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx - /* At this point rdx contains number of w[char] needs to go. - Now onwards rdx will keep decrementing with each compare. */ - - /* Loop unroll 4 times for 4 vector loop. */ - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 - subq $-VEC_SIZE, %rax - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x1) - - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) - - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x2) - - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) - - VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x3) - - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) - - VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x4) - - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) - /* Save pointer before 4 x VEC_SIZE alignment. */ - movq %rax, %rcx - - /* Align address to VEC_SIZE * 4 for loop. */ - andq $-(VEC_SIZE * 4), %rax - - subq %rax, %rcx -# ifdef USE_AS_WCSLEN - shr $2, %VRCX -# endif - /* rcx contains number of [w]char will be recompared due to - alignment fixes. rdx must be incremented by rcx to offset - alignment adjustment. */ - addq %rcx, %rdx - /* Need jump as we don't want to add/subtract rdx for first - iteration of 4 x VEC_SIZE aligned loop. */ - - .p2align 4,,11 -L(loop): - /* VPMINU and VPCMP combination provide better performance as - compared to alternative combinations. */ - VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) - VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) - VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) - VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) - - VPTESTN %VMM(2), %VMM(2), %k0 - VPTESTN %VMM(4), %VMM(4), %k1 - - subq $-(VEC_SIZE * 4), %rax - KORTEST %k0, %k1 - - jnz L(loopend) - subq $(CHAR_PER_VEC * 4), %rdx - ja L(loop) - mov %rsi, %rax - ret - -L(loopend): - - VPTESTN %VMM(1), %VMM(1), %k2 - KMOV %k2, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x1) - - KMOV %k0, %VRCX - /* At this point, if k0 is non zero, null char must be in the - second vector. */ - test %VRCX, %VRCX - jnz L(ret_vec_x2) - - VPTESTN %VMM(3), %VMM(3), %k3 - KMOV %k3, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x3) - /* At this point null [w]char must be in the fourth vector so no - need to check. */ - KMOV %k1, %VRCX - - /* Fourth, third, second vector terminating are pretty much - same, implemented this way to avoid branching and reuse code - from pre loop exit condition. */ -L(ret_vec_x4): - bsf %VRCX, %VRCX - subq %rdi, %rax -# ifdef USE_AS_WCSLEN - subq $-(VEC_SIZE * 3), %rax - shrq $2, %rax - addq %rcx, %rax -# else - leaq (VEC_SIZE * 3)(%rcx, %rax), %rax -# endif - - cmpq %rsi, %rax - cmovnb %rsi, %rax - ret - -L(ret_vec_x3): - bsf %VRCX, %VRCX - subq %rdi, %rax -# ifdef USE_AS_WCSLEN - subq $-(VEC_SIZE * 2), %rax - shrq $2, %rax - addq %rcx, %rax -# else - leaq (VEC_SIZE * 2)(%rcx, %rax), %rax -# endif - cmpq %rsi, %rax - cmovnb %rsi, %rax - ret - -L(ret_vec_x2): - subq $-VEC_SIZE, %rax -L(ret_vec_x1): - bsf %VRCX, %VRCX - subq %rdi, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - addq %rcx, %rax - cmpq %rsi, %rax - cmovnb %rsi, %rax - ret - -L(page_cross): - mov %rdi, %rax - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx -# ifdef USE_AS_WCSLEN - sarl $2, %ecx -# endif - /* ecx contains number of w[char] to be skipped as a result - of address alignment. */ - andq $-VEC_SIZE, %rax - VPCMPEQ (%rax), %VMM(0), %k0 - KMOV %k0, %VRDX - /* Ignore number of character for alignment adjustment. */ - shr %cl, %VRDX - jnz L(page_cross_end) - movl $CHAR_PER_VEC, %eax - sub %ecx, %eax - cmp %rax, %rsi - ja L(align_more) - -L(page_cross_end): - bsf %VRDX, %VRAX - cmpq %rsi, %rax - cmovnb %esi, %eax - ret - -END (STRNLEN) -#endif +#include "strnlen-evex-base.S" \ No newline at end of file diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym index 2bbd563a6c..988a4b8593 100644 --- a/sysdeps/x86_64/nptl/tcb-offsets.sym +++ b/sysdeps/x86_64/nptl/tcb-offsets.sym @@ -13,6 +13,3 @@ MULTIPLE_THREADS_OFFSET offsetof (tcbhead_t, multiple_threads) POINTER_GUARD offsetof (tcbhead_t, pointer_guard) FEATURE_1_OFFSET offsetof (tcbhead_t, feature_1) SSP_BASE_OFFSET offsetof (tcbhead_t, ssp_base) - --- Not strictly offsets, but these values are also used in the TCB. -TCB_CANCELED_BITMASK CANCELED_BITMASK diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile index a015789a4f..a9b20b798f 100644 --- a/sysdeps/x86_64/x32/Makefile +++ b/sysdeps/x86_64/x32/Makefile @@ -1,3 +1,9 @@ +ifeq ($(subdir),elf) +# Xfail tst-platform-1 on x32 since kernel passes i686 in AT_PLATFORM. +# See https://sourceware.org/bugzilla/show_bug.cgi?id=22363 +test-xfail-tst-platform-1 = yes +endif + ifeq ($(subdir),math) # Since x32 returns 32-bit long int and 64-bit long long int in the # same 64-bit register, we make the 32b-bit lround an alias of the diff --git a/sysdeps/x86_64/x32/dl-machine.h b/sysdeps/x86_64/x32/dl-machine.h deleted file mode 100644 index c35cee9261..0000000000 --- a/sysdeps/x86_64/x32/dl-machine.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Machine-dependent ELF dynamic relocation inline functions. x32 version. - Copyright (C) 2012-2024 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -/* Must allow <sysdeps/x86_64/dl-machine.h> to be included more than once. - See #ifdef RESOLVE_MAP in sysdeps/x86_64/dl-machine.h. */ -#include <sysdeps/x86_64/dl-machine.h> - -#ifndef _X32_DL_MACHINE_H -#define _X32_DL_MACHINE_H - -#undef ARCH_LA_PLTENTER -#undef ARCH_LA_PLTEXIT -#undef RTLD_START - -/* Names of the architecture-specific auditing callback functions. */ -#define ARCH_LA_PLTENTER x32_gnu_pltenter -#define ARCH_LA_PLTEXIT x32_gnu_pltexit - -/* Initial entry point code for the dynamic linker. - The C function `_dl_start' is the real entry point; - its return value is the user program's entry point. */ -#define RTLD_START asm ("\n\ -.text\n\ - .p2align 4\n\ -.globl _start\n\ -.globl _dl_start_user\n\ -_start:\n\ - movl %esp, %edi\n\ - call _dl_start\n\ -_dl_start_user:\n\ - # Save the user entry point address in %r12.\n\ - movl %eax, %r12d\n\ - # Read the original argument count.\n\ - movl (%rsp), %edx\n\ - # Call _dl_init (struct link_map *main_map, int argc, char **argv, char **env)\n\ - # argc -> rsi\n\ - movl %edx, %esi\n\ - # Save %rsp value in %r13.\n\ - movl %esp, %r13d\n\ - # And align stack for the _dl_init call.\n\ - and $-16, %esp\n\ - # _dl_loaded -> rdi\n\ - movl _rtld_local(%rip), %edi\n\ - # env -> rcx\n\ - lea 8(%r13,%rdx,4), %ecx\n\ - # argv -> rdx\n\ - lea 4(%r13), %edx\n\ - # Clear %rbp to mark outermost frame obviously even for constructors.\n\ - xorl %ebp, %ebp\n\ - # Call the function to run the initializers.\n\ - call _dl_init\n\ - # Pass our finalizer function to the user in %rdx, as per ELF ABI.\n\ - lea _dl_fini(%rip), %edx\n\ - # And make sure %rsp points to argc stored on the stack.\n\ - movl %r13d, %esp\n\ - # Jump to the user's entry point.\n\ - jmp *%r12\n\ -.previous\n\ -"); - -#endif /* !_X32_DL_MACHINE_H */ |