13 files changed, 582 insertions, 846 deletions
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index a6de3793e4..4f12955875 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -139,37 +139,37 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 .globl _start\n\
 .globl _dl_start_user\n\
 _start:\n\
-	movq %rsp, %rdi\n\
+	mov %" RSP_LP ", %" RDI_LP "\n\
 	call _dl_start\n\
 _dl_start_user:\n\
 	# Save the user entry point address in %r12.\n\
-	movq %rax, %r12\n\
+	mov %" RAX_LP ", %" R12_LP "\n\
 	# Save %rsp value in %r13.\n\
-	movq %rsp, %r13\n\
+	mov %" RSP_LP ", % " R13_LP "\n\
 "\
 	RTLD_START_ENABLE_X86_FEATURES \
 "\
 	# Read the original argument count.\n\
-	movq (%rsp), %rdx\n\
+	mov (%rsp), %" RDX_LP "\n\
 	# Call _dl_init (struct link_map *main_map, int argc, char **argv, char **env)\n\
 	# argc -> rsi\n\
-	movq %rdx, %rsi\n\
+	mov %" RDX_LP ", %" RSI_LP "\n\
 	# And align stack for the _dl_init call. \n\
-	andq $-16, %rsp\n\
+	and $-16, %" RSP_LP "\n\
 	# _dl_loaded -> rdi\n\
-	movq _rtld_local(%rip), %rdi\n\
+	mov _rtld_local(%rip), %" RDI_LP "\n\
 	# env -> rcx\n\
-	leaq 16(%r13,%rdx,8), %rcx\n\
+	lea 2*" LP_SIZE "(%r13,%rdx," LP_SIZE "), %" RCX_LP "\n\
 	# argv -> rdx\n\
-	leaq 8(%r13), %rdx\n\
+	lea " LP_SIZE "(%r13), %" RDX_LP "\n\
 	# Clear %rbp to mark outermost frame obviously even for constructors.\n\
 	xorl %ebp, %ebp\n\
 	# Call the function to run the initializers.\n\
 	call _dl_init\n\
 	# Pass our finalizer function to the user in %rdx, as per ELF ABI.\n\
-	leaq _dl_fini(%rip), %rdx\n\
+	lea _dl_fini(%rip), %" RDX_LP "\n\
 	# And make sure %rsp points to argc stored on the stack.\n\
-	movq %r13, %rsp\n\
+	mov %" R13_LP ", %" RSP_LP "\n\
 	# Jump to the user's entry point.\n\
 	jmp *%r12\n\
 .previous\n\
@@ -234,8 +234,13 @@ elf_machine_plt_value (struct link_map *map, const ElfW(Rela) *reloc,
 
 
 /* Names of the architecture-specific auditing callback functions.  */
+#ifdef __LP64__
 #define ARCH_LA_PLTENTER x86_64_gnu_pltenter
 #define ARCH_LA_PLTEXIT x86_64_gnu_pltexit
+#else
+#define ARCH_LA_PLTENTER x32_gnu_pltenter
+#define ARCH_LA_PLTEXIT x32_gnu_pltexit
+#endif
 
 #endif /* !dl_machine_h */
 
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
index 869023bbba..b3c1e4fcd7 100644
--- a/sysdeps/x86_64/dl-tls.c
+++ b/sysdeps/x86_64/dl-tls.c
@@ -41,7 +41,10 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
   dtv_t *dtv = THREAD_DTV ();
 
   size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
-  if (__glibc_unlikely (dtv[0].counter != gen))
+  if (__glibc_unlikely (dtv[0].counter != gen)
+      /* See comment in __tls_get_addr in elf/dl-tls.c.  */
+      && !(_dl_tls_allocate_active ()
+           && GET_ADDR_MODULE < _dl_tls_initial_modid_limit))
     return update_get_addr (GET_ADDR_PARAM, gen);
 
   return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index 3592dfae1d..c2e36dcbdf 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -1430,28 +1430,28 @@ Function: "exp10_vlen8_avx2":
 float: 1
 
 Function: "exp10m1":
-double: 2
-float: 1
-float128: 1
-ldouble: 1
+double: 4
+float: 2
+float128: 3
+ldouble: 4
 
 Function: "exp10m1_downward":
-double: 1
-float: 1
-float128: 3
-ldouble: 2
+double: 3
+float: 3
+float128: 6
+ldouble: 6
 
 Function: "exp10m1_towardzero":
-double: 1
-float: 1
-float128: 3
-ldouble: 2
+double: 2
+float: 3
+float128: 6
+ldouble: 5
 
 Function: "exp10m1_upward":
-double: 3
-float: 1
-float128: 3
-ldouble: 2
+double: 5
+float: 4
+float128: 6
+ldouble: 6
 
 Function: "exp2":
 double: 1
@@ -1498,28 +1498,28 @@ Function: "exp2_vlen8_avx2":
 float: 1
 
 Function: "exp2m1":
-double: 1
-float: 1
-float128: 1
-ldouble: 1
-
-Function: "exp2m1_downward":
 double: 2
-float: 1
+float: 2
 float128: 2
 ldouble: 3
 
+Function: "exp2m1_downward":
+double: 3
+float: 3
+float128: 3
+ldouble: 6
+
 Function: "exp2m1_towardzero":
-double: 2
-float: 1
-float128: 2
-ldouble: 3
+double: 3
+float: 2
+float128: 4
+ldouble: 5
 
 Function: "exp2m1_upward":
-double: 1
-float: 1
-float128: 2
-ldouble: 3
+double: 3
+float: 3
+float128: 5
+ldouble: 6
 
 Function: "exp_downward":
 double: 1
@@ -1808,28 +1808,28 @@ Function: "log10_vlen8_avx2":
 float: 1
 
 Function: "log10p1":
-double: 1
-float: 1
+double: 2
+float: 2
 float128: 3
-ldouble: 2
+ldouble: 4
 
 Function: "log10p1_downward":
 double: 2
-float: 1
-float128: 2
-ldouble: 4
+float: 3
+float128: 4
+ldouble: 8
 
 Function: "log10p1_towardzero":
-double: 2
+double: 3
 float: 2
-float128: 2
-ldouble: 4
+float128: 3
+ldouble: 8
 
 Function: "log10p1_upward":
 double: 2
-float: 1
-float128: 3
-ldouble: 3
+float: 3
+float128: 4
+ldouble: 6
 
 Function: "log1p":
 double: 1
@@ -1920,10 +1920,10 @@ Function: "log2_vlen8_avx2":
 float: 1
 
 Function: "log2p1":
-double: 1
-float: 1
+double: 2
+float: 2
 float128: 3
-ldouble: 2
+ldouble: 4
 
 Function: "log2p1_downward":
 double: 2
@@ -1938,9 +1938,9 @@ float128: 2
 ldouble: 4
 
 Function: "log2p1_upward":
-double: 1
+double: 2
 float: 2
-float128: 2
+float128: 3
 ldouble: 5
 
 Function: "log_downward":
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 7a637ef7ca..8dc3d7ab5a 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -46,6 +46,13 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
   attribute_hidden;
 
+static inline int
+prefer_erms_nt_impl (const struct cpu_features *cpu_features)
+{
+  return CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+	 || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset);
+}
+
 static inline void *
 IFUNC_SELECTOR (void)
 {
@@ -61,7 +68,7 @@ IFUNC_SELECTOR (void)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (prefer_erms_nt_impl (cpu_features))
 	    return OPTIMIZE (avx512_unaligned_erms);
 
 	  return OPTIMIZE (avx512_unaligned);
@@ -76,7 +83,7 @@ IFUNC_SELECTOR (void)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (prefer_erms_nt_impl (cpu_features))
 	    return OPTIMIZE (evex_unaligned_erms);
 
 	  return OPTIMIZE (evex_unaligned);
@@ -84,7 +91,7 @@ IFUNC_SELECTOR (void)
 
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (prefer_erms_nt_impl (cpu_features))
 	    return OPTIMIZE (avx2_unaligned_erms_rtm);
 
 	  return OPTIMIZE (avx2_unaligned_rtm);
@@ -93,14 +100,15 @@ IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
 				       Prefer_No_VZEROUPPER, !))
 	{
-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (prefer_erms_nt_impl (cpu_features))
 	    return OPTIMIZE (avx2_unaligned_erms);
 
 	  return OPTIMIZE (avx2_unaligned);
 	}
     }
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+      || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
     return OPTIMIZE (sse2_unaligned_erms);
 
   return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 048d015712..01008fd981 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -151,13 +151,10 @@ L(more_2x_vec):
 	   loop.  */
 	movups	%xmm0, (%rdi)
 
-# ifdef SHARED_CACHE_SIZE_HALF
-	cmp	$SHARED_CACHE_SIZE_HALF, %RDX_LP
-# else
-	cmp	__x86_shared_cache_size_half(%rip), %rdx
-# endif
+	cmp	__x86_shared_non_temporal_threshold(%rip), %rdx
 	ja	L(large_memcpy)
 
+L(loop_fwd):
 	leaq	-64(%rdi, %rdx), %r8
 	andq	$-16, %rdi
 	movl	$48, %edx
@@ -199,6 +196,13 @@ L(large_memcpy):
 	movups	-64(%r9, %rdx), %xmm10
 	movups	-80(%r9, %rdx), %xmm11
 
+	/* Check if src and dst overlap. If they do use cacheable
+	   writes to potentially gain positive interference between
+	   the loads during the memmove.  */
+	subq	%rdi, %r9
+	cmpq	%rdx, %r9
+	jb	L(loop_fwd)
+
 	sall	$5, %ecx
 	leal	(%rcx, %rcx, 2), %r8d
 	leaq	-96(%rdi, %rdx), %rcx
diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S
index 04e2c0e79e..3a0b7c9d64 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S
@@ -124,13 +124,13 @@ L(page_cross):
 	VPCMPNE	%VMM(1), %VMM(0), %k1
 	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
 	KMOV	%k0, %VRAX
-# ifdef USE_AS_WCSCHR
+	sar	%cl, %VRAX
+#ifdef USE_AS_WCSCHR
 	sub	$VEC_MATCH_MASK, %VRAX
-# else
+#else
 	inc	%VRAX
-# endif
+#endif
 	/* Ignore number of character for alignment adjustment.  */
-	shr	%cl, %VRAX
 	jz	L(align_more)
 
 	bsf	%VRAX, %VRAX
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 06730ab2a1..cea034f394 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -209,7 +209,9 @@
    returned.  */
 
 	.section SECTION(.text), "ax", @progbits
-	.align	16
+	/* Align 64 bytes here. This is to get the L(loop) block ideally
+	   aligned for the DSB.  */
+	.align	64
 	.type	STRCMP, @function
 	.globl	STRCMP
 # ifdef USE_AS_STRCASECMP_L
@@ -509,9 +511,7 @@ L(ret4):
 	ret
 # endif
 
-	/* 32 byte align here ensures the main loop is ideally aligned
-	   for DSB.  */
-	.p2align 5
+	.p2align 4,, 4
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(VEC_SIZE)(%rdi), %VMM(0)
@@ -1426,10 +1426,9 @@ L(less_32_till_page):
 L(ret_zero_page_cross_slow_case0):
 	xorl	%eax, %eax
 	ret
-# endif
-
-
+# else
 	.p2align 4,, 10
+# endif
 L(less_16_till_page):
 	cmpl	$((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
 	ja	L(less_8_till_page)
@@ -1482,8 +1481,12 @@ L(less_16_till_page):
 # endif
 	jmp	L(prepare_loop_aligned)
 
-
-
+# ifndef USE_AS_STRNCMP
+	/* Fits in aligning bytes.  */
+L(ret_zero_4_loop):
+	xorl	%eax, %eax
+	ret
+# endif
 
 	.p2align 4,, 10
 L(less_8_till_page):
@@ -1554,6 +1557,7 @@ L(ret_less_8_wcs):
 
 #  ifdef USE_AS_STRNCMP
 	.p2align 4,, 2
+L(ret_zero_4_loop):
 L(ret_zero_page_cross_slow_case1):
 	xorl	%eax, %eax
 	ret
@@ -1586,10 +1590,6 @@ L(less_4_loop):
 	subq	$-(CHAR_PER_VEC * 4), %rdx
 #  endif
 	jmp	L(prepare_loop_aligned)
-
-L(ret_zero_4_loop):
-	xorl	%eax, %eax
-	ret
 L(ret_less_4_loop):
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex-base.S b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
new file mode 100644
index 0000000000..1c2cfdfe06
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
@@ -0,0 +1,462 @@
+/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions.
+   Copyright (C) 2022-2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+#ifdef USE_AS_WCSLEN
+# define VPCMPEQ	vpcmpeqd
+# define VPTESTN	vptestnmd
+# define VPMINU	vpminud
+# define CHAR_SIZE	4
+#else
+# define VPCMPEQ	vpcmpeqb
+# define VPTESTN	vptestnmb
+# define VPMINU	vpminub
+# define CHAR_SIZE	1
+#endif
+
+#define XZERO	VMM_128(0)
+#define VZERO	VMM(0)
+#define PAGE_SIZE	4096
+#define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+#if CHAR_PER_VEC == 32
+# define SUB_SHORT(imm, reg)	subb $(imm), %VGPR_SZ(reg, 8)
+#else
+# define SUB_SHORT(imm, reg)	subl $(imm), %VGPR_SZ(reg, 32)
+#endif
+
+#ifdef USE_AS_WCSLEN
+/* For wide-character, we care more about limitting code size
+   than optimally aligning targets, so just cap nop padding
+   reasonably low.  */
+# define P2ALIGN(...)	.p2align 4,, 6
+# define P2ALIGN_CLAMPED(...)	P2ALIGN(__VA_ARGS__)
+#else
+# define P2ALIGN(x)	.p2align x
+# define P2ALIGN_CLAMPED(x, y)	.p2align x,, y
+#endif
+
+	.section SECTION(.text), "ax", @progbits
+	/* Aligning entry point to 64 byte, provides better performance for
+	   one vector length string.  */
+ENTRY_P2ALIGN(STRNLEN, 6)
+	/* rdi is pointer to array, rsi is the upper limit.  */
+
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(zero)
+
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#endif
+
+	vpxorq	%XZERO, %XZERO, %XZERO
+
+	/* Check that we won't cross a page boundary with our first load.  */
+	movl	%edi, %eax
+	shll	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(crosses_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+	   null byte.  */
+	VPCMPEQ	(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRCX
+
+	/* If src (rcx) is zero, bsf does not change the result.  NB:
+	   Must use 64-bit bsf here so that upper bits of len are not
+	   cleared.  */
+	movq	%rsi, %rax
+	bsfq	%rcx, %rax
+
+	/* If rax > CHAR_PER_VEC then rcx must have been zero (no null
+	   CHAR) and rsi must be > CHAR_PER_VEC.  */
+	cmpq	$CHAR_PER_VEC, %rax
+	ja	L(more_1x_vec)
+
+	/* Check if first match in bounds.  */
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+	ret
+
+#if VEC_SIZE == 32
+	P2ALIGN_CLAMPED(4, 2)
+L(zero):
+L(max_0):
+	movl	%esi, %eax
+	ret
+#endif
+
+	P2ALIGN_CLAMPED(4, 10)
+L(more_1x_vec):
+L(cross_page_continue):
+	/* After this calculation, rax stores the number of elements
+	   left to be processed The complexity comes from the fact some
+	   elements get read twice due to alignment and we need to be
+	   sure we don't count them twice (else, it would just be rsi -
+	   CHAR_PER_VEC).  */
+
+#ifdef USE_AS_WCSLEN
+	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+	   overflow.  */
+	movq	%rdi, %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+	sarq	$2, %rax
+	leaq	-(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
+#else
+	/* Calculate ptr + N - VEC_SIZE, then mask off the low bits,
+	   then subtract ptr to get the new aligned limit value.  */
+	leaq	(VEC_SIZE * -1)(%rsi, %rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+#endif
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VZERO, %k0
+
+	/* Checking here is faster for 256-bit but not 512-bit */
+#if VEC_SIZE == 0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+#endif
+
+	cmpq	$(CHAR_PER_VEC * 2), %rax
+	ja	L(more_2x_vec)
+
+L(last_2x_vec_or_less):
+
+	/* Checking here is faster for 512-bit but not 256-bit */
+#if VEC_SIZE != 0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+#endif
+
+	/* Check for the end of data.  */
+	SUB_SHORT (CHAR_PER_VEC, rax)
+	jbe	L(max_0)
+
+	/* Check the final remaining vector.  */
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+#if VEC_SIZE == 32
+	jz	L(max_0)
+#else
+	jnz	L(last_vec_check)
+	P2ALIGN_CLAMPED(4, 2)
+L(zero):
+L(max_0):
+	movl	%esi, %eax
+	ret
+
+#endif
+	P2ALIGN_CLAMPED(4, 4)
+L(last_vec_check):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %edx
+	lea	(%rsi, %rdx), %eax
+	cmovae	%esi, %eax
+	ret
+
+
+#if VEC_SIZE == 32
+	P2ALIGN_CLAMPED(4, 8)
+#endif
+L(last_4x_vec_or_less):
+	addl	$(CHAR_PER_VEC * -4), %eax
+	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VZERO, %k0
+
+#if VEC_SIZE == 64
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+#endif
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2), %eax
+	jbe	L(last_2x_vec_or_less)
+
+	P2ALIGN_CLAMPED(4, 6)
+L(more_2x_vec):
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
+
+	/* Already checked in 256-bit case */
+#if VEC_SIZE != 0
+	KMOV	%k0, %VRDX
+
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x1)
+#endif
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x2)
+
+	cmpq	$(CHAR_PER_VEC * 4), %rax
+	ja	L(more_4x_vec)
+
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	addl	$(CHAR_PER_VEC * -2), %eax
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+
+	subb	$(CHAR_PER_VEC), %al
+	jbe	L(max_1)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+L(max_1):
+	movl	%esi, %eax
+	ret
+
+
+	P2ALIGN_CLAMPED(4, 14)
+L(first_vec_x2):
+#if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
+	ret
+	P2ALIGN_CLAMPED(4, 6)
+#else
+	addl	$CHAR_PER_VEC, %esi
+#endif
+L(first_vec_x1):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
+	ret
+
+#if VEC_SIZE == 64
+	P2ALIGN_CLAMPED(4, 6)
+L(first_vec_x4):
+# if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+	ret
+	P2ALIGN_CLAMPED(4, 6)
+# else
+	addl	$CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+	ret
+#endif
+
+	P2ALIGN_CLAMPED(6, 20)
+L(more_4x_vec):
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x4)
+
+	/* Check if at last VEC_SIZE * 4 length before aligning for the
+	   loop.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rax
+	jbe	L(last_4x_vec_or_less)
+
+
+	/* Compute number of words checked after aligning.  */
+#ifdef USE_AS_WCSLEN
+	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+	   overflow.  */
+	leaq	(VEC_SIZE * -3)(%rdi), %rdx
+#else
+	leaq	(VEC_SIZE * -3)(%rdi, %rax), %rax
+#endif
+
+	subq	$(VEC_SIZE * -1), %rdi
+
+	/* Align data to VEC_SIZE * 4.  */
+#if VEC_SIZE == 64
+	/* Saves code size.  No evex512 processor has partial register
+	   stalls.  If that change this can be replaced with `andq
+	   $-(VEC_SIZE * 4), %rdi`.  */
+	xorb	%dil, %dil
+#else
+	andq	$-(VEC_SIZE * 4), %rdi
+#endif
+
+#ifdef USE_AS_WCSLEN
+	subq	%rdi, %rdx
+	sarq	$2, %rdx
+	addq	%rdx, %rax
+#else
+	subq	%rdi, %rax
+#endif
+
+	// mov     %rdi, %rdx
+
+	P2ALIGN(6)
+L(loop):
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k1
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	KORTEST	%k0, %k1
+
+	jnz	L(loopend)
+	subq	$(CHAR_PER_VEC * 4), %rax
+	ja	L(loop)
+	mov	%rsi, %rax
+	ret
+
+
+#if VEC_SIZE == 32
+	P2ALIGN_CLAMPED(4, 6)
+L(first_vec_x4):
+# if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+	ret
+	P2ALIGN_CLAMPED(4, 6)
+# else
+	addl	$CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+	ret
+#endif
+
+
+	P2ALIGN_CLAMPED(4, 11)
+L(loopend):
+	/* We found a null terminator in one of the 4 vectors.  */
+
+	/* Check the first vector.  */
+	movq	%rax, %r8
+	VPTESTN	%VMM(1), %VMM(1), %k2
+	KMOV	%k2, %VRCX
+	bsf	%rcx, %r8
+
+	cmpq	$(CHAR_PER_VEC), %r8
+	jbe	L(end_vec)
+
+	/* Check the second vector.  */
+	subq	$(CHAR_PER_VEC), %rax
+	movq	%rax, %r8
+	KMOV	%k0, %VRCX
+	bsf	%rcx, %r8
+
+	cmpq	$(CHAR_PER_VEC), %r8
+	jbe	L(end_vec)
+
+	/* Check the third vector.  */
+	subq	$(CHAR_PER_VEC), %rax
+	movq	%rax, %r8
+	VPTESTN	%VMM(3), %VMM(3), %k2
+	KMOV	%k2, %VRCX
+	bsf	%rcx, %r8
+
+	cmpq	$(CHAR_PER_VEC), %r8
+	jbe	L(end_vec)
+
+	/* It is in the fourth vector.  */
+	subq	$(CHAR_PER_VEC), %rax
+	movq	%rax, %r8
+	KMOV	%k1, %VRCX
+	bsf	%rcx, %r8
+
+	P2ALIGN_CLAMPED(4, 3)
+L(end_vec):
+	/* Get the number that has been processed.  */
+	movq	%rsi, %rcx
+	subq	%rax, %rcx
+
+	/* Add that to the offset we found the null terminator at.  */
+	leaq	(%r8, %rcx), %rax
+
+	/* Take the min of that and the limit.  */
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+	ret
+
+	P2ALIGN_CLAMPED(4, 11)
+L(crosses_page_boundary):
+	/* Align data backwards to VEC_SIZE.  */
+	shrl	$20, %eax
+	movq	%rdi, %rcx
+	andq	$-VEC_SIZE, %rcx
+	VPCMPEQ	(%rcx), %VZERO, %k0
+
+	KMOV	%k0, %VRCX
+#ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+#endif
+	/* By this point rax contains number of bytes we need to skip.  */
+	shrx	%VRAX, %VRCX, %VRCX
+
+	/* Calculates CHAR_PER_VEC - eax and stores in eax.  */
+	negl	%eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+
+	movq	%rsi, %rdx
+	bsf	%VRCX, %VRDX
+	cmpq	%rax, %rdx
+	ja	L(cross_page_continue)
+
+	/* The vector had a null terminator or we are at the limit.  */
+	movl	%edx, %eax
+	cmpq	%rdx, %rsi
+	cmovb	%esi, %eax
+	ret
+
+END(STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
index 91b16830eb..c41288906c 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
@@ -1,423 +1,7 @@
-/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
-   Copyright (C) 2022-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-#include <sysdep.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# ifndef VEC_SIZE
-#  include "x86-evex256-vecs.h"
-# endif
-
-
-# ifndef STRNLEN
-#  define STRNLEN	__strnlen_evex
-# endif
-
-# ifdef USE_AS_WCSLEN
-#  define VPCMPEQ	vpcmpeqd
-#  define VPCMPNEQ	vpcmpneqd
-#  define VPTESTN	vptestnmd
-#  define VPTEST	vptestmd
-#  define VPMINU	vpminud
-#  define CHAR_SIZE	4
-
-# else
-#  define VPCMPEQ	vpcmpeqb
-#  define VPCMPNEQ	vpcmpneqb
-#  define VPTESTN	vptestnmb
-#  define VPTEST	vptestmb
-#  define VPMINU	vpminub
-#  define CHAR_SIZE	1
-
-#  define REG_WIDTH	VEC_SIZE
-# endif
-
-# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
-
-# include "reg-macros.h"
-
-# if CHAR_PER_VEC == 32
-#  define SUB_SHORT(imm, reg)	subb $(imm), %VGPR_SZ(reg, 8)
-# else
-#  define SUB_SHORT(imm, reg)	subl $(imm), %VGPR_SZ(reg, 32)
-# endif
-
-
-
-# if CHAR_PER_VEC == 64
-#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
-# else
-#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
-# endif
-
-
-# define XZERO	VMM_128(0)
-# define VZERO	VMM(0)
-# define PAGE_SIZE	4096
-
-	.section SECTION(.text), "ax", @progbits
-ENTRY_P2ALIGN (STRNLEN, 6)
-	/* Check zero length.  */
-	test	%RSI_LP, %RSI_LP
-	jz	L(zero)
-# ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-# endif
-
-	movl	%edi, %eax
-	vpxorq	%XZERO, %XZERO, %XZERO
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	ja	L(cross_page_boundary)
-
-	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
-	   null byte.  */
-	VPCMPEQ	(%rdi), %VZERO, %k0
-
-	KMOV	%k0, %VRCX
-	movq	%rsi, %rax
-
-	/* If src (rcx) is zero, bsf does not change the result.  NB:
-	   Must use 64-bit bsf here so that upper bits of len are not
-	   cleared.  */
-	bsfq	%rcx, %rax
-	/* If rax > CHAR_PER_VEC then rcx must have been zero (no null
-	   CHAR) and rsi must be > CHAR_PER_VEC.  */
-	cmpq	$CHAR_PER_VEC, %rax
-	ja	L(more_1x_vec)
-	/* Check if first match in bounds.  */
-	cmpq	%rax, %rsi
-	cmovb	%esi, %eax
-	ret
-
-
-# if CHAR_PER_VEC != 32
-	.p2align 4,, 2
-L(zero):
-L(max_0):
-	movl	%esi, %eax
-	ret
-# endif
-
-	/* Aligned more for strnlen compares remaining length vs 2 *
-	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
-	   going to the loop.  */
-	.p2align 4,, 10
-L(more_1x_vec):
-L(cross_page_continue):
-	/* Compute number of words checked after aligning.  */
-# ifdef USE_AS_WCSLEN
-	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
-	   overflow.  */
-	movq	%rdi, %rax
-	andq	$(VEC_SIZE * -1), %rdi
-	subq	%rdi, %rax
-	sarq	$2, %rax
-	leaq	-(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
-# else
-	leaq	(VEC_SIZE * -1)(%rsi, %rdi), %rax
-	andq	$(VEC_SIZE * -1), %rdi
-	subq	%rdi, %rax
-# endif
-
-
-	VPCMPEQ	VEC_SIZE(%rdi), %VZERO, %k0
-
-	cmpq	$(CHAR_PER_VEC * 2), %rax
-	ja	L(more_2x_vec)
-
-L(last_2x_vec_or_less):
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(last_vec_check)
-
-	/* Check the end of data.  */
-	SUB_SHORT (CHAR_PER_VEC, rax)
-	jbe	L(max_0)
-	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jz	L(max_0)
-	/* Best place for LAST_VEC_CHECK if ZMM.  */
-	.p2align 4,, 8
-L(last_vec_check):
-	bsf	%VRDX, %VRDX
-	sub	%eax, %edx
-	lea	(%rsi, %rdx), %eax
-	cmovae	%esi, %eax
-	ret
-
-# if CHAR_PER_VEC == 32
-	.p2align 4,, 2
-L(zero):
-L(max_0):
-	movl	%esi, %eax
-	ret
-# endif
-
-	.p2align 4,, 8
-L(last_4x_vec_or_less):
-	addl	$(CHAR_PER_VEC * -4), %eax
-	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VZERO, %k0
-	subq	$(VEC_SIZE * -4), %rdi
-	cmpl	$(CHAR_PER_VEC * 2), %eax
-	jbe	L(last_2x_vec_or_less)
-
-	.p2align 4,, 6
-L(more_2x_vec):
-	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
-	   rechecking bounds.  */
-
-	KMOV	%k0, %VRDX
-
-	test	%VRDX, %VRDX
-	jnz	L(first_vec_x1)
-
-	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(first_vec_x2)
-
-	cmpq	$(CHAR_PER_VEC * 4), %rax
-	ja	L(more_4x_vec)
-
-
-	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRDX
-	addl	$(CHAR_PER_VEC * -2), %eax
-	test	%VRDX, %VRDX
-	jnz	L(last_vec_check)
-
-	subl	$(CHAR_PER_VEC), %eax
-	jbe	L(max_1)
-
-	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRDX
-
-	test	%VRDX, %VRDX
-	jnz	L(last_vec_check)
-L(max_1):
-	movl	%esi, %eax
-	ret
-
-	.p2align 4,, 3
-L(first_vec_x2):
-# if VEC_SIZE == 64
-	/* If VEC_SIZE == 64 we can fit logic for full return label in
-	   spare bytes before next cache line.  */
-	bsf	%VRDX, %VRDX
-	sub	%eax, %esi
-	leal	(CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
-	ret
-	.p2align 4,, 6
-# else
-	addl	$CHAR_PER_VEC, %esi
-# endif
-L(first_vec_x1):
-	bsf	%VRDX, %VRDX
-	sub	%eax, %esi
-	leal	(CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
-	ret
-
-
-	.p2align 4,, 6
-L(first_vec_x4):
-# if VEC_SIZE == 64
-	/* If VEC_SIZE == 64 we can fit logic for full return label in
-	   spare bytes before next cache line.  */
-	bsf	%VRDX, %VRDX
-	sub	%eax, %esi
-	leal	(CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
-	ret
-	.p2align 4,, 6
-# else
-	addl	$CHAR_PER_VEC, %esi
-# endif
-L(first_vec_x3):
-	bsf	%VRDX, %VRDX
-	sub	%eax, %esi
-	leal	(CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
-	ret
-
-	.p2align 4,, 5
-L(more_4x_vec):
-	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(first_vec_x3)
-
-	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(first_vec_x4)
-
-	/* Check if at last VEC_SIZE * 4 length before aligning for the
-	   loop.  */
-	cmpq	$(CHAR_PER_VEC * 8), %rax
-	jbe	L(last_4x_vec_or_less)
-
-
-	/* Compute number of words checked after aligning.  */
-# ifdef USE_AS_WCSLEN
-	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
-	   overflow.  */
-	leaq	(VEC_SIZE * -3)(%rdi), %rdx
-# else
-	leaq	(VEC_SIZE * -3)(%rdi, %rax), %rax
-# endif
-
-	subq	$(VEC_SIZE * -1), %rdi
-
-	/* Align data to VEC_SIZE * 4.  */
-# if VEC_SIZE == 64
-	/* Saves code size.  No evex512 processor has partial register
-	   stalls.  If that change this can be replaced with `andq
-	   $-(VEC_SIZE * 4), %rdi`.  */
-	xorb	%dil, %dil
-# else
-	andq	$-(VEC_SIZE * 4), %rdi
-# endif
-
-# ifdef USE_AS_WCSLEN
-	subq	%rdi, %rdx
-	sarq	$2, %rdx
-	addq	%rdx, %rax
-# else
-	subq	%rdi, %rax
-# endif
-	/* Compare 4 * VEC at a time forward.  */
-	.p2align 4,, 11
-L(loop_4x_vec):
-	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
-	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
-	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
-	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
-	VPTESTN	%VMM(2), %VMM(2), %k0
-	VPTESTN	%VMM(4), %VMM(4), %k2
-	subq	$-(VEC_SIZE * 4), %rdi
-	/* Break if at end of length.  */
-	subq	$(CHAR_PER_VEC * 4), %rax
-	jbe	L(loop_len_end)
-
-
-	KORTEST %k0, %k2
-	jz	L(loop_4x_vec)
-
-
-L(loop_last_4x_vec):
-	movq	%rsi, %rcx
-	subq	%rax, %rsi
-	VPTESTN	%VMM(1), %VMM(1), %k1
-	KMOV	%k1, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(last_vec_x0)
-
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(last_vec_x1)
-
-	VPTESTN	%VMM(3), %VMM(3), %k0
-
-	/* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
-	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
-	   individually, for VEC_SIZE == 32 we combine them in a single
-	   64-bit GPR.  */
-# if CHAR_PER_VEC == 64
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(last_vec_x2)
-	KMOV	%k2, %VRDX
-# else
-	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
-	 */
-	kmovd	%k2, %edx
-	kmovd	%k0, %eax
-	salq	$CHAR_PER_VEC, %rdx
-	orq	%rax, %rdx
-# endif
-
-	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
-	 */
-	bsfq	%rdx, %rdx
-	leaq	(FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
-	cmpq	%rax, %rcx
-	cmovb	%rcx, %rax
-	ret
-
-	/* Handle last 4x VEC after loop. All VECs have been loaded.  */
-	.p2align 4,, 4
-L(loop_len_end):
-	KORTEST %k0, %k2
-	jnz	L(loop_last_4x_vec)
-	movq	%rsi, %rax
-	ret
-
-
-# if CHAR_PER_VEC == 64
-	/* Since we can't combine the last 2x VEC for VEC_SIZE == 64
-	   need return label for it.  */
-	.p2align 4,, 8
-L(last_vec_x2):
-	bsf	%VRDX, %VRDX
-	leaq	(CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
-	cmpq	%rax, %rcx
-	cmovb	%rcx, %rax
-	ret
-# endif
-
-
-	.p2align 4,, 10
-L(last_vec_x1):
-	addq	$CHAR_PER_VEC, %rsi
-L(last_vec_x0):
-	bsf	%VRDX, %VRDX
-	leaq	(CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
-	cmpq	%rax, %rcx
-	cmovb	%rcx, %rax
-	ret
-
-
-	.p2align 4,, 8
-L(cross_page_boundary):
-	/* Align data to VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andq	$-VEC_SIZE, %rcx
-	VPCMPEQ	(%rcx), %VZERO, %k0
-
-	KMOV	%k0, %VRCX
-# ifdef USE_AS_WCSLEN
-	shrl	$2, %eax
-	andl	$(CHAR_PER_VEC - 1), %eax
-# endif
-	shrx	%VRAX, %VRCX, %VRCX
-
-	negl	%eax
-	andl	$(CHAR_PER_VEC - 1), %eax
-	movq	%rsi, %rdx
-	bsf	%VRCX, %VRDX
-	cmpq	%rax, %rdx
-	ja	L(cross_page_continue)
-	movl	%edx, %eax
-	cmpq	%rdx, %rsi
-	cmovb	%esi, %eax
-	ret
-END (STRNLEN)
+#ifndef STRNLEN
+#define STRNLEN __strnlen_evex
 #endif
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strnlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
index f8e55883bb..8ef54078f8 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -1,264 +1,7 @@
-/* Placeholder function, not used by any processor at the moment.
-   Copyright (C) 2022-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
 #ifndef STRNLEN
 #define STRNLEN __strnlen_evex512
 #endif
 
 #include "x86-evex512-vecs.h"
 #include "reg-macros.h"
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# include <sysdep.h>
-
-# ifdef USE_AS_WCSLEN
-#  define VPCMPEQ	vpcmpeqd
-#  define VPTESTN	vptestnmd
-#  define VPMINU	vpminud
-#  define CHAR_SIZE	4
-# else
-#  define VPCMPEQ	vpcmpeqb
-#  define VPTESTN	vptestnmb
-#  define VPMINU	vpminub
-#  define CHAR_SIZE	1
-# endif
-
-# define PAGE_SIZE	4096
-# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
-
-	.section SECTION(.text),"ax",@progbits
-/* Aligning entry point to 64 byte, provides better performance for
-   one vector length string.  */
-ENTRY_P2ALIGN (STRNLEN, 6)
-	/* Check zero length.  */
-	test	%RSI_LP, %RSI_LP
-	jz	L(ret_max)
-#  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
-
-	movl	%edi, %eax
-	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
-	sall	$20, %eax
-	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
-	ja	L(page_cross)
-
-	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMPEQ	(%rdi), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	/* Store max length in rax.  */
-	mov	%rsi, %rax
-	/* If rcx is 0, rax will have max length.  We can not use VRCX
-	   and VRAX here for evex256 because, upper 32 bits may be
-	   undefined for ecx and eax.  */
-	bsfq	%rcx, %rax
-	cmp	$CHAR_PER_VEC, %rax
-	ja	L(align_more)
-	cmpq	%rax, %rsi
-	cmovb	%esi, %eax
-	ret
-
-	/* At this point vector max length reached.  */
-	.p2align 4,,3
-L(ret_max):
-	movq	%rsi, %rax
-	ret
-
-L(align_more):
-	mov	%rdi, %rax
-	/* Align rax to VEC_SIZE.  */
-	andq	$-VEC_SIZE, %rax
-	movq	%rdi, %rdx
-	subq	%rax, %rdx
-#  ifdef USE_AS_WCSLEN
-	shr	$2, %VRDX
-#  endif
-	/* At this point rdx contains [w]chars already compared.  */
-	leaq	-CHAR_PER_VEC(%rsi, %rdx), %rdx
-	/* At this point rdx contains number of w[char] needs to go.
-	   Now onwards rdx will keep decrementing with each compare.  */
-
-	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
-	subq	$-VEC_SIZE, %rax
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x1)
-
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-
-	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x2)
-
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-
-	VPCMPEQ	(VEC_SIZE * 2)(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x3)
-
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-
-	VPCMPEQ	(VEC_SIZE * 3)(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x4)
-
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-	/* Save pointer before 4 x VEC_SIZE alignment.  */
-	movq	%rax, %rcx
-
-	/* Align address to VEC_SIZE * 4 for loop.  */
-	andq	$-(VEC_SIZE * 4), %rax
-
-	subq	%rax, %rcx
-#  ifdef USE_AS_WCSLEN
-	shr	$2, %VRCX
-#  endif
-	/* rcx contains number of [w]char will be recompared due to
-	   alignment fixes.  rdx must be incremented by rcx to offset
-	   alignment adjustment.  */
-	addq	%rcx, %rdx
-	/* Need jump as we don't want to add/subtract rdx for first
-	   iteration of 4 x VEC_SIZE aligned loop.  */
-
-	.p2align 4,,11
-L(loop):
-	/* VPMINU and VPCMP combination provide better performance as
-	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
-
-	VPTESTN	%VMM(2), %VMM(2), %k0
-	VPTESTN	%VMM(4), %VMM(4), %k1
-
-	subq	$-(VEC_SIZE * 4), %rax
-	KORTEST	%k0, %k1
-
-	jnz	L(loopend)
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	ja	L(loop)
-	mov	%rsi, %rax
-	ret
-
-L(loopend):
-
-	VPTESTN	%VMM(1), %VMM(1), %k2
-	KMOV	%k2, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x1)
-
-	KMOV	%k0, %VRCX
-	/* At this point, if k0 is non zero, null char must be in the
-	   second vector.  */
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x2)
-
-	VPTESTN	%VMM(3), %VMM(3), %k3
-	KMOV	%k3, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x3)
-	/* At this point null [w]char must be in the fourth vector so no
-	   need to check.  */
-	KMOV	%k1, %VRCX
-
-	/* Fourth, third, second vector terminating are pretty much
-	   same, implemented this way to avoid branching and reuse code
-	   from pre loop exit condition.  */
-L(ret_vec_x4):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	subq	$-(VEC_SIZE * 3), %rax
-	shrq	$2, %rax
-	addq	%rcx, %rax
-# else
-	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
-# endif
-
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-	ret
-
-L(ret_vec_x3):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	subq	$-(VEC_SIZE * 2), %rax
-	shrq	$2, %rax
-	addq	%rcx, %rax
-# else
-	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
-# endif
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-	ret
-
-L(ret_vec_x2):
-	subq	$-VEC_SIZE, %rax
-L(ret_vec_x1):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-# endif
-	addq	%rcx, %rax
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-	ret
-
-L(page_cross):
-	mov	%rdi, %rax
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-# ifdef USE_AS_WCSLEN
-	sarl	$2, %ecx
-# endif
-	/* ecx contains number of w[char] to be skipped as a result
-	   of address alignment.  */
-	andq	$-VEC_SIZE, %rax
-	VPCMPEQ	(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRDX
-	/* Ignore number of character for alignment adjustment.  */
-	shr	%cl, %VRDX
-	jnz	L(page_cross_end)
-	movl    $CHAR_PER_VEC, %eax
-	sub     %ecx, %eax
-	cmp	%rax, %rsi
-	ja	L(align_more)
-
-L(page_cross_end):
-	bsf	%VRDX, %VRAX
-	cmpq	%rsi, %rax
-	cmovnb	%esi, %eax
-	ret
-
-END (STRNLEN)
-#endif
+#include "strnlen-evex-base.S"
\ No newline at end of file
diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym
index 2bbd563a6c..988a4b8593 100644
--- a/sysdeps/x86_64/nptl/tcb-offsets.sym
+++ b/sysdeps/x86_64/nptl/tcb-offsets.sym
@@ -13,6 +13,3 @@ MULTIPLE_THREADS_OFFSET	offsetof (tcbhead_t, multiple_threads)
 POINTER_GUARD		offsetof (tcbhead_t, pointer_guard)
 FEATURE_1_OFFSET	offsetof (tcbhead_t, feature_1)
 SSP_BASE_OFFSET		offsetof (tcbhead_t, ssp_base)
-
--- Not strictly offsets, but these values are also used in the TCB.
-TCB_CANCELED_BITMASK	 CANCELED_BITMASK
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index a015789a4f..a9b20b798f 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -1,3 +1,9 @@
+ifeq ($(subdir),elf)
+# Xfail tst-platform-1 on x32 since kernel passes i686 in AT_PLATFORM.
+# See https://sourceware.org/bugzilla/show_bug.cgi?id=22363
+test-xfail-tst-platform-1 = yes
+endif
+
 ifeq ($(subdir),math)
 # Since x32 returns 32-bit long int and 64-bit long long int in the
 # same 64-bit register, we make the 32b-bit lround an alias of the
diff --git a/sysdeps/x86_64/x32/dl-machine.h b/sysdeps/x86_64/x32/dl-machine.h
deleted file mode 100644
index c35cee9261..0000000000
--- a/sysdeps/x86_64/x32/dl-machine.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Machine-dependent ELF dynamic relocation inline functions.  x32 version.
-   Copyright (C) 2012-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-/* Must allow <sysdeps/x86_64/dl-machine.h> to be included more than once.
-   See #ifdef RESOLVE_MAP in sysdeps/x86_64/dl-machine.h.  */
-#include <sysdeps/x86_64/dl-machine.h>
-
-#ifndef _X32_DL_MACHINE_H
-#define _X32_DL_MACHINE_H
-
-#undef ARCH_LA_PLTENTER
-#undef ARCH_LA_PLTEXIT
-#undef RTLD_START
-
-/* Names of the architecture-specific auditing callback functions.  */
-#define ARCH_LA_PLTENTER x32_gnu_pltenter
-#define ARCH_LA_PLTEXIT x32_gnu_pltexit
-
-/* Initial entry point code for the dynamic linker.
-   The C function `_dl_start' is the real entry point;
-   its return value is the user program's entry point.  */
-#define RTLD_START asm ("\n\
-.text\n\
-	.p2align 4\n\
-.globl _start\n\
-.globl _dl_start_user\n\
-_start:\n\
-	movl %esp, %edi\n\
-	call _dl_start\n\
-_dl_start_user:\n\
-	# Save the user entry point address in %r12.\n\
-	movl %eax, %r12d\n\
-	# Read the original argument count.\n\
-	movl (%rsp), %edx\n\
-	# Call _dl_init (struct link_map *main_map, int argc, char **argv, char **env)\n\
-	# argc -> rsi\n\
-	movl %edx, %esi\n\
-	# Save %rsp value in %r13.\n\
-	movl %esp, %r13d\n\
-	# And align stack for the _dl_init call.\n\
-	and $-16, %esp\n\
-	# _dl_loaded -> rdi\n\
-	movl _rtld_local(%rip), %edi\n\
-	# env -> rcx\n\
-	lea 8(%r13,%rdx,4), %ecx\n\
-	# argv -> rdx\n\
-	lea 4(%r13), %edx\n\
-	# Clear %rbp to mark outermost frame obviously even for constructors.\n\
-	xorl %ebp, %ebp\n\
-	# Call the function to run the initializers.\n\
-	call _dl_init\n\
-	# Pass our finalizer function to the user in %rdx, as per ELF ABI.\n\
-	lea _dl_fini(%rip), %edx\n\
-	# And make sure %rsp points to argc stored on the stack.\n\
-	movl %r13d, %esp\n\
-	# Jump to the user's entry point.\n\
-	jmp *%r12\n\
-.previous\n\
-");
-
-#endif /* !_X32_DL_MACHINE_H */