32 files changed, 1221 insertions, 109 deletions
diff --git a/INSTALL b/INSTALL
index 208ec98d4b..4bd3d53676 100644
--- a/INSTALL
+++ b/INSTALL
@@ -224,6 +224,23 @@ if 'CFLAGS' is specified it must enable optimization.  For example:
      By default for x86_64, the GNU C Library is built with the vector
      math library.  Use this option to disable the vector math library.
 
+'--disable-static-c++-tests'
+     By default, if the C++ toolchain lacks support for static linking,
+     configure fails to find the C++ header files and the glibc build
+     fails.  '--disable-static-c++-link-check' allows the glibc build to
+     finish, but static C++ tests will fail if the C++ toolchain doesn't
+     have the necessary static C++ libraries.  Use this option to skip
+     the static C++ tests.  This option implies
+     '--disable-static-c++-link-check'.
+
+'--disable-static-c++-link-check'
+     By default, if the C++ toolchain lacks support for static linking,
+     configure fails to find the C++ header files and the glibc build
+     fails.  Use this option to disable the static C++ link check so
+     that the C++ header files can be located.  The newly built libc.a
+     can be used to create static C++ tests if the C++ toolchain has the
+     necessary static C++ libraries.
+
 '--disable-scv'
      Disable using 'scv' instruction for syscalls.  All syscalls will
      use 'sc' instead, even if the kernel supports 'scv'.  PowerPC only.
diff --git a/catgets/Makefile b/catgets/Makefile
index 24b4560d5f..40c65eac95 100644
--- a/catgets/Makefile
+++ b/catgets/Makefile
@@ -43,8 +43,12 @@ tests-special += \
   $(objpfx)test-gencat.out \
   $(objpfx)test1.cat \
   $(objpfx)test2.cat \
-  $(objpfx)tst-catgets-mem.out
   # tests-special
+ifeq (yes,$(build-shared))
+ifneq ($(PERL),no)
+tests-special += $(objpfx)tst-catgets-mem.out
+endif
+endif
 endif
 
 gencat-modules	= xmalloc
@@ -68,9 +72,17 @@ generated += \
   test1.h \
   test2.cat \
   test2.h \
+  # generated
+ifeq ($(run-built-tests),yes)
+ifeq (yes,$(build-shared))
+ifneq ($(PERL),no)
+generated += \
   tst-catgets-mem.out \
   tst-catgets.mtrace \
   # generated
+endif
+endif
+endif
 
 generated-dirs += \
   de \
diff --git a/configure b/configure
index 1df2f2e6d1..1bae55b45b 100755
--- a/configure
+++ b/configure
@@ -771,6 +771,8 @@ ac_user_opts='
 enable_option_checking
 with_pkgversion
 with_bugurl
+enable_static_c___tests
+enable_static_c___link_check
 with_gd
 with_gd_include
 with_gd_lib
@@ -1440,6 +1442,10 @@ Optional Features:
   --disable-option-checking  ignore unrecognized --enable/--with options
   --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
   --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
+  --disable-static-c++-tests
+                          disable static C++ tests[default=no]
+  --disable-static-c++-link-check
+                          disable static C++ link check [default=no]
   --disable-sanity-checks really do not use threads (should not be used except
                           in special situations) [default=yes]
   --enable-shared         build shared library [default=yes if GNU ld]
@@ -3855,6 +3861,29 @@ if test -z "$CPP"; then
 fi
 
 
+# This will get text that should go into config.make.
+config_vars=
+
+# Check whether --enable-static-c++-tests was given.
+if test ${enable_static_c___tests+y}
+then :
+  enableval=$enable_static_c___tests; static_cxx_tests=$enableval
+else $as_nop
+  static_cxx_tests=yes
+fi
+
+config_vars="$config_vars
+static-cxx-tests = $static_cxx_tests"
+
+# Check whether --enable-static-c++-link-check was given.
+if test ${enable_static_c___link_check+y}
+then :
+  enableval=$enable_static_c___link_check; static_cxx_link_check=$enableval
+else $as_nop
+  static_cxx_link_check=yes
+fi
+
+
 # We need the C++ compiler only for testing.
 
 
@@ -4279,10 +4308,11 @@ esac
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
-# Static case.
-old_LDFLAGS="$LDFLAGS"
-LDFLAGS="$LDFLAGS -static"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+if test $static_cxx_link_check$static_cxx_tests = yesyes; then
+  # Static case.
+  old_LDFLAGS="$LDFLAGS"
+  LDFLAGS="$LDFLAGS -static"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 #include <iostream>
@@ -4304,7 +4334,8 @@ esac
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.beam \
     conftest$ac_exeext conftest.$ac_ext
-LDFLAGS="$old_LDFLAGS"
+  LDFLAGS="$old_LDFLAGS"
+fi
 ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
@@ -4324,9 +4355,6 @@ if test "`cd $srcdir; pwd -P`" = "`pwd -P`"; then
   as_fn_error $? "you must configure in a separate build directory" "$LINENO" 5
 fi
 
-# This will get text that should go into config.make.
-config_vars=
-
 # Check for a --with-gd argument and set libgd-LDFLAGS in config.make.
 
 # Check whether --with-gd was given.
diff --git a/configure.ac b/configure.ac
index bdc385d03c..e48957f318 100644
--- a/configure.ac
+++ b/configure.ac
@@ -52,6 +52,22 @@ fi
 AC_SUBST(cross_compiling)
 AC_PROG_CPP
 
+# This will get text that should go into config.make.
+config_vars=
+
+AC_ARG_ENABLE([static-c++-tests],
+	      AS_HELP_STRING([--disable-static-c++-tests],
+			     [disable static C++ tests@<:@default=no@:>@]),
+	      [static_cxx_tests=$enableval],
+	      [static_cxx_tests=yes])
+LIBC_CONFIG_VAR([static-cxx-tests], [$static_cxx_tests])
+
+AC_ARG_ENABLE([static-c++-link-check],
+	      AS_HELP_STRING([--disable-static-c++-link-check],
+			     [disable static C++ link check @<:@default=no@:>@]),
+	      [static_cxx_link_check=$enableval],
+	      [static_cxx_link_check=yes])
+
 # We need the C++ compiler only for testing.
 AC_PROG_CXX
 # It's useless to us if it can't link programs (e.g. missing -lstdc++).
@@ -61,10 +77,11 @@ AC_LANG_PUSH([C++])
 AC_LINK_IFELSE([AC_LANG_PROGRAM([], [])],
 	       [libc_cv_cxx_link_ok=yes],
 	       [libc_cv_cxx_link_ok=no])
-# Static case.
-old_LDFLAGS="$LDFLAGS"
-LDFLAGS="$LDFLAGS -static"
-AC_LINK_IFELSE([AC_LANG_SOURCE([
+if test $static_cxx_link_check$static_cxx_tests = yesyes; then
+  # Static case.
+  old_LDFLAGS="$LDFLAGS"
+  LDFLAGS="$LDFLAGS -static"
+  AC_LINK_IFELSE([AC_LANG_SOURCE([
 #include <iostream>
 
 int
@@ -74,9 +91,10 @@ main()
   return 0;
 }
 ])],
-	       [],
-	       [libc_cv_cxx_link_ok=no])
-LDFLAGS="$old_LDFLAGS"
+		 [],
+		 [libc_cv_cxx_link_ok=no])
+  LDFLAGS="$old_LDFLAGS"
+fi
 AC_LANG_POP([C++])])
 AS_IF([test $libc_cv_cxx_link_ok != yes], [CXX=])
 
@@ -84,9 +102,6 @@ if test "`cd $srcdir; pwd -P`" = "`pwd -P`"; then
   AC_MSG_ERROR([you must configure in a separate build directory])
 fi
 
-# This will get text that should go into config.make.
-config_vars=
-
 # Check for a --with-gd argument and set libgd-LDFLAGS in config.make.
 AC_ARG_WITH([gd],
 	    AS_HELP_STRING([--with-gd=DIR],
diff --git a/elf/Makefile b/elf/Makefile
index bb6cd06dec..24ad5221c2 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -445,6 +445,7 @@ tests += \
   tst-p_align1 \
   tst-p_align2 \
   tst-p_align3 \
+  tst-recursive-tls \
   tst-relsort1 \
   tst-ro-dynamic \
   tst-rtld-run-static \
@@ -632,13 +633,19 @@ $(objpfx)tst-rtld-does-not-exist.out: tst-rtld-does-not-exist.sh $(objpfx)ld.so
 tests += $(tests-execstack-$(have-z-execstack))
 ifeq ($(run-built-tests),yes)
 tests-special += \
-  $(objpfx)noload-mem.out \
   $(objpfx)tst-ldconfig-X.out \
   $(objpfx)tst-ldconfig-p.out \
   $(objpfx)tst-ldconfig-soname.out \
-  $(objpfx)tst-leaks1-mem.out \
   $(objpfx)tst-rtld-help.out \
   # tests-special
+ifeq (yes,$(build-shared))
+ifneq ($(PERL),no)
+tests-special += \
+  $(objpfx)noload-mem.out \
+  $(objpfx)tst-leaks1-mem.out \
+  # tests-special
+endif
+endif
 endif
 tlsmod17a-suffixes = 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
 tlsmod18a-suffixes = 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
@@ -886,6 +893,23 @@ modules-names += \
   tst-null-argv-lib \
   tst-p_alignmod-base \
   tst-p_alignmod3 \
+  tst-recursive-tlsmallocmod \
+  tst-recursive-tlsmod0 \
+  tst-recursive-tlsmod1 \
+  tst-recursive-tlsmod2 \
+  tst-recursive-tlsmod3 \
+  tst-recursive-tlsmod4 \
+  tst-recursive-tlsmod5 \
+  tst-recursive-tlsmod6 \
+  tst-recursive-tlsmod7 \
+  tst-recursive-tlsmod8 \
+  tst-recursive-tlsmod9 \
+  tst-recursive-tlsmod10 \
+  tst-recursive-tlsmod11 \
+  tst-recursive-tlsmod12 \
+  tst-recursive-tlsmod13 \
+  tst-recursive-tlsmod14 \
+  tst-recursive-tlsmod15 \
   tst-relsort1mod1 \
   tst-relsort1mod2 \
   tst-ro-dynamic-mod \
@@ -3093,3 +3117,11 @@ CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=$(have-mtls-descriptor)
 CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=$(have-mtls-descriptor)
 CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=$(have-mtls-descriptor)
 endif
+
+$(objpfx)tst-recursive-tls: $(objpfx)tst-recursive-tlsmallocmod.so
+# More objects than DTV_SURPLUS, to trigger DTV reallocation.
+$(objpfx)tst-recursive-tls.out: \
+  $(patsubst %,$(objpfx)tst-recursive-tlsmod%.so, \
+    0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+$(objpfx)tst-recursive-tlsmod%.os: tst-recursive-tlsmodN.c
+	$(compile-command.c) -DVAR=thread_$* -DFUNC=get_threadvar_$*
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 670dbc42fc..3d221273f1 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -75,6 +75,31 @@
 /* Default for dl_tls_static_optional.  */
 #define OPTIONAL_TLS 512
 
+/* Used to count the number of threads currently executing dynamic TLS
+   updates.  Used to avoid recursive malloc calls in __tls_get_addr
+   for an interposed malloc that uses global-dynamic TLS (which is not
+   recommended); see _dl_tls_allocate_active checks.  This could be a
+   per-thread flag, but would need TLS access in the dynamic linker.  */
+unsigned int _dl_tls_threads_in_update;
+
+static inline void
+_dl_tls_allocate_begin (void)
+{
+  atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, 1);
+}
+
+static inline void
+_dl_tls_allocate_end (void)
+{
+  atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, -1);
+}
+
+static inline bool
+_dl_tls_allocate_active (void)
+{
+  return atomic_load_relaxed (&_dl_tls_threads_in_update) > 0;
+}
+
 /* Compute the static TLS surplus based on the namespace count and the
    TLS space that can be used for optimizations.  */
 static inline int
@@ -425,12 +450,18 @@ _dl_allocate_tls_storage (void)
   size += TLS_PRE_TCB_SIZE;
 #endif
 
-  /* Perform the allocation.  Reserve space for the required alignment
-     and the pointer to the original allocation.  */
+  /* Reserve space for the required alignment and the pointer to the
+     original allocation.  */
   size_t alignment = GLRO (dl_tls_static_align);
+
+  /* Perform the allocation.  */
+  _dl_tls_allocate_begin ();
   void *allocated = malloc (size + alignment + sizeof (void *));
   if (__glibc_unlikely (allocated == NULL))
-    return NULL;
+    {
+      _dl_tls_allocate_end ();
+      return NULL;
+    }
 
   /* Perform alignment and allocate the DTV.  */
 #if TLS_TCB_AT_TP
@@ -466,6 +497,8 @@ _dl_allocate_tls_storage (void)
   result = allocate_dtv (result);
   if (result == NULL)
     free (allocated);
+
+  _dl_tls_allocate_end ();
   return result;
 }
 
@@ -483,6 +516,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
   size_t newsize = max_modid + DTV_SURPLUS;
   size_t oldsize = dtv[-1].counter;
 
+  _dl_tls_allocate_begin ();
   if (dtv == GL(dl_initial_dtv))
     {
       /* This is the initial dtv that was either statically allocated in
@@ -502,6 +536,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
       if (newp == NULL)
 	oom ();
     }
+  _dl_tls_allocate_end ();
 
   newp[0].counter = newsize;
 
@@ -676,7 +711,9 @@ allocate_dtv_entry (size_t alignment, size_t size)
   if (powerof2 (alignment) && alignment <= _Alignof (max_align_t))
     {
       /* The alignment is supported by malloc.  */
+      _dl_tls_allocate_begin ();
       void *ptr = malloc (size);
+      _dl_tls_allocate_end ();
       return (struct dtv_pointer) { ptr, ptr };
     }
 
@@ -688,7 +725,10 @@ allocate_dtv_entry (size_t alignment, size_t size)
 
   /* Perform the allocation.  This is the pointer we need to free
      later.  */
+  _dl_tls_allocate_begin ();
   void *start = malloc (alloc_size);
+  _dl_tls_allocate_end ();
+
   if (start == NULL)
     return (struct dtv_pointer) {};
 
@@ -826,7 +866,11 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
 		 free implementation.  Checking here papers over at
 		 least some dynamic TLS usage by interposed mallocs.  */
 	      if (dtv[modid].pointer.to_free != NULL)
-		free (dtv[modid].pointer.to_free);
+		{
+		  _dl_tls_allocate_begin ();
+		  free (dtv[modid].pointer.to_free);
+		  _dl_tls_allocate_end ();
+		}
 	      dtv[modid].pointer.val = TLS_DTV_UNALLOCATED;
 	      dtv[modid].pointer.to_free = NULL;
 
@@ -956,10 +1000,22 @@ __tls_get_addr (GET_ADDR_ARGS)
   size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
   if (__glibc_unlikely (dtv[0].counter != gen))
     {
-      /* Update DTV up to the global generation, see CONCURRENCY NOTES
-         in _dl_update_slotinfo.  */
-      gen = atomic_load_acquire (&GL(dl_tls_generation));
-      return update_get_addr (GET_ADDR_PARAM, gen);
+      if (_dl_tls_allocate_active ()
+	  && GET_ADDR_MODULE < _dl_tls_initial_modid_limit)
+	  /* This is a reentrant __tls_get_addr call, but we can
+	     satisfy it because it's an initially-loaded module ID.
+	     These TLS slotinfo slots do not change, so the
+	     out-of-date generation counter does not matter.  However,
+	     if not in a TLS update, still update_get_addr below, to
+	     get off the slow path eventually.  */
+	;
+      else
+	{
+	  /* Update DTV up to the global generation, see CONCURRENCY NOTES
+	     in _dl_update_slotinfo.  */
+	  gen = atomic_load_acquire (&GL(dl_tls_generation));
+	  return update_get_addr (GET_ADDR_PARAM, gen);
+	}
     }
 
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
@@ -969,7 +1025,7 @@ __tls_get_addr (GET_ADDR_ARGS)
 
   return (char *) p + GET_ADDR_OFFSET;
 }
-#endif
+#endif /* SHARED */
 
 
 /* Look up the module's TLS block as for __tls_get_addr,
@@ -1018,6 +1074,25 @@ _dl_tls_get_addr_soft (struct link_map *l)
   return data;
 }
 
+size_t _dl_tls_initial_modid_limit;
+
+void
+_dl_tls_initial_modid_limit_setup (void)
+{
+  struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
+  size_t idx;
+  for (idx = 0; idx < listp->len; ++idx)
+    {
+      struct link_map *l = listp->slotinfo[idx].map;
+      if (l == NULL
+	  /* The object can be unloaded, so its modid can be
+	     reassociated.  */
+	  || !(l->l_type == lt_executable || l->l_type == lt_library))
+	break;
+    }
+  _dl_tls_initial_modid_limit = idx;
+}
+
 
 void
 _dl_add_to_slotinfo (struct link_map *l, bool do_add)
@@ -1050,9 +1125,11 @@ _dl_add_to_slotinfo (struct link_map *l, bool do_add)
 	 the first slot.  */
       assert (idx == 0);
 
+      _dl_tls_allocate_begin ();
       listp = (struct dtv_slotinfo_list *)
 	malloc (sizeof (struct dtv_slotinfo_list)
 		+ TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
+      _dl_tls_allocate_end ();
       if (listp == NULL)
 	{
 	  /* We ran out of memory while resizing the dtv slotinfo list.  */
diff --git a/elf/rtld.c b/elf/rtld.c
index e9525ea987..6352ba76c5 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -788,6 +788,8 @@ init_tls (size_t naudit)
     _dl_fatal_printf ("\
 cannot allocate TLS data structures for initial thread\n");
 
+  _dl_tls_initial_modid_limit_setup ();
+
   /* Store for detection of the special case by __tls_get_addr
      so it knows not to pass this dtv to the normal realloc.  */
   GL(dl_initial_dtv) = GET_DTV (tcbp);
diff --git a/elf/tst-recursive-tls.c b/elf/tst-recursive-tls.c
new file mode 100644
index 0000000000..716d1f783a
--- /dev/null
+++ b/elf/tst-recursive-tls.c
@@ -0,0 +1,60 @@
+/* Test with interposed malloc with dynamic TLS.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <array_length.h>
+#include <stdio.h>
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+/* Defined in tst-recursive-tlsmallocmod.so.  */
+extern __thread unsigned int malloc_subsytem_counter;
+
+static int
+do_test (void)
+{
+  /* 16 is large enough to exercise the DTV resizing case.  */
+  void *handles[16];
+
+  for (unsigned int i = 0; i < array_length (handles); ++i)
+    {
+      /* Re-use the TLS slot for module 0.  */
+      if (i > 0)
+        xdlclose (handles[0]);
+
+      char soname[30];
+      snprintf (soname, sizeof (soname), "tst-recursive-tlsmod%u.so", i);
+      handles[i] = xdlopen (soname, RTLD_NOW);
+
+      if (i > 0)
+        {
+          handles[0] = xdlopen ("tst-recursive-tlsmod0.so", RTLD_NOW);
+          int (*fptr) (void) = xdlsym (handles[0], "get_threadvar_0");
+          /* May trigger TLS storage allocation using malloc.  */
+          TEST_COMPARE (fptr (), 0);
+        }
+    }
+
+  for (unsigned int i = 0; i < array_length (handles); ++i)
+    xdlclose (handles[i]);
+
+  printf ("info: malloc subsystem calls: %u\n", malloc_subsytem_counter);
+  TEST_VERIFY (malloc_subsytem_counter > 0);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-recursive-tlsmallocmod.c b/elf/tst-recursive-tlsmallocmod.c
new file mode 100644
index 0000000000..c24e9945d1
--- /dev/null
+++ b/elf/tst-recursive-tlsmallocmod.c
@@ -0,0 +1,64 @@
+/* Interposed malloc with dynamic TLS.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include <dlfcn.h>
+
+__thread unsigned int malloc_subsytem_counter;
+
+static __typeof (malloc) *malloc_fptr;
+static __typeof (free) *free_fptr;
+static __typeof (calloc) *calloc_fptr;
+static __typeof (realloc) *realloc_fptr;
+
+static void __attribute__ ((constructor))
+init (void)
+{
+  malloc_fptr = dlsym (RTLD_NEXT, "malloc");
+  free_fptr = dlsym (RTLD_NEXT, "free");
+  calloc_fptr = dlsym (RTLD_NEXT, "calloc");
+  realloc_fptr = dlsym (RTLD_NEXT, "realloc");
+}
+
+void *
+malloc (size_t size)
+{
+  ++malloc_subsytem_counter;
+  return malloc_fptr (size);
+}
+
+void
+free (void *ptr)
+{
+  ++malloc_subsytem_counter;
+  return free_fptr (ptr);
+}
+
+void *
+calloc (size_t a, size_t b)
+{
+  ++malloc_subsytem_counter;
+  return calloc_fptr (a, b);
+}
+
+void *
+realloc (void *ptr, size_t size)
+{
+  ++malloc_subsytem_counter;
+  return realloc_fptr (ptr, size);
+}
diff --git a/elf/tst-recursive-tlsmodN.c b/elf/tst-recursive-tlsmodN.c
new file mode 100644
index 0000000000..bb7592aee6
--- /dev/null
+++ b/elf/tst-recursive-tlsmodN.c
@@ -0,0 +1,28 @@
+/* Test module with global-dynamic TLS.  Used to trigger DTV reallocation.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Compiled with VAR and FUNC set via -D.  FUNC requires some
+   relocation against TLS variable VAR.  */
+
+__thread int VAR;
+
+int
+FUNC (void)
+{
+  return VAR;
+}
diff --git a/io/bits/fcntl2.h b/io/bits/fcntl2.h
index 26f1792fd1..0cced392e7 100644
--- a/io/bits/fcntl2.h
+++ b/io/bits/fcntl2.h
@@ -61,13 +61,8 @@ open (const char *__path, int __oflag, ...)
   return __open_alias (__path, __oflag, __va_arg_pack ());
 }
 #elif __fortify_use_clang
-__fortify_function_error_function __attribute_overloadable__ int
-open (const char *__path, int __oflag, mode_t __mode, ...)
-     __fortify_clang_unavailable ("open can be called either with 2 or 3 arguments, not more");
-
 __fortify_function __attribute_overloadable__ int
 open (__fortify_clang_overload_arg (const char *, ,__path), int __oflag)
-     __fortify_clang_prefer_this_overload
      __fortify_clang_error (__OPEN_NEEDS_MODE (__oflag),
 			    "open with O_CREAT or O_TMPFILE in second argument needs 3 arguments")
 {
diff --git a/libio/Makefile b/libio/Makefile
index f607edbefb..8720381fdc 100644
--- a/libio/Makefile
+++ b/libio/Makefile
@@ -261,15 +261,28 @@ tst-bz22415-ENV = MALLOC_TRACE=$(objpfx)tst-bz22415.mtrace \
 tst-bz24228-ENV = MALLOC_TRACE=$(objpfx)tst-bz24228.mtrace \
 		  LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so
 
-generated += test-fmemopen.mtrace test-fmemopen.check
-generated += tst-fdopen-seek-failure.mtrace tst-fdopen-seek-failure.check
-generated += tst-fopenloc.mtrace tst-fopenloc.check
-generated += tst-bz22415.mtrace tst-bz22415.check
-
 aux	:= fileops genops stdfiles stdio strops
 
+ifeq ($(run-built-tests),yes)
+ifeq ($(build-shared),yes)
+ifneq ($(PERL),no)
+generated += \
+  test-fmemopen.check \
+  test-fmemopen.mtrace \
+  tst-bz22415.check \
+  tst-bz22415.mtrace \
+  tst-bz24228.check \
+  tst-bz24228.mtrace \
+  tst-fdopen-seek-failure.check \
+  tst-fdopen-seek-failure.mtrace \
+  tst-fopenloc.check \
+  tst-fopenloc.mtrace \
+  # generated
+endif
+endif
+endif
+
 ifeq ($(build-shared),yes)
-generated += tst-bz24228.mtrace tst-bz24228.check
 aux	+= oldfileops oldstdfiles
 tests += \
   tst-stderr-compat \
@@ -286,16 +299,23 @@ shared-only-routines = oldiofopen oldiofdopen oldiofclose oldfileops	\
 
 ifeq ($(run-built-tests),yes)
 tests-special += \
-  $(objpfx)test-fmemopen-mem.out \
   $(objpfx)test-freopen.out \
-  $(objpfx)tst-bz22415-mem.out \
-  $(objpfx)tst-fdopen-seek-failure-mem.out \
   # tests-special
 ifeq (yes,$(build-shared))
 # Run tst-fopenloc-cmp.out and tst-openloc-mem.out only if shared
 # library is enabled since they depend on tst-fopenloc.out.
-tests-special += $(objpfx)tst-fopenloc-cmp.out $(objpfx)tst-fopenloc-mem.out \
-		 $(objpfx)tst-bz24228-mem.out
+tests-special += $(objpfx)tst-fopenloc-cmp.out
+ifeq ($(build-shared),yes)
+ifneq ($(PERL),no)
+tests-special += \
+  $(objpfx)test-fmemopen-mem.out \
+  $(objpfx)tst-bz22415-mem.out \
+  $(objpfx)tst-bz24228-mem.out \
+  $(objpfx)tst-fdopen-seek-failure-mem.out \
+  $(objpfx)tst-fopenloc-mem.out \
+  # tests-special
+endif
+endif
 endif
 
 tests += \
diff --git a/manual/install.texi b/manual/install.texi
index 6504d02c62..a7847b02c0 100644
--- a/manual/install.texi
+++ b/manual/install.texi
@@ -252,6 +252,22 @@ configure with @option{--disable-werror}.
 By default for x86_64, @theglibc{} is built with the vector math library.
 Use this option to disable the vector math library.
 
+@item --disable-static-c++-tests
+By default, if the C++ toolchain lacks support for static linking,
+configure fails to find the C++ header files and the glibc build fails.
+@option{--disable-static-c++-link-check} allows the glibc build to finish,
+but static C++ tests will fail if the C++ toolchain doesn't have the
+necessary static C++ libraries.  Use this option to skip the static C++
+tests.  This option implies @option{--disable-static-c++-link-check}.
+
+@item --disable-static-c++-link-check
+By default, if the C++ toolchain lacks support for static linking,
+configure fails to find the C++ header files and the glibc build fails.
+Use this option to disable the static C++ link check so that the C++
+header files can be located.  The newly built libc.a can be used to
+create static C++ tests if the C++ toolchain has the necessary static
+C++ libraries.
+
 @item --disable-scv
 Disable using @code{scv} instruction for syscalls. All syscalls will use
 @code{sc} instead, even if the kernel supports @code{scv}. PowerPC only.
diff --git a/manual/llio.texi b/manual/llio.texi
index fe1807a849..78c7c79913 100644
--- a/manual/llio.texi
+++ b/manual/llio.texi
@@ -1573,10 +1573,15 @@ permitted.  They include @code{PROT_READ}, @code{PROT_WRITE}, and
 of address space for future use.  The @code{mprotect} function can be
 used to change the protection flags.  @xref{Memory Protection}.
 
-@var{flags} contains flags that control the nature of the map.
-One of @code{MAP_SHARED} or @code{MAP_PRIVATE} must be specified.
+The @var{flags} parameter contains flags that control the nature of
+the map.  One of @code{MAP_SHARED}, @code{MAP_SHARED_VALIDATE}, or
+@code{MAP_PRIVATE} must be specified.  Additional flags may be bitwise
+OR'd to further define the mapping.
 
-They include:
+Note that, aside from @code{MAP_PRIVATE} and @code{MAP_SHARED}, not
+all flags are supported on all versions of all operating systems.
+Consult the kernel-specific documentation for details.  The flags
+include:
 
 @vtable @code
 @item MAP_PRIVATE
@@ -1598,9 +1603,19 @@ Note that actual writing may take place at any time.  You need to use
 @code{msync}, described below, if it is important that other processes
 using conventional I/O get a consistent view of the file.
 
+@item MAP_SHARED_VALIDATE
+Similar to @code{MAP_SHARED} except that additional flags will be
+validated by the kernel, and the call will fail if an unrecognized
+flag is provided.  With @code{MAP_SHARED} using a flag on a kernel
+that doesn't support it causes the flag to be ignored.
+@code{MAP_SHARED_VALIDATE} should be used when the behavior of all
+flags is required.
+
 @item MAP_FIXED
 This forces the system to use the exact mapping address specified in
-@var{address} and fail if it can't.
+@var{address} and fail if it can't.  Note that if the new mapping
+would overlap an existing mapping, the overlapping portion of the
+existing map is unmapped.
 
 @c One of these is official - the other is obviously an obsolete synonym
 @c Which is which?
@@ -1641,10 +1656,73 @@ The @code{MAP_HUGETLB} flag is specific to Linux.
 @c There is a mechanism to select different hugepage sizes; see
 @c include/uapi/asm-generic/hugetlb_encode.h in the kernel sources.
 
-@c Linux has some other MAP_ options, which I have not discussed here.
-@c MAP_DENYWRITE, MAP_EXECUTABLE and MAP_GROWSDOWN don't seem applicable to
-@c user programs (and I don't understand the last two).  MAP_LOCKED does
-@c not appear to be implemented.
+@item MAP_32BIT
+Require addresses that can be accessed with a signed 32 bit pointer,
+i.e., within the first 2 GiB.  Ignored if MAP_FIXED is specified.
+
+@item MAP_DENYWRITE
+@itemx MAP_EXECUTABLE
+@itemx MAP_FILE
+
+Provided for compatibility.  Ignored by the Linux kernel.
+
+@item MAP_FIXED_NOREPLACE
+Similar to @code{MAP_FIXED} except the call will fail with
+@code{EEXIST} if the new mapping would overwrite an existing mapping.
+To test for support for this flag, specify MAP_FIXED_NOREPLACE without
+MAP_FIXED, and (if the call was successful) check the actual address
+returned.  If it does not match the address passed, then this flag is
+not supported.
+
+@item MAP_GROWSDOWN
+This flag is used to make stacks, and is typically only needed inside
+the program loader to set up the main stack for the running process.
+The mapping is created according to the other flags, except an
+additional page just prior to the mapping is marked as a ``guard
+page''.  If a write is attempted inside this guard page, that page is
+mapped, the mapping is extended, and a new guard page is created.
+Thus, the mapping continues to grow towards lower addresses until it
+encounters some other mapping.
+
+Note that accessing memory beyond the guard page will not trigger this
+feature.  In gcc, use @code{-fstack-clash-protection} to ensure the
+guard page is always touched.
+
+@item MAP_LOCKED
+A hint that requests that mapped pages are locked in memory (i.e. not
+paged out).  Note that this is a request and not a requirement; use
+@code{mlock} if locking is required.
+
+@item MAP_POPULATE
+@itemx MAP_NONBLOCK
+@code{MAP_POPULATE} is a hint that requests that the kernel read-ahead
+a file-backed mapping, causing pages to be mapped before they're
+needed.  @code{MAP_NONBLOCK} is a hint that requests that the kernel
+@emph{not} attempt such except for pages are already in memory.  Note
+that neither of these hints affects future paging activity, use
+@code{mlock} if such needs to be controlled.
+
+@item MAP_NORESERVE
+Asks the kernel to not reserve physical backing (i.e. space in a swap
+device) for a mapping.  This would be useful for, for example, a very
+large but sparsely used mapping which need not be limited in total
+length by available RAM, but with very few mapped pages.  Note that
+writes to such a mapping may cause a @code{SIGSEGV} if the system is
+unable to map a page due to lack of resources.
+
+On Linux, this flag's behavior may be overwridden by
+@file{/proc/sys/vm/overcommit_memory} as documented in the proc(5) man
+page.
+
+@item MAP_STACK
+Ensures that the resulting mapping is suitable for use as a program
+stack.  For example, the use of huge pages might be precluded.
+
+@item MAP_SYNC
+This is a special flag for DAX devices, which tells the kernel to
+write dirty metadata out whenever dirty data is written out.  Unlike
+most other flags, this one will fail unless @code{MAP_SHARED_VALIDATE}
+is also given.
 
 @end vtable
 
@@ -1655,6 +1733,24 @@ Possible errors include:
 
 @table @code
 
+@item EACCES
+
+@var{filedes} was not open for the type of access specified in @var{protect}.
+
+@item EAGAIN
+
+The system has temporarily run out of resources.
+
+@item EBADF
+
+The @var{fd} passed is invalid, and a valid file descriptor is
+required (i.e. MAP_ANONYMOUS was not specified).
+
+@item EEXIST
+
+@code{MAP_FIXED_NOREPLACE} was specified and an existing mapping was
+found overlapping the requested address range.
+
 @item EINVAL
 
 Either @var{address} was unusable (because it is not a multiple of the
@@ -1663,23 +1759,37 @@ applicable page size), or inconsistent @var{flags} were given.
 If @code{MAP_HUGETLB} was specified, the file or system does not support
 large page sizes.
 
-@item EACCES
+@item ENODEV
 
-@var{filedes} was not open for the type of access specified in @var{protect}.
+This file is of a type that doesn't support mapping, the process has
+exceeded its data space limit, or the map request would exceed the
+process's virtual address space.
 
 @item ENOMEM
 
-Either there is not enough memory for the operation, or the process is
-out of address space.
-
-@item ENODEV
-
-This file is of a type that doesn't support mapping.
+There is not enough memory for the operation, the process is out of
+address space, or there are too many mappings.  On Linux, the maximum
+number of mappings can be controlled via
+@file{/proc/sys/vm/max_map_count} or, if your OS supports it, via
+the @code{vm.max_map_count} @code{sysctl} setting.
 
 @item ENOEXEC
 
 The file is on a filesystem that doesn't support mapping.
 
+@item EPERM
+
+@code{PROT_EXEC} was requested but the file is on a filesystem that
+was mounted with execution denied, a file seal prevented the mapping,
+or the caller set MAP_HUDETLB but does not have the required
+priviledges.
+
+@item EOVERFLOW
+
+Either the offset into the file plus the length of the mapping causes
+internal page counts to overflow, or the offset requested exceeds the
+length of the file.
+
 @c On Linux, EAGAIN will appear if the file has a conflicting mandatory lock.
 @c However mandatory locks are not discussed in this manual.
 @c
diff --git a/misc/Makefile b/misc/Makefile
index c273ec6974..5d17c562fe 100644
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -214,12 +214,18 @@ routines_no_fortify += \
   syslog \
   # routines_no_fortify
 
+ifeq ($(run-built-tests),yes)
+ifeq (yes,$(build-shared))
+ifneq ($(PERL),no)
 generated += \
   tst-allocate_once-mem.out \
   tst-allocate_once.mtrace \
   tst-error1-mem.out \
   tst-error1.mtrace \
   # generated
+endif
+endif
+endif
 
 aux := init-misc
 install-lib := libg.a
@@ -285,8 +291,14 @@ tests-internal += tst-fd_to_filename
 tests-static += tst-fd_to_filename
 
 ifeq ($(run-built-tests),yes)
-tests-special += $(objpfx)tst-error1-mem.out \
-  $(objpfx)tst-allocate_once-mem.out
+ifeq (yes,$(build-shared))
+ifneq ($(PERL),no)
+tests-special += \
+  $(objpfx)tst-allocate_once-mem.out \
+  $(objpfx)tst-error1-mem.out \
+  # tests-special
+endif
+endif
 endif
 
 tests-container := \
diff --git a/nptl/Makefile b/nptl/Makefile
index b3f8af2e1c..c4c27e0d23 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -545,6 +545,9 @@ tests-static += \
   # tests-static
 
 tests += tst-cancel24-static
+ifeq ($(static-cxx-tests),no)
+tests-unsupported += tst-cancel24-static
+endif
 
 tests-internal += \
   tst-sem11-static \
@@ -556,10 +559,12 @@ xtests-static += tst-setuid1-static
 ifeq ($(run-built-tests),yes)
 tests-special += \
   $(objpfx)tst-oddstacklimit.out \
-  $(objpfx)tst-stack3-mem.out \
   # tests-special
 ifeq ($(build-shared),yes)
 tests-special += $(objpfx)tst-tls6.out
+ifneq ($(PERL),no)
+tests-special += $(objpfx)tst-stack3-mem.out
+endif
 endif
 endif
 
@@ -617,10 +622,17 @@ tst-stack3-ENV = MALLOC_TRACE=$(objpfx)tst-stack3.mtrace \
 $(objpfx)tst-stack3-mem.out: $(objpfx)tst-stack3.out
 	$(common-objpfx)malloc/mtrace $(objpfx)tst-stack3.mtrace > $@; \
 	$(evaluate-test)
+
+ifeq ($(run-built-tests),yes)
+ifeq (yes,$(build-shared))
+ifneq ($(PERL),no)
 generated += \
   tst-stack3-mem.out \
   tst-stack3.mtrace \
   # generated
+endif
+endif
+endif
 
 tst-stack4mod.sos=$(shell for i in 0 1 2 3 4 5 6 7 8 9 10 \
 				   11 12 13 14 15 16 17 18 19; do \
diff --git a/posix/Makefile b/posix/Makefile
index a1e84853a8..2c598cd20a 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -418,6 +418,17 @@ generated += \
   $(addprefix wordexp-test-result, 1 2 3 4 5 6 7 8 9 10) \
   annexc \
   annexc.out \
+  getconf.speclist \
+  ptestcases.h \
+  testcases.h \
+  tst-getconf.out \
+  wordexp-tst.out \
+  # generated
+
+ifeq ($(run-built-tests),yes)
+ifeq (yes,$(build-shared))
+ifneq ($(PERL),no)
+generated += \
   bug-ga2-mem.out \
   bug-ga2.mtrace \
   bug-glob2-mem.out \
@@ -430,23 +441,22 @@ generated += \
   bug-regex21.mtrace \
   bug-regex31-mem.out \
   bug-regex31.mtrace \
+  bug-regex36-mem.out \
   bug-regex36.mtrace \
-  getconf.speclist \
-  ptestcases.h \
-  testcases.h \
   tst-boost-mem.out \
   tst-boost.mtrace \
   tst-fnmatch-mem.out \
   tst-fnmatch.mtrace \
-  tst-getconf.out \
   tst-pcre-mem.out \
   tst-pcre.mtrace \
   tst-rxspencer-no-utf8-mem.out \
   tst-rxspencer-no-utf8.mtrace \
   tst-vfork3-mem.out \
   tst-vfork3.mtrace \
-  wordexp-tst.out \
   # generated
+endif
+endif
+endif
 
 ifeq ($(run-built-tests),yes)
 ifeq (yes,$(build-shared))
@@ -461,6 +471,9 @@ endif
 # XXX Please note that for now we ignore the result of this test.
 tests-special += $(objpfx)annexc.out
 ifeq ($(run-built-tests),yes)
+tests-special += $(objpfx)tst-getconf.out
+ifeq (yes,$(build-shared))
+ifneq ($(PERL),no)
 tests-special += \
   $(objpfx)bug-ga2-mem.out \
   $(objpfx)bug-glob2-mem.out \
@@ -471,13 +484,14 @@ tests-special += \
   $(objpfx)bug-regex36-mem.out \
   $(objpfx)tst-boost-mem.out \
   $(objpfx)tst-fnmatch-mem.out \
-  $(objpfx)tst-getconf.out \
   $(objpfx)tst-glob-tilde-mem.out \
   $(objpfx)tst-pcre-mem.out \
   $(objpfx)tst-rxspencer-no-utf8-mem.out \
   $(objpfx)tst-vfork3-mem.out \
   # tests-special
 endif
+endif
+endif
 
 include ../Rules
 
diff --git a/signal/Makefile b/signal/Makefile
index e8e3dce0cf..7cddbc3c65 100644
--- a/signal/Makefile
+++ b/signal/Makefile
@@ -46,11 +46,22 @@ routines	:= signal raise killpg \
 		   allocrtsig sigtimedwait sigwaitinfo sigqueue \
 		   sighold sigrelse sigignore sigset
 
-tests		:= tst-signal tst-sigset tst-sigsimple tst-raise tst-sigset2 \
-		   tst-sigwait-eintr tst-sigaction \
-		   tst-minsigstksz-1 tst-minsigstksz-2 tst-minsigstksz-3 \
-		   tst-minsigstksz-3a tst-minsigstksz-4 tst-minsigstksz-5 \
-		   tst-sigisemptyset
+tests := \
+  tst-minsigstksz-1 \
+  tst-minsigstksz-2 \
+  tst-minsigstksz-3 \
+  tst-minsigstksz-3a \
+  tst-minsigstksz-4 \
+  tst-minsigstksz-5 \
+  tst-raise \
+  tst-sigaction \
+  tst-sigisemptyset \
+  tst-signal \
+  tst-sigset \
+  tst-sigset2 \
+  tst-sigsimple \
+  tst-sigwait-eintr \
+# tests
 
 include ../Rules
 
diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 6bc972af1a..a63c05a120 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -229,10 +229,6 @@ tests := \
   tst-popen \
   tst-popen2 \
   tst-printf-binary \
-  tst-printf-bz18872 \
-  tst-printf-bz25691 \
-  tst-printf-fp-free \
-  tst-printf-fp-leak \
   tst-printf-intn \
   tst-printf-oct \
   tst-printf-round \
@@ -261,7 +257,6 @@ tests := \
   tst-vfprintf-mbs-prec \
   tst-vfprintf-user-type \
   tst-vfprintf-width-i18n \
-  tst-vfprintf-width-prec \
   tst-vfprintf-width-prec-alloc \
   tst-wc-printf \
   tstdiomisc \
@@ -270,6 +265,20 @@ tests := \
   xbug \
   # tests
 
+ifeq ($(run-built-tests),yes)
+ifeq (yes,$(build-shared))
+ifneq ($(PERL),no)
+tests += \
+  tst-printf-bz18872 \
+  tst-printf-bz25691 \
+  tst-printf-fp-free \
+  tst-printf-fp-leak \
+  tst-vfprintf-width-prec \
+  # tests
+endif
+endif
+endif
+
 tests-container += \
   tst-popen3
   # tests-container
@@ -293,14 +302,19 @@ test-srcs = \
 
 ifeq ($(run-built-tests),yes)
 tests-special += \
-  $(objpfx)tst-printf-bz18872-mem.out \
-  $(objpfx)tst-printf-bz25691-mem.out \
-  $(objpfx)tst-printf-fp-free-mem.out \
-  $(objpfx)tst-printf-fp-leak-mem.out \
   $(objpfx)tst-printf.out \
   $(objpfx)tst-printfsz-islongdouble.out \
   $(objpfx)tst-setvbuf1-cmp.out \
   $(objpfx)tst-unbputc.out \
+  # tests-special
+
+ifeq (yes,$(build-shared))
+ifneq ($(PERL),no)
+tests-special += \
+  $(objpfx)tst-printf-bz18872-mem.out \
+  $(objpfx)tst-printf-bz25691-mem.out \
+  $(objpfx)tst-printf-fp-free-mem.out \
+  $(objpfx)tst-printf-fp-leak-mem.out \
   $(objpfx)tst-vfprintf-width-prec-mem.out \
   # tests-special
 
@@ -317,6 +331,8 @@ generated += \
   tst-vfprintf-width-prec-mem.out \
   tst-vfprintf-width-prec.mtrace \
   # generated
+endif
+endif
 endif # $(run-built-tests)
 
 tests-special += $(objpfx)tst-errno-manual.out
diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h
index 31782b66f9..bc8d842238 100644
--- a/sysdeps/aarch64/cpu-features.h
+++ b/sysdeps/aarch64/cpu-features.h
@@ -1,6 +1,7 @@
 /* Initialize CPU feature data.  AArch64 version.
    This file is part of the GNU C Library.
    Copyright (C) 2017-2024 Free Software Foundation, Inc.
+   Copyright The GNU Toolchain Authors.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -56,6 +57,11 @@
 #define IS_A64FX(midr) (MIDR_IMPLEMENTOR(midr) == 'F'			      \
 			&& MIDR_PARTNUM(midr) == 0x001)
 
+#define IS_ORYON1(midr) (MIDR_IMPLEMENTOR(midr) == 'Q'			\
+		         && (MIDR_PARTNUM(midr) == 0x001		\
+			     || (MIDR_PARTNUM(midr) == 0x002		\
+			         && MIDR_VARIANT(midr) == 0)))
+
 struct cpu_features
 {
   uint64_t midr_el1;
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index e4720b7468..3e251cc234 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -5,6 +5,7 @@ sysdep_routines += \
   memcpy_a64fx \
   memcpy_generic \
   memcpy_mops \
+  memcpy_oryon1 \
   memcpy_sve \
   memcpy_thunderx \
   memcpy_thunderx2 \
@@ -14,6 +15,7 @@ sysdep_routines += \
   memset_generic \
   memset_kunpeng \
   memset_mops \
+  memset_oryon1 \
   memset_zva64 \
   strlen_asimd \
   strlen_generic \
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index ecd0f87de6..b2fda541f9 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -1,5 +1,6 @@
 /* Enumerate available IFUNC implementations of a function.  AARCH64 version.
    Copyright (C) 2017-2024 Free Software Foundation, Inc.
+   Copyright The GNU Toolchain Authors.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -35,6 +36,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/aarch64/multiarch/memcpy.c, memmove.c and memset.c.  */
   IFUNC_IMPL (i, name, memcpy,
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_oryon1)
 	      IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
 #if HAVE_AARCH64_SVE_ASM
 	      IFUNC_IMPL_ADD (array, i, memcpy, sve && !bti, __memcpy_a64fx)
@@ -44,6 +46,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
   IFUNC_IMPL (i, name, memmove,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_oryon1)
 	      IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
 #if HAVE_AARCH64_SVE_ASM
 	      IFUNC_IMPL_ADD (array, i, memmove, sve && !bti, __memmove_a64fx)
@@ -53,6 +56,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
   IFUNC_IMPL (i, name, memset,
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva64)
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_oryon1)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
 #if HAVE_AARCH64_SVE_ASM
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index ce53567dab..15c954778b 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -1,5 +1,6 @@
 /* Multiple versions of memcpy. AARCH64 version.
    Copyright (C) 2017-2024 Free Software Foundation, Inc.
+   Copyright The GNU Toolchain Authors.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -34,6 +35,7 @@ extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_mops attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_oryon1 attribute_hidden;
 
 static inline __typeof (__redirect_memcpy) *
 select_memcpy_ifunc (void)
@@ -50,6 +52,9 @@ select_memcpy_ifunc (void)
       return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic;
     }
 
+  if (IS_ORYON1 (midr))
+    return __memcpy_oryon1;
+
   if (IS_THUNDERX (midr))
     return __memcpy_thunderx;
 
diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S
new file mode 100644
index 0000000000..4efc43df28
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S
@@ -0,0 +1,301 @@
+/* A oryon-1 core Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2017-2024 Free Software Foundation, Inc.
+   Copyright The GNU Toolchain Authors.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define G_l	count
+#define G_h	dst
+#define tmp1	x14
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   In order to share code with memmove, small and medium copies read all
+   data before writing, allowing any kind of overlap. So small, medium
+   and large backwards memmoves are handled by falling through into memcpy.
+   Overlapping large forward memmoves use a loop that copies backwards.
+*/
+
+ENTRY (__memmove_oryon1)
+
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+	sub	tmp1, dstin, src
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.lo	L(move_long)
+
+	/* Common case falls through into memcpy.  */
+END (__memmove_oryon1)
+
+ENTRY (__memcpy_oryon1)
+
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 16
+	b.ls	L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 6
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+	.p2align 6
+1:
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+	cbz	count, 2f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+2:	ret
+
+	.p2align 6
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
+	ret
+
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 6
+L(copy_long):
+
+	/* On oryon1 cores, large memcpy's are helped by using ldnp/stnp.
+	   This loop is identical to the one below it but using ldnp/stnp
+	   instructions.  For loops that are less than 32768 bytes,
+	   the ldnp/stnp instructions will not help and will cause a slow
+	   down so only use the ldnp/stnp loop for the largest sizes.  */
+
+	cmp	count, #32768
+	b.lo	L(copy_long_without_nontemp)
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldnp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldnp	A_l, A_h, [src, 16]
+	stnp	D_l, D_h, [dstin]
+	ldnp	B_l, B_h, [src, 32]
+	ldnp	C_l, C_h, [src, 48]
+	ldnp	D_l, D_h, [src, 64]
+	add	src, src, #64
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+
+L(nontemp_loop64):
+	tbz	src, #6, 1f
+1:
+	stnp	A_l, A_h, [dst, 16]
+	ldnp	A_l, A_h, [src, 16]
+	stnp	B_l, B_h, [dst, 32]
+	ldnp	B_l, B_h, [src, 32]
+	stnp	C_l, C_h, [dst, 48]
+	ldnp	C_l, C_h, [src, 48]
+	stnp	D_l, D_h, [dst, 64]
+	ldnp	D_l, D_h, [src, 64]
+	add	src, src, #64
+	add	dst, dst, #64
+	subs	count, count, 64
+	b.hi	L(nontemp_loop64)
+	b	L(last64)
+
+L(copy_long_without_nontemp):
+
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(last64)
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+L(last64):
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.p2align 6
+L(move_long):
+	cbz	tmp1, 3f
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	/* Align dstend to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	and	tmp1, dstend, 15
+	ldp	D_l, D_h, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+
+	nop
+1:
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
+2:
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+3:	ret
+
+END (__memcpy_oryon1)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 34bce045dd..bd063c16c9 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -1,5 +1,6 @@
 /* Multiple versions of memset. AARCH64 version.
    Copyright (C) 2017-2024 Free Software Foundation, Inc.
+   Copyright The GNU Toolchain Authors.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -34,6 +35,7 @@ extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
 extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
 extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
 extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
+extern __typeof (__redirect_memset) __memset_oryon1 attribute_hidden;
 
 static inline __typeof (__redirect_memset) *
 select_memset_ifunc (void)
@@ -49,6 +51,9 @@ select_memset_ifunc (void)
 	return __memset_a64fx;
     }
 
+  if (IS_ORYON1 (midr) && zva_size == 64)
+    return __memset_oryon1;
+
   if (IS_KUNPENG920 (midr))
     return __memset_kunpeng;
 
diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S
new file mode 100644
index 0000000000..b43a43b54e
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_oryon1.S
@@ -0,0 +1,169 @@
+/* Optimized memset for Qualcomm's oyron-1 core.
+   Copyright (C) 2018-2024 Free Software Foundation, Inc.
+   Copyright The GNU Toolchain Authors.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "memset-reg.h"
+
+/* Assumptions:
+   ARMv8-a, AArch64, unaligned accesses
+ */
+
+ENTRY (__memset_oryon1)
+
+	PTR_ARG (0)
+	SIZE_ARG (2)
+
+	bfi	valw, valw, 8, 8
+	bfi	valw, valw, 16, 16
+	bfi	val, val, 32, 32
+
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+
+	.p2align 3
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	.p2align 3
+	/* Set 16..96 bytes.  */
+L(set_medium):
+	stp	val, val, [dstin]
+	tbnz	count, 6, L(set96)
+	stp	val, val, [dstend, -16]
+	tbz	count, 5, 1f
+	stp	val, val, [dstin, 16]
+	stp	val, val, [dstend, -32]
+1:	ret
+
+	.p2align 6
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	stp	val, val, [dstin, 16]
+	stp	val, val, [dstin, 32]
+	stp	val, val, [dstin, 48]
+	stp	val, val, [dstend, -32]
+	stp	val, val, [dstend, -16]
+	ret
+
+	.p2align 6
+L(set_long):
+	stp	val, val, [dstin]
+	bic	dst, dstin, 15
+	cmp	count, 256
+	ccmp	valw, 0, 0, cs
+	b.eq	L(try_zva)
+	cmp	count, #32768
+	b.hi	L(set_long_with_nontemp)
+	/* Small-size or non-zero memset does not use DC ZVA. */
+	sub	count, dstend, dst
+
+	/* Adjust count and bias for loop. By subtracting extra 1 from count,
+	  it is easy to use tbz instruction to check whether loop tailing
+	  count is less than 33 bytes, so as to bypass 2 unnecessary stps. */
+	sub	count, count, 64+16+1
+
+1:	stp	val, val, [dst, 16]
+	stp	val, val, [dst, 32]
+	stp	val, val, [dst, 48]
+	stp	val, val, [dst, 64]!
+	subs	count, count, 64
+	b.hs	1b
+
+	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
+	stp	val, val, [dst, 16]
+	stp	val, val, [dst, 32]
+1:	stp	val, val, [dstend, -32]
+	stp	val, val, [dstend, -16]
+	ret
+
+L(set_long_with_nontemp):
+	/* Small-size or non-zero memset does not use DC ZVA. */
+	sub	count, dstend, dst
+
+	/* Adjust count and bias for loop. By subtracting extra 1 from count,
+	   it is easy to use tbz instruction to check whether loop tailing
+	   count is less than 33 bytes, so as to bypass 2 unnecessary stps. */
+	sub	count, count, 64+16+1
+
+1:	stnp	val, val, [dst, 16]
+	stnp	val, val, [dst, 32]
+	stnp	val, val, [dst, 48]
+	stnp	val, val, [dst, 64]
+	add	dst, dst, #64
+	subs	count, count, 64
+	b.hs	1b
+
+	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
+	stnp	val, val, [dst, 16]
+	stnp	val, val, [dst, 32]
+1:	stnp	val, val, [dstend, -32]
+	stnp	val, val, [dstend, -16]
+	ret
+
+L(try_zva):
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA as it is faster. */
+	.p2align 6
+L(zva_64):
+	stp	val, val, [dst, 16]
+	stp	val, val, [dst, 32]
+	stp	val, val, [dst, 48]
+	bic	dst, dst, 63
+	stp	val, val, [dst, 64]
+	stp	val, val, [dst, 64+16]
+	stp	val, val, [dst, 96]
+	stp	val, val, [dst, 96+16]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	val, val, [dst, 0]
+	stp	val, val, [dst, 16]
+	stp	val, val, [dst, 32]
+	stp	val, val, [dst, 48]
+
+	stp	val, val, [dstend, -64]
+	stp	val, val, [dstend, -64+16]
+	stp	val, val, [dstend, -32]
+	stp	val, val, [dstend, -16]
+	ret
+
+END (__memset_oryon1)
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index 50f58a60e3..656e8a3fa0 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -1256,6 +1256,20 @@ extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
 					     size_t gen)
      attribute_hidden;
 
+/* The last TLS module ID that is initially loaded, plus 1.  TLS
+   addresses for modules with IDs lower than that can be obtained from
+   the DTV even if its generation is outdated.  */
+extern size_t _dl_tls_initial_modid_limit attribute_hidden attribute_relro;
+
+/* Compute _dl_tls_initial_modid_limit.  To be called after initial
+   relocation.  */
+void _dl_tls_initial_modid_limit_setup (void) attribute_hidden;
+
+/* Number of threads currently in a TLS update.  This is used to
+   detect reentrant __tls_get_addr calls without a per-thread
+   flag.  */
+extern unsigned int _dl_tls_threads_in_update attribute_hidden;
+
 /* Look up the module's TLS block as for __tls_get_addr,
    but never touch anything.  Return null if it's not allocated yet.  */
 extern void *_dl_tls_get_addr_soft (struct link_map *l) attribute_hidden;
diff --git a/sysdeps/mips/fpu/math-use-builtins-fma.h b/sysdeps/mips/fpu/math-use-builtins-fma.h
new file mode 100644
index 0000000000..57108f968e
--- /dev/null
+++ b/sysdeps/mips/fpu/math-use-builtins-fma.h
@@ -0,0 +1,36 @@
+/* Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* MIPSr6 has MADDF.s/MADDF.d instructions, which are fused.  In MIPS
+   ISA, double support can be subsetted.  Only FMAF is enabled for this
+   case.  */
+
+#include <sysdep.h>
+
+#if __mips_isa_rev >= 6
+# ifdef __mips_single_float
+#  define USE_FMA_BUILTIN 0
+# else
+#  define USE_FMA_BUILTIN 1
+# endif
+# define USE_FMAF_BUILTIN 1
+#else
+# define USE_FMA_BUILTIN 0
+# define USE_FMAF_BUILTIN 0
+#endif
+#define USE_FMAL_BUILTIN 0
+#define USE_FMAF128_BUILTIN 0
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 3d7c2819d7..e501e084ef 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -1023,39 +1023,59 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
 
       model += extended_model;
       if (family == 0x6)
-        {
-          if (model == 0xf || model == 0x19)
-            {
+	{
+	  /* Tuning for older Zhaoxin processors.  */
+	  if (model == 0xf || model == 0x19)
+	    {
 	      CPU_FEATURE_UNSET (cpu_features, AVX);
 	      CPU_FEATURE_UNSET (cpu_features, AVX2);
 
-              cpu_features->preferred[index_arch_Slow_SSE4_2]
-                |= bit_arch_Slow_SSE4_2;
+	      cpu_features->preferred[index_arch_Slow_SSE4_2]
+		  |= bit_arch_Slow_SSE4_2;
 
+	      /*  Unaligned AVX loads are slower.  */
 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
-		&= ~bit_arch_AVX_Fast_Unaligned_Load;
-            }
-        }
+		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+	    }
+	}
       else if (family == 0x7)
-        {
-	  if (model == 0x1b)
+	{
+	  switch (model)
 	    {
+	      /* Wudaokou microarch tuning.  */
+	    case 0x1b:
 	      CPU_FEATURE_UNSET (cpu_features, AVX);
 	      CPU_FEATURE_UNSET (cpu_features, AVX2);
 
 	      cpu_features->preferred[index_arch_Slow_SSE4_2]
-		|= bit_arch_Slow_SSE4_2;
+		  |= bit_arch_Slow_SSE4_2;
 
 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
-		&= ~bit_arch_AVX_Fast_Unaligned_Load;
-	    }
-	  else if (model == 0x3b)
-	    {
+		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+	      break;
+
+	      /* Lujiazui microarch tuning.  */
+	    case 0x3b:
 	      CPU_FEATURE_UNSET (cpu_features, AVX);
 	      CPU_FEATURE_UNSET (cpu_features, AVX2);
 
 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
-		&= ~bit_arch_AVX_Fast_Unaligned_Load;
+		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+	      break;
+
+	      /* Yongfeng and Shijidadao mircoarch tuning.  */
+	    case 0x5b:
+	      cpu_features->cachesize_non_temporal_divisor = 2;
+	    case 0x6b:
+	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
+		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+
+	      /* To use sse2_unaligned versions of memset, strcpy and strcat.
+	       */
+	      cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+		  |= (bit_arch_Prefer_No_VZEROUPPER
+		      | bit_arch_Fast_Unaligned_Load);
+	      break;
 	    }
 	}
     }
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 3a6ec4ef9f..5e77345a6e 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -934,8 +934,10 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
      a higher risk of actually thrashing the cache as they don't have a HW LRU
      hint. As well, their performance in highly parallel situations is
-     noticeably worse.  */
-  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+     noticeably worse. Zhaoxin processors are an exception, the lowbound is not
+     suitable for them based on actual test data.  */
+  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+      && cpu_features->basic.kind != arch_kind_zhaoxin)
     non_temporal_threshold = non_temporal_threshold_lowbound;
   /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
      'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
index 869023bbba..b3c1e4fcd7 100644
--- a/sysdeps/x86_64/dl-tls.c
+++ b/sysdeps/x86_64/dl-tls.c
@@ -41,7 +41,10 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
   dtv_t *dtv = THREAD_DTV ();
 
   size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
-  if (__glibc_unlikely (dtv[0].counter != gen))
+  if (__glibc_unlikely (dtv[0].counter != gen)
+      /* See comment in __tls_get_addr in elf/dl-tls.c.  */
+      && !(_dl_tls_allocate_active ()
+           && GET_ADDR_MODULE < _dl_tls_initial_modid_limit))
     return update_get_addr (GET_ADDR_PARAM, gen);
 
   return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 048d015712..01008fd981 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -151,13 +151,10 @@ L(more_2x_vec):
 	   loop.  */
 	movups	%xmm0, (%rdi)
 
-# ifdef SHARED_CACHE_SIZE_HALF
-	cmp	$SHARED_CACHE_SIZE_HALF, %RDX_LP
-# else
-	cmp	__x86_shared_cache_size_half(%rip), %rdx
-# endif
+	cmp	__x86_shared_non_temporal_threshold(%rip), %rdx
 	ja	L(large_memcpy)
 
+L(loop_fwd):
 	leaq	-64(%rdi, %rdx), %r8
 	andq	$-16, %rdi
 	movl	$48, %edx
@@ -199,6 +196,13 @@ L(large_memcpy):
 	movups	-64(%r9, %rdx), %xmm10
 	movups	-80(%r9, %rdx), %xmm11
 
+	/* Check if src and dst overlap. If they do use cacheable
+	   writes to potentially gain positive interference between
+	   the loads during the memmove.  */
+	subq	%rdi, %r9
+	cmpq	%rdx, %r9
+	jb	L(loop_fwd)
+
 	sall	$5, %ecx
 	leal	(%rcx, %rcx, 2), %r8d
 	leaq	-96(%rdi, %rdx), %rcx