diff options
32 files changed, 1221 insertions, 109 deletions
diff --git a/INSTALL b/INSTALL index 208ec98d4b..4bd3d53676 100644 --- a/INSTALL +++ b/INSTALL @@ -224,6 +224,23 @@ if 'CFLAGS' is specified it must enable optimization. For example: By default for x86_64, the GNU C Library is built with the vector math library. Use this option to disable the vector math library. +'--disable-static-c++-tests' + By default, if the C++ toolchain lacks support for static linking, + configure fails to find the C++ header files and the glibc build + fails. '--disable-static-c++-link-check' allows the glibc build to + finish, but static C++ tests will fail if the C++ toolchain doesn't + have the necessary static C++ libraries. Use this option to skip + the static C++ tests. This option implies + '--disable-static-c++-link-check'. + +'--disable-static-c++-link-check' + By default, if the C++ toolchain lacks support for static linking, + configure fails to find the C++ header files and the glibc build + fails. Use this option to disable the static C++ link check so + that the C++ header files can be located. The newly built libc.a + can be used to create static C++ tests if the C++ toolchain has the + necessary static C++ libraries. + '--disable-scv' Disable using 'scv' instruction for syscalls. All syscalls will use 'sc' instead, even if the kernel supports 'scv'. PowerPC only. diff --git a/catgets/Makefile b/catgets/Makefile index 24b4560d5f..40c65eac95 100644 --- a/catgets/Makefile +++ b/catgets/Makefile @@ -43,8 +43,12 @@ tests-special += \ $(objpfx)test-gencat.out \ $(objpfx)test1.cat \ $(objpfx)test2.cat \ - $(objpfx)tst-catgets-mem.out # tests-special +ifeq (yes,$(build-shared)) +ifneq ($(PERL),no) +tests-special += $(objpfx)tst-catgets-mem.out +endif +endif endif gencat-modules = xmalloc @@ -68,9 +72,17 @@ generated += \ test1.h \ test2.cat \ test2.h \ + # generated +ifeq ($(run-built-tests),yes) +ifeq (yes,$(build-shared)) +ifneq ($(PERL),no) +generated += \ tst-catgets-mem.out \ tst-catgets.mtrace \ # generated +endif +endif +endif generated-dirs += \ de \ diff --git a/configure b/configure index 1df2f2e6d1..1bae55b45b 100755 --- a/configure +++ b/configure @@ -771,6 +771,8 @@ ac_user_opts=' enable_option_checking with_pkgversion with_bugurl +enable_static_c___tests +enable_static_c___link_check with_gd with_gd_include with_gd_lib @@ -1440,6 +1442,10 @@ Optional Features: --disable-option-checking ignore unrecognized --enable/--with options --disable-FEATURE do not include FEATURE (same as --enable-FEATURE=no) --enable-FEATURE[=ARG] include FEATURE [ARG=yes] + --disable-static-c++-tests + disable static C++ tests[default=no] + --disable-static-c++-link-check + disable static C++ link check [default=no] --disable-sanity-checks really do not use threads (should not be used except in special situations) [default=yes] --enable-shared build shared library [default=yes if GNU ld] @@ -3855,6 +3861,29 @@ if test -z "$CPP"; then fi +# This will get text that should go into config.make. +config_vars= + +# Check whether --enable-static-c++-tests was given. +if test ${enable_static_c___tests+y} +then : + enableval=$enable_static_c___tests; static_cxx_tests=$enableval +else $as_nop + static_cxx_tests=yes +fi + +config_vars="$config_vars +static-cxx-tests = $static_cxx_tests" + +# Check whether --enable-static-c++-link-check was given. +if test ${enable_static_c___link_check+y} +then : + enableval=$enable_static_c___link_check; static_cxx_link_check=$enableval +else $as_nop + static_cxx_link_check=yes +fi + + # We need the C++ compiler only for testing. @@ -4279,10 +4308,11 @@ esac fi rm -f core conftest.err conftest.$ac_objext conftest.beam \ conftest$ac_exeext conftest.$ac_ext -# Static case. -old_LDFLAGS="$LDFLAGS" -LDFLAGS="$LDFLAGS -static" -cat confdefs.h - <<_ACEOF >conftest.$ac_ext +if test $static_cxx_link_check$static_cxx_tests = yesyes; then + # Static case. + old_LDFLAGS="$LDFLAGS" + LDFLAGS="$LDFLAGS -static" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include <iostream> @@ -4304,7 +4334,8 @@ esac fi rm -f core conftest.err conftest.$ac_objext conftest.beam \ conftest$ac_exeext conftest.$ac_ext -LDFLAGS="$old_LDFLAGS" + LDFLAGS="$old_LDFLAGS" +fi ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' @@ -4324,9 +4355,6 @@ if test "`cd $srcdir; pwd -P`" = "`pwd -P`"; then as_fn_error $? "you must configure in a separate build directory" "$LINENO" 5 fi -# This will get text that should go into config.make. -config_vars= - # Check for a --with-gd argument and set libgd-LDFLAGS in config.make. # Check whether --with-gd was given. diff --git a/configure.ac b/configure.ac index bdc385d03c..e48957f318 100644 --- a/configure.ac +++ b/configure.ac @@ -52,6 +52,22 @@ fi AC_SUBST(cross_compiling) AC_PROG_CPP +# This will get text that should go into config.make. +config_vars= + +AC_ARG_ENABLE([static-c++-tests], + AS_HELP_STRING([--disable-static-c++-tests], + [disable static C++ tests@<:@default=no@:>@]), + [static_cxx_tests=$enableval], + [static_cxx_tests=yes]) +LIBC_CONFIG_VAR([static-cxx-tests], [$static_cxx_tests]) + +AC_ARG_ENABLE([static-c++-link-check], + AS_HELP_STRING([--disable-static-c++-link-check], + [disable static C++ link check @<:@default=no@:>@]), + [static_cxx_link_check=$enableval], + [static_cxx_link_check=yes]) + # We need the C++ compiler only for testing. AC_PROG_CXX # It's useless to us if it can't link programs (e.g. missing -lstdc++). @@ -61,10 +77,11 @@ AC_LANG_PUSH([C++]) AC_LINK_IFELSE([AC_LANG_PROGRAM([], [])], [libc_cv_cxx_link_ok=yes], [libc_cv_cxx_link_ok=no]) -# Static case. -old_LDFLAGS="$LDFLAGS" -LDFLAGS="$LDFLAGS -static" -AC_LINK_IFELSE([AC_LANG_SOURCE([ +if test $static_cxx_link_check$static_cxx_tests = yesyes; then + # Static case. + old_LDFLAGS="$LDFLAGS" + LDFLAGS="$LDFLAGS -static" + AC_LINK_IFELSE([AC_LANG_SOURCE([ #include <iostream> int @@ -74,9 +91,10 @@ main() return 0; } ])], - [], - [libc_cv_cxx_link_ok=no]) -LDFLAGS="$old_LDFLAGS" + [], + [libc_cv_cxx_link_ok=no]) + LDFLAGS="$old_LDFLAGS" +fi AC_LANG_POP([C++])]) AS_IF([test $libc_cv_cxx_link_ok != yes], [CXX=]) @@ -84,9 +102,6 @@ if test "`cd $srcdir; pwd -P`" = "`pwd -P`"; then AC_MSG_ERROR([you must configure in a separate build directory]) fi -# This will get text that should go into config.make. -config_vars= - # Check for a --with-gd argument and set libgd-LDFLAGS in config.make. AC_ARG_WITH([gd], AS_HELP_STRING([--with-gd=DIR], diff --git a/elf/Makefile b/elf/Makefile index bb6cd06dec..24ad5221c2 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -445,6 +445,7 @@ tests += \ tst-p_align1 \ tst-p_align2 \ tst-p_align3 \ + tst-recursive-tls \ tst-relsort1 \ tst-ro-dynamic \ tst-rtld-run-static \ @@ -632,13 +633,19 @@ $(objpfx)tst-rtld-does-not-exist.out: tst-rtld-does-not-exist.sh $(objpfx)ld.so tests += $(tests-execstack-$(have-z-execstack)) ifeq ($(run-built-tests),yes) tests-special += \ - $(objpfx)noload-mem.out \ $(objpfx)tst-ldconfig-X.out \ $(objpfx)tst-ldconfig-p.out \ $(objpfx)tst-ldconfig-soname.out \ - $(objpfx)tst-leaks1-mem.out \ $(objpfx)tst-rtld-help.out \ # tests-special +ifeq (yes,$(build-shared)) +ifneq ($(PERL),no) +tests-special += \ + $(objpfx)noload-mem.out \ + $(objpfx)tst-leaks1-mem.out \ + # tests-special +endif +endif endif tlsmod17a-suffixes = 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 tlsmod18a-suffixes = 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 @@ -886,6 +893,23 @@ modules-names += \ tst-null-argv-lib \ tst-p_alignmod-base \ tst-p_alignmod3 \ + tst-recursive-tlsmallocmod \ + tst-recursive-tlsmod0 \ + tst-recursive-tlsmod1 \ + tst-recursive-tlsmod2 \ + tst-recursive-tlsmod3 \ + tst-recursive-tlsmod4 \ + tst-recursive-tlsmod5 \ + tst-recursive-tlsmod6 \ + tst-recursive-tlsmod7 \ + tst-recursive-tlsmod8 \ + tst-recursive-tlsmod9 \ + tst-recursive-tlsmod10 \ + tst-recursive-tlsmod11 \ + tst-recursive-tlsmod12 \ + tst-recursive-tlsmod13 \ + tst-recursive-tlsmod14 \ + tst-recursive-tlsmod15 \ tst-relsort1mod1 \ tst-relsort1mod2 \ tst-ro-dynamic-mod \ @@ -3093,3 +3117,11 @@ CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=$(have-mtls-descriptor) CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=$(have-mtls-descriptor) CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=$(have-mtls-descriptor) endif + +$(objpfx)tst-recursive-tls: $(objpfx)tst-recursive-tlsmallocmod.so +# More objects than DTV_SURPLUS, to trigger DTV reallocation. +$(objpfx)tst-recursive-tls.out: \ + $(patsubst %,$(objpfx)tst-recursive-tlsmod%.so, \ + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) +$(objpfx)tst-recursive-tlsmod%.os: tst-recursive-tlsmodN.c + $(compile-command.c) -DVAR=thread_$* -DFUNC=get_threadvar_$* diff --git a/elf/dl-tls.c b/elf/dl-tls.c index 670dbc42fc..3d221273f1 100644 --- a/elf/dl-tls.c +++ b/elf/dl-tls.c @@ -75,6 +75,31 @@ /* Default for dl_tls_static_optional. */ #define OPTIONAL_TLS 512 +/* Used to count the number of threads currently executing dynamic TLS + updates. Used to avoid recursive malloc calls in __tls_get_addr + for an interposed malloc that uses global-dynamic TLS (which is not + recommended); see _dl_tls_allocate_active checks. This could be a + per-thread flag, but would need TLS access in the dynamic linker. */ +unsigned int _dl_tls_threads_in_update; + +static inline void +_dl_tls_allocate_begin (void) +{ + atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, 1); +} + +static inline void +_dl_tls_allocate_end (void) +{ + atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, -1); +} + +static inline bool +_dl_tls_allocate_active (void) +{ + return atomic_load_relaxed (&_dl_tls_threads_in_update) > 0; +} + /* Compute the static TLS surplus based on the namespace count and the TLS space that can be used for optimizations. */ static inline int @@ -425,12 +450,18 @@ _dl_allocate_tls_storage (void) size += TLS_PRE_TCB_SIZE; #endif - /* Perform the allocation. Reserve space for the required alignment - and the pointer to the original allocation. */ + /* Reserve space for the required alignment and the pointer to the + original allocation. */ size_t alignment = GLRO (dl_tls_static_align); + + /* Perform the allocation. */ + _dl_tls_allocate_begin (); void *allocated = malloc (size + alignment + sizeof (void *)); if (__glibc_unlikely (allocated == NULL)) - return NULL; + { + _dl_tls_allocate_end (); + return NULL; + } /* Perform alignment and allocate the DTV. */ #if TLS_TCB_AT_TP @@ -466,6 +497,8 @@ _dl_allocate_tls_storage (void) result = allocate_dtv (result); if (result == NULL) free (allocated); + + _dl_tls_allocate_end (); return result; } @@ -483,6 +516,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid) size_t newsize = max_modid + DTV_SURPLUS; size_t oldsize = dtv[-1].counter; + _dl_tls_allocate_begin (); if (dtv == GL(dl_initial_dtv)) { /* This is the initial dtv that was either statically allocated in @@ -502,6 +536,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid) if (newp == NULL) oom (); } + _dl_tls_allocate_end (); newp[0].counter = newsize; @@ -676,7 +711,9 @@ allocate_dtv_entry (size_t alignment, size_t size) if (powerof2 (alignment) && alignment <= _Alignof (max_align_t)) { /* The alignment is supported by malloc. */ + _dl_tls_allocate_begin (); void *ptr = malloc (size); + _dl_tls_allocate_end (); return (struct dtv_pointer) { ptr, ptr }; } @@ -688,7 +725,10 @@ allocate_dtv_entry (size_t alignment, size_t size) /* Perform the allocation. This is the pointer we need to free later. */ + _dl_tls_allocate_begin (); void *start = malloc (alloc_size); + _dl_tls_allocate_end (); + if (start == NULL) return (struct dtv_pointer) {}; @@ -826,7 +866,11 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen) free implementation. Checking here papers over at least some dynamic TLS usage by interposed mallocs. */ if (dtv[modid].pointer.to_free != NULL) - free (dtv[modid].pointer.to_free); + { + _dl_tls_allocate_begin (); + free (dtv[modid].pointer.to_free); + _dl_tls_allocate_end (); + } dtv[modid].pointer.val = TLS_DTV_UNALLOCATED; dtv[modid].pointer.to_free = NULL; @@ -956,10 +1000,22 @@ __tls_get_addr (GET_ADDR_ARGS) size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); if (__glibc_unlikely (dtv[0].counter != gen)) { - /* Update DTV up to the global generation, see CONCURRENCY NOTES - in _dl_update_slotinfo. */ - gen = atomic_load_acquire (&GL(dl_tls_generation)); - return update_get_addr (GET_ADDR_PARAM, gen); + if (_dl_tls_allocate_active () + && GET_ADDR_MODULE < _dl_tls_initial_modid_limit) + /* This is a reentrant __tls_get_addr call, but we can + satisfy it because it's an initially-loaded module ID. + These TLS slotinfo slots do not change, so the + out-of-date generation counter does not matter. However, + if not in a TLS update, still update_get_addr below, to + get off the slow path eventually. */ + ; + else + { + /* Update DTV up to the global generation, see CONCURRENCY NOTES + in _dl_update_slotinfo. */ + gen = atomic_load_acquire (&GL(dl_tls_generation)); + return update_get_addr (GET_ADDR_PARAM, gen); + } } void *p = dtv[GET_ADDR_MODULE].pointer.val; @@ -969,7 +1025,7 @@ __tls_get_addr (GET_ADDR_ARGS) return (char *) p + GET_ADDR_OFFSET; } -#endif +#endif /* SHARED */ /* Look up the module's TLS block as for __tls_get_addr, @@ -1018,6 +1074,25 @@ _dl_tls_get_addr_soft (struct link_map *l) return data; } +size_t _dl_tls_initial_modid_limit; + +void +_dl_tls_initial_modid_limit_setup (void) +{ + struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list); + size_t idx; + for (idx = 0; idx < listp->len; ++idx) + { + struct link_map *l = listp->slotinfo[idx].map; + if (l == NULL + /* The object can be unloaded, so its modid can be + reassociated. */ + || !(l->l_type == lt_executable || l->l_type == lt_library)) + break; + } + _dl_tls_initial_modid_limit = idx; +} + void _dl_add_to_slotinfo (struct link_map *l, bool do_add) @@ -1050,9 +1125,11 @@ _dl_add_to_slotinfo (struct link_map *l, bool do_add) the first slot. */ assert (idx == 0); + _dl_tls_allocate_begin (); listp = (struct dtv_slotinfo_list *) malloc (sizeof (struct dtv_slotinfo_list) + TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo)); + _dl_tls_allocate_end (); if (listp == NULL) { /* We ran out of memory while resizing the dtv slotinfo list. */ diff --git a/elf/rtld.c b/elf/rtld.c index e9525ea987..6352ba76c5 100644 --- a/elf/rtld.c +++ b/elf/rtld.c @@ -788,6 +788,8 @@ init_tls (size_t naudit) _dl_fatal_printf ("\ cannot allocate TLS data structures for initial thread\n"); + _dl_tls_initial_modid_limit_setup (); + /* Store for detection of the special case by __tls_get_addr so it knows not to pass this dtv to the normal realloc. */ GL(dl_initial_dtv) = GET_DTV (tcbp); diff --git a/elf/tst-recursive-tls.c b/elf/tst-recursive-tls.c new file mode 100644 index 0000000000..716d1f783a --- /dev/null +++ b/elf/tst-recursive-tls.c @@ -0,0 +1,60 @@ +/* Test with interposed malloc with dynamic TLS. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <array_length.h> +#include <stdio.h> +#include <support/check.h> +#include <support/xdlfcn.h> + +/* Defined in tst-recursive-tlsmallocmod.so. */ +extern __thread unsigned int malloc_subsytem_counter; + +static int +do_test (void) +{ + /* 16 is large enough to exercise the DTV resizing case. */ + void *handles[16]; + + for (unsigned int i = 0; i < array_length (handles); ++i) + { + /* Re-use the TLS slot for module 0. */ + if (i > 0) + xdlclose (handles[0]); + + char soname[30]; + snprintf (soname, sizeof (soname), "tst-recursive-tlsmod%u.so", i); + handles[i] = xdlopen (soname, RTLD_NOW); + + if (i > 0) + { + handles[0] = xdlopen ("tst-recursive-tlsmod0.so", RTLD_NOW); + int (*fptr) (void) = xdlsym (handles[0], "get_threadvar_0"); + /* May trigger TLS storage allocation using malloc. */ + TEST_COMPARE (fptr (), 0); + } + } + + for (unsigned int i = 0; i < array_length (handles); ++i) + xdlclose (handles[i]); + + printf ("info: malloc subsystem calls: %u\n", malloc_subsytem_counter); + TEST_VERIFY (malloc_subsytem_counter > 0); + return 0; +} + +#include <support/test-driver.c> diff --git a/elf/tst-recursive-tlsmallocmod.c b/elf/tst-recursive-tlsmallocmod.c new file mode 100644 index 0000000000..c24e9945d1 --- /dev/null +++ b/elf/tst-recursive-tlsmallocmod.c @@ -0,0 +1,64 @@ +/* Interposed malloc with dynamic TLS. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <stdlib.h> +#include <dlfcn.h> + +__thread unsigned int malloc_subsytem_counter; + +static __typeof (malloc) *malloc_fptr; +static __typeof (free) *free_fptr; +static __typeof (calloc) *calloc_fptr; +static __typeof (realloc) *realloc_fptr; + +static void __attribute__ ((constructor)) +init (void) +{ + malloc_fptr = dlsym (RTLD_NEXT, "malloc"); + free_fptr = dlsym (RTLD_NEXT, "free"); + calloc_fptr = dlsym (RTLD_NEXT, "calloc"); + realloc_fptr = dlsym (RTLD_NEXT, "realloc"); +} + +void * +malloc (size_t size) +{ + ++malloc_subsytem_counter; + return malloc_fptr (size); +} + +void +free (void *ptr) +{ + ++malloc_subsytem_counter; + return free_fptr (ptr); +} + +void * +calloc (size_t a, size_t b) +{ + ++malloc_subsytem_counter; + return calloc_fptr (a, b); +} + +void * +realloc (void *ptr, size_t size) +{ + ++malloc_subsytem_counter; + return realloc_fptr (ptr, size); +} diff --git a/elf/tst-recursive-tlsmodN.c b/elf/tst-recursive-tlsmodN.c new file mode 100644 index 0000000000..bb7592aee6 --- /dev/null +++ b/elf/tst-recursive-tlsmodN.c @@ -0,0 +1,28 @@ +/* Test module with global-dynamic TLS. Used to trigger DTV reallocation. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* Compiled with VAR and FUNC set via -D. FUNC requires some + relocation against TLS variable VAR. */ + +__thread int VAR; + +int +FUNC (void) +{ + return VAR; +} diff --git a/io/bits/fcntl2.h b/io/bits/fcntl2.h index 26f1792fd1..0cced392e7 100644 --- a/io/bits/fcntl2.h +++ b/io/bits/fcntl2.h @@ -61,13 +61,8 @@ open (const char *__path, int __oflag, ...) return __open_alias (__path, __oflag, __va_arg_pack ()); } #elif __fortify_use_clang -__fortify_function_error_function __attribute_overloadable__ int -open (const char *__path, int __oflag, mode_t __mode, ...) - __fortify_clang_unavailable ("open can be called either with 2 or 3 arguments, not more"); - __fortify_function __attribute_overloadable__ int open (__fortify_clang_overload_arg (const char *, ,__path), int __oflag) - __fortify_clang_prefer_this_overload __fortify_clang_error (__OPEN_NEEDS_MODE (__oflag), "open with O_CREAT or O_TMPFILE in second argument needs 3 arguments") { diff --git a/libio/Makefile b/libio/Makefile index f607edbefb..8720381fdc 100644 --- a/libio/Makefile +++ b/libio/Makefile @@ -261,15 +261,28 @@ tst-bz22415-ENV = MALLOC_TRACE=$(objpfx)tst-bz22415.mtrace \ tst-bz24228-ENV = MALLOC_TRACE=$(objpfx)tst-bz24228.mtrace \ LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so -generated += test-fmemopen.mtrace test-fmemopen.check -generated += tst-fdopen-seek-failure.mtrace tst-fdopen-seek-failure.check -generated += tst-fopenloc.mtrace tst-fopenloc.check -generated += tst-bz22415.mtrace tst-bz22415.check - aux := fileops genops stdfiles stdio strops +ifeq ($(run-built-tests),yes) +ifeq ($(build-shared),yes) +ifneq ($(PERL),no) +generated += \ + test-fmemopen.check \ + test-fmemopen.mtrace \ + tst-bz22415.check \ + tst-bz22415.mtrace \ + tst-bz24228.check \ + tst-bz24228.mtrace \ + tst-fdopen-seek-failure.check \ + tst-fdopen-seek-failure.mtrace \ + tst-fopenloc.check \ + tst-fopenloc.mtrace \ + # generated +endif +endif +endif + ifeq ($(build-shared),yes) -generated += tst-bz24228.mtrace tst-bz24228.check aux += oldfileops oldstdfiles tests += \ tst-stderr-compat \ @@ -286,16 +299,23 @@ shared-only-routines = oldiofopen oldiofdopen oldiofclose oldfileops \ ifeq ($(run-built-tests),yes) tests-special += \ - $(objpfx)test-fmemopen-mem.out \ $(objpfx)test-freopen.out \ - $(objpfx)tst-bz22415-mem.out \ - $(objpfx)tst-fdopen-seek-failure-mem.out \ # tests-special ifeq (yes,$(build-shared)) # Run tst-fopenloc-cmp.out and tst-openloc-mem.out only if shared # library is enabled since they depend on tst-fopenloc.out. -tests-special += $(objpfx)tst-fopenloc-cmp.out $(objpfx)tst-fopenloc-mem.out \ - $(objpfx)tst-bz24228-mem.out +tests-special += $(objpfx)tst-fopenloc-cmp.out +ifeq ($(build-shared),yes) +ifneq ($(PERL),no) +tests-special += \ + $(objpfx)test-fmemopen-mem.out \ + $(objpfx)tst-bz22415-mem.out \ + $(objpfx)tst-bz24228-mem.out \ + $(objpfx)tst-fdopen-seek-failure-mem.out \ + $(objpfx)tst-fopenloc-mem.out \ + # tests-special +endif +endif endif tests += \ diff --git a/manual/install.texi b/manual/install.texi index 6504d02c62..a7847b02c0 100644 --- a/manual/install.texi +++ b/manual/install.texi @@ -252,6 +252,22 @@ configure with @option{--disable-werror}. By default for x86_64, @theglibc{} is built with the vector math library. Use this option to disable the vector math library. +@item --disable-static-c++-tests +By default, if the C++ toolchain lacks support for static linking, +configure fails to find the C++ header files and the glibc build fails. +@option{--disable-static-c++-link-check} allows the glibc build to finish, +but static C++ tests will fail if the C++ toolchain doesn't have the +necessary static C++ libraries. Use this option to skip the static C++ +tests. This option implies @option{--disable-static-c++-link-check}. + +@item --disable-static-c++-link-check +By default, if the C++ toolchain lacks support for static linking, +configure fails to find the C++ header files and the glibc build fails. +Use this option to disable the static C++ link check so that the C++ +header files can be located. The newly built libc.a can be used to +create static C++ tests if the C++ toolchain has the necessary static +C++ libraries. + @item --disable-scv Disable using @code{scv} instruction for syscalls. All syscalls will use @code{sc} instead, even if the kernel supports @code{scv}. PowerPC only. diff --git a/manual/llio.texi b/manual/llio.texi index fe1807a849..78c7c79913 100644 --- a/manual/llio.texi +++ b/manual/llio.texi @@ -1573,10 +1573,15 @@ permitted. They include @code{PROT_READ}, @code{PROT_WRITE}, and of address space for future use. The @code{mprotect} function can be used to change the protection flags. @xref{Memory Protection}. -@var{flags} contains flags that control the nature of the map. -One of @code{MAP_SHARED} or @code{MAP_PRIVATE} must be specified. +The @var{flags} parameter contains flags that control the nature of +the map. One of @code{MAP_SHARED}, @code{MAP_SHARED_VALIDATE}, or +@code{MAP_PRIVATE} must be specified. Additional flags may be bitwise +OR'd to further define the mapping. -They include: +Note that, aside from @code{MAP_PRIVATE} and @code{MAP_SHARED}, not +all flags are supported on all versions of all operating systems. +Consult the kernel-specific documentation for details. The flags +include: @vtable @code @item MAP_PRIVATE @@ -1598,9 +1603,19 @@ Note that actual writing may take place at any time. You need to use @code{msync}, described below, if it is important that other processes using conventional I/O get a consistent view of the file. +@item MAP_SHARED_VALIDATE +Similar to @code{MAP_SHARED} except that additional flags will be +validated by the kernel, and the call will fail if an unrecognized +flag is provided. With @code{MAP_SHARED} using a flag on a kernel +that doesn't support it causes the flag to be ignored. +@code{MAP_SHARED_VALIDATE} should be used when the behavior of all +flags is required. + @item MAP_FIXED This forces the system to use the exact mapping address specified in -@var{address} and fail if it can't. +@var{address} and fail if it can't. Note that if the new mapping +would overlap an existing mapping, the overlapping portion of the +existing map is unmapped. @c One of these is official - the other is obviously an obsolete synonym @c Which is which? @@ -1641,10 +1656,73 @@ The @code{MAP_HUGETLB} flag is specific to Linux. @c There is a mechanism to select different hugepage sizes; see @c include/uapi/asm-generic/hugetlb_encode.h in the kernel sources. -@c Linux has some other MAP_ options, which I have not discussed here. -@c MAP_DENYWRITE, MAP_EXECUTABLE and MAP_GROWSDOWN don't seem applicable to -@c user programs (and I don't understand the last two). MAP_LOCKED does -@c not appear to be implemented. +@item MAP_32BIT +Require addresses that can be accessed with a signed 32 bit pointer, +i.e., within the first 2 GiB. Ignored if MAP_FIXED is specified. + +@item MAP_DENYWRITE +@itemx MAP_EXECUTABLE +@itemx MAP_FILE + +Provided for compatibility. Ignored by the Linux kernel. + +@item MAP_FIXED_NOREPLACE +Similar to @code{MAP_FIXED} except the call will fail with +@code{EEXIST} if the new mapping would overwrite an existing mapping. +To test for support for this flag, specify MAP_FIXED_NOREPLACE without +MAP_FIXED, and (if the call was successful) check the actual address +returned. If it does not match the address passed, then this flag is +not supported. + +@item MAP_GROWSDOWN +This flag is used to make stacks, and is typically only needed inside +the program loader to set up the main stack for the running process. +The mapping is created according to the other flags, except an +additional page just prior to the mapping is marked as a ``guard +page''. If a write is attempted inside this guard page, that page is +mapped, the mapping is extended, and a new guard page is created. +Thus, the mapping continues to grow towards lower addresses until it +encounters some other mapping. + +Note that accessing memory beyond the guard page will not trigger this +feature. In gcc, use @code{-fstack-clash-protection} to ensure the +guard page is always touched. + +@item MAP_LOCKED +A hint that requests that mapped pages are locked in memory (i.e. not +paged out). Note that this is a request and not a requirement; use +@code{mlock} if locking is required. + +@item MAP_POPULATE +@itemx MAP_NONBLOCK +@code{MAP_POPULATE} is a hint that requests that the kernel read-ahead +a file-backed mapping, causing pages to be mapped before they're +needed. @code{MAP_NONBLOCK} is a hint that requests that the kernel +@emph{not} attempt such except for pages are already in memory. Note +that neither of these hints affects future paging activity, use +@code{mlock} if such needs to be controlled. + +@item MAP_NORESERVE +Asks the kernel to not reserve physical backing (i.e. space in a swap +device) for a mapping. This would be useful for, for example, a very +large but sparsely used mapping which need not be limited in total +length by available RAM, but with very few mapped pages. Note that +writes to such a mapping may cause a @code{SIGSEGV} if the system is +unable to map a page due to lack of resources. + +On Linux, this flag's behavior may be overwridden by +@file{/proc/sys/vm/overcommit_memory} as documented in the proc(5) man +page. + +@item MAP_STACK +Ensures that the resulting mapping is suitable for use as a program +stack. For example, the use of huge pages might be precluded. + +@item MAP_SYNC +This is a special flag for DAX devices, which tells the kernel to +write dirty metadata out whenever dirty data is written out. Unlike +most other flags, this one will fail unless @code{MAP_SHARED_VALIDATE} +is also given. @end vtable @@ -1655,6 +1733,24 @@ Possible errors include: @table @code +@item EACCES + +@var{filedes} was not open for the type of access specified in @var{protect}. + +@item EAGAIN + +The system has temporarily run out of resources. + +@item EBADF + +The @var{fd} passed is invalid, and a valid file descriptor is +required (i.e. MAP_ANONYMOUS was not specified). + +@item EEXIST + +@code{MAP_FIXED_NOREPLACE} was specified and an existing mapping was +found overlapping the requested address range. + @item EINVAL Either @var{address} was unusable (because it is not a multiple of the @@ -1663,23 +1759,37 @@ applicable page size), or inconsistent @var{flags} were given. If @code{MAP_HUGETLB} was specified, the file or system does not support large page sizes. -@item EACCES +@item ENODEV -@var{filedes} was not open for the type of access specified in @var{protect}. +This file is of a type that doesn't support mapping, the process has +exceeded its data space limit, or the map request would exceed the +process's virtual address space. @item ENOMEM -Either there is not enough memory for the operation, or the process is -out of address space. - -@item ENODEV - -This file is of a type that doesn't support mapping. +There is not enough memory for the operation, the process is out of +address space, or there are too many mappings. On Linux, the maximum +number of mappings can be controlled via +@file{/proc/sys/vm/max_map_count} or, if your OS supports it, via +the @code{vm.max_map_count} @code{sysctl} setting. @item ENOEXEC The file is on a filesystem that doesn't support mapping. +@item EPERM + +@code{PROT_EXEC} was requested but the file is on a filesystem that +was mounted with execution denied, a file seal prevented the mapping, +or the caller set MAP_HUDETLB but does not have the required +priviledges. + +@item EOVERFLOW + +Either the offset into the file plus the length of the mapping causes +internal page counts to overflow, or the offset requested exceeds the +length of the file. + @c On Linux, EAGAIN will appear if the file has a conflicting mandatory lock. @c However mandatory locks are not discussed in this manual. @c diff --git a/misc/Makefile b/misc/Makefile index c273ec6974..5d17c562fe 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -214,12 +214,18 @@ routines_no_fortify += \ syslog \ # routines_no_fortify +ifeq ($(run-built-tests),yes) +ifeq (yes,$(build-shared)) +ifneq ($(PERL),no) generated += \ tst-allocate_once-mem.out \ tst-allocate_once.mtrace \ tst-error1-mem.out \ tst-error1.mtrace \ # generated +endif +endif +endif aux := init-misc install-lib := libg.a @@ -285,8 +291,14 @@ tests-internal += tst-fd_to_filename tests-static += tst-fd_to_filename ifeq ($(run-built-tests),yes) -tests-special += $(objpfx)tst-error1-mem.out \ - $(objpfx)tst-allocate_once-mem.out +ifeq (yes,$(build-shared)) +ifneq ($(PERL),no) +tests-special += \ + $(objpfx)tst-allocate_once-mem.out \ + $(objpfx)tst-error1-mem.out \ + # tests-special +endif +endif endif tests-container := \ diff --git a/nptl/Makefile b/nptl/Makefile index b3f8af2e1c..c4c27e0d23 100644 --- a/nptl/Makefile +++ b/nptl/Makefile @@ -545,6 +545,9 @@ tests-static += \ # tests-static tests += tst-cancel24-static +ifeq ($(static-cxx-tests),no) +tests-unsupported += tst-cancel24-static +endif tests-internal += \ tst-sem11-static \ @@ -556,10 +559,12 @@ xtests-static += tst-setuid1-static ifeq ($(run-built-tests),yes) tests-special += \ $(objpfx)tst-oddstacklimit.out \ - $(objpfx)tst-stack3-mem.out \ # tests-special ifeq ($(build-shared),yes) tests-special += $(objpfx)tst-tls6.out +ifneq ($(PERL),no) +tests-special += $(objpfx)tst-stack3-mem.out +endif endif endif @@ -617,10 +622,17 @@ tst-stack3-ENV = MALLOC_TRACE=$(objpfx)tst-stack3.mtrace \ $(objpfx)tst-stack3-mem.out: $(objpfx)tst-stack3.out $(common-objpfx)malloc/mtrace $(objpfx)tst-stack3.mtrace > $@; \ $(evaluate-test) + +ifeq ($(run-built-tests),yes) +ifeq (yes,$(build-shared)) +ifneq ($(PERL),no) generated += \ tst-stack3-mem.out \ tst-stack3.mtrace \ # generated +endif +endif +endif tst-stack4mod.sos=$(shell for i in 0 1 2 3 4 5 6 7 8 9 10 \ 11 12 13 14 15 16 17 18 19; do \ diff --git a/posix/Makefile b/posix/Makefile index a1e84853a8..2c598cd20a 100644 --- a/posix/Makefile +++ b/posix/Makefile @@ -418,6 +418,17 @@ generated += \ $(addprefix wordexp-test-result, 1 2 3 4 5 6 7 8 9 10) \ annexc \ annexc.out \ + getconf.speclist \ + ptestcases.h \ + testcases.h \ + tst-getconf.out \ + wordexp-tst.out \ + # generated + +ifeq ($(run-built-tests),yes) +ifeq (yes,$(build-shared)) +ifneq ($(PERL),no) +generated += \ bug-ga2-mem.out \ bug-ga2.mtrace \ bug-glob2-mem.out \ @@ -430,23 +441,22 @@ generated += \ bug-regex21.mtrace \ bug-regex31-mem.out \ bug-regex31.mtrace \ + bug-regex36-mem.out \ bug-regex36.mtrace \ - getconf.speclist \ - ptestcases.h \ - testcases.h \ tst-boost-mem.out \ tst-boost.mtrace \ tst-fnmatch-mem.out \ tst-fnmatch.mtrace \ - tst-getconf.out \ tst-pcre-mem.out \ tst-pcre.mtrace \ tst-rxspencer-no-utf8-mem.out \ tst-rxspencer-no-utf8.mtrace \ tst-vfork3-mem.out \ tst-vfork3.mtrace \ - wordexp-tst.out \ # generated +endif +endif +endif ifeq ($(run-built-tests),yes) ifeq (yes,$(build-shared)) @@ -461,6 +471,9 @@ endif # XXX Please note that for now we ignore the result of this test. tests-special += $(objpfx)annexc.out ifeq ($(run-built-tests),yes) +tests-special += $(objpfx)tst-getconf.out +ifeq (yes,$(build-shared)) +ifneq ($(PERL),no) tests-special += \ $(objpfx)bug-ga2-mem.out \ $(objpfx)bug-glob2-mem.out \ @@ -471,13 +484,14 @@ tests-special += \ $(objpfx)bug-regex36-mem.out \ $(objpfx)tst-boost-mem.out \ $(objpfx)tst-fnmatch-mem.out \ - $(objpfx)tst-getconf.out \ $(objpfx)tst-glob-tilde-mem.out \ $(objpfx)tst-pcre-mem.out \ $(objpfx)tst-rxspencer-no-utf8-mem.out \ $(objpfx)tst-vfork3-mem.out \ # tests-special endif +endif +endif include ../Rules diff --git a/signal/Makefile b/signal/Makefile index e8e3dce0cf..7cddbc3c65 100644 --- a/signal/Makefile +++ b/signal/Makefile @@ -46,11 +46,22 @@ routines := signal raise killpg \ allocrtsig sigtimedwait sigwaitinfo sigqueue \ sighold sigrelse sigignore sigset -tests := tst-signal tst-sigset tst-sigsimple tst-raise tst-sigset2 \ - tst-sigwait-eintr tst-sigaction \ - tst-minsigstksz-1 tst-minsigstksz-2 tst-minsigstksz-3 \ - tst-minsigstksz-3a tst-minsigstksz-4 tst-minsigstksz-5 \ - tst-sigisemptyset +tests := \ + tst-minsigstksz-1 \ + tst-minsigstksz-2 \ + tst-minsigstksz-3 \ + tst-minsigstksz-3a \ + tst-minsigstksz-4 \ + tst-minsigstksz-5 \ + tst-raise \ + tst-sigaction \ + tst-sigisemptyset \ + tst-signal \ + tst-sigset \ + tst-sigset2 \ + tst-sigsimple \ + tst-sigwait-eintr \ +# tests include ../Rules diff --git a/stdio-common/Makefile b/stdio-common/Makefile index 6bc972af1a..a63c05a120 100644 --- a/stdio-common/Makefile +++ b/stdio-common/Makefile @@ -229,10 +229,6 @@ tests := \ tst-popen \ tst-popen2 \ tst-printf-binary \ - tst-printf-bz18872 \ - tst-printf-bz25691 \ - tst-printf-fp-free \ - tst-printf-fp-leak \ tst-printf-intn \ tst-printf-oct \ tst-printf-round \ @@ -261,7 +257,6 @@ tests := \ tst-vfprintf-mbs-prec \ tst-vfprintf-user-type \ tst-vfprintf-width-i18n \ - tst-vfprintf-width-prec \ tst-vfprintf-width-prec-alloc \ tst-wc-printf \ tstdiomisc \ @@ -270,6 +265,20 @@ tests := \ xbug \ # tests +ifeq ($(run-built-tests),yes) +ifeq (yes,$(build-shared)) +ifneq ($(PERL),no) +tests += \ + tst-printf-bz18872 \ + tst-printf-bz25691 \ + tst-printf-fp-free \ + tst-printf-fp-leak \ + tst-vfprintf-width-prec \ + # tests +endif +endif +endif + tests-container += \ tst-popen3 # tests-container @@ -293,14 +302,19 @@ test-srcs = \ ifeq ($(run-built-tests),yes) tests-special += \ - $(objpfx)tst-printf-bz18872-mem.out \ - $(objpfx)tst-printf-bz25691-mem.out \ - $(objpfx)tst-printf-fp-free-mem.out \ - $(objpfx)tst-printf-fp-leak-mem.out \ $(objpfx)tst-printf.out \ $(objpfx)tst-printfsz-islongdouble.out \ $(objpfx)tst-setvbuf1-cmp.out \ $(objpfx)tst-unbputc.out \ + # tests-special + +ifeq (yes,$(build-shared)) +ifneq ($(PERL),no) +tests-special += \ + $(objpfx)tst-printf-bz18872-mem.out \ + $(objpfx)tst-printf-bz25691-mem.out \ + $(objpfx)tst-printf-fp-free-mem.out \ + $(objpfx)tst-printf-fp-leak-mem.out \ $(objpfx)tst-vfprintf-width-prec-mem.out \ # tests-special @@ -317,6 +331,8 @@ generated += \ tst-vfprintf-width-prec-mem.out \ tst-vfprintf-width-prec.mtrace \ # generated +endif +endif endif # $(run-built-tests) tests-special += $(objpfx)tst-errno-manual.out diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h index 31782b66f9..bc8d842238 100644 --- a/sysdeps/aarch64/cpu-features.h +++ b/sysdeps/aarch64/cpu-features.h @@ -1,6 +1,7 @@ /* Initialize CPU feature data. AArch64 version. This file is part of the GNU C Library. Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright The GNU Toolchain Authors. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -56,6 +57,11 @@ #define IS_A64FX(midr) (MIDR_IMPLEMENTOR(midr) == 'F' \ && MIDR_PARTNUM(midr) == 0x001) +#define IS_ORYON1(midr) (MIDR_IMPLEMENTOR(midr) == 'Q' \ + && (MIDR_PARTNUM(midr) == 0x001 \ + || (MIDR_PARTNUM(midr) == 0x002 \ + && MIDR_VARIANT(midr) == 0))) + struct cpu_features { uint64_t midr_el1; diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index e4720b7468..3e251cc234 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -5,6 +5,7 @@ sysdep_routines += \ memcpy_a64fx \ memcpy_generic \ memcpy_mops \ + memcpy_oryon1 \ memcpy_sve \ memcpy_thunderx \ memcpy_thunderx2 \ @@ -14,6 +15,7 @@ sysdep_routines += \ memset_generic \ memset_kunpeng \ memset_mops \ + memset_oryon1 \ memset_zva64 \ strlen_asimd \ strlen_generic \ diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index ecd0f87de6..b2fda541f9 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -1,5 +1,6 @@ /* Enumerate available IFUNC implementations of a function. AARCH64 version. Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright The GNU Toolchain Authors. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -35,6 +36,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/aarch64/multiarch/memcpy.c, memmove.c and memset.c. */ IFUNC_IMPL (i, name, memcpy, IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_oryon1) IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memcpy, sve && !bti, __memcpy_a64fx) @@ -44,6 +46,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) IFUNC_IMPL (i, name, memmove, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_oryon1) IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memmove, sve && !bti, __memmove_a64fx) @@ -53,6 +56,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) IFUNC_IMPL (i, name, memset, IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva64) + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_oryon1) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng) #if HAVE_AARCH64_SVE_ASM diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index ce53567dab..15c954778b 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -1,5 +1,6 @@ /* Multiple versions of memcpy. AARCH64 version. Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright The GNU Toolchain Authors. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -34,6 +35,7 @@ extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_mops attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_oryon1 attribute_hidden; static inline __typeof (__redirect_memcpy) * select_memcpy_ifunc (void) @@ -50,6 +52,9 @@ select_memcpy_ifunc (void) return prefer_sve_ifuncs ? __memcpy_sve : __memcpy_generic; } + if (IS_ORYON1 (midr)) + return __memcpy_oryon1; + if (IS_THUNDERX (midr)) return __memcpy_thunderx; diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S new file mode 100644 index 0000000000..4efc43df28 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S @@ -0,0 +1,301 @@ +/* A oryon-1 core Optimized memcpy implementation for AARCH64. + Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright The GNU Toolchain Authors. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l src +#define E_h count +#define F_l srcend +#define F_h dst +#define G_l count +#define G_h dst +#define tmp1 x14 + +/* Copies are split into 3 main cases: small copies of up to 16 bytes, + medium copies of 17..96 bytes which are fully unrolled. Large copies + of more than 96 bytes align the destination and use an unrolled loop + processing 64 bytes per iteration. + In order to share code with memmove, small and medium copies read all + data before writing, allowing any kind of overlap. So small, medium + and large backwards memmoves are handled by falling through into memcpy. + Overlapping large forward memmoves use a loop that copies backwards. +*/ + +ENTRY (__memmove_oryon1) + + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + sub tmp1, dstin, src + cmp count, 96 + ccmp tmp1, count, 2, hi + b.lo L(move_long) + + /* Common case falls through into memcpy. */ +END (__memmove_oryon1) + +ENTRY (__memcpy_oryon1) + + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + add srcend, src, count + add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 6 + /* Small copies: 0..16 bytes. */ +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 6 +1: + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ +1: + cbz count, 2f + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +2: ret + + .p2align 6 + /* Copy 64..96 bytes. Copy 64 bytes from the start and + 32 bytes from the end. */ +L(copy96): + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [src, 32] + ldp D_l, D_h, [src, 48] + ldp E_l, E_h, [srcend, -32] + ldp F_l, F_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin, 32] + stp D_l, D_h, [dstin, 48] + stp E_l, E_h, [dstend, -32] + stp F_l, F_h, [dstend, -16] + ret + + /* Align DST to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + .p2align 6 +L(copy_long): + + /* On oryon1 cores, large memcpy's are helped by using ldnp/stnp. + This loop is identical to the one below it but using ldnp/stnp + instructions. For loops that are less than 32768 bytes, + the ldnp/stnp instructions will not help and will cause a slow + down so only use the ldnp/stnp loop for the largest sizes. */ + + cmp count, #32768 + b.lo L(copy_long_without_nontemp) + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldnp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldnp A_l, A_h, [src, 16] + stnp D_l, D_h, [dstin] + ldnp B_l, B_h, [src, 32] + ldnp C_l, C_h, [src, 48] + ldnp D_l, D_h, [src, 64] + add src, src, #64 + subs count, count, 128 + 16 /* Test and readjust count. */ + +L(nontemp_loop64): + tbz src, #6, 1f +1: + stnp A_l, A_h, [dst, 16] + ldnp A_l, A_h, [src, 16] + stnp B_l, B_h, [dst, 32] + ldnp B_l, B_h, [src, 32] + stnp C_l, C_h, [dst, 48] + ldnp C_l, C_h, [src, 48] + stnp D_l, D_h, [dst, 64] + ldnp D_l, D_h, [src, 64] + add src, src, #64 + add dst, dst, #64 + subs count, count, 64 + b.hi L(nontemp_loop64) + b L(last64) + +L(copy_long_without_nontemp): + + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(last64) +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the end even if + there is just 1 byte left. */ +L(last64): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 6 +L(move_long): + cbz tmp1, 3f + + add srcend, src, count + add dstend, dstin, count + + /* Align dstend to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + and tmp1, dstend, 15 + ldp D_l, D_h, [srcend, -16] + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls 2f + + nop +1: + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the start even if + there is just 1 byte left. */ +2: + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] +3: ret + +END (__memcpy_oryon1) diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c index 34bce045dd..bd063c16c9 100644 --- a/sysdeps/aarch64/multiarch/memset.c +++ b/sysdeps/aarch64/multiarch/memset.c @@ -1,5 +1,6 @@ /* Multiple versions of memset. AARCH64 version. Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright The GNU Toolchain Authors. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -34,6 +35,7 @@ extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden; extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden; extern __typeof (__redirect_memset) __memset_generic attribute_hidden; extern __typeof (__redirect_memset) __memset_mops attribute_hidden; +extern __typeof (__redirect_memset) __memset_oryon1 attribute_hidden; static inline __typeof (__redirect_memset) * select_memset_ifunc (void) @@ -49,6 +51,9 @@ select_memset_ifunc (void) return __memset_a64fx; } + if (IS_ORYON1 (midr) && zva_size == 64) + return __memset_oryon1; + if (IS_KUNPENG920 (midr)) return __memset_kunpeng; diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S new file mode 100644 index 0000000000..b43a43b54e --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_oryon1.S @@ -0,0 +1,169 @@ +/* Optimized memset for Qualcomm's oyron-1 core. + Copyright (C) 2018-2024 Free Software Foundation, Inc. + Copyright The GNU Toolchain Authors. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "memset-reg.h" + +/* Assumptions: + ARMv8-a, AArch64, unaligned accesses + */ + +ENTRY (__memset_oryon1) + + PTR_ARG (0) + SIZE_ARG (2) + + bfi valw, valw, 8, 8 + bfi valw, valw, 16, 16 + bfi val, val, 32, 32 + + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + cmp count, 16 + b.hs L(set_medium) + + /* Set 0..15 bytes. */ + tbz count, 3, 1f + str val, [dstin] + str val, [dstend, -8] + ret + + .p2align 3 +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend, -4] + ret +2: cbz count, 3f + strb valw, [dstin] + tbz count, 1, 3f + strh valw, [dstend, -2] +3: ret + + .p2align 3 + /* Set 16..96 bytes. */ +L(set_medium): + stp val, val, [dstin] + tbnz count, 6, L(set96) + stp val, val, [dstend, -16] + tbz count, 5, 1f + stp val, val, [dstin, 16] + stp val, val, [dstend, -32] +1: ret + + .p2align 6 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + stp val, val, [dstin, 16] + stp val, val, [dstin, 32] + stp val, val, [dstin, 48] + stp val, val, [dstend, -32] + stp val, val, [dstend, -16] + ret + + .p2align 6 +L(set_long): + stp val, val, [dstin] + bic dst, dstin, 15 + cmp count, 256 + ccmp valw, 0, 0, cs + b.eq L(try_zva) + cmp count, #32768 + b.hi L(set_long_with_nontemp) + /* Small-size or non-zero memset does not use DC ZVA. */ + sub count, dstend, dst + + /* Adjust count and bias for loop. By subtracting extra 1 from count, + it is easy to use tbz instruction to check whether loop tailing + count is less than 33 bytes, so as to bypass 2 unnecessary stps. */ + sub count, count, 64+16+1 + +1: stp val, val, [dst, 16] + stp val, val, [dst, 32] + stp val, val, [dst, 48] + stp val, val, [dst, 64]! + subs count, count, 64 + b.hs 1b + + tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ + stp val, val, [dst, 16] + stp val, val, [dst, 32] +1: stp val, val, [dstend, -32] + stp val, val, [dstend, -16] + ret + +L(set_long_with_nontemp): + /* Small-size or non-zero memset does not use DC ZVA. */ + sub count, dstend, dst + + /* Adjust count and bias for loop. By subtracting extra 1 from count, + it is easy to use tbz instruction to check whether loop tailing + count is less than 33 bytes, so as to bypass 2 unnecessary stps. */ + sub count, count, 64+16+1 + +1: stnp val, val, [dst, 16] + stnp val, val, [dst, 32] + stnp val, val, [dst, 48] + stnp val, val, [dst, 64] + add dst, dst, #64 + subs count, count, 64 + b.hs 1b + + tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ + stnp val, val, [dst, 16] + stnp val, val, [dst, 32] +1: stnp val, val, [dstend, -32] + stnp val, val, [dstend, -16] + ret + +L(try_zva): + /* Write the first and last 64 byte aligned block using stp rather + than using DC ZVA as it is faster. */ + .p2align 6 +L(zva_64): + stp val, val, [dst, 16] + stp val, val, [dst, 32] + stp val, val, [dst, 48] + bic dst, dst, 63 + stp val, val, [dst, 64] + stp val, val, [dst, 64+16] + stp val, val, [dst, 96] + stp val, val, [dst, 96+16] + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ + add dst, dst, 128 +1: dc zva, dst + add dst, dst, 64 + subs count, count, 64 + b.hi 1b + stp val, val, [dst, 0] + stp val, val, [dst, 16] + stp val, val, [dst, 32] + stp val, val, [dst, 48] + + stp val, val, [dstend, -64] + stp val, val, [dstend, -64+16] + stp val, val, [dstend, -32] + stp val, val, [dstend, -16] + ret + +END (__memset_oryon1) diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index 50f58a60e3..656e8a3fa0 100644 --- a/sysdeps/generic/ldsodefs.h +++ b/sysdeps/generic/ldsodefs.h @@ -1256,6 +1256,20 @@ extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid, size_t gen) attribute_hidden; +/* The last TLS module ID that is initially loaded, plus 1. TLS + addresses for modules with IDs lower than that can be obtained from + the DTV even if its generation is outdated. */ +extern size_t _dl_tls_initial_modid_limit attribute_hidden attribute_relro; + +/* Compute _dl_tls_initial_modid_limit. To be called after initial + relocation. */ +void _dl_tls_initial_modid_limit_setup (void) attribute_hidden; + +/* Number of threads currently in a TLS update. This is used to + detect reentrant __tls_get_addr calls without a per-thread + flag. */ +extern unsigned int _dl_tls_threads_in_update attribute_hidden; + /* Look up the module's TLS block as for __tls_get_addr, but never touch anything. Return null if it's not allocated yet. */ extern void *_dl_tls_get_addr_soft (struct link_map *l) attribute_hidden; diff --git a/sysdeps/mips/fpu/math-use-builtins-fma.h b/sysdeps/mips/fpu/math-use-builtins-fma.h new file mode 100644 index 0000000000..57108f968e --- /dev/null +++ b/sysdeps/mips/fpu/math-use-builtins-fma.h @@ -0,0 +1,36 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* MIPSr6 has MADDF.s/MADDF.d instructions, which are fused. In MIPS + ISA, double support can be subsetted. Only FMAF is enabled for this + case. */ + +#include <sysdep.h> + +#if __mips_isa_rev >= 6 +# ifdef __mips_single_float +# define USE_FMA_BUILTIN 0 +# else +# define USE_FMA_BUILTIN 1 +# endif +# define USE_FMAF_BUILTIN 1 +#else +# define USE_FMA_BUILTIN 0 +# define USE_FMAF_BUILTIN 0 +#endif +#define USE_FMAL_BUILTIN 0 +#define USE_FMAF128_BUILTIN 0 diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 3d7c2819d7..e501e084ef 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -1023,39 +1023,59 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht model += extended_model; if (family == 0x6) - { - if (model == 0xf || model == 0x19) - { + { + /* Tuning for older Zhaoxin processors. */ + if (model == 0xf || model == 0x19) + { CPU_FEATURE_UNSET (cpu_features, AVX); CPU_FEATURE_UNSET (cpu_features, AVX2); - cpu_features->preferred[index_arch_Slow_SSE4_2] - |= bit_arch_Slow_SSE4_2; + cpu_features->preferred[index_arch_Slow_SSE4_2] + |= bit_arch_Slow_SSE4_2; + /* Unaligned AVX loads are slower. */ cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load] - &= ~bit_arch_AVX_Fast_Unaligned_Load; - } - } + &= ~bit_arch_AVX_Fast_Unaligned_Load; + } + } else if (family == 0x7) - { - if (model == 0x1b) + { + switch (model) { + /* Wudaokou microarch tuning. */ + case 0x1b: CPU_FEATURE_UNSET (cpu_features, AVX); CPU_FEATURE_UNSET (cpu_features, AVX2); cpu_features->preferred[index_arch_Slow_SSE4_2] - |= bit_arch_Slow_SSE4_2; + |= bit_arch_Slow_SSE4_2; cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load] - &= ~bit_arch_AVX_Fast_Unaligned_Load; - } - else if (model == 0x3b) - { + &= ~bit_arch_AVX_Fast_Unaligned_Load; + break; + + /* Lujiazui microarch tuning. */ + case 0x3b: CPU_FEATURE_UNSET (cpu_features, AVX); CPU_FEATURE_UNSET (cpu_features, AVX2); cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load] - &= ~bit_arch_AVX_Fast_Unaligned_Load; + &= ~bit_arch_AVX_Fast_Unaligned_Load; + break; + + /* Yongfeng and Shijidadao mircoarch tuning. */ + case 0x5b: + cpu_features->cachesize_non_temporal_divisor = 2; + case 0x6b: + cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load] + &= ~bit_arch_AVX_Fast_Unaligned_Load; + + /* To use sse2_unaligned versions of memset, strcpy and strcat. + */ + cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER] + |= (bit_arch_Prefer_No_VZEROUPPER + | bit_arch_Fast_Unaligned_Load); + break; } } } diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index 3a6ec4ef9f..5e77345a6e 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -934,8 +934,10 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run a higher risk of actually thrashing the cache as they don't have a HW LRU hint. As well, their performance in highly parallel situations is - noticeably worse. */ - if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + noticeably worse. Zhaoxin processors are an exception, the lowbound is not + suitable for them based on actual test data. */ + if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS) + && cpu_features->basic.kind != arch_kind_zhaoxin) non_temporal_threshold = non_temporal_threshold_lowbound; /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c index 869023bbba..b3c1e4fcd7 100644 --- a/sysdeps/x86_64/dl-tls.c +++ b/sysdeps/x86_64/dl-tls.c @@ -41,7 +41,10 @@ __tls_get_addr_slow (GET_ADDR_ARGS) dtv_t *dtv = THREAD_DTV (); size_t gen = atomic_load_acquire (&GL(dl_tls_generation)); - if (__glibc_unlikely (dtv[0].counter != gen)) + if (__glibc_unlikely (dtv[0].counter != gen) + /* See comment in __tls_get_addr in elf/dl-tls.c. */ + && !(_dl_tls_allocate_active () + && GET_ADDR_MODULE < _dl_tls_initial_modid_limit)) return update_get_addr (GET_ADDR_PARAM, gen); return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL); diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S index 048d015712..01008fd981 100644 --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S +++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S @@ -151,13 +151,10 @@ L(more_2x_vec): loop. */ movups %xmm0, (%rdi) -# ifdef SHARED_CACHE_SIZE_HALF - cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP -# else - cmp __x86_shared_cache_size_half(%rip), %rdx -# endif + cmp __x86_shared_non_temporal_threshold(%rip), %rdx ja L(large_memcpy) +L(loop_fwd): leaq -64(%rdi, %rdx), %r8 andq $-16, %rdi movl $48, %edx @@ -199,6 +196,13 @@ L(large_memcpy): movups -64(%r9, %rdx), %xmm10 movups -80(%r9, %rdx), %xmm11 + /* Check if src and dst overlap. If they do use cacheable + writes to potentially gain positive interference between + the loads during the memmove. */ + subq %rdi, %r9 + cmpq %rdx, %r9 + jb L(loop_fwd) + sall $5, %ecx leal (%rcx, %rcx, 2), %r8d leaq -96(%rdi, %rdx), %rcx |