about summary refs log tree commit diff
diff options
context:
space:
mode:
authorMatheus Castanho <msc@linux.ibm.com>2020-03-02 14:16:40 -0300
committerTulio Magno Quites Machado Filho <tuliom@linux.ibm.com>2020-03-03 17:34:21 -0300
commitfe5012e47407914ec1a66f8337f6adfba6c42680 (patch)
tree9738b1212eefb4f1429e8559c2fda0ce853e96d7
parentc49ad9bdc9ecc49a4e9811448697ae3773d472a0 (diff)
parent2dc2d678e91f3f093d0f4855ac086efb288a5e23 (diff)
downloadglibc-fe5012e47407914ec1a66f8337f6adfba6c42680.tar.gz
glibc-fe5012e47407914ec1a66f8337f6adfba6c42680.tar.xz
glibc-fe5012e47407914ec1a66f8337f6adfba6c42680.zip
Merge branch release/2.26/master into ibm/2.26/master
-rw-r--r--ChangeLog256
-rw-r--r--NEWS26
-rw-r--r--benchtests/bench-strcasestr.c1
-rw-r--r--benchtests/bench-strstr.c3
-rw-r--r--elf/elf.h7
-rw-r--r--libio/Makefile3
-rw-r--r--libio/oldstdfiles.c5
-rw-r--r--libio/tst-wfile-sync.c39
-rw-r--r--libio/tst-wfile-sync.input1
-rw-r--r--libio/wfileops.c5
-rw-r--r--malloc/malloc.c9
-rw-r--r--manual/tunables.texi6
-rw-r--r--nptl/pthread_mutex_trylock.c57
-rw-r--r--posix/tst-mmap-offset.c9
-rw-r--r--string/memmem.c124
-rw-r--r--string/str-two-way.h65
-rw-r--r--string/strcasestr.c42
-rw-r--r--string/strstr.c176
-rw-r--r--string/test-strcasestr.c1
-rw-r--r--string/test-strstr.c31
-rw-r--r--sysdeps/aarch64/dl-machine.h35
-rw-r--r--sysdeps/aarch64/memcmp.S214
-rw-r--r--sysdeps/aarch64/multiarch/memcpy.c2
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_falkor.S123
-rw-r--r--sysdeps/aarch64/strcmp.S31
-rw-r--r--sysdeps/aarch64/strncmp.S95
-rw-r--r--sysdeps/generic/mmap_info.h16
-rw-r--r--sysdeps/unix/sysv/linux/aarch64/cpu-features.c1
-rw-r--r--sysdeps/unix/sysv/linux/aarch64/cpu-features.h3
-rw-r--r--sysdeps/unix/sysv/linux/mips/Makefile21
-rw-r--r--sysdeps/unix/sysv/linux/mips/configure41
-rw-r--r--sysdeps/unix/sysv/linux/mips/configure.ac32
-rw-r--r--sysdeps/unix/sysv/linux/mips/mmap_info.h13
-rw-r--r--sysdeps/unix/sysv/linux/mmap64.c9
-rw-r--r--sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h3
-rw-r--r--sysdeps/x86_64/memchr.S10
-rw-r--r--sysdeps/x86_64/memcmp.S20
-rw-r--r--sysdeps/x86_64/memrchr.S4
-rw-r--r--sysdeps/x86_64/multiarch/memchr-avx2.S8
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S7
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-sse4.S9
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-ssse3.S7
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-ssse3-back.S17
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-ssse3.S17
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S16
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S50
-rw-r--r--sysdeps/x86_64/multiarch/memrchr-avx2.S4
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S6
-rw-r--r--sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S32
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-sse42.S6
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S4
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-ssse3.S6
-rw-r--r--sysdeps/x86_64/multiarch/strlen-avx2.S9
-rw-r--r--sysdeps/x86_64/strcmp.S6
-rw-r--r--sysdeps/x86_64/strlen.S12
-rw-r--r--sysdeps/x86_64/x32/Makefile12
-rw-r--r--sysdeps/x86_64/x32/test-size_t.h35
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-memchr.c72
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-memcmp-2.c79
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-memcmp.c76
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-memcpy.c58
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-memrchr.c57
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-memset.c73
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-strncasecmp.c59
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-strncmp.c78
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-strncpy.c58
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-strnlen.c72
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-wcsncmp.c20
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-wcsnlen.c20
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-wmemchr.c20
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-wmemcmp.c20
-rw-r--r--sysdeps/x86_64/x32/tst-size_t-wmemset.c20
72 files changed, 2130 insertions, 454 deletions
diff --git a/ChangeLog b/ChangeLog
index 60b0364037..5f3df320bc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,259 @@
+2019-09-13  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* string/memmem.c (__memmem): Rewrite to improve performance.
+
+2019-06-12  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* string/str-two-way.h (two_way_short_needle): Add inline to avoid
+	warning.
+	(two_way_long_needle): Block inlining.
+	* string/strstr.c (strstr2): Add new function.
+	(strstr3): Likewise.
+	(STRSTR): Completely rewrite strstr to improve performance.
+
+2019-09-13  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	[BZ #23637]
+	* string/test-strstr.c (pr23637): New function.
+	(test_main): Add tests with longer needles.
+	* string/strcasestr.c (AVAILABLE): Fix readahead distance.
+	* string/strstr.c (AVAILABLE): Likewise.
+
+2019-09-13  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
+
+	* string/memmem.c: Use memcmp for first match.
+
+2019-09-13  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* string/strcasestr.c (STRCASESTR): Simplify and speedup first match.
+	* string/strstr.c (AVAILABLE): Likewise.
+
+2019-09-13  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* benchtests/bench-strcasestr.c: Rename __strnlen to strnlen.
+	* benchtests/bench-strstr.c: Likewise.
+	* string/memmem.c (FASTSEARCH): Define.
+	* string/str-two-way.h (two_way_short_needle): Minor cleanups.
+	Add support for FASTSEARCH.
+	* string/strcasestr.c (AVAILABLE): Use read-ahead __strnlen.
+	* string/strstr.c (AVAILABLE): Use read-ahead __strnlen.
+	(FASTSEARCH): Define.
+	* string/test-strcasestr.c: Rename __strnlen to strnlen.
+	* string/test-strstr.c: Likewise.
+
+2019-09-06  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* manual/tunables.texi (glibc.cpu.name): Add ares tunable.
+	* sysdeps/aarch64/multiarch/memcpy.c (__libc_memcpy): Use
+	__memcpy_falkor for ares.
+	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_ARES):
+	Add new define.
+	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c (cpu_list):
+	Add ares cpu.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
+	Use vector registers.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
+	Use multiple registers to copy data in loop tail.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/strncmp.S (strncmp): Use lsr instead of
+	mov + lsr.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/strncmp.S (strncmp): Use a separate shift
+	instruction to unbreak builds with binutils 2.26 and older.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/strncmp.S (count): New macro.
+	(strncmp): Store misaligned length in SRC1 in COUNT.
+	(mutual_align): Adjust.
+	(misaligned8): Load dword at a time when it is safe.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/strcmp.S (do_misaligned): Jump back to
+	do_misaligned, not misaligned8.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/strcmp.S (misaligned8): Compare dword at a
+	time whenever possible.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/memcmp.S (more16): Fix loop16 branch target.
+
+	* sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a
+	time.
+
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/memcmp.S: Use L() macro for labels.
+
+2019-09-06  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* sysdeps/aarch64/memcmp.S (memcmp):
+	Rewrite of optimized memcmp.
+
+2019-07-12  Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+
+	[BZ #24699]
+	* posix/tst-mmap-offset.c: Mention BZ #24699.
+	(do_test_bz21270): Rename to do_test_large_offset and use
+	mmap64_maximum_offset to check for maximum expected offset value.
+	* sysdeps/generic/mmap_info.h: New file.
+	* sysdeps/unix/sysv/linux/mips/mmap_info.h: Likewise.
+	* sysdeps/unix/sysv/linux/mmap64.c (MMAP_OFF_HIGH_MASK): Define iff
+	__NR_mmap2 is used.
+
+2019-07-12  Szabolcs Nagy  <szabolcs.nagy@arm.com>
+
+	* sysdeps/aarch64/dl-machine.h (elf_machine_lazy_rel): Check
+	STO_AARCH64_VARIANT_PCS and bind such symbols at load time.
+
+2019-06-13  Szabolcs Nagy  <szabolcs.nagy@arm.com>
+
+	* elf/elf.h (STO_AARCH64_VARIANT_PCS): Define.
+	(DT_AARCH64_VARIANT_PCS): Define.
+
+2019-05-22  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	[BZ #24531]
+	* malloc/malloc.c (MAX_TCACHE_COUNT): New define.
+	(do_set_tcache_count): Only update if count is small enough.
+	* manual/tunables.texi (glibc.malloc.tcache_count): Document max value.
+
+2019-05-15  Andreas Schwab  <schwab@suse.de>
+
+	[BZ #20568]
+	* libio/wfileops.c (_IO_wfile_sync): Correct last argument to
+	__codecvt_do_length.
+	* libio/Makefile (tests): Add tst-wfile-sync.
+	($(objpfx)tst-wfile-sync.out): Depend on $(gen-locales).
+	* libio/tst-wfile-sync.c: New file.
+	* libio/tst-wfile-sync.input: New file.
+
+2019-02-07  Stefan Liebler  <stli@linux.ibm.com>
+
+	[BZ #24180]
+	* nptl/pthread_mutex_trylock.c (__pthread_mutex_trylock):
+	Add compiler barriers and comments.
+
+2019-02-04  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24155]
+	CVE-2019-7309
+	* NEWS: Updated for CVE-2019-7309.
+	* sysdeps/x86_64/memcmp.S: Use RDX_LP for size.  Clear the
+	upper 32 bits of RDX register for x32.  Use unsigned Jcc
+	instructions, instead of signed.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test.
+
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length.
+	Clear the upper 32 bits of RSI register.
+	* sysdeps/x86_64/strlen.S: Use RSI_LP for length.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen
+	and tst-size_t-wcsnlen.
+	* sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise.
+
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Use RDX_LP
+	for length.
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
+	* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.
+
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcmp-sse42.S: Use RDX_LP for length.
+	* sysdeps/x86_64/strcmp.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
+	tst-size_t-strncmp and tst-size_t-wcsncmp.
+	* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.
+
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
+	RDX_LP for length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
+	* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.
+
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
+	* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.
+
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
+	Likewise.
+	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
+	Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
+
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
+	* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
+	tst-size_t-wmemcmp.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
+
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memchr.S: Use RDX_LP for length.  Clear the
+	upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/test-size_t.h: New file.
+	* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
+
 2019-01-11  Gabriel F. T. Gomes  <gabriel@inconstante.eti.br>
 
 	* sysdeps/powerpc/fpu/libm-test-ulps: Regenerate.
diff --git a/NEWS b/NEWS
index 49895f81bd..3ccaae3968 100644
--- a/NEWS
+++ b/NEWS
@@ -86,6 +86,26 @@ Security related changes:
   denial of service due to resource exhaustion when processing getaddrinfo
   calls with crafted host names.  Reported by Guido Vranken.
 
+  CVE-2019-6488: On x32, the size_t parameter may be passed in the lower
+  32 bits of a 64-bit register with with non-zero upper 32 bit.  When it
+  happened, accessing the 32-bit size_t value as the full 64-bit register
+  in the assembly string/memory functions would cause a buffer overflow.
+  Reported by H.J. Lu.
+
+  CVE-2019-7309: x86-64 memcmp used signed Jcc instructions to check
+  size.  For x86-64, memcmp on an object size larger than SSIZE_MAX
+  has undefined behavior.  On x32, the size_t argument may be passed
+  in the lower 32 bits of the 64-bit RDX register with non-zero upper
+  32 bits.  When it happened with the sign bit of RDX register set,
+  memcmp gave the wrong result since it treated the size argument as
+  zero.  Reported by H.J. Lu.
+
+  CVE-2019-19126: ld.so failed to ignore the LD_PREFER_MAP_32BIT_EXEC
+  environment variable during program execution after a security
+  transition, allowing local attackers to restrict the possible mapping
+  addresses for loaded libraries and thus bypass ASLR for a setuid
+  program.  Reported by Marcin Kościelnicki.
+
 The following bugs are resolved with this release:
 
   [16750] ldd: Never run file directly.
@@ -93,6 +113,7 @@ The following bugs are resolved with this release:
   [17956] crypt: Use NSPR header files in addition to NSS header files
   [20419] elf: Fix stack overflow with huge PT_NOTE segment
   [20532] getaddrinfo: More robust handling of dlopen failures
+  [20568] Fix crash in _IO_wfile_sync
   [21242] assert: Suppress pedantic warning caused by statement expression
   [21265] x86-64: Use fxsave/xsave/xsavec in _dl_runtime_resolve
   [21269] i386 sigaction sa_restorer handling is wrong
@@ -165,6 +186,11 @@ The following bugs are resolved with this release:
   [23927] Linux if_nametoindex() does not close descriptor (CVE-2018-19591)
   [24018] gettext may return NULL
   [24027] malloc: Integer overflow in realloc
+  [24097] Can't use 64-bit register for size_t in assembly codes for x32 (CVE-2019-6488)
+  [24155] x32 memcmp can treat positive length as 0 (if sign bit in RDX is set) (CVE-2019-7309)
+  [25203] libio: Disable vtable validation for pre-2.1 interposed handles
+  [25204] Ignore LD_PREFER_MAP_32BIT_EXEC for SUID programs
+
 
 Version 2.26
 
diff --git a/benchtests/bench-strcasestr.c b/benchtests/bench-strcasestr.c
index 4e6f480c84..9a031b3064 100644
--- a/benchtests/bench-strcasestr.c
+++ b/benchtests/bench-strcasestr.c
@@ -24,6 +24,7 @@
 #define STRCASESTR simple_strcasestr
 #define NO_ALIAS
 #define __strncasecmp strncasecmp
+#define __strnlen strnlen
 #include "../string/strcasestr.c"
 
 
diff --git a/benchtests/bench-strstr.c b/benchtests/bench-strstr.c
index e63659f136..2fa64118f4 100644
--- a/benchtests/bench-strstr.c
+++ b/benchtests/bench-strstr.c
@@ -22,6 +22,9 @@
 
 
 #define STRSTR simple_strstr
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(X)
+#define __strnlen strnlen
 #include "../string/strstr.c"
 
 
diff --git a/elf/elf.h b/elf/elf.h
index 3900b4c9f0..f80506c562 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -2759,6 +2759,13 @@ enum
 #define R_AARCH64_TLSDESC      1031	/* TLS Descriptor.  */
 #define R_AARCH64_IRELATIVE	1032	/* STT_GNU_IFUNC relocation.  */
 
+/* AArch64 specific values for the Dyn d_tag field.  */
+#define DT_AARCH64_VARIANT_PCS	(DT_LOPROC + 5)
+#define DT_AARCH64_NUM		6
+
+/* AArch64 specific values for the st_other field.  */
+#define STO_AARCH64_VARIANT_PCS 0x80
+
 /* ARM relocs.  */
 
 #define R_ARM_NONE		0	/* No reloc */
diff --git a/libio/Makefile b/libio/Makefile
index 74bf5279f1..79158f3ebd 100644
--- a/libio/Makefile
+++ b/libio/Makefile
@@ -62,7 +62,7 @@ tests = tst_swprintf tst_wprintf tst_swscanf tst_wscanf tst_getwc tst_putwc   \
 	bug-memstream1 bug-wmemstream1 \
 	tst-setvbuf1 tst-popen1 tst-fgetwc bug-wsetpos tst-fseek \
 	tst-fwrite-error tst-ftell-partial-wide tst-ftell-active-handler \
-	tst-ftell-append tst-fputws
+	tst-ftell-append tst-fputws tst-wfile-sync
 
 tests-internal = tst-vtables tst-vtables-interposed
 
@@ -202,6 +202,7 @@ $(objpfx)tst-ungetwc1.out: $(gen-locales)
 $(objpfx)tst-ungetwc2.out: $(gen-locales)
 $(objpfx)tst-widetext.out: $(gen-locales)
 $(objpfx)tst_wprintf2.out: $(gen-locales)
+$(objpfx)tst-wfile-sync.out: $(gen-locales)
 endif
 
 $(objpfx)test-freopen.out: test-freopen.sh $(objpfx)test-freopen
diff --git a/libio/oldstdfiles.c b/libio/oldstdfiles.c
index bed7bceca2..82ba986367 100644
--- a/libio/oldstdfiles.c
+++ b/libio/oldstdfiles.c
@@ -87,6 +87,11 @@ _IO_check_libio (void)
 	stdout->_vtable_offset = stderr->_vtable_offset =
 	((int) sizeof (struct _IO_FILE)
 	 - (int) sizeof (struct _IO_FILE_complete));
+
+      if (_IO_stdin_.vtable != &_IO_old_file_jumps
+	  || _IO_stdout_.vtable != &_IO_old_file_jumps
+	  || _IO_stderr_.vtable != &_IO_old_file_jumps)
+	IO_set_accept_foreign_vtables (&_IO_vtable_check);
     }
 }
 
diff --git a/libio/tst-wfile-sync.c b/libio/tst-wfile-sync.c
new file mode 100644
index 0000000000..618682064d
--- /dev/null
+++ b/libio/tst-wfile-sync.c
@@ -0,0 +1,39 @@
+/* Test that _IO_wfile_sync does not crash (bug 20568).
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <locale.h>
+#include <stdio.h>
+#include <wchar.h>
+#include <support/check.h>
+#include <support/xunistd.h>
+
+static int
+do_test (void)
+{
+  TEST_VERIFY_EXIT (setlocale (LC_ALL, "de_DE.UTF-8") != NULL);
+  /* Fill the stdio buffer and advance the read pointer.  */
+  TEST_VERIFY_EXIT (fgetwc (stdin) != WEOF);
+  /* This calls _IO_wfile_sync, it should not crash.  */
+  TEST_VERIFY_EXIT (setvbuf (stdin, NULL, _IONBF, 0) == 0);
+  /* Verify that the external file offset has been synchronized.  */
+  TEST_COMPARE (xlseek (0, 0, SEEK_CUR), 1);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/libio/tst-wfile-sync.input b/libio/tst-wfile-sync.input
new file mode 100644
index 0000000000..12d0958f7a
--- /dev/null
+++ b/libio/tst-wfile-sync.input
@@ -0,0 +1 @@
+This is a test of _IO_wfile_sync.
diff --git a/libio/wfileops.c b/libio/wfileops.c
index fb94f45040..727e1b23b9 100644
--- a/libio/wfileops.c
+++ b/libio/wfileops.c
@@ -526,11 +526,12 @@ _IO_wfile_sync (_IO_FILE *fp)
 	     generate the wide characters up to the current reading
 	     position.  */
 	  int nread;
-
+	  size_t wnread = (fp->_wide_data->_IO_read_ptr
+			   - fp->_wide_data->_IO_read_base);
 	  fp->_wide_data->_IO_state = fp->_wide_data->_IO_last_state;
 	  nread = (*cv->__codecvt_do_length) (cv, &fp->_wide_data->_IO_state,
 					      fp->_IO_read_base,
-					      fp->_IO_read_end, delta);
+					      fp->_IO_read_end, wnread);
 	  fp->_IO_read_ptr = fp->_IO_read_base + nread;
 	  delta = -(fp->_IO_read_end - fp->_IO_read_base - nread);
 	}
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 49e8ed69c2..9896230b21 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -2924,6 +2924,8 @@ typedef struct tcache_perthread_struct
   tcache_entry *entries[TCACHE_MAX_BINS];
 } tcache_perthread_struct;
 
+#define MAX_TCACHE_COUNT 127	/* Maximum value of counts[] entries.  */
+
 static __thread bool tcache_shutting_down = false;
 static __thread tcache_perthread_struct *tcache = NULL;
 
@@ -5097,8 +5099,11 @@ static inline int
 __always_inline
 do_set_tcache_count (size_t value)
 {
-  LIBC_PROBE (memory_tunable_tcache_count, 2, value, mp_.tcache_count);
-  mp_.tcache_count = value;
+  if (value <= MAX_TCACHE_COUNT)
+    {
+      LIBC_PROBE (memory_tunable_tcache_count, 2, value, mp_.tcache_count);
+      mp_.tcache_count = value;
+    }
   return 1;
 }
 
diff --git a/manual/tunables.texi b/manual/tunables.texi
index b09e3fe791..b230cde556 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -187,8 +187,8 @@ per-thread cache.  The default (and maximum) value is 1032 bytes on
 
 @deftp Tunable glibc.malloc.tcache_count
 The maximum number of chunks of each size to cache. The default is 7.
-There is no upper limit, other than available system memory.  If set
-to zero, the per-thread cache is effectively disabled.
+The upper limit is 127.  If set to zero, the per-thread cache is effectively
+disabled.
 
 The approximate maximum overhead of the per-thread cache is thus equal
 to the number of bins times the chunk count in each bin times the size
@@ -253,7 +253,7 @@ This tunable is specific to i386 and x86-64.
 @deftp Tunable glibc.tune.cpu
 The @code{glibc.tune.cpu=xxx} tunable allows the user to tell @theglibc{} to
 assume that the CPU is @code{xxx} where xxx may have one of these values:
-@code{generic}, @code{falkor}, @code{thunderxt88}.
+@code{generic}, @code{falkor}, @code{thunderxt88}, @code{ares}.
 
 This tunable is specific to aarch64.
 @end deftp
diff --git a/nptl/pthread_mutex_trylock.c b/nptl/pthread_mutex_trylock.c
index ec7da61c73..d478eca21b 100644
--- a/nptl/pthread_mutex_trylock.c
+++ b/nptl/pthread_mutex_trylock.c
@@ -92,6 +92,9 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
     case PTHREAD_MUTEX_ROBUST_ADAPTIVE_NP:
       THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
 		     &mutex->__data.__list.__next);
+      /* We need to set op_pending before starting the operation.  Also
+	 see comments at ENQUEUE_MUTEX.  */
+      __asm ("" ::: "memory");
 
       oldval = mutex->__data.__lock;
       do
@@ -117,7 +120,12 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	      /* But it is inconsistent unless marked otherwise.  */
 	      mutex->__data.__owner = PTHREAD_MUTEX_INCONSISTENT;
 
+	      /* We must not enqueue the mutex before we have acquired it.
+		 Also see comments at ENQUEUE_MUTEX.  */
+	      __asm ("" ::: "memory");
 	      ENQUEUE_MUTEX (mutex);
+	      /* We need to clear op_pending after we enqueue the mutex.  */
+	      __asm ("" ::: "memory");
 	      THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
 	      /* Note that we deliberately exist here.  If we fall
@@ -133,6 +141,8 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	      int kind = PTHREAD_MUTEX_TYPE (mutex);
 	      if (kind == PTHREAD_MUTEX_ROBUST_ERRORCHECK_NP)
 		{
+		  /* We do not need to ensure ordering wrt another memory
+		     access.  Also see comments at ENQUEUE_MUTEX. */
 		  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
 				 NULL);
 		  return EDEADLK;
@@ -140,6 +150,8 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 
 	      if (kind == PTHREAD_MUTEX_ROBUST_RECURSIVE_NP)
 		{
+		  /* We do not need to ensure ordering wrt another memory
+		     access.  */
 		  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
 				 NULL);
 
@@ -158,6 +170,9 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 							id, 0);
 	  if (oldval != 0 && (oldval & FUTEX_OWNER_DIED) == 0)
 	    {
+	      /* We haven't acquired the lock as it is already acquired by
+		 another owner.  We do not need to ensure ordering wrt another
+		 memory access.  */
 	      THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
 	      return EBUSY;
@@ -171,13 +186,20 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	      if (oldval == id)
 		lll_unlock (mutex->__data.__lock,
 			    PTHREAD_ROBUST_MUTEX_PSHARED (mutex));
+	      /* FIXME This violates the mutex destruction requirements.  See
+		 __pthread_mutex_unlock_full.  */
 	      THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 	      return ENOTRECOVERABLE;
 	    }
 	}
       while ((oldval & FUTEX_OWNER_DIED) != 0);
 
+      /* We must not enqueue the mutex before we have acquired it.
+	 Also see comments at ENQUEUE_MUTEX.  */
+      __asm ("" ::: "memory");
       ENQUEUE_MUTEX (mutex);
+      /* We need to clear op_pending after we enqueue the mutex.  */
+      __asm ("" ::: "memory");
       THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
       mutex->__data.__owner = id;
@@ -203,10 +225,15 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	int robust = mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP;
 
 	if (robust)
-	  /* Note: robust PI futexes are signaled by setting bit 0.  */
-	  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
-			 (void *) (((uintptr_t) &mutex->__data.__list.__next)
-				   | 1));
+	  {
+	    /* Note: robust PI futexes are signaled by setting bit 0.  */
+	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
+			   (void *) (((uintptr_t) &mutex->__data.__list.__next)
+				     | 1));
+	    /* We need to set op_pending before starting the operation.  Also
+	       see comments at ENQUEUE_MUTEX.  */
+	    __asm ("" ::: "memory");
+	  }
 
 	oldval = mutex->__data.__lock;
 
@@ -215,12 +242,16 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	  {
 	    if (kind == PTHREAD_MUTEX_ERRORCHECK_NP)
 	      {
+		/* We do not need to ensure ordering wrt another memory
+		   access.  */
 		THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 		return EDEADLK;
 	      }
 
 	    if (kind == PTHREAD_MUTEX_RECURSIVE_NP)
 	      {
+		/* We do not need to ensure ordering wrt another memory
+		   access.  */
 		THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
 		/* Just bump the counter.  */
@@ -242,6 +273,9 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	  {
 	    if ((oldval & FUTEX_OWNER_DIED) == 0)
 	      {
+		/* We haven't acquired the lock as it is already acquired by
+		   another owner.  We do not need to ensure ordering wrt another
+		   memory access.  */
 		THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
 		return EBUSY;
@@ -262,6 +296,9 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	    if (INTERNAL_SYSCALL_ERROR_P (e, __err)
 		&& INTERNAL_SYSCALL_ERRNO (e, __err) == EWOULDBLOCK)
 	      {
+		/* The kernel has not yet finished the mutex owner death.
+		   We do not need to ensure ordering wrt another memory
+		   access.  */
 		THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
 		return EBUSY;
@@ -279,7 +316,12 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	    /* But it is inconsistent unless marked otherwise.  */
 	    mutex->__data.__owner = PTHREAD_MUTEX_INCONSISTENT;
 
+	    /* We must not enqueue the mutex before we have acquired it.
+	       Also see comments at ENQUEUE_MUTEX.  */
+	    __asm ("" ::: "memory");
 	    ENQUEUE_MUTEX (mutex);
+	    /* We need to clear op_pending after we enqueue the mutex.  */
+	    __asm ("" ::: "memory");
 	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
 	    /* Note that we deliberately exit here.  If we fall
@@ -302,13 +344,20 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 						  PTHREAD_ROBUST_MUTEX_PSHARED (mutex)),
 			      0, 0);
 
+	    /* To the kernel, this will be visible after the kernel has
+	       acquired the mutex in the syscall.  */
 	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 	    return ENOTRECOVERABLE;
 	  }
 
 	if (robust)
 	  {
+	    /* We must not enqueue the mutex before we have acquired it.
+	       Also see comments at ENQUEUE_MUTEX.  */
+	    __asm ("" ::: "memory");
 	    ENQUEUE_MUTEX_PI (mutex);
+	    /* We need to clear op_pending after we enqueue the mutex.  */
+	    __asm ("" ::: "memory");
 	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 	  }
 
diff --git a/posix/tst-mmap-offset.c b/posix/tst-mmap-offset.c
index 5bb88aab10..cfd82484f6 100644
--- a/posix/tst-mmap-offset.c
+++ b/posix/tst-mmap-offset.c
@@ -1,4 +1,4 @@
-/* BZ #18877 and #21270 mmap offset test.
+/* BZ #18877, BZ #21270, and BZ #24699 mmap offset test.
 
    Copyright (C) 2015-2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
@@ -24,6 +24,7 @@
 #include <unistd.h>
 #include <errno.h>
 #include <sys/mman.h>
+#include <mmap_info.h>
 
 #include <support/check.h>
 
@@ -76,7 +77,7 @@ do_test_bz18877 (void)
 
 /* Check if invalid offset are handled correctly by mmap.  */
 static int
-do_test_bz21270 (void)
+do_test_large_offset (void)
 {
   /* For architectures with sizeof (off_t) < sizeof (off64_t) mmap is
      implemented with __SYS_mmap2 syscall and the offset is represented in
@@ -90,7 +91,7 @@ do_test_bz21270 (void)
   const size_t length = 4096;
 
   void *addr = mmap64 (NULL, length, prot, flags, fd, offset);
-  if (sizeof (off_t) < sizeof (off64_t))
+  if (mmap64_maximum_offset (page_shift) < UINT64_MAX)
     {
       if ((addr != MAP_FAILED) && (errno != EINVAL))
 	FAIL_RET ("mmap succeed");
@@ -110,7 +111,7 @@ do_test (void)
   int ret = 0;
 
   ret += do_test_bz18877 ();
-  ret += do_test_bz21270 ();
+  ret += do_test_large_offset ();
 
   return ret;
 }
diff --git a/string/memmem.c b/string/memmem.c
index 54fca4966d..fba7fe33f7 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -15,67 +15,115 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
 
-/* Specification of memmem.  */
 #include <string.h>
 
 #ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
 # define __memmem	memmem
 #endif
 
 #define RETURN_TYPE void *
 #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
+#define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N))
 #include "str-two-way.h"
 
 #undef memmem
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
+/* Hash character pairs so a small shift table can be used.  All bits of
+   p[0] are included, but not all bits from p[-1].  So if two equal hashes
+   match on p[-1], p[0] matches too.  Hash collisions are harmless and result
+   in smaller shifts.  */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 void *
-__memmem (const void *haystack_start, size_t haystack_len,
-	  const void *needle_start, size_t needle_len)
+__memmem (const void *haystack, size_t hs_len,
+	  const void *needle, size_t ne_len)
 {
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (__glibc_unlikely (haystack_len < needle_len))
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
+
+  if (ne_len == 0)
+    return (void *) hs;
+  if (ne_len == 1)
+    return (void *) memchr (hs, ne[0], hs_len);
+
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
     return NULL;
 
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
+  const unsigned char *end = hs + hs_len - ne_len;
+
+  if (ne_len == 2)
+    {
+      uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+      for (hs++; hs <= end && hw != nw; )
+	hw = hw << 16 | *++hs;
+      return hw == nw ? (void *)hs - 1 : NULL;
+    }
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  /* Shift1 is the amount we can skip after matching the hash of the
+     needle end but not the full needle.  */
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  for ( ; hs <= end; )
     {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || __builtin_expect (needle_len == 1, 0))
-	return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-	return NULL;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+      /* Skip past character pairs not in the needle.  */
+      do
+	{
+	  hs += m1;
+	  tmp = shift[hash2 (hs)];
+	}
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+	 and continue until we match the hash of the needle end.  */
+      hs -= tmp;
+      if (tmp < m1)
+	continue;
+
+      /* Hash of the last 2 characters matches.  If the needle is long,
+	 try to quickly filter out mismatches.  */
+      if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0)
+	{
+	  if (memcmp (hs, ne, m1) == 0)
+	    return (void *) hs;
+
+	  /* Adjust filter offset when it doesn't find the mismatch.  */
+	  offset = (offset >= 8 ? offset : m1) - 8;
+	}
+
+      /* Skip based on matching the hash of the needle end.  */
+      hs += shift1;
     }
-  else
-    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
+  return NULL;
 }
 libc_hidden_def (__memmem)
 weak_alias (__memmem, memmem)
 libc_hidden_weak (memmem)
-
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/str-two-way.h b/string/str-two-way.h
index 599c867ffd..30aca30c40 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len,
    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
-static RETURN_TYPE
+static inline RETURN_TYPE
 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
 		      const unsigned char *needle, size_t needle_len)
 {
@@ -281,50 +281,50 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
     }
   else
     {
-      const unsigned char *phaystack = &haystack[suffix];
+      const unsigned char *phaystack;
       /* The comparison always starts from needle[suffix], so cache it
 	 and use an optimized first-character loop.  */
       unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]);
 
-#if CHECK_EOL
-      /* We start matching from the SUFFIX'th element, so make sure we
-	 don't hit '\0' before that.  */
-      if (haystack_len < suffix + 1
-	  && !AVAILABLE (haystack, haystack_len, 0, suffix + 1))
-	return NULL;
-#endif
-
       /* The two halves of needle are distinct; no extra memory is
 	 required, and any mismatch results in a maximal shift.  */
       period = MAX (suffix, needle_len - suffix) + 1;
       j = 0;
-      while (1
-#if !CHECK_EOL
-	     && AVAILABLE (haystack, haystack_len, j, needle_len)
-#endif
-	     )
+      while (AVAILABLE (haystack, haystack_len, j, needle_len))
 	{
 	  unsigned char haystack_char;
 	  const unsigned char *pneedle;
 
-	  /* TODO: The first-character loop can be sped up by adapting
-	     longword-at-a-time implementation of memchr/strchr.  */
-	  if (needle_suffix
+	  phaystack = &haystack[suffix + j];
+
+#ifdef FASTSEARCH
+	  if (*phaystack++ != needle_suffix)
+	    {
+	      phaystack = FASTSEARCH (phaystack, needle_suffix,
+				      haystack_len - needle_len - j);
+	      if (phaystack == NULL)
+		goto ret0;
+	      j = phaystack - &haystack[suffix];
+	      phaystack++;
+	    }
+#else
+	  while (needle_suffix
 	      != (haystack_char = CANON_ELEMENT (*phaystack++)))
 	    {
 	      RET0_IF_0 (haystack_char);
-#if !CHECK_EOL
+# if !CHECK_EOL
 	      ++j;
-#endif
-	      continue;
+	      if (!AVAILABLE (haystack, haystack_len, j, needle_len))
+		goto ret0;
+# endif
 	    }
 
-#if CHECK_EOL
+# if CHECK_EOL
 	  /* Calculate J if it wasn't kept up-to-date in the first-character
 	     loop.  */
 	  j = phaystack - &haystack[suffix] - 1;
+# endif
 #endif
-
 	  /* Scan for matches in right half.  */
 	  i = suffix + 1;
 	  pneedle = &needle[i];
@@ -338,6 +338,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
 		}
 	      ++i;
 	    }
+#if CHECK_EOL
+	  /* Update minimal length of haystack.  */
+	  if (phaystack > haystack + haystack_len)
+	    haystack_len = phaystack - haystack;
+#endif
 	  if (needle_len <= i)
 	    {
 	      /* Scan for matches in left half.  */
@@ -360,13 +365,6 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
 	    }
 	  else
 	    j += i - suffix + 1;
-
-#if CHECK_EOL
-	  if (!AVAILABLE (haystack, haystack_len, j, needle_len))
-	    break;
-#endif
-
-	  phaystack = &haystack[suffix + j];
 	}
     }
  ret0: __attribute__ ((unused))
@@ -384,8 +382,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
    and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
-   sublinear performance is not possible.  */
-static RETURN_TYPE
+   sublinear performance is not possible.
+
+   Since this function is large and complex, block inlining to avoid
+   slowing down the common case of small needles.  */
+__attribute__((noinline)) static RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
 		     const unsigned char *needle, size_t needle_len)
 {
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 2acf003155..19ea1d4bbf 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -37,8 +37,9 @@
 /* Two-Way algorithm.  */
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
+  (((j) + (n_l) <= (h_l)) \
+   || ((h_l) += __strnlen ((void*)((h) + (h_l)), (n_l) + 512), \
+       (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
 #define CANON_ELEMENT(c) TOLOWER (c)
@@ -58,31 +59,22 @@
    case-insensitive comparison.  This function gives unspecified
    results in multibyte locales.  */
 char *
-STRCASESTR (const char *haystack_start, const char *needle_start)
+STRCASESTR (const char *haystack, const char *needle)
 {
-  const char *haystack = haystack_start;
-  const char *needle = needle_start;
   size_t needle_len; /* Length of NEEDLE.  */
   size_t haystack_len; /* Known minimum length of HAYSTACK.  */
-  bool ok = true; /* True if NEEDLE is prefix of HAYSTACK.  */
-
-  /* Determine length of NEEDLE, and in the process, make sure
-     HAYSTACK is at least as long (no point processing all of a long
-     NEEDLE if HAYSTACK is too short).  */
-  while (*haystack && *needle)
-    {
-      ok &= (TOLOWER ((unsigned char) *haystack)
-	     == TOLOWER ((unsigned char) *needle));
-      haystack++;
-      needle++;
-    }
-  if (*needle)
+
+  /* Handle empty NEEDLE special case.  */
+  if (needle[0] == '\0')
+    return (char *) haystack;
+
+  /* Ensure HAYSTACK length is at least as long as NEEDLE length.
+     Since a match may occur early on in a huge HAYSTACK, use strnlen
+     and read ahead a few cachelines for improved performance.  */
+  needle_len = strlen (needle);
+  haystack_len = __strnlen (haystack, needle_len + 256);
+  if (haystack_len < needle_len)
     return NULL;
-  if (ok)
-    return (char *) haystack_start;
-  needle_len = needle - needle_start;
-  haystack = haystack_start + 1;
-  haystack_len = needle_len - 1;
 
   /* Perform the search.  Abstract memory is considered to be an array
      of 'unsigned char' values, not an array of 'char' values.  See
@@ -90,10 +82,10 @@ STRCASESTR (const char *haystack_start, const char *needle_start)
   if (needle_len < LONG_NEEDLE_THRESHOLD)
     return two_way_short_needle ((const unsigned char *) haystack,
 				 haystack_len,
-				 (const unsigned char *) needle_start,
+				 (const unsigned char *) needle,
 				 needle_len);
   return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
-			      (const unsigned char *) needle_start,
+			      (const unsigned char *) needle,
 			      needle_len);
 }
 
diff --git a/string/strstr.c b/string/strstr.c
index 88f1d5de36..4d72ffbfc9 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -16,27 +16,17 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
 
-/* Specification of strstr.  */
 #include <string.h>
 
-#include <stdbool.h>
-
-#ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
-#endif
-
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
-#define CHECK_EOL (1)
-#define RET0_IF_0(a) if (!a) goto ret0
+  (((j) + (n_l) <= (h_l)) \
+   || ((h_l) += __strnlen ((void*)((h) + (h_l)), (n_l) + 512), \
+       (j) + (n_l) <= (h_l)))
 #include "str-two-way.h"
 
 #undef strstr
@@ -45,48 +35,128 @@
 #define STRSTR strstr
 #endif
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE is empty, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
+static inline char *
+strstr2 (const unsigned char *hs, const unsigned char *ne)
+{
+  uint32_t h1 = (ne[0] << 16) | ne[1];
+  uint32_t h2 = 0;
+  for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
+      h2 = (h2 << 16) | c;
+  return h1 == h2 ? (char *)hs - 2 : NULL;
+}
+
+static inline char *
+strstr3 (const unsigned char *hs, const unsigned char *ne)
+{
+  uint32_t h1 = ((uint32_t)ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8);
+  uint32_t h2 = 0;
+  for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
+      h2 = (h2 | c) << 8;
+  return h1 == h2 ? (char *)hs - 3 : NULL;
+}
+
+/* Hash character pairs so a small shift table can be used.  All bits of
+   p[0] are included, but not all bits from p[-1].  So if two equal hashes
+   match on p[-1], p[0] matches too.  Hash collisions are harmless and result
+   in smaller shifts.  */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast strstr algorithm with guaranteed linear-time performance.
+   Small needles up to size 3 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 char *
-STRSTR (const char *haystack_start, const char *needle_start)
+STRSTR (const char *haystack, const char *needle)
 {
-  const char *haystack = haystack_start;
-  const char *needle = needle_start;
-  size_t needle_len; /* Length of NEEDLE.  */
-  size_t haystack_len; /* Known minimum length of HAYSTACK.  */
-  bool ok = true; /* True if NEEDLE is prefix of HAYSTACK.  */
-
-  /* Determine length of NEEDLE, and in the process, make sure
-     HAYSTACK is at least as long (no point processing all of a long
-     NEEDLE if HAYSTACK is too short).  */
-  while (*haystack && *needle)
-    ok &= *haystack++ == *needle++;
-  if (*needle)
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
+
+  /* Handle short needle special cases first.  */
+  if (ne[0] == '\0')
+    return (char *)hs;
+  hs = (const unsigned char *)strchr ((const char*)hs, ne[0]);
+  if (hs == NULL || ne[1] == '\0')
+    return (char*)hs;
+  if (ne[2] == '\0')
+    return strstr2 (hs, ne);
+  if (ne[3] == '\0')
+    return strstr3 (hs, ne);
+
+  /* Ensure haystack length is at least as long as needle length.
+     Since a match may occur early on in a huge haystack, use strnlen
+     and read ahead a few cachelines for improved performance.  */
+  size_t ne_len = strlen ((const char*)ne);
+  size_t hs_len = __strnlen ((const char*)hs, ne_len | 512);
+  if (hs_len < ne_len)
     return NULL;
-  if (ok)
-    return (char *) haystack_start;
-
-  /* Reduce the size of haystack using strchr, since it has a smaller
-     linear coefficient than the Two-Way algorithm.  */
-  needle_len = needle - needle_start;
-  haystack = strchr (haystack_start + 1, *needle_start);
-  if (!haystack || __builtin_expect (needle_len == 1, 0))
-    return (char *) haystack;
-  needle -= needle_len;
-  haystack_len = (haystack > haystack_start + needle_len ? 1
-		  : needle_len + haystack_start - haystack);
-
-  /* Perform the search.  Abstract memory is considered to be an array
-     of 'unsigned char' values, not an array of 'char' values.  See
-     ISO C 99 section 6.2.6.1.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
-    return two_way_short_needle ((const unsigned char *) haystack,
-				 haystack_len,
-				 (const unsigned char *) needle, needle_len);
-  return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
-			      (const unsigned char *) needle, needle_len);
+
+  /* Check whether we have a match.  This improves performance since we
+     avoid initialization overheads.  */
+  if (memcmp (hs, ne, ne_len) == 0)
+    return (char *) hs;
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__glibc_unlikely (ne_len > 256))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  const unsigned char *end = hs + hs_len - ne_len;
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  /* Initialize bad character shift hash table.  */
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  /* Shift1 is the amount we can skip after matching the hash of the
+     needle end but not the full needle.  */
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  while (1)
+    {
+      if (__glibc_unlikely (hs > end))
+	{
+	  end += __strnlen ((const char*)end + m1 + 1, 2048);
+	  if (hs > end)
+	    return NULL;
+	}
+
+      /* Skip past character pairs not in the needle.  */
+      do
+	{
+	  hs += m1;
+	  tmp = shift[hash2 (hs)];
+	}
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+	 and continue until we match the hash of the needle end.  */
+      hs -= tmp;
+      if (tmp < m1)
+	continue;
+
+      /* Hash of the last 2 characters matches.  If the needle is long,
+	 try to quickly filter out mismatches.  */
+      if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0)
+	{
+	  if (memcmp (hs, ne, m1) == 0)
+	    return (void *) hs;
+
+	  /* Adjust filter offset when it doesn't find the mismatch.  */
+	  offset = (offset >= 8 ? offset : m1) - 8;
+	}
+
+      /* Skip based on matching the hash of the needle end.  */
+      hs += shift1;
+    }
 }
 libc_hidden_builtin_def (strstr)
-
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/test-strcasestr.c b/string/test-strcasestr.c
index abb3916732..78e03da7c4 100644
--- a/string/test-strcasestr.c
+++ b/string/test-strcasestr.c
@@ -25,6 +25,7 @@
 #define STRCASESTR simple_strcasestr
 #define NO_ALIAS
 #define __strncasecmp strncasecmp
+#define __strnlen strnlen
 #include "strcasestr.c"
 
 
diff --git a/string/test-strstr.c b/string/test-strstr.c
index 33f221149a..5bce73b0bd 100644
--- a/string/test-strstr.c
+++ b/string/test-strstr.c
@@ -24,6 +24,7 @@
 
 #define STRSTR simple_strstr
 #define libc_hidden_builtin_def(arg) /* nothing */
+#define __strnlen strnlen
 #include "strstr.c"
 
 
@@ -150,6 +151,32 @@ check2 (void)
     }
 }
 
+#define N 1024
+
+static void
+pr23637 (void)
+{
+  char *h = (char*) buf1;
+  char *n = (char*) buf2;
+
+  for (int i = 0; i < N; i++)
+    {
+      n[i] = 'x';
+      h[i] = ' ';
+      h[i + N] = 'x';
+    }
+
+  n[N] = '\0';
+  h[N * 2] = '\0';
+
+  /* Ensure we don't match at the first 'x'.  */
+  h[0] = 'x';
+
+  char *exp_result = stupid_strstr (h, n);
+  FOR_EACH_IMPL (impl, 0)
+    check_result (impl, h, n, exp_result);
+}
+
 static int
 test_main (void)
 {
@@ -157,6 +184,7 @@ test_main (void)
 
   check1 ();
   check2 ();
+  pr23637 ();
 
   printf ("%23s", "");
   FOR_EACH_IMPL (impl, 0)
@@ -201,6 +229,9 @@ test_main (void)
 	do_test (15, 9, hlen, klen, 1);
 	do_test (15, 15, hlen, klen, 0);
 	do_test (15, 15, hlen, klen, 1);
+
+	do_test (15, 15, hlen + klen * 4, klen * 4, 0);
+	do_test (15, 15, hlen + klen * 4, klen * 4, 1);
       }
 
   do_test (0, 0, page_size - 1, 16, 0);
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 3fb00e6e2d..9ffc2e4c9d 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -391,10 +391,37 @@ elf_machine_lazy_rel (struct link_map *map,
   /* Check for unexpected PLT reloc type.  */
   if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))
     {
-      if (__builtin_expect (map->l_mach.plt, 0) == 0)
-	*reloc_addr += l_addr;
-      else
-	*reloc_addr = map->l_mach.plt;
+      if (map->l_mach.plt == 0)
+	{
+	  /* Prelinking.  */
+	  *reloc_addr += l_addr;
+	  return;
+	}
+
+      if (1) /* DT_AARCH64_VARIANT_PCS is not available, so always check.  */
+	{
+	  /* Check the symbol table for variant PCS symbols.  */
+	  const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
+	  const ElfW (Sym) *symtab =
+	    (const void *)D_PTR (map, l_info[DT_SYMTAB]);
+	  const ElfW (Sym) *sym = &symtab[symndx];
+	  if (__glibc_unlikely (sym->st_other & STO_AARCH64_VARIANT_PCS))
+	    {
+	      /* Avoid lazy resolution of variant PCS symbols.  */
+	      const struct r_found_version *version = NULL;
+	      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+		{
+		  const ElfW (Half) *vernum =
+		    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
+		  version = &map->l_versions[vernum[symndx] & 0x7fff];
+		}
+	      elf_machine_rela (map, reloc, sym, version, reloc_addr,
+				skip_ifunc);
+	      return;
+	    }
+	}
+
+      *reloc_addr = map->l_mach.plt;
     }
   else if (__builtin_expect (r_type == AARCH64_R(TLSDESC), 1))
     {
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index 4cfcb89297..a741e7b17f 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -22,132 +22,132 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses.
  */
 
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define limit		x2
-#define result		x0
+#define result		w0
 
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define has_nul		x5
-#define diff		x6
-#define endloop		x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define pos		x11
-#define limit_wd	x12
-#define mask		x13
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
 
 ENTRY_ALIGN (memcmp, 6)
 	DELOUSE (0)
 	DELOUSE (1)
 	DELOUSE (2)
-	cbz	limit, L(ret0)
-	eor	tmp1, src1, src2
-	tst	tmp1, #7
-	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
-	add	limit_wd, limit, #7
-	lsr	limit_wd, limit_wd, #3
-	/* Start of performance-critical section  -- one 64B cache line.  */
-L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-L(start_realigned):
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */
-	cbz	endloop, L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
-
-	/* Not reached the limit, must have found a diff.  */
-	cbnz	limit_wd, L(not_limit)
-
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	L(not_limit)
-
-	lsl	limit, limit, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-#ifdef __AARCH64EB__
-	lsr	mask, mask, limit
-#else
-	lsl	mask, mask, limit
-#endif
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	orr	diff, diff, mask
-L(not_limit):
 
-#ifndef	__AARCH64EB__
-	rev	diff, diff
+	subs	limit, limit, 8
+	b.lo	L(less8)
+
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	b.ne	L(return)
+
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
+
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop16)
+
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
+	.p2align 4
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)
+
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp     data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+L(return):
+#ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
-	/* The MS-non-zero bit of DIFF marks either the first bit
-	   that is different, or the end of the significant data.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	clz	pos, diff
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	RET
-
-L(mutual_align):
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	ldr	data1, [src1], #8
-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	add	limit_wd, limit, #7
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	lsr	limit_wd, limit_wd, #3
-	b	L(start_realigned)
-
-L(ret0):
-	mov	result, #0
-	RET
-
-	.p2align 6
-L(misaligned8):
-	sub	limit, limit, #1
-1:
-	/* Perhaps we can do better than this.  */
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
-	sub	result, data1, data2
-	RET
+	cmp     data1, data2
+L(ret_eq):
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less8):
+	adds	limit, limit, 4
+	b.lo	L(less4)
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
+	cmp	data1w, data2w
+	b.ne	L(return)
+	sub	limit, limit, 4
+L(less4):
+	adds	limit, limit, 4
+	beq	L(ret_eq)
+L(byte_loop):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+	sub	result, data1w, data2w
+	ret
+
 END (memcmp)
 #undef bcmp
 weak_alias (memcmp, bcmp)
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index b395df1c63..ee4d78ea1d 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -35,7 +35,7 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
 libc_ifunc (__libc_memcpy,
             (IS_THUNDERX (midr)
 	     ? __memcpy_thunderx
-	     : (IS_FALKOR (midr)
+	     : (IS_FALKOR (midr) || IS_ARES (midr)
 		? __memcpy_falkor
 		: __memcpy_generic)));
 
diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
index dea4f225ee..9cde8dcbd6 100644
--- a/sysdeps/aarch64/multiarch/memcpy_falkor.S
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -29,11 +29,19 @@
 #define dst	x3
 #define srcend	x4
 #define dstend	x5
-#define A_l	x6
-#define A_lw	w6
-#define A_h	x7
-#define A_hw	w7
 #define tmp1	x14
+#define A_x	x6
+#define B_x	x7
+#define A_w	w6
+#define B_w	w7
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
 
 /* Copies are split into 3 main cases:
 
@@ -53,9 +61,9 @@
    bumping up the small copies up to 32 bytes allows us to do that without
    cost and also allows us to reduce the size of the prep code before loop64.
 
-   All copies are done only via two registers r6 and r7.  This is to ensure
-   that all loads hit a single hardware prefetcher which can get correctly
-   trained to prefetch a single stream.
+   The copy loop uses only one register q0.  This is to ensure that all loads
+   hit a single hardware prefetcher which can get correctly trained to prefetch
+   a single stream.
 
    The non-temporal stores help optimize cache utilization.  */
 
@@ -66,29 +74,29 @@ ENTRY_ALIGN (__memcpy_falkor, 6)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	b.ls	L(copy32)
-	ldp	A_l, A_h, [src]
+	ldr	A_q, [src]
 	cmp	count, 128
-	stp	A_l, A_h, [dstin]
+	str	A_q, [dstin]
 	b.hi	L(copy_long)
 
 	/* Medium copies: 33..128 bytes.  */
 	sub	tmp1, count, 1
-	ldp	A_l, A_h, [src, 16]
-	stp	A_l, A_h, [dstin, 16]
+	ldr	A_q, [src, 16]
+	ldr	B_q, [srcend, -32]
+	ldr	C_q, [srcend, -16]
 	tbz	tmp1, 6, 1f
-	ldp	A_l, A_h, [src, 32]
-	stp	A_l, A_h, [dstin, 32]
-	ldp	A_l, A_h, [src, 48]
-	stp	A_l, A_h, [dstin, 48]
-	ldp	A_l, A_h, [srcend, -64]
-	stp	A_l, A_h, [dstend, -64]
-	ldp	A_l, A_h, [srcend, -48]
-	stp	A_l, A_h, [dstend, -48]
+	ldr	D_q, [src, 32]
+	ldr	E_q, [src, 48]
+	str	D_q, [dstin, 32]
+	str	E_q, [dstin, 48]
+	ldr	F_q, [srcend, -64]
+	ldr	G_q, [srcend, -48]
+	str	F_q, [dstend, -64]
+	str	G_q, [dstend, -48]
 1:
-	ldp	A_l, A_h, [srcend, -32]
-	stp	A_l, A_h, [dstend, -32]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	A_l, A_h, [dstend, -16]
+	str	A_q, [dstin, 16]
+	str	B_q, [dstend, -32]
+	str	C_q, [dstend, -16]
 	ret
 
 	.p2align 4
@@ -97,44 +105,44 @@ L(copy32):
 	/* 16-32 */
 	cmp	count, 16
 	b.lo	1f
-	ldp	A_l, A_h, [src]
-	stp	A_l, A_h, [dstin]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	A_l, A_h, [dstend, -16]
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
 	ret
 	.p2align 4
 1:
 	/* 8-15 */
 	tbz	count, 3, 1f
-	ldr	A_l, [src]
-	str	A_l, [dstin]
-	ldr	A_l, [srcend, -8]
-	str	A_l, [dstend, -8]
+	ldr	A_x, [src]
+	ldr	B_x, [srcend, -8]
+	str	A_x, [dstin]
+	str	B_x, [dstend, -8]
 	ret
 	.p2align 4
 1:
 	/* 4-7 */
 	tbz	count, 2, 1f
-	ldr	A_lw, [src]
-	str	A_lw, [dstin]
-	ldr	A_lw, [srcend, -4]
-	str	A_lw, [dstend, -4]
+	ldr	A_w, [src]
+	ldr	B_w, [srcend, -4]
+	str	A_w, [dstin]
+	str	B_w, [dstend, -4]
 	ret
 	.p2align 4
 1:
 	/* 2-3 */
 	tbz	count, 1, 1f
-	ldrh	A_lw, [src]
-	strh	A_lw, [dstin]
-	ldrh	A_lw, [srcend, -2]
-	strh	A_lw, [dstend, -2]
+	ldrh	A_w, [src]
+	ldrh	B_w, [srcend, -2]
+	strh	A_w, [dstin]
+	strh	B_w, [dstend, -2]
 	ret
 	.p2align 4
 1:
 	/* 0-1 */
 	tbz	count, 0, 1f
-	ldrb	A_lw, [src]
-	strb	A_lw, [dstin]
+	ldrb	A_w, [src]
+	strb	A_w, [dstin]
 1:
 	ret
 
@@ -153,30 +161,29 @@ L(copy_long):
 	add	count, count, tmp1
 
 L(loop64):
-	ldp	A_l, A_h, [src, 16]!
-	stnp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [src, 16]!
+	ldr	A_q, [src, 16]!
+	str	A_q, [dst, 16]
+	ldr	A_q, [src, 16]!
 	subs	count, count, 64
-	stnp	A_l, A_h, [dst, 32]
-	ldp	A_l, A_h, [src, 16]!
-	stnp	A_l, A_h, [dst, 48]
-	ldp	A_l, A_h, [src, 16]!
-	stnp	A_l, A_h, [dst, 64]
-	add	dst, dst, 64
+	str	A_q, [dst, 32]
+	ldr	A_q, [src, 16]!
+	str	A_q, [dst, 48]
+	ldr	A_q, [src, 16]!
+	str	A_q, [dst, 64]!
 	b.hi	L(loop64)
 
 	/* Write the last full set of 64 bytes.  The remainder is at most 64
 	   bytes, so it is safe to always copy 64 bytes from the end even if
 	   there is just 1 byte left.  */
 L(last64):
-	ldp	A_l, A_h, [srcend, -64]
-	stnp	A_l, A_h, [dstend, -64]
-	ldp	A_l, A_h, [srcend, -48]
-	stnp	A_l, A_h, [dstend, -48]
-	ldp	A_l, A_h, [srcend, -32]
-	stnp	A_l, A_h, [dstend, -32]
-	ldp	A_l, A_h, [srcend, -16]
-	stnp	A_l, A_h, [dstend, -16]
+	ldr	E_q, [srcend, -64]
+	str	E_q, [dstend, -64]
+	ldr	D_q, [srcend, -48]
+	str	D_q, [dstend, -48]
+	ldr	C_q, [srcend, -32]
+	str	C_q, [dstend, -32]
+	ldr	B_q, [srcend, -16]
+	str	B_q, [dstend, -16]
 	ret
 
 END (__memcpy_falkor)
diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S
index e99d6625b7..7eed82cee7 100644
--- a/sysdeps/aarch64/strcmp.S
+++ b/sysdeps/aarch64/strcmp.S
@@ -72,6 +72,7 @@ L(start_realigned):
 	cbz	syndrome, L(loop_aligned)
 	/* End of performance-critical section  -- one 64B cache line.  */
 
+L(end):
 #ifndef	__AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
@@ -145,12 +146,38 @@ L(mutual_align):
 	b	L(start_realigned)
 
 L(misaligned8):
-	/* We can do better than this.  */
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond page boundary in
+	   SRC2.  */
+	tst	src1, #7
+	b.eq	L(loop_misaligned)
+L(do_misaligned):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	cmp	data1w, #1
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	L(misaligned8)
+	b.ne	L(done)
+	tst	src1, #7
+	b.ne	L(do_misaligned)
+
+L(loop_misaligned):
+	/* Test if we are within the last dword of the end of a 4K page.  If
+	   yes then jump back to the misaligned loop to copy a byte at a time.  */
+	and	tmp1, src2, #0xff8
+	eor	tmp1, tmp1, #0xff8
+	cbz	tmp1, L(do_misaligned)
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_misaligned)
+	b	L(end)
+
+L(done):
 	sub	result, data1, data2
 	RET
 END(strcmp)
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
index 3e4d88a5d7..fc1f633348 100644
--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -49,6 +49,7 @@
 #define limit_wd	x13
 #define mask		x14
 #define endloop		x15
+#define count		mask
 
 ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
 	DELOUSE (0)
@@ -58,9 +59,9 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
+	and	count, src1, #7
 	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
+	cbnz	count, L(mutual_align)
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
@@ -165,43 +166,107 @@ L(mutual_align):
 	bic	src1, src1, #7
 	bic	src2, src2, #7
 	ldr	data1, [src1], #8
-	neg	tmp3, tmp1, lsl #3	/* 64 - bits(bytes beyond align). */
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 #ifdef __AARCH64EB__
 	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #else
 	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #endif
 	and	tmp3, limit_wd, #7
 	lsr	limit_wd, limit_wd, #3
 	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, tmp1
-	add	tmp3, tmp3, tmp1
+	add	limit, limit, count
+	add	tmp3, tmp3, count
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
-L(ret0):
-	mov	result, #0
-	RET
-
 	.p2align 6
+	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
-	sub	limit, limit, #1
-1:
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
+
+L(byte_loop):
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
-	ccmp	data1w, #1, #0, cs	/* NZCV = 0b0000.  */
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
+	b.eq	L(byte_loop)
+L(done):
 	sub	result, data1, data2
 	RET
+
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
+	lsr	limit_wd, limit, #3
+	cbz	count, L(do_misaligned)
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+	lsr	limit_wd, limit, #3
+
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+L(do_misaligned):
+	/* Prepare ourselves for the next page crossing.  Unlike the aligned
+	   loop, we fetch 1 less dword because we risk crossing bounds on
+	   SRC2.  */
+	mov	count, #8
+	subs	limit_wd, limit_wd, #1
+	b.lo	L(done_loop)
+L(loop_misaligned):
+	and	tmp2, src2, #0xff8
+	eor	tmp2, tmp2, #0xff8
+	cbz	tmp2, L(page_end_loop)
+
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+	subs	limit_wd, limit_wd, #1
+	b.pl	L(loop_misaligned)
+
+L(done_loop):
+	/* We found a difference or a NULL before the limit was reached.  */
+	and	limit, limit, #7
+	cbz	limit, L(not_limit)
+	/* Read the last word.  */
+	sub	src1, src1, 8
+	sub	src2, src2, 8
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+
+L(ret0):
+	mov	result, #0
+	RET
+
 END (strncmp)
 libc_hidden_builtin_def (strncmp)
diff --git a/sysdeps/generic/mmap_info.h b/sysdeps/generic/mmap_info.h
new file mode 100644
index 0000000000..b3087df2d3
--- /dev/null
+++ b/sysdeps/generic/mmap_info.h
@@ -0,0 +1,16 @@
+/* As default architectures with sizeof (off_t) < sizeof (off64_t) the mmap is
+   implemented with __SYS_mmap2 syscall and the offset is represented in
+   multiples of page size.  For offset larger than
+   '1 << (page_shift + 8 * sizeof (off_t))' (that is, 1<<44 on system with
+   page size of 4096 bytes) the system call silently truncates the offset.
+   For this case, glibc mmap implementation returns EINVAL.  */
+
+/* Return the maximum value expected as offset argument in mmap64 call.  */
+static inline uint64_t
+mmap64_maximum_offset (long int page_shift)
+{
+  if (sizeof (off_t) < sizeof (off64_t))
+    return (UINT64_C(1) << (page_shift + (8 * sizeof (off_t)))) - 1;
+  else
+    return UINT64_MAX;
+}
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
index 0c7e13f4fa..50297bc409 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -30,6 +30,7 @@ struct cpu_list
 static struct cpu_list cpu_list[] = {
       {"falkor",	0x510FC000},
       {"thunderxt88",	0x430F0A10},
+      {"ares",		0x411FD0C0},
       {"generic", 	0x0}
 };
 
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index 73cb53da9a..d2ad5c63b9 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -44,6 +44,9 @@
 #define IS_FALKOR(midr) (MIDR_IMPLEMENTOR(midr) == 'Q'			      \
                         && MIDR_PARTNUM(midr) == 0xc00)
 
+#define IS_ARES(midr) (MIDR_IMPLEMENTOR(midr) == 'A'			      \
+			&& MIDR_PARTNUM(midr) == 0xd0c)
+
 struct cpu_features
 {
   uint64_t midr_el1;
diff --git a/sysdeps/unix/sysv/linux/mips/Makefile b/sysdeps/unix/sysv/linux/mips/Makefile
index bca11d39e0..a58e6954a7 100644
--- a/sysdeps/unix/sysv/linux/mips/Makefile
+++ b/sysdeps/unix/sysv/linux/mips/Makefile
@@ -99,14 +99,25 @@ sysdep-dl-routines += dl-static
 
 sysdep_routines += dl-vdso
 endif
-
-# Supporting non-executable stacks on MIPS requires changes to both
-# the Linux kernel and glibc.  See
-# <https://sourceware.org/ml/libc-alpha/2016-01/msg00567.html> and
-# <https://sourceware.org/ml/libc-alpha/2016-01/msg00719.html>.
+# If the compiler doesn't use GNU.stack note,
+# this test is expected to fail.
+ifneq ($(mips-has-gnustack),yes)
 test-xfail-check-execstack = yes
 endif
+endif
 
 ifeq ($(subdir),stdlib)
 gen-as-const-headers += ucontext_i.sym
 endif
+
+ifeq ($(mips-force-execstack),yes)
+CFLAGS-.o += -Wa,-execstack
+CFLAGS-.os += -Wa,-execstack
+CFLAGS-.op += -Wa,-execstack
+CFLAGS-.oS += -Wa,-execstack
+
+ASFLAGS-.o += -Wa,-execstack
+ASFLAGS-.os += -Wa,-execstack
+ASFLAGS-.op += -Wa,-execstack
+ASFLAGS-.oS += -Wa,-execstack
+endif
diff --git a/sysdeps/unix/sysv/linux/mips/configure b/sysdeps/unix/sysv/linux/mips/configure
index a5513fad48..2ec86a3121 100644
--- a/sysdeps/unix/sysv/linux/mips/configure
+++ b/sysdeps/unix/sysv/linux/mips/configure
@@ -475,3 +475,44 @@ if test -z "$arch_minimum_kernel"; then
     arch_minimum_kernel=4.5.0
   fi
 fi
+
+# Check if we are supposed to run on kernels older than 4.8.0. If so,
+# force executable stack to avoid potential runtime problems with fpu
+# emulation.
+# NOTE: The check below assumes that in absence of user-provided minumum_kernel
+# we will default to arch_minimum_kernel which is currently less than 4.8.0 for
+# all known configurations. If this changes, the check must be updated.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the compiler must use executable stack" >&5
+$as_echo_n "checking whether the compiler must use executable stack... " >&6; }
+if ${libc_cv_mips_force_execstack+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  libc_cv_mips_force_execstack=no
+  if test $libc_mips_float = hard; then
+    if test -n "$minimum_kernel"; then
+
+       min_version=$((`echo "$minimum_kernel.0.0.0" | sed 's/\([0-9]*\)\.\([0-9]*\)\.\([0-9]*\).*/\1 \* 65536 + \2 \* 256 + \3/'`))
+
+       if test $min_version -lt 264192; then
+         libc_cv_mips_force_execstack=yes
+       fi
+    else
+      libc_cv_mips_force_execstack=yes
+    fi
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_mips_force_execstack" >&5
+$as_echo "$libc_cv_mips_force_execstack" >&6; }
+
+libc_mips_has_gnustack=$libc_cv_as_noexecstack
+
+if test $libc_cv_mips_force_execstack = yes; then
+  libc_mips_has_gnustack=no
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: forcing executable stack for pre-4.8.0 Linux kernels" >&5
+$as_echo "$as_me: WARNING: forcing executable stack for pre-4.8.0 Linux kernels" >&2;}
+fi
+
+config_vars="$config_vars
+mips-force-execstack = ${libc_cv_mips_force_execstack}"
+config_vars="$config_vars
+mips-has-gnustack = ${libc_mips_has_gnustack}"
diff --git a/sysdeps/unix/sysv/linux/mips/configure.ac b/sysdeps/unix/sysv/linux/mips/configure.ac
index 9147aa4582..3db1b32b08 100644
--- a/sysdeps/unix/sysv/linux/mips/configure.ac
+++ b/sysdeps/unix/sysv/linux/mips/configure.ac
@@ -134,3 +134,35 @@ if test -z "$arch_minimum_kernel"; then
     arch_minimum_kernel=4.5.0
   fi
 fi
+
+# Check if we are supposed to run on kernels older than 4.8.0. If so,
+# force executable stack to avoid potential runtime problems with fpu
+# emulation.
+# NOTE: The check below assumes that in absence of user-provided minumum_kernel
+# we will default to arch_minimum_kernel which is currently less than 4.8.0 for
+# all known configurations. If this changes, the check must be updated.
+AC_CACHE_CHECK([whether the compiler must use executable stack],
+        libc_cv_mips_force_execstack, [dnl
+libc_cv_mips_force_execstack=no
+  if test $libc_mips_float = hard; then
+    if test -n "$minimum_kernel"; then
+       changequote(,)
+       min_version=$((`echo "$minimum_kernel.0.0.0" | sed 's/\([0-9]*\)\.\([0-9]*\)\.\([0-9]*\).*/\1 \* 65536 + \2 \* 256 + \3/'`))
+       changequote([,])
+       if test $min_version -lt 264192; then
+         libc_cv_mips_force_execstack=yes
+       fi
+    else
+      libc_cv_mips_force_execstack=yes
+    fi
+  fi])
+
+libc_mips_has_gnustack=$libc_cv_as_noexecstack
+
+if test $libc_cv_mips_force_execstack = yes; then
+  libc_mips_has_gnustack=no
+  AC_MSG_WARN([forcing executable stack for pre-4.8.0 Linux kernels])
+fi
+
+LIBC_CONFIG_VAR([mips-force-execstack],[${libc_cv_mips_force_execstack}])
+LIBC_CONFIG_VAR([mips-has-gnustack],[${libc_mips_has_gnustack}])
diff --git a/sysdeps/unix/sysv/linux/mips/mmap_info.h b/sysdeps/unix/sysv/linux/mips/mmap_info.h
new file mode 100644
index 0000000000..07c9e3a044
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/mips/mmap_info.h
@@ -0,0 +1,13 @@
+/* mips64n32 uses __NR_mmap for mmap64 while still having sizeof (off_t)
+   smaller than sizeof (off64_t).  So it allows mapping large offsets
+   using mmap64 than 32-bit archs which uses __NR_mmap2.  */
+
+static inline uint64_t
+mmap64_maximum_offset (long int page_shift)
+{
+#if _MIPS_SIM == _ABIN32 || _MIPS_SIM == _ABI64
+  return UINT64_MAX;
+#else
+  return (UINT64_C(1) << (page_shift + (8 * sizeof (off_t)))) - 1;
+#endif
+}
diff --git a/sysdeps/unix/sysv/linux/mmap64.c b/sysdeps/unix/sysv/linux/mmap64.c
index e8d519b17a..8441a9caa7 100644
--- a/sysdeps/unix/sysv/linux/mmap64.c
+++ b/sysdeps/unix/sysv/linux/mmap64.c
@@ -23,11 +23,18 @@
 #include <sysdep.h>
 #include <mmap_internal.h>
 
+#ifdef __NR_mmap2
 /* To avoid silent truncation of offset when using mmap2, do not accept
    offset larger than 1 << (page_shift + off_t bits).  For archictures with
    32 bits off_t and page size of 4096 it would be 1^44.  */
-#define MMAP_OFF_HIGH_MASK \
+# define MMAP_OFF_HIGH_MASK \
   ((-(MMAP2_PAGE_UNIT << 1) << (8 * sizeof (off_t) - 1)))
+#else
+/* Some ABIs might use __NR_mmap while having sizeof (off_t) smaller than
+   sizeof (off64_t) (currently only MIPS64n32).  For this case just set
+   zero the higher bits so mmap with large offset does not fail.  */
+# define MMAP_OFF_HIGH_MASK  0x0
+#endif
 
 #define MMAP_OFF_MASK (MMAP_OFF_HIGH_MASK | MMAP_OFF_LOW_MASK)
 
diff --git a/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h b/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h
index 8d474d0d04..37f0b14adb 100644
--- a/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h
+++ b/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h
@@ -31,7 +31,8 @@
    environment variable, LD_PREFER_MAP_32BIT_EXEC.  */
 #define EXTRA_LD_ENVVARS \
   case 21:								  \
-    if (memcmp (envline, "PREFER_MAP_32BIT_EXEC", 21) == 0)		  \
+    if (!__libc_enable_secure						  \
+	&& memcmp (envline, "PREFER_MAP_32BIT_EXEC", 21) == 0)		  \
       GLRO(dl_x86_cpu_features).feature[index_arch_Prefer_MAP_32BIT_EXEC] \
 	|= bit_arch_Prefer_MAP_32BIT_EXEC;				  \
     break;
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index f5f05f6c8c..fd20f64f9b 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
 	mov	%edi, %ecx
 
 #ifdef USE_AS_WMEMCHR
-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
-	shl	$2, %rdx
+	shl	$2, %RDX_LP
 #else
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
 	punpcklbw %xmm1, %xmm1
-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
 	punpcklbw %xmm1, %xmm1
 #endif
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index 0828a22534..9ae90661c8 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -21,14 +21,18 @@
 
 	.text
 ENTRY (memcmp)
-	test	%rdx, %rdx
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#endif
+	test	%RDX_LP, %RDX_LP
 	jz	L(finz)
 	cmpq	$1, %rdx
-	jle	L(finr1b)
+	jbe	L(finr1b)
 	subq	%rdi, %rsi
 	movq	%rdx, %r10
 	cmpq	$32, %r10
-	jge	L(gt32)
+	jae	L(gt32)
 	/* Handle small chunks and last block of less than 32 bytes.  */
 L(small):
 	testq	$1, %r10
@@ -156,7 +160,7 @@ L(A32):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
-        jge	L(mt16)
+        jae	L(mt16)
 	/* Pre-unroll to be ready for unrolled 64B loop.  */
 	testq	$32, %rdi
 	jz	L(A64)
@@ -178,7 +182,7 @@ L(A64):
 	movq	%r11, %r10
 	andq	$-64, %r10
 	cmpq	%r10, %rdi
-        jge	L(mt32)
+        jae	L(mt32)
 
 L(A64main):
 	movdqu    (%rdi,%rsi), %xmm0
@@ -216,7 +220,7 @@ L(mt32):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
-        jge	L(mt16)
+        jae	L(mt16)
 
 L(A32main):
 	movdqu    (%rdi,%rsi), %xmm0
@@ -254,7 +258,7 @@ L(ATR):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
-        jge	L(mt16)
+        jae	L(mt16)
 	testq	$16, %rdi
 	jz	L(ATR32)
 
@@ -325,7 +329,7 @@ L(ATR64main):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
-        jge	L(mt16)
+        jae	L(mt16)
 
 L(ATR32res):
 	movdqa    (%rdi,%rsi), %xmm0
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index 5fa0fe9c1c..44ef5c213a 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -24,13 +24,13 @@
 ENTRY (__memrchr)
 	movd	%esi, %xmm1
 
-	sub	$16, %rdx
+	sub	$16, %RDX_LP
 	jbe	L(length_less16)
 
 	punpcklbw	%xmm1, %xmm1
 	punpcklbw	%xmm1, %xmm1
 
-	add	%rdx, %rdi
+	add	%RDX_LP, %RDI_LP
 	pshufd	$0, %xmm1, %xmm1
 
 	movdqu	(%rdi), %xmm0
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index a7275ed7e1..85bdca5b4f 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -40,16 +40,20 @@
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
-	testq	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
 	jz	L(null)
 # endif
 	movl	%edi, %ecx
 	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
 # ifdef USE_AS_WMEMCHR
-	shl	$2, %rdx
+	shl	$2, %RDX_LP
 	vpbroadcastd %xmm0, %ymm0
 # else
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#  endif
 	vpbroadcastb %xmm0, %ymm0
 # endif
 	/* Check if we may cross page boundary with one vector load.  */
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index 16f46301ca..fb12e13ddf 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -58,9 +58,12 @@
 	.section .text.avx,"ax",@progbits
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
 # endif
-	cmpq	$VEC_SIZE, %rdx
+	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 
 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index 771639f662..834b84cf72 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -42,13 +42,16 @@
 	.section .text.sse4.1,"ax",@progbits
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
 # endif
 	pxor	%xmm0, %xmm0
-	cmp	$79, %rdx
+	cmp	$79, %RDX_LP
 	ja	L(79bytesormore)
 # ifndef USE_AS_WMEMCMP
-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
 	je	L(firstbyte)
 # endif
 	add	%rdx, %rsi
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
index 8d7d2fe67b..af8724e9db 100644
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -33,9 +33,12 @@
 	atom_text_section
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
-	shl	$2, %rdx
-	test	%rdx, %rdx
+	shl	$2, %RDX_LP
+	test	%RDX_LP, %RDX_LP
 	jz	L(equal)
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
 # endif
 	mov	%rdx, %rcx
 	mov	%rdi, %rdx
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 4e060a27fd..7388e7412c 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -48,28 +48,33 @@
 	.section .text.ssse3,"ax",@progbits
 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
 ENTRY (MEMPCPY_CHK)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMPCPY_CHK)
 
 ENTRY (MEMPCPY)
-	movq	%rdi, %rax
-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
 	jmp	L(start)
 END (MEMPCPY)
 #endif
 
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMCPY_CHK)
 #endif
 
 ENTRY (MEMCPY)
-	mov	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
 #ifdef USE_AS_MEMPCPY
-	add	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
 #endif
 
 #ifdef USE_AS_MEMMOVE
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index f3ea52a46c..74306d7daf 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -48,28 +48,33 @@
 	.section .text.ssse3,"ax",@progbits
 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
 ENTRY (MEMPCPY_CHK)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMPCPY_CHK)
 
 ENTRY (MEMPCPY)
-	movq	%rdi, %rax
-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
 	jmp	L(start)
 END (MEMPCPY)
 #endif
 
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMCPY_CHK)
 #endif
 
 ENTRY (MEMCPY)
-	mov	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
 #ifdef USE_AS_MEMPCPY
-	add	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
 #endif
 
 #ifdef USE_AS_MEMMOVE
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
index ae84ddc667..dae0616019 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
@@ -25,30 +25,34 @@
 	.section .text.avx512,"ax",@progbits
 # if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
 ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__mempcpy_chk_avx512_no_vzeroupper)
 
 ENTRY (__mempcpy_avx512_no_vzeroupper)
-	movq	%rdi, %rax
-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
 	jmp	L(start)
 END (__mempcpy_avx512_no_vzeroupper)
 # endif
 
 # ifdef SHARED
 ENTRY (__memmove_chk_avx512_no_vzeroupper)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__memmove_chk_avx512_no_vzeroupper)
 # endif
 
 ENTRY (__memmove_avx512_no_vzeroupper)
-	mov	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
 # ifdef USE_AS_MEMPCPY
-	add	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
 # endif
 L(start):
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
 	lea	(%rsi, %rdx), %rcx
 	lea	(%rdi, %rdx), %r9
 	cmp	$512, %rdx
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index d694e8b2be..9225fea1d7 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -107,22 +107,22 @@
 	.section SECTION(.text),"ax",@progbits
 #if defined SHARED && IS_IN (libc)
 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 #endif
 
 #if VEC_SIZE == 16 || defined SHARED
 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
-	movq	%rdi, %rax
-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
 	jmp	L(start)
 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 #endif
 
 #if defined SHARED && IS_IN (libc)
 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 #endif
@@ -130,9 +130,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
 	movq	%rdi, %rax
 L(start):
-	cmpq	$VEC_SIZE, %rdx
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
 #if !defined USE_MULTIARCH || !IS_IN (libc)
 L(last_2x_vec):
@@ -153,33 +157,33 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
 # if VEC_SIZE == 16
 #  if defined SHARED
 ENTRY (__mempcpy_chk_erms)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__mempcpy_chk_erms)
 
 /* Only used to measure performance of REP MOVSB.  */
 ENTRY (__mempcpy_erms)
-	movq	%rdi, %rax
-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
 	jmp	L(start_movsb)
 END (__mempcpy_erms)
 #  endif
 
 ENTRY (__memmove_chk_erms)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__memmove_chk_erms)
 
 ENTRY (__memmove_erms)
 	movq	%rdi, %rax
 L(start_movsb):
-	movq	%rdx, %rcx
-	cmpq	%rsi, %rdi
+	mov	%RDX_LP, %RCX_LP
+	cmp	%RSI_LP, %RDI_LP
 	jb	1f
 	/* Source == destination is less common.  */
 	je	2f
-	leaq	(%rsi,%rcx), %rdx
-	cmpq	%rdx, %rdi
+	lea	(%rsi,%rcx), %RDX_LP
+	cmp	%RDX_LP, %RDI_LP
 	jb	L(movsb_backward)
 1:
 	rep movsb
@@ -201,18 +205,18 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
 
 # ifdef SHARED
 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 
 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
-	movq	%rdi, %rax
-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
 	jmp	L(start_erms)
 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 
 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
@@ -220,9 +224,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 	movq	%rdi, %rax
 L(start_erms):
-	cmpq	$VEC_SIZE, %rdx
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
 L(last_2x_vec):
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
@@ -249,7 +257,7 @@ L(movsb):
 # endif
 	jb	L(more_8x_vec_backward)
 1:
-	movq	%rdx, %rcx
+	mov	%RDX_LP, %RCX_LP
 	rep movsb
 L(nop):
 	ret
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index 3ee02e1cc3..40e1dba301 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
 	vmovd	%esi, %xmm0
 	vpbroadcastb %xmm0, %ymm0
 
-	subq	$VEC_SIZE, %rdx
+	sub	$VEC_SIZE, %RDX_LP
 	jbe	L(last_vec_or_less)
 
-	addq	%rdx, %rdi
+	add	%RDX_LP, %RDI_LP
 
 	/* Check the last VEC_SIZE bytes.  */
 	vpcmpeqb (%rdi), %ymm0, %ymm1
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
index 1f66602398..5be12bd06b 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
@@ -29,12 +29,16 @@
 	.section .text.avx512,"ax",@progbits
 #if defined PIC
 ENTRY (MEMSET_CHK)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMSET_CHK)
 #endif
 
 ENTRY (MEMSET)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
 	vpxor	%xmm0, %xmm0, %xmm0
 	vmovd	%esi, %xmm1
 	lea	(%rdi, %rdx), %rsi
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 8ed470283e..2023a8e108 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -75,8 +75,8 @@
 	.section SECTION(.text),"ax",@progbits
 #if VEC_SIZE == 16 && IS_IN (libc)
 ENTRY (__bzero)
-	movq	%rdi, %rax /* Set return value.  */
-	movq	%rsi, %rdx /* Set n.  */
+	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+	mov	%RSI_LP, %RDX_LP /* Set n.  */
 	pxor	%xmm0, %xmm0
 	jmp	L(entry_from_bzero)
 END (__bzero)
@@ -86,13 +86,13 @@ weak_alias (__bzero, bzero)
 #if IS_IN (libc)
 # if defined SHARED
 ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
 # endif
 
 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
-	shlq	$2, %rdx
+	shl	$2, %RDX_LP
 	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 	jmp	L(entry_from_bzero)
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
@@ -100,13 +100,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
 
 #if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
 L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
@@ -122,7 +126,7 @@ END (MEMSET_SYMBOL (__memset, unaligned))
 
 # if VEC_SIZE == 16
 ENTRY (__memset_chk_erms)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__memset_chk_erms)
 
@@ -135,11 +139,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
 L(stosb):
 	/* Issue vzeroupper before rep stosb.  */
 	VZEROUPPER
-	movq	%rdx, %rcx
+	mov	%RDX_LP, %RCX_LP
 	movzbl	%sil, %eax
-	movq	%rdi, %rdx
+	mov	%RDI_LP, %RDX_LP
 	rep stosb
-	movq	%rdx, %rax
+	mov	%RDX_LP, %RAX_LP
 	ret
 # if VEC_SIZE == 16
 END (__memset_erms)
@@ -149,16 +153,20 @@ END (MEMSET_SYMBOL (__memset, erms))
 
 # if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
-	cmpq	$VEC_SIZE, %rdx
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index 4aeb14e175..9e5f93cb86 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -155,11 +155,11 @@ STRCMP_SSE42:
 #endif
 
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
 	je	LABEL(strcmp_exitz)
-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
 	je	LABEL(Byte0)
-	mov	%rdx, %r11
+	mov	%RDX_LP, %R11_LP
 #endif
 	mov	%esi, %ecx
 	mov	%edi, %eax
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
index 6a5ab7ab26..b46f6f8aed 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -40,8 +40,8 @@
 .text
 ENTRY (STRCPY)
 #  ifdef USE_AS_STRNCPY
-	mov	%rdx, %r8
-	test	%r8, %r8
+	mov	%RDX_LP, %R8_LP
+	test	%R8_LP, %R8_LP
 	jz	L(ExitZero)
 #  endif
 	mov	%rsi, %rcx
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
index 47aaeae671..83134f3b2c 100644
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -31,13 +31,13 @@ ENTRY (STRCPY)
 
 	mov	%rsi, %rcx
 #  ifdef USE_AS_STRNCPY
-	mov	%rdx, %r8
+	mov	%RDX_LP, %R8_LP
 #  endif
 	mov	%rdi, %rdx
 #  ifdef USE_AS_STRNCPY
-	test	%r8, %r8
+	test	%R8_LP, %R8_LP
 	jz	L(Exit0)
-	cmp	$8, %r8
+	cmp	$8, %R8_LP
 	jbe	L(StrncpyExit8Bytes)
 # endif
 	cmpb	$0, (%rcx)
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index 1dc823af0a..2ce10f59e6 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -42,12 +42,15 @@
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
 	/* Check for zero length.  */
-	testq	%rsi, %rsi
+	test	%RSI_LP, %RSI_LP
 	jz	L(zero)
 #  ifdef USE_AS_WCSLEN
-	shl	$2, %rsi
+	shl	$2, %RSI_LP
+#  elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
 #  endif
-	movq	%rsi, %r8
+	mov	%RSI_LP, %R8_LP
 # endif
 	movl	%edi, %ecx
 	movq	%rdi, %rdx
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 076be04df5..2aa301997f 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -135,11 +135,11 @@ ENTRY (STRCMP)
  * This implementation uses SSE to compare up to 16 bytes at a time.
  */
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
 	je	LABEL(strcmp_exitz)
-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
 	je	LABEL(Byte0)
-	mov	%rdx, %r11
+	mov	%RDX_LP, %R11_LP
 #endif
 	mov	%esi, %ecx
 	mov	%edi, %eax
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index b5ab117c79..30cec693c1 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -59,21 +59,21 @@ ENTRY(strlen)
 
 #ifdef AS_STRNLEN
 /* Do not read anything when n==0.  */
-	test	%rsi, %rsi
+	test	%RSI_LP, %RSI_LP
 	jne	L(n_nonzero)
 	xor	%rax, %rax
 	ret
 L(n_nonzero):
 # ifdef AS_WCSLEN
-	shlq	$2, %rsi
+	shl	$2, %RSI_LP
 # endif
 
 /* Initialize long lived registers.  */
 
-	add	%rdi, %rsi
-	mov	%rsi, %r10
-	and	$-64, %r10
-	mov	%rsi, %r11
+	add	%RDI_LP, %RSI_LP
+	mov	%RSI_LP, %R10_LP
+	and	$-64, %R10_LP
+	mov	%RSI_LP, %R11_LP
 #endif
 
 	pxor	%xmm0, %xmm0
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index f2ebc24fb0..8748956563 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -4,3 +4,15 @@ ifeq ($(subdir),math)
 # 64-bit llround.  Add -fno-builtin-lround to silence the compiler.
 CFLAGS-s_llround.c += -fno-builtin-lround
 endif
+
+ifeq ($(subdir),string)
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen \
+	 tst-size_t-memcmp-2
+endif
+
+ifeq ($(subdir),wcsmbs)
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
+	 tst-size_t-wcsncmp tst-size_t-wcsnlen
+endif
diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
new file mode 100644
index 0000000000..78a940863e
--- /dev/null
+++ b/sysdeps/x86_64/x32/test-size_t.h
@@ -0,0 +1,35 @@
+/* Test string/memory functions with size_t in the lower 32 bits of
+   64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#include <string/test-string.h>
+
+/* On x32, parameter_t may be passed in a 64-bit register with the LEN
+   field in the lower 32 bits.  When the LEN field of 64-bit register
+   is passed to string/memory function as the size_t parameter, only
+   the lower 32 bits can be used.  */
+typedef struct
+{
+  union
+    {
+      size_t len;
+      void (*fn) (void);
+    };
+  void *p;
+} parameter_t;
diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
new file mode 100644
index 0000000000..29a3daf102
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
@@ -0,0 +1,72 @@
+/* Test memchr with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef WIDE
+# define TEST_NAME "memchr"
+#else
+# define TEST_NAME "wmemchr"
+#endif /* WIDE */
+#include "test-size_t.h"
+
+#ifndef WIDE
+# define MEMCHR memchr
+# define CHAR char
+# define UCHAR unsigned char
+#else
+# include <wchar.h>
+# define MEMCHR wmemchr
+# define CHAR wchar_t
+# define UCHAR wchar_t
+#endif /* WIDE */
+
+IMPL (MEMCHR, 1)
+
+typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
+
+static CHAR *
+__attribute__ ((noinline, noclone))
+do_memchr (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
+  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      c.fn = impl->fn;
+      CHAR *res = do_memchr (src, c);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %p != NULL",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
new file mode 100644
index 0000000000..d8ae1a0813
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
@@ -0,0 +1,79 @@
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifdef WIDE
+# define TEST_NAME "wmemcmp"
+#else
+# define TEST_NAME "memcmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <inttypes.h>
+# include <wchar.h>
+
+# define MEMCMP wmemcmp
+# define CHAR wchar_t
+#else
+# define MEMCMP memcmp
+# define CHAR char
+#endif
+
+IMPL (MEMCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_memcmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  memcpy (buf1, buf2, page_size);
+
+  CHAR *p = (CHAR *) buf1;
+  p[page_size / sizeof (CHAR) - 1] = (CHAR) 1;
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_memcmp (dest, src);
+      if (res >= 0)
+	{
+	  error (0, 0, "Wrong result in function %s: %i >= 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
new file mode 100644
index 0000000000..9bd6fdb45a
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
@@ -0,0 +1,76 @@
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifdef WIDE
+# define TEST_NAME "wmemcmp"
+#else
+# define TEST_NAME "memcmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <inttypes.h>
+# include <wchar.h>
+
+# define MEMCMP wmemcmp
+# define CHAR wchar_t
+#else
+# define MEMCMP memcmp
+# define CHAR char
+#endif
+
+IMPL (MEMCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_memcmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  memcpy (buf1, buf2, page_size);
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_memcmp (dest, src);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
new file mode 100644
index 0000000000..66b71e1749
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
@@ -0,0 +1,58 @@
+/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "memcpy"
+#include "test-size_t.h"
+
+IMPL (memcpy, 1)
+
+typedef void *(*proto_t) (void *, const void *, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memcpy (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      do_memcpy (dest, src);
+      int res = memcmp (dest.p, src.p, dest.len);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
new file mode 100644
index 0000000000..c83699c097
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
@@ -0,0 +1,57 @@
+/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "memrchr"
+#include "test-size_t.h"
+
+IMPL (memchr, 1)
+
+typedef void * (*proto_t) (const void *, int, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memrchr (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t src = { { page_size }, buf2 };
+  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      c.fn = impl->fn;
+      void * res = do_memrchr (src, c);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %p != NULL",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
new file mode 100644
index 0000000000..2c367af6cd
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
@@ -0,0 +1,73 @@
+/* Test memset with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef WIDE
+# define TEST_NAME "wmemset"
+#else
+# define TEST_NAME "memset"
+#endif /* WIDE */
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+# define MEMSET wmemset
+# define CHAR wchar_t
+#else
+# define MEMSET memset
+# define CHAR char
+#endif /* WIDE */
+
+IMPL (MEMSET, 1)
+
+typedef CHAR *(*proto_t) (CHAR *, int, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memset (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  CHAR ch = 0x23;
+  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
+  parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      c.fn = impl->fn;
+      CHAR *p = (CHAR *) do_memset (src, c);
+      size_t i;
+      for (i = 0; i < src.len; i++)
+	if (p[i] != ch)
+	  {
+	    error (0, 0, "Wrong result in function %s", impl->name);
+	    ret = 1;
+	  }
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
new file mode 100644
index 0000000000..862335937b
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
@@ -0,0 +1,59 @@
+/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "strncasecmp"
+#include "test-size_t.h"
+
+IMPL (strncasecmp, 1)
+
+typedef int (*proto_t) (const char *, const char *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_strncasecmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  strncpy ((char *) buf1, (const char *) buf2, page_size);
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_strncasecmp (dest, src);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
new file mode 100644
index 0000000000..54e6bd83ef
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
@@ -0,0 +1,78 @@
+/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef WIDE
+# define TEST_NAME "wcsncmp"
+#else
+# define TEST_NAME "strncmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+
+# define STRNCMP wcsncmp
+# define STRNCPY wcsncpy
+# define CHAR wchar_t
+#else
+# define STRNCMP strncmp
+# define STRNCPY strncpy
+# define CHAR char
+#endif
+
+IMPL (STRNCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+
+static int
+__attribute__ ((noinline, noclone))
+do_strncmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  size_t size = page_size / sizeof (CHAR);
+  parameter_t dest = { { size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_strncmp (dest, src);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
new file mode 100644
index 0000000000..4dec71e6b3
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
@@ -0,0 +1,58 @@
+/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "strncpy"
+#include "test-size_t.h"
+
+IMPL (strncpy, 1)
+
+typedef char *(*proto_t) (char *, const char*, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_strncpy (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      do_strncpy (dest, src);
+      int res = strncmp (dest.p, src.p, dest.len);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
new file mode 100644
index 0000000000..690a4a8a31
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
@@ -0,0 +1,72 @@
+/* Test strnlen with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef WIDE
+# define TEST_NAME "wcsnlen"
+#else
+# define TEST_NAME "strnlen"
+#endif /* WIDE */
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+# define STRNLEN wcsnlen
+# define CHAR wchar_t
+#else
+# define STRNLEN strnlen
+# define CHAR char
+#endif /* WIDE */
+
+IMPL (STRNLEN, 1)
+
+typedef size_t (*proto_t) (const CHAR *, size_t);
+
+static size_t
+__attribute__ ((noinline, noclone))
+do_strnlen (parameter_t a, parameter_t b)
+{
+  return CALL (&a, a.p, b.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  size_t size = page_size / sizeof (CHAR);
+  parameter_t src = { { 0 }, buf2 };
+  parameter_t c = { { size }, (void *) (uintptr_t) 'a' };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      size_t res = do_strnlen (src, c);
+      if (res != size)
+	{
+	  error (0, 0, "Wrong result in function %s: 0x%x != 0x%x",
+		 impl->name, res, size);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
new file mode 100644
index 0000000000..4829647c19
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
@@ -0,0 +1,20 @@
+/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-strncmp.c"
diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
new file mode 100644
index 0000000000..093b4bbe1b
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
@@ -0,0 +1,20 @@
+/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-strnlen.c"
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
new file mode 100644
index 0000000000..877801d646
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
@@ -0,0 +1,20 @@
+/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-memchr.c"
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
new file mode 100644
index 0000000000..e8b5ffd0d5
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
@@ -0,0 +1,20 @@
+/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-memcmp.c"
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
new file mode 100644
index 0000000000..955eb488c2
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
@@ -0,0 +1,20 @@
+/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-memset.c"