From bff8346b0184b15fbb80863112133f48a7bd62a9 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 1 Feb 2019 12:17:09 -0800
Subject: x86-64 memchr/wmemchr: Properly handle the length parameter [BZ
 #24097]

On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits.  The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.

This pach fixes memchr/wmemchr for x32.  Tested on x86-64 and x32.  On
x86-64, libc.so is the same with and withou the fix.

	[BZ #24097]
	CVE-2019-6488
	* sysdeps/x86_64/memchr.S: Use RDX_LP for length.  Clear the
	upper 32 bits of RDX register.
	* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
	tst-size_t-wmemchr.
	* sysdeps/x86_64/x32/test-size_t.h: New file.
	* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
	* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.

(cherry picked from commit 97700a34f36721b11a754cf37a1cc40695ece1fd)
---
 ChangeLog                               | 13 ++++++
 NEWS                                    |  7 ++++
 sysdeps/x86_64/memchr.S                 | 10 +++--
 sysdeps/x86_64/multiarch/memchr-avx2.S  |  8 +++-
 sysdeps/x86_64/x32/Makefile             |  8 ++++
 sysdeps/x86_64/x32/test-size_t.h        | 35 ++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-memchr.c  | 72 +++++++++++++++++++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++++
 8 files changed, 168 insertions(+), 5 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/test-size_t.h
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c

diff --git a/ChangeLog b/ChangeLog
index 60b0364037..d5a95a474c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memchr.S: Use RDX_LP for length.  Clear the
+	upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/test-size_t.h: New file.
+	* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
+
 2019-01-11  Gabriel F. T. Gomes  <gabriel@inconstante.eti.br>
 
 	* sysdeps/powerpc/fpu/libm-test-ulps: Regenerate.
diff --git a/NEWS b/NEWS
index 49895f81bd..1bea44cc8d 100644
--- a/NEWS
+++ b/NEWS
@@ -86,6 +86,12 @@ Security related changes:
   denial of service due to resource exhaustion when processing getaddrinfo
   calls with crafted host names.  Reported by Guido Vranken.
 
+  CVE-2019-6488: On x32, the size_t parameter may be passed in the lower
+  32 bits of a 64-bit register with with non-zero upper 32 bit.  When it
+  happened, accessing the 32-bit size_t value as the full 64-bit register
+  in the assembly string/memory functions would cause a buffer overflow.
+  Reported by H.J. Lu.
+
 The following bugs are resolved with this release:
 
   [16750] ldd: Never run file directly.
@@ -165,6 +171,7 @@ The following bugs are resolved with this release:
   [23927] Linux if_nametoindex() does not close descriptor (CVE-2018-19591)
   [24018] gettext may return NULL
   [24027] malloc: Integer overflow in realloc
+  [24097] Can't use 64-bit register for size_t in assembly codes for x32 (CVE-2019-6488)
 
 Version 2.26
 
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index f5f05f6c8c..fd20f64f9b 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
 	mov	%edi, %ecx
 
 #ifdef USE_AS_WMEMCHR
-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
-	shl	$2, %rdx
+	shl	$2, %RDX_LP
 #else
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
 	punpcklbw %xmm1, %xmm1
-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
 	punpcklbw %xmm1, %xmm1
 #endif
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index a7275ed7e1..85bdca5b4f 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -40,16 +40,20 @@
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
-	testq	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
 	jz	L(null)
 # endif
 	movl	%edi, %ecx
 	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
 # ifdef USE_AS_WMEMCHR
-	shl	$2, %rdx
+	shl	$2, %RDX_LP
 	vpbroadcastd %xmm0, %ymm0
 # else
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#  endif
 	vpbroadcastb %xmm0, %ymm0
 # endif
 	/* Check if we may cross page boundary with one vector load.  */
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index f2ebc24fb0..7d528889c6 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
 # 64-bit llround.  Add -fno-builtin-lround to silence the compiler.
 CFLAGS-s_llround.c += -fno-builtin-lround
 endif
+
+ifeq ($(subdir),string)
+tests += tst-size_t-memchr
+endif
+
+ifeq ($(subdir),wcsmbs)
+tests += tst-size_t-wmemchr
+endif
diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
new file mode 100644
index 0000000000..78a940863e
--- /dev/null
+++ b/sysdeps/x86_64/x32/test-size_t.h
@@ -0,0 +1,35 @@
+/* Test string/memory functions with size_t in the lower 32 bits of
+   64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#include <string/test-string.h>
+
+/* On x32, parameter_t may be passed in a 64-bit register with the LEN
+   field in the lower 32 bits.  When the LEN field of 64-bit register
+   is passed to string/memory function as the size_t parameter, only
+   the lower 32 bits can be used.  */
+typedef struct
+{
+  union
+    {
+      size_t len;
+      void (*fn) (void);
+    };
+  void *p;
+} parameter_t;
diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
new file mode 100644
index 0000000000..29a3daf102
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
@@ -0,0 +1,72 @@
+/* Test memchr with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef WIDE
+# define TEST_NAME "memchr"
+#else
+# define TEST_NAME "wmemchr"
+#endif /* WIDE */
+#include "test-size_t.h"
+
+#ifndef WIDE
+# define MEMCHR memchr
+# define CHAR char
+# define UCHAR unsigned char
+#else
+# include <wchar.h>
+# define MEMCHR wmemchr
+# define CHAR wchar_t
+# define UCHAR wchar_t
+#endif /* WIDE */
+
+IMPL (MEMCHR, 1)
+
+typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
+
+static CHAR *
+__attribute__ ((noinline, noclone))
+do_memchr (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
+  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      c.fn = impl->fn;
+      CHAR *res = do_memchr (src, c);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %p != NULL",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
new file mode 100644
index 0000000000..877801d646
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
@@ -0,0 +1,20 @@
+/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-memchr.c"
-- 
cgit 1.4.1


From 232a7628f01c1ed93144e3115b240950ec07e8f3 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 1 Feb 2019 12:19:07 -0800
Subject: x86-64 memcmp/wmemcmp: Properly handle the length parameter [BZ
 #24097]

On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits.  The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.

This pach fixes memcmp/wmemcmp for x32.  Tested on x86-64 and x32.  On
x86-64, libc.so is the same with and withou the fix.

	[BZ #24097]
	CVE-2019-6488
	* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
	length.  Clear the upper 32 bits of RDX register.
	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
	* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
	tst-size_t-wmemcmp.
	* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
	* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.

(cherry picked from commit b304fc201d2f6baf52ea790df8643e99772243cd)
---
 ChangeLog                                    | 13 +++++
 sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S |  7 ++-
 sysdeps/x86_64/multiarch/memcmp-sse4.S       |  9 ++--
 sysdeps/x86_64/multiarch/memcmp-ssse3.S      |  7 ++-
 sysdeps/x86_64/x32/Makefile                  |  4 +-
 sysdeps/x86_64/x32/tst-size_t-memcmp.c       | 76 ++++++++++++++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c      | 20 ++++++++
 7 files changed, 127 insertions(+), 9 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c

diff --git a/ChangeLog b/ChangeLog
index d5a95a474c..dfa52bf21d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
+	* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
+	tst-size_t-wmemcmp.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
+
 2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
 
 	[BZ #24097]
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index 16f46301ca..fb12e13ddf 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -58,9 +58,12 @@
 	.section .text.avx,"ax",@progbits
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
 # endif
-	cmpq	$VEC_SIZE, %rdx
+	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 
 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index 771639f662..834b84cf72 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -42,13 +42,16 @@
 	.section .text.sse4.1,"ax",@progbits
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
 # endif
 	pxor	%xmm0, %xmm0
-	cmp	$79, %rdx
+	cmp	$79, %RDX_LP
 	ja	L(79bytesormore)
 # ifndef USE_AS_WMEMCMP
-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
 	je	L(firstbyte)
 # endif
 	add	%rdx, %rsi
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
index 8d7d2fe67b..af8724e9db 100644
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -33,9 +33,12 @@
 	atom_text_section
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
-	shl	$2, %rdx
-	test	%rdx, %rdx
+	shl	$2, %RDX_LP
+	test	%RDX_LP, %RDX_LP
 	jz	L(equal)
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
 # endif
 	mov	%rdx, %rcx
 	mov	%rdi, %rdx
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index 7d528889c6..ddec7f0466 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
 endif
 
 ifeq ($(subdir),string)
-tests += tst-size_t-memchr
+tests += tst-size_t-memchr tst-size_t-memcmp
 endif
 
 ifeq ($(subdir),wcsmbs)
-tests += tst-size_t-wmemchr
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp
 endif
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
new file mode 100644
index 0000000000..9bd6fdb45a
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
@@ -0,0 +1,76 @@
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifdef WIDE
+# define TEST_NAME "wmemcmp"
+#else
+# define TEST_NAME "memcmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <inttypes.h>
+# include <wchar.h>
+
+# define MEMCMP wmemcmp
+# define CHAR wchar_t
+#else
+# define MEMCMP memcmp
+# define CHAR char
+#endif
+
+IMPL (MEMCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_memcmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  memcpy (buf1, buf2, page_size);
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_memcmp (dest, src);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
new file mode 100644
index 0000000000..e8b5ffd0d5
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
@@ -0,0 +1,20 @@
+/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-memcmp.c"
-- 
cgit 1.4.1


From 94b88894b15d8a0dcce3d3b17e04880d72f2a9a7 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 1 Feb 2019 12:20:06 -0800
Subject: x86-64 memcpy: Properly handle the length parameter [BZ #24097]

On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits.  The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.

This pach fixes memcpy for x32.  Tested on x86-64 and x32.  On x86-64,
libc.so is the same with and withou the fix.

	[BZ #24097]
	CVE-2019-6488
	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
	length.  Clear the upper 32 bits of RDX register.
	* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
	* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
	Likewise.
	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
	Likewise.
	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
	tst-size_t-wmemchr.
	* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.

(cherry picked from commit 231c56760c1e2ded21ad96bbb860b1f08c556c7a)
---
 ChangeLog                                          | 15 ++++++
 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S       | 17 ++++---
 sysdeps/x86_64/multiarch/memcpy-ssse3.S            | 17 ++++---
 .../multiarch/memmove-avx512-no-vzeroupper.S       | 16 +++---
 .../x86_64/multiarch/memmove-vec-unaligned-erms.S  | 50 +++++++++++--------
 sysdeps/x86_64/x32/Makefile                        |  2 +-
 sysdeps/x86_64/x32/tst-size_t-memcpy.c             | 58 ++++++++++++++++++++++
 7 files changed, 135 insertions(+), 40 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c

diff --git a/ChangeLog b/ChangeLog
index dfa52bf21d..857cc9364f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
+	Likewise.
+	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
+	Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
+
 2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
 
 	[BZ #24097]
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 4e060a27fd..7388e7412c 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -48,28 +48,33 @@
 	.section .text.ssse3,"ax",@progbits
 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
 ENTRY (MEMPCPY_CHK)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMPCPY_CHK)
 
 ENTRY (MEMPCPY)
-	movq	%rdi, %rax
-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
 	jmp	L(start)
 END (MEMPCPY)
 #endif
 
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMCPY_CHK)
 #endif
 
 ENTRY (MEMCPY)
-	mov	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
 #ifdef USE_AS_MEMPCPY
-	add	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
 #endif
 
 #ifdef USE_AS_MEMMOVE
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index f3ea52a46c..74306d7daf 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -48,28 +48,33 @@
 	.section .text.ssse3,"ax",@progbits
 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
 ENTRY (MEMPCPY_CHK)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMPCPY_CHK)
 
 ENTRY (MEMPCPY)
-	movq	%rdi, %rax
-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
 	jmp	L(start)
 END (MEMPCPY)
 #endif
 
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMCPY_CHK)
 #endif
 
 ENTRY (MEMCPY)
-	mov	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
 #ifdef USE_AS_MEMPCPY
-	add	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
 #endif
 
 #ifdef USE_AS_MEMMOVE
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
index ae84ddc667..dae0616019 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
@@ -25,30 +25,34 @@
 	.section .text.avx512,"ax",@progbits
 # if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
 ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__mempcpy_chk_avx512_no_vzeroupper)
 
 ENTRY (__mempcpy_avx512_no_vzeroupper)
-	movq	%rdi, %rax
-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
 	jmp	L(start)
 END (__mempcpy_avx512_no_vzeroupper)
 # endif
 
 # ifdef SHARED
 ENTRY (__memmove_chk_avx512_no_vzeroupper)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__memmove_chk_avx512_no_vzeroupper)
 # endif
 
 ENTRY (__memmove_avx512_no_vzeroupper)
-	mov	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
 # ifdef USE_AS_MEMPCPY
-	add	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
 # endif
 L(start):
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
 	lea	(%rsi, %rdx), %rcx
 	lea	(%rdi, %rdx), %r9
 	cmp	$512, %rdx
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index d694e8b2be..9225fea1d7 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -107,22 +107,22 @@
 	.section SECTION(.text),"ax",@progbits
 #if defined SHARED && IS_IN (libc)
 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 #endif
 
 #if VEC_SIZE == 16 || defined SHARED
 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
-	movq	%rdi, %rax
-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
 	jmp	L(start)
 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 #endif
 
 #if defined SHARED && IS_IN (libc)
 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 #endif
@@ -130,9 +130,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
 	movq	%rdi, %rax
 L(start):
-	cmpq	$VEC_SIZE, %rdx
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
 #if !defined USE_MULTIARCH || !IS_IN (libc)
 L(last_2x_vec):
@@ -153,33 +157,33 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
 # if VEC_SIZE == 16
 #  if defined SHARED
 ENTRY (__mempcpy_chk_erms)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__mempcpy_chk_erms)
 
 /* Only used to measure performance of REP MOVSB.  */
 ENTRY (__mempcpy_erms)
-	movq	%rdi, %rax
-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
 	jmp	L(start_movsb)
 END (__mempcpy_erms)
 #  endif
 
 ENTRY (__memmove_chk_erms)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__memmove_chk_erms)
 
 ENTRY (__memmove_erms)
 	movq	%rdi, %rax
 L(start_movsb):
-	movq	%rdx, %rcx
-	cmpq	%rsi, %rdi
+	mov	%RDX_LP, %RCX_LP
+	cmp	%RSI_LP, %RDI_LP
 	jb	1f
 	/* Source == destination is less common.  */
 	je	2f
-	leaq	(%rsi,%rcx), %rdx
-	cmpq	%rdx, %rdi
+	lea	(%rsi,%rcx), %RDX_LP
+	cmp	%RDX_LP, %RDI_LP
 	jb	L(movsb_backward)
 1:
 	rep movsb
@@ -201,18 +205,18 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
 
 # ifdef SHARED
 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 
 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
-	movq	%rdi, %rax
-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
 	jmp	L(start_erms)
 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 
 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
@@ -220,9 +224,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 	movq	%rdi, %rax
 L(start_erms):
-	cmpq	$VEC_SIZE, %rdx
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
 L(last_2x_vec):
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
@@ -249,7 +257,7 @@ L(movsb):
 # endif
 	jb	L(more_8x_vec_backward)
 1:
-	movq	%rdx, %rcx
+	mov	%RDX_LP, %RCX_LP
 	rep movsb
 L(nop):
 	ret
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index ddec7f0466..2fe1e5ac5a 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
 endif
 
 ifeq ($(subdir),string)
-tests += tst-size_t-memchr tst-size_t-memcmp
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
new file mode 100644
index 0000000000..66b71e1749
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
@@ -0,0 +1,58 @@
+/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "memcpy"
+#include "test-size_t.h"
+
+IMPL (memcpy, 1)
+
+typedef void *(*proto_t) (void *, const void *, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memcpy (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      do_memcpy (dest, src);
+      int res = memcmp (dest.p, src.p, dest.len);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
-- 
cgit 1.4.1


From c9ea2e82d4f4002b162f427c4761c5bec6ee6876 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 1 Feb 2019 12:20:54 -0800
Subject: x86-64 memrchr: Properly handle the length parameter [BZ #24097]

On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits.  The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.

This pach fixes memrchr for x32.  Tested on x86-64 and x32.  On x86-64,
libc.so is the same with and withou the fix.

	[BZ #24097]
	CVE-2019-6488
	* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
	* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
	* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.

(cherry picked from commit ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0)
---
 ChangeLog                               |  9 ++++++
 sysdeps/x86_64/memrchr.S                |  4 +--
 sysdeps/x86_64/multiarch/memrchr-avx2.S |  4 +--
 sysdeps/x86_64/x32/Makefile             |  3 +-
 sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++++++++++
 5 files changed, 72 insertions(+), 5 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c

diff --git a/ChangeLog b/ChangeLog
index 857cc9364f..2cfeeabe66 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
+	* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.
+
 2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
 
 	[BZ #24097]
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index 5fa0fe9c1c..44ef5c213a 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -24,13 +24,13 @@
 ENTRY (__memrchr)
 	movd	%esi, %xmm1
 
-	sub	$16, %rdx
+	sub	$16, %RDX_LP
 	jbe	L(length_less16)
 
 	punpcklbw	%xmm1, %xmm1
 	punpcklbw	%xmm1, %xmm1
 
-	add	%rdx, %rdi
+	add	%RDX_LP, %RDI_LP
 	pshufd	$0, %xmm1, %xmm1
 
 	movdqu	(%rdi), %xmm0
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index 3ee02e1cc3..40e1dba301 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
 	vmovd	%esi, %xmm0
 	vpbroadcastb %xmm0, %ymm0
 
-	subq	$VEC_SIZE, %rdx
+	sub	$VEC_SIZE, %RDX_LP
 	jbe	L(last_vec_or_less)
 
-	addq	%rdx, %rdi
+	add	%RDX_LP, %RDI_LP
 
 	/* Check the last VEC_SIZE bytes.  */
 	vpcmpeqb (%rdi), %ymm0, %ymm1
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index 2fe1e5ac5a..e99dbd7c86 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround
 endif
 
 ifeq ($(subdir),string)
-tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+	 tst-size_t-memrchr
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
new file mode 100644
index 0000000000..c83699c097
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
@@ -0,0 +1,57 @@
+/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "memrchr"
+#include "test-size_t.h"
+
+IMPL (memchr, 1)
+
+typedef void * (*proto_t) (const void *, int, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memrchr (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t src = { { page_size }, buf2 };
+  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      c.fn = impl->fn;
+      void * res = do_memrchr (src, c);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %p != NULL",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
-- 
cgit 1.4.1


From 885e4af2ac9b272d6ddd7f9c1954d0364d7ebab3 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 1 Feb 2019 12:21:41 -0800
Subject: x86-64 memset/wmemset: Properly handle the length parameter [BZ
 #24097]

On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits.  The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.

This pach fixes memset/wmemset for x32.  Tested on x86-64 and x32.  On
x86-64, libc.so is the same with and withou the fix.

	[BZ #24097]
	CVE-2019-6488
	* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
	RDX_LP for length.  Clear the upper 32 bits of RDX register.
	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
	* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
	* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.

(cherry picked from commit 82d0b4a4d76db554eb6757acb790fcea30b19965)
---
 ChangeLog                                          | 11 ++++
 .../x86_64/multiarch/memset-avx512-no-vzeroupper.S |  6 +-
 .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 32 ++++++----
 sysdeps/x86_64/x32/Makefile                        |  4 +-
 sysdeps/x86_64/x32/tst-size_t-memset.c             | 73 ++++++++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-wmemset.c            | 20 ++++++
 6 files changed, 131 insertions(+), 15 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c

diff --git a/ChangeLog b/ChangeLog
index 2cfeeabe66..c9a5db6f8e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
+	RDX_LP for length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
+	* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.
+
 2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
 
 	[BZ #24097]
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
index 1f66602398..5be12bd06b 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
@@ -29,12 +29,16 @@
 	.section .text.avx512,"ax",@progbits
 #if defined PIC
 ENTRY (MEMSET_CHK)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMSET_CHK)
 #endif
 
 ENTRY (MEMSET)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
 	vpxor	%xmm0, %xmm0, %xmm0
 	vmovd	%esi, %xmm1
 	lea	(%rdi, %rdx), %rsi
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 8ed470283e..2023a8e108 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -75,8 +75,8 @@
 	.section SECTION(.text),"ax",@progbits
 #if VEC_SIZE == 16 && IS_IN (libc)
 ENTRY (__bzero)
-	movq	%rdi, %rax /* Set return value.  */
-	movq	%rsi, %rdx /* Set n.  */
+	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+	mov	%RSI_LP, %RDX_LP /* Set n.  */
 	pxor	%xmm0, %xmm0
 	jmp	L(entry_from_bzero)
 END (__bzero)
@@ -86,13 +86,13 @@ weak_alias (__bzero, bzero)
 #if IS_IN (libc)
 # if defined SHARED
 ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
 # endif
 
 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
-	shlq	$2, %rdx
+	shl	$2, %RDX_LP
 	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 	jmp	L(entry_from_bzero)
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
@@ -100,13 +100,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
 
 #if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
 L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
@@ -122,7 +126,7 @@ END (MEMSET_SYMBOL (__memset, unaligned))
 
 # if VEC_SIZE == 16
 ENTRY (__memset_chk_erms)
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__memset_chk_erms)
 
@@ -135,11 +139,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
 L(stosb):
 	/* Issue vzeroupper before rep stosb.  */
 	VZEROUPPER
-	movq	%rdx, %rcx
+	mov	%RDX_LP, %RCX_LP
 	movzbl	%sil, %eax
-	movq	%rdi, %rdx
+	mov	%RDI_LP, %RDX_LP
 	rep stosb
-	movq	%rdx, %rax
+	mov	%RDX_LP, %RAX_LP
 	ret
 # if VEC_SIZE == 16
 END (__memset_erms)
@@ -149,16 +153,20 @@ END (MEMSET_SYMBOL (__memset, erms))
 
 # if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
-	cmpq	$VEC_SIZE, %rdx
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index e99dbd7c86..98bd9ae9cd 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -7,9 +7,9 @@ endif
 
 ifeq ($(subdir),string)
 tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
-	 tst-size_t-memrchr
+	 tst-size_t-memrchr tst-size_t-memset
 endif
 
 ifeq ($(subdir),wcsmbs)
-tests += tst-size_t-wmemchr tst-size_t-wmemcmp
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
 endif
diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
new file mode 100644
index 0000000000..2c367af6cd
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
@@ -0,0 +1,73 @@
+/* Test memset with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef WIDE
+# define TEST_NAME "wmemset"
+#else
+# define TEST_NAME "memset"
+#endif /* WIDE */
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+# define MEMSET wmemset
+# define CHAR wchar_t
+#else
+# define MEMSET memset
+# define CHAR char
+#endif /* WIDE */
+
+IMPL (MEMSET, 1)
+
+typedef CHAR *(*proto_t) (CHAR *, int, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memset (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  CHAR ch = 0x23;
+  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
+  parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      c.fn = impl->fn;
+      CHAR *p = (CHAR *) do_memset (src, c);
+      size_t i;
+      for (i = 0; i < src.len; i++)
+	if (p[i] != ch)
+	  {
+	    error (0, 0, "Wrong result in function %s", impl->name);
+	    ret = 1;
+	  }
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
new file mode 100644
index 0000000000..955eb488c2
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
@@ -0,0 +1,20 @@
+/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-memset.c"
-- 
cgit 1.4.1


From 15ce2f62f6bc04b87152e583549147073d71b1d2 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 1 Feb 2019 12:22:33 -0800
Subject: x86-64 strncmp family: Properly handle the length parameter [BZ
 #24097]

On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits.  The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.

This pach fixes the strncmp family for x32.  Tested on x86-64 and x32.
On x86-64, libc.so is the same with and withou the fix.

	[BZ #24097]
	CVE-2019-6488
	* sysdeps/x86_64/multiarch/strcmp-sse42.S: Use RDX_LP for length.
	* sysdeps/x86_64/strcmp.S: Likewise.
	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
	tst-size_t-strncmp and tst-size_t-wcsncmp.
	* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
	* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
	* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.

(cherry picked from commit ee915088a0231cd421054dbd8abab7aadf331153)
---
 ChangeLog                                   | 12 +++++
 sysdeps/x86_64/multiarch/strcmp-sse42.S     |  6 +--
 sysdeps/x86_64/strcmp.S                     |  6 +--
 sysdeps/x86_64/x32/Makefile                 |  6 ++-
 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-strncmp.c     | 78 +++++++++++++++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c     | 20 ++++++++
 7 files changed, 179 insertions(+), 8 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c

diff --git a/ChangeLog b/ChangeLog
index c9a5db6f8e..84813b6d9e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcmp-sse42.S: Use RDX_LP for length.
+	* sysdeps/x86_64/strcmp.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
+	tst-size_t-strncmp and tst-size_t-wcsncmp.
+	* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.
+
 2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
 
 	[BZ #24097]
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index 4aeb14e175..9e5f93cb86 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -155,11 +155,11 @@ STRCMP_SSE42:
 #endif
 
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
 	je	LABEL(strcmp_exitz)
-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
 	je	LABEL(Byte0)
-	mov	%rdx, %r11
+	mov	%RDX_LP, %R11_LP
 #endif
 	mov	%esi, %ecx
 	mov	%edi, %eax
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 076be04df5..2aa301997f 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -135,11 +135,11 @@ ENTRY (STRCMP)
  * This implementation uses SSE to compare up to 16 bytes at a time.
  */
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
 	je	LABEL(strcmp_exitz)
-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
 	je	LABEL(Byte0)
-	mov	%rdx, %r11
+	mov	%RDX_LP, %R11_LP
 #endif
 	mov	%esi, %ecx
 	mov	%edi, %eax
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index 98bd9ae9cd..db30283977 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -7,9 +7,11 @@ endif
 
 ifeq ($(subdir),string)
 tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
-	 tst-size_t-memrchr tst-size_t-memset
+	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+	 tst-size_t-strncmp
 endif
 
 ifeq ($(subdir),wcsmbs)
-tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
+	 tst-size_t-wcsncmp
 endif
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
new file mode 100644
index 0000000000..862335937b
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
@@ -0,0 +1,59 @@
+/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "strncasecmp"
+#include "test-size_t.h"
+
+IMPL (strncasecmp, 1)
+
+typedef int (*proto_t) (const char *, const char *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_strncasecmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  strncpy ((char *) buf1, (const char *) buf2, page_size);
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_strncasecmp (dest, src);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
new file mode 100644
index 0000000000..54e6bd83ef
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
@@ -0,0 +1,78 @@
+/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef WIDE
+# define TEST_NAME "wcsncmp"
+#else
+# define TEST_NAME "strncmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+
+# define STRNCMP wcsncmp
+# define STRNCPY wcsncpy
+# define CHAR wchar_t
+#else
+# define STRNCMP strncmp
+# define STRNCPY strncpy
+# define CHAR char
+#endif
+
+IMPL (STRNCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+
+static int
+__attribute__ ((noinline, noclone))
+do_strncmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  size_t size = page_size / sizeof (CHAR);
+  parameter_t dest = { { size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_strncmp (dest, src);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
new file mode 100644
index 0000000000..4829647c19
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
@@ -0,0 +1,20 @@
+/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-strncmp.c"
-- 
cgit 1.4.1


From 40575878cd2cb00c7fe6de1da9811e1c5ab2c0c6 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 1 Feb 2019 12:23:23 -0800
Subject: x86-64 strncpy: Properly handle the length parameter [BZ #24097]

On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits.  The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.

This pach fixes strncpy for x32.  Tested on x86-64 and x32.  On x86-64,
libc.so is the same with and withou the fix.

	[BZ #24097]
	CVE-2019-6488
	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Use RDX_LP
	for length.
	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
	* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.

(cherry picked from commit c7c54f65b080affb87a1513dee449c8ad6143c8b)
---
 ChangeLog                                        | 10 ++++
 sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S |  4 +-
 sysdeps/x86_64/multiarch/strcpy-ssse3.S          |  6 +--
 sysdeps/x86_64/x32/Makefile                      |  2 +-
 sysdeps/x86_64/x32/tst-size_t-strncpy.c          | 58 ++++++++++++++++++++++++
 5 files changed, 74 insertions(+), 6 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c

diff --git a/ChangeLog b/ChangeLog
index 84813b6d9e..77f977c36d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Use RDX_LP
+	for length.
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
+	* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.
+
 2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
 
 	[BZ #24097]
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
index 6a5ab7ab26..b46f6f8aed 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -40,8 +40,8 @@
 .text
 ENTRY (STRCPY)
 #  ifdef USE_AS_STRNCPY
-	mov	%rdx, %r8
-	test	%r8, %r8
+	mov	%RDX_LP, %R8_LP
+	test	%R8_LP, %R8_LP
 	jz	L(ExitZero)
 #  endif
 	mov	%rsi, %rcx
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
index 47aaeae671..83134f3b2c 100644
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -31,13 +31,13 @@ ENTRY (STRCPY)
 
 	mov	%rsi, %rcx
 #  ifdef USE_AS_STRNCPY
-	mov	%rdx, %r8
+	mov	%RDX_LP, %R8_LP
 #  endif
 	mov	%rdi, %rdx
 #  ifdef USE_AS_STRNCPY
-	test	%r8, %r8
+	test	%R8_LP, %R8_LP
 	jz	L(Exit0)
-	cmp	$8, %r8
+	cmp	$8, %R8_LP
 	jbe	L(StrncpyExit8Bytes)
 # endif
 	cmpb	$0, (%rcx)
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index db30283977..2a9e20a90b 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -8,7 +8,7 @@ endif
 ifeq ($(subdir),string)
 tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
-	 tst-size_t-strncmp
+	 tst-size_t-strncmp tst-size_t-strncpy
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
new file mode 100644
index 0000000000..4dec71e6b3
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
@@ -0,0 +1,58 @@
+/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "strncpy"
+#include "test-size_t.h"
+
+IMPL (strncpy, 1)
+
+typedef char *(*proto_t) (char *, const char*, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_strncpy (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      do_strncpy (dest, src);
+      int res = strncmp (dest.p, src.p, dest.len);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
-- 
cgit 1.4.1


From dc968f557398565d416dab69224f722c1d2beb9b Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 1 Feb 2019 12:24:08 -0800
Subject: x86-64 strnlen/wcsnlen: Properly handle the length parameter [BZ
 #24097]

On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits.  The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.

This pach fixes strnlen/wcsnlen for x32.  Tested on x86-64 and x32.  On
x86-64, libc.so is the same with and withou the fix.

	[BZ #24097]
	CVE-2019-6488
	* sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length.
	Clear the upper 32 bits of RSI register.
	* sysdeps/x86_64/strlen.S: Use RSI_LP for length.
	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen
	and tst-size_t-wcsnlen.
	* sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file.
	* sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise.

(cherry picked from commit 5165de69c0908e28a380cbd4bb054e55ea4abc95)
---
 ChangeLog                               | 12 ++++++
 sysdeps/x86_64/multiarch/strlen-avx2.S  |  9 +++--
 sysdeps/x86_64/strlen.S                 | 12 +++---
 sysdeps/x86_64/x32/Makefile             |  4 +-
 sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++++
 6 files changed, 118 insertions(+), 11 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c

diff --git a/ChangeLog b/ChangeLog
index 77f977c36d..bc1f461a48 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length.
+	Clear the upper 32 bits of RSI register.
+	* sysdeps/x86_64/strlen.S: Use RSI_LP for length.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen
+	and tst-size_t-wcsnlen.
+	* sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise.
+
 2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
 
 	[BZ #24097]
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index 1dc823af0a..2ce10f59e6 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -42,12 +42,15 @@
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
 	/* Check for zero length.  */
-	testq	%rsi, %rsi
+	test	%RSI_LP, %RSI_LP
 	jz	L(zero)
 #  ifdef USE_AS_WCSLEN
-	shl	$2, %rsi
+	shl	$2, %RSI_LP
+#  elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
 #  endif
-	movq	%rsi, %r8
+	mov	%RSI_LP, %R8_LP
 # endif
 	movl	%edi, %ecx
 	movq	%rdi, %rdx
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index b5ab117c79..30cec693c1 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -59,21 +59,21 @@ ENTRY(strlen)
 
 #ifdef AS_STRNLEN
 /* Do not read anything when n==0.  */
-	test	%rsi, %rsi
+	test	%RSI_LP, %RSI_LP
 	jne	L(n_nonzero)
 	xor	%rax, %rax
 	ret
 L(n_nonzero):
 # ifdef AS_WCSLEN
-	shlq	$2, %rsi
+	shl	$2, %RSI_LP
 # endif
 
 /* Initialize long lived registers.  */
 
-	add	%rdi, %rsi
-	mov	%rsi, %r10
-	and	$-64, %r10
-	mov	%rsi, %r11
+	add	%RDI_LP, %RSI_LP
+	mov	%RSI_LP, %R10_LP
+	and	$-64, %R10_LP
+	mov	%RSI_LP, %R11_LP
 #endif
 
 	pxor	%xmm0, %xmm0
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index 2a9e20a90b..1557724b0c 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -8,10 +8,10 @@ endif
 ifeq ($(subdir),string)
 tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
-	 tst-size_t-strncmp tst-size_t-strncpy
+	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
 endif
 
 ifeq ($(subdir),wcsmbs)
 tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
-	 tst-size_t-wcsncmp
+	 tst-size_t-wcsncmp tst-size_t-wcsnlen
 endif
diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
new file mode 100644
index 0000000000..690a4a8a31
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
@@ -0,0 +1,72 @@
+/* Test strnlen with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef WIDE
+# define TEST_NAME "wcsnlen"
+#else
+# define TEST_NAME "strnlen"
+#endif /* WIDE */
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+# define STRNLEN wcsnlen
+# define CHAR wchar_t
+#else
+# define STRNLEN strnlen
+# define CHAR char
+#endif /* WIDE */
+
+IMPL (STRNLEN, 1)
+
+typedef size_t (*proto_t) (const CHAR *, size_t);
+
+static size_t
+__attribute__ ((noinline, noclone))
+do_strnlen (parameter_t a, parameter_t b)
+{
+  return CALL (&a, a.p, b.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  size_t size = page_size / sizeof (CHAR);
+  parameter_t src = { { 0 }, buf2 };
+  parameter_t c = { { size }, (void *) (uintptr_t) 'a' };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      size_t res = do_strnlen (src, c);
+      if (res != size)
+	{
+	  error (0, 0, "Wrong result in function %s: 0x%x != 0x%x",
+		 impl->name, res, size);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
new file mode 100644
index 0000000000..093b4bbe1b
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
@@ -0,0 +1,20 @@
+/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-strnlen.c"
-- 
cgit 1.4.1


From 04e767b59b16befce93c6086362acbc1fae63f4e Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 4 Feb 2019 08:55:52 -0800
Subject: x86-64 memcmp: Use unsigned Jcc instructions on size [BZ #24155]

Since the size argument is unsigned. we should use unsigned Jcc
instructions, instead of signed, to check size.

Tested on x86-64 and x32, with and without --disable-multi-arch.

	[BZ #24155]
	CVE-2019-7309
	* NEWS: Updated for CVE-2019-7309.
	* sysdeps/x86_64/memcmp.S: Use RDX_LP for size.  Clear the
	upper 32 bits of RDX register for x32.  Use unsigned Jcc
	instructions, instead of signed.
	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2.
	* sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test.

(cherry picked from commit 3f635fb43389b54f682fc9ed2acc0b2aaf4a923d)
---
 ChangeLog                                | 11 +++++
 NEWS                                     | 10 ++++
 sysdeps/x86_64/memcmp.S                  | 20 ++++----
 sysdeps/x86_64/x32/Makefile              |  3 +-
 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c | 79 ++++++++++++++++++++++++++++++++
 5 files changed, 114 insertions(+), 9 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c

diff --git a/ChangeLog b/ChangeLog
index bc1f461a48..6570bc66bd 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2019-02-04  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24155]
+	CVE-2019-7309
+	* NEWS: Updated for CVE-2019-7309.
+	* sysdeps/x86_64/memcmp.S: Use RDX_LP for size.  Clear the
+	upper 32 bits of RDX register for x32.  Use unsigned Jcc
+	instructions, instead of signed.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test.
+
 2019-02-01  H.J. Lu  <hongjiu.lu@intel.com>
 
 	[BZ #24097]
diff --git a/NEWS b/NEWS
index 1bea44cc8d..735c578726 100644
--- a/NEWS
+++ b/NEWS
@@ -92,6 +92,14 @@ Security related changes:
   in the assembly string/memory functions would cause a buffer overflow.
   Reported by H.J. Lu.
 
+  CVE-2019-7309: x86-64 memcmp used signed Jcc instructions to check
+  size.  For x86-64, memcmp on an object size larger than SSIZE_MAX
+  has undefined behavior.  On x32, the size_t argument may be passed
+  in the lower 32 bits of the 64-bit RDX register with non-zero upper
+  32 bits.  When it happened with the sign bit of RDX register set,
+  memcmp gave the wrong result since it treated the size argument as
+  zero.  Reported by H.J. Lu.
+
 The following bugs are resolved with this release:
 
   [16750] ldd: Never run file directly.
@@ -172,6 +180,8 @@ The following bugs are resolved with this release:
   [24018] gettext may return NULL
   [24027] malloc: Integer overflow in realloc
   [24097] Can't use 64-bit register for size_t in assembly codes for x32 (CVE-2019-6488)
+  [24155] x32 memcmp can treat positive length as 0 (if sign bit in RDX is set) (CVE-2019-7309)
+
 
 Version 2.26
 
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index 0828a22534..9ae90661c8 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -21,14 +21,18 @@
 
 	.text
 ENTRY (memcmp)
-	test	%rdx, %rdx
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#endif
+	test	%RDX_LP, %RDX_LP
 	jz	L(finz)
 	cmpq	$1, %rdx
-	jle	L(finr1b)
+	jbe	L(finr1b)
 	subq	%rdi, %rsi
 	movq	%rdx, %r10
 	cmpq	$32, %r10
-	jge	L(gt32)
+	jae	L(gt32)
 	/* Handle small chunks and last block of less than 32 bytes.  */
 L(small):
 	testq	$1, %r10
@@ -156,7 +160,7 @@ L(A32):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
-        jge	L(mt16)
+        jae	L(mt16)
 	/* Pre-unroll to be ready for unrolled 64B loop.  */
 	testq	$32, %rdi
 	jz	L(A64)
@@ -178,7 +182,7 @@ L(A64):
 	movq	%r11, %r10
 	andq	$-64, %r10
 	cmpq	%r10, %rdi
-        jge	L(mt32)
+        jae	L(mt32)
 
 L(A64main):
 	movdqu    (%rdi,%rsi), %xmm0
@@ -216,7 +220,7 @@ L(mt32):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
-        jge	L(mt16)
+        jae	L(mt16)
 
 L(A32main):
 	movdqu    (%rdi,%rsi), %xmm0
@@ -254,7 +258,7 @@ L(ATR):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
-        jge	L(mt16)
+        jae	L(mt16)
 	testq	$16, %rdi
 	jz	L(ATR32)
 
@@ -325,7 +329,7 @@ L(ATR64main):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
-        jge	L(mt16)
+        jae	L(mt16)
 
 L(ATR32res):
 	movdqa    (%rdi,%rsi), %xmm0
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index 1557724b0c..8748956563 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -8,7 +8,8 @@ endif
 ifeq ($(subdir),string)
 tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
-	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
+	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen \
+	 tst-size_t-memcmp-2
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
new file mode 100644
index 0000000000..d8ae1a0813
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
@@ -0,0 +1,79 @@
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifdef WIDE
+# define TEST_NAME "wmemcmp"
+#else
+# define TEST_NAME "memcmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <inttypes.h>
+# include <wchar.h>
+
+# define MEMCMP wmemcmp
+# define CHAR wchar_t
+#else
+# define MEMCMP memcmp
+# define CHAR char
+#endif
+
+IMPL (MEMCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_memcmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  memcpy (buf1, buf2, page_size);
+
+  CHAR *p = (CHAR *) buf1;
+  p[page_size / sizeof (CHAR) - 1] = (CHAR) 1;
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_memcmp (dest, src);
+      if (res >= 0)
+	{
+	  error (0, 0, "Wrong result in function %s: %i >= 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
-- 
cgit 1.4.1


From c165427d55d4eefca2dd324073c52f54db0db15a Mon Sep 17 00:00:00 2001
From: Stefan Liebler <stli@linux.ibm.com>
Date: Thu, 7 Feb 2019 15:18:36 +0100
Subject: Add compiler barriers around modifications of the robust mutex list
 for pthread_mutex_trylock. [BZ #24180]

While debugging a kernel warning, Thomas Gleixner, Sebastian Sewior and
Heiko Carstens found a bug in pthread_mutex_trylock due to misordered
instructions:
140:   a5 1b 00 01             oill    %r1,1
144:   e5 48 a0 f0 00 00       mvghi   240(%r10),0   <--- THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
14a:   e3 10 a0 e0 00 24       stg     %r1,224(%r10) <--- last THREAD_SETMEM of ENQUEUE_MUTEX_PI

vs (with compiler barriers):
140:   a5 1b 00 01             oill    %r1,1
144:   e3 10 a0 e0 00 24       stg     %r1,224(%r10)
14a:   e5 48 a0 f0 00 00       mvghi   240(%r10),0

Please have a look at the discussion:
"Re: WARN_ON_ONCE(!new_owner) within wake_futex_pi() triggerede"
(https://lore.kernel.org/lkml/20190202112006.GB3381@osiris/)

This patch is introducing the same compiler barriers and comments
for pthread_mutex_trylock as introduced for pthread_mutex_lock and
pthread_mutex_timedlock by commit 8f9450a0b7a9e78267e8ae1ab1000ebca08e473e
"Add compiler barriers around modifications of the robust mutex list."

ChangeLog:

	[BZ #24180]
	* nptl/pthread_mutex_trylock.c (__pthread_mutex_trylock):
	Add compiler barriers and comments.

(cherry picked from commit 823624bdc47f1f80109c9c52dee7939b9386d708)
---
 ChangeLog                    |  6 +++++
 nptl/pthread_mutex_trylock.c | 57 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 6570bc66bd..b0afd11b08 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2019-02-07  Stefan Liebler  <stli@linux.ibm.com>
+
+	[BZ #24180]
+	* nptl/pthread_mutex_trylock.c (__pthread_mutex_trylock):
+	Add compiler barriers and comments.
+
 2019-02-04  H.J. Lu  <hongjiu.lu@intel.com>
 
 	[BZ #24155]
diff --git a/nptl/pthread_mutex_trylock.c b/nptl/pthread_mutex_trylock.c
index ec7da61c73..d478eca21b 100644
--- a/nptl/pthread_mutex_trylock.c
+++ b/nptl/pthread_mutex_trylock.c
@@ -92,6 +92,9 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
     case PTHREAD_MUTEX_ROBUST_ADAPTIVE_NP:
       THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
 		     &mutex->__data.__list.__next);
+      /* We need to set op_pending before starting the operation.  Also
+	 see comments at ENQUEUE_MUTEX.  */
+      __asm ("" ::: "memory");
 
       oldval = mutex->__data.__lock;
       do
@@ -117,7 +120,12 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	      /* But it is inconsistent unless marked otherwise.  */
 	      mutex->__data.__owner = PTHREAD_MUTEX_INCONSISTENT;
 
+	      /* We must not enqueue the mutex before we have acquired it.
+		 Also see comments at ENQUEUE_MUTEX.  */
+	      __asm ("" ::: "memory");
 	      ENQUEUE_MUTEX (mutex);
+	      /* We need to clear op_pending after we enqueue the mutex.  */
+	      __asm ("" ::: "memory");
 	      THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
 	      /* Note that we deliberately exist here.  If we fall
@@ -133,6 +141,8 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	      int kind = PTHREAD_MUTEX_TYPE (mutex);
 	      if (kind == PTHREAD_MUTEX_ROBUST_ERRORCHECK_NP)
 		{
+		  /* We do not need to ensure ordering wrt another memory
+		     access.  Also see comments at ENQUEUE_MUTEX. */
 		  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
 				 NULL);
 		  return EDEADLK;
@@ -140,6 +150,8 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 
 	      if (kind == PTHREAD_MUTEX_ROBUST_RECURSIVE_NP)
 		{
+		  /* We do not need to ensure ordering wrt another memory
+		     access.  */
 		  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
 				 NULL);
 
@@ -158,6 +170,9 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 							id, 0);
 	  if (oldval != 0 && (oldval & FUTEX_OWNER_DIED) == 0)
 	    {
+	      /* We haven't acquired the lock as it is already acquired by
+		 another owner.  We do not need to ensure ordering wrt another
+		 memory access.  */
 	      THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
 	      return EBUSY;
@@ -171,13 +186,20 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	      if (oldval == id)
 		lll_unlock (mutex->__data.__lock,
 			    PTHREAD_ROBUST_MUTEX_PSHARED (mutex));
+	      /* FIXME This violates the mutex destruction requirements.  See
+		 __pthread_mutex_unlock_full.  */
 	      THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 	      return ENOTRECOVERABLE;
 	    }
 	}
       while ((oldval & FUTEX_OWNER_DIED) != 0);
 
+      /* We must not enqueue the mutex before we have acquired it.
+	 Also see comments at ENQUEUE_MUTEX.  */
+      __asm ("" ::: "memory");
       ENQUEUE_MUTEX (mutex);
+      /* We need to clear op_pending after we enqueue the mutex.  */
+      __asm ("" ::: "memory");
       THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
       mutex->__data.__owner = id;
@@ -203,10 +225,15 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	int robust = mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP;
 
 	if (robust)
-	  /* Note: robust PI futexes are signaled by setting bit 0.  */
-	  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
-			 (void *) (((uintptr_t) &mutex->__data.__list.__next)
-				   | 1));
+	  {
+	    /* Note: robust PI futexes are signaled by setting bit 0.  */
+	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
+			   (void *) (((uintptr_t) &mutex->__data.__list.__next)
+				     | 1));
+	    /* We need to set op_pending before starting the operation.  Also
+	       see comments at ENQUEUE_MUTEX.  */
+	    __asm ("" ::: "memory");
+	  }
 
 	oldval = mutex->__data.__lock;
 
@@ -215,12 +242,16 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	  {
 	    if (kind == PTHREAD_MUTEX_ERRORCHECK_NP)
 	      {
+		/* We do not need to ensure ordering wrt another memory
+		   access.  */
 		THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 		return EDEADLK;
 	      }
 
 	    if (kind == PTHREAD_MUTEX_RECURSIVE_NP)
 	      {
+		/* We do not need to ensure ordering wrt another memory
+		   access.  */
 		THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
 		/* Just bump the counter.  */
@@ -242,6 +273,9 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	  {
 	    if ((oldval & FUTEX_OWNER_DIED) == 0)
 	      {
+		/* We haven't acquired the lock as it is already acquired by
+		   another owner.  We do not need to ensure ordering wrt another
+		   memory access.  */
 		THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
 		return EBUSY;
@@ -262,6 +296,9 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	    if (INTERNAL_SYSCALL_ERROR_P (e, __err)
 		&& INTERNAL_SYSCALL_ERRNO (e, __err) == EWOULDBLOCK)
 	      {
+		/* The kernel has not yet finished the mutex owner death.
+		   We do not need to ensure ordering wrt another memory
+		   access.  */
 		THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
 		return EBUSY;
@@ -279,7 +316,12 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 	    /* But it is inconsistent unless marked otherwise.  */
 	    mutex->__data.__owner = PTHREAD_MUTEX_INCONSISTENT;
 
+	    /* We must not enqueue the mutex before we have acquired it.
+	       Also see comments at ENQUEUE_MUTEX.  */
+	    __asm ("" ::: "memory");
 	    ENQUEUE_MUTEX (mutex);
+	    /* We need to clear op_pending after we enqueue the mutex.  */
+	    __asm ("" ::: "memory");
 	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 
 	    /* Note that we deliberately exit here.  If we fall
@@ -302,13 +344,20 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex)
 						  PTHREAD_ROBUST_MUTEX_PSHARED (mutex)),
 			      0, 0);
 
+	    /* To the kernel, this will be visible after the kernel has
+	       acquired the mutex in the syscall.  */
 	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 	    return ENOTRECOVERABLE;
 	  }
 
 	if (robust)
 	  {
+	    /* We must not enqueue the mutex before we have acquired it.
+	       Also see comments at ENQUEUE_MUTEX.  */
+	    __asm ("" ::: "memory");
 	    ENQUEUE_MUTEX_PI (mutex);
+	    /* We need to clear op_pending after we enqueue the mutex.  */
+	    __asm ("" ::: "memory");
 	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
 	  }
 
-- 
cgit 1.4.1


From 4385ec1d8af4203b23dce8c9dc2f1aff5acaf094 Mon Sep 17 00:00:00 2001
From: Andreas Schwab <schwab@suse.de>
Date: Tue, 14 May 2019 17:14:59 +0200
Subject: Fix crash in _IO_wfile_sync (bug 20568)

When computing the length of the converted part of the stdio buffer, use
the number of consumed wide characters, not the (negative) distance to the
end of the wide buffer.

(cherry picked from commit 32ff397533715988c19cbf3675dcbd727ec13e18)
---
 ChangeLog                  | 10 ++++++++++
 NEWS                       |  1 +
 libio/Makefile             |  3 ++-
 libio/tst-wfile-sync.c     | 39 +++++++++++++++++++++++++++++++++++++++
 libio/tst-wfile-sync.input |  1 +
 libio/wfileops.c           |  5 +++--
 6 files changed, 56 insertions(+), 3 deletions(-)
 create mode 100644 libio/tst-wfile-sync.c
 create mode 100644 libio/tst-wfile-sync.input

diff --git a/ChangeLog b/ChangeLog
index b0afd11b08..78924ef13e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2019-05-15  Andreas Schwab  <schwab@suse.de>
+
+	[BZ #20568]
+	* libio/wfileops.c (_IO_wfile_sync): Correct last argument to
+	__codecvt_do_length.
+	* libio/Makefile (tests): Add tst-wfile-sync.
+	($(objpfx)tst-wfile-sync.out): Depend on $(gen-locales).
+	* libio/tst-wfile-sync.c: New file.
+	* libio/tst-wfile-sync.input: New file.
+
 2019-02-07  Stefan Liebler  <stli@linux.ibm.com>
 
 	[BZ #24180]
diff --git a/NEWS b/NEWS
index 735c578726..c6c44a3914 100644
--- a/NEWS
+++ b/NEWS
@@ -107,6 +107,7 @@ The following bugs are resolved with this release:
   [17956] crypt: Use NSPR header files in addition to NSS header files
   [20419] elf: Fix stack overflow with huge PT_NOTE segment
   [20532] getaddrinfo: More robust handling of dlopen failures
+  [20568] Fix crash in _IO_wfile_sync
   [21242] assert: Suppress pedantic warning caused by statement expression
   [21265] x86-64: Use fxsave/xsave/xsavec in _dl_runtime_resolve
   [21269] i386 sigaction sa_restorer handling is wrong
diff --git a/libio/Makefile b/libio/Makefile
index 74bf5279f1..79158f3ebd 100644
--- a/libio/Makefile
+++ b/libio/Makefile
@@ -62,7 +62,7 @@ tests = tst_swprintf tst_wprintf tst_swscanf tst_wscanf tst_getwc tst_putwc   \
 	bug-memstream1 bug-wmemstream1 \
 	tst-setvbuf1 tst-popen1 tst-fgetwc bug-wsetpos tst-fseek \
 	tst-fwrite-error tst-ftell-partial-wide tst-ftell-active-handler \
-	tst-ftell-append tst-fputws
+	tst-ftell-append tst-fputws tst-wfile-sync
 
 tests-internal = tst-vtables tst-vtables-interposed
 
@@ -202,6 +202,7 @@ $(objpfx)tst-ungetwc1.out: $(gen-locales)
 $(objpfx)tst-ungetwc2.out: $(gen-locales)
 $(objpfx)tst-widetext.out: $(gen-locales)
 $(objpfx)tst_wprintf2.out: $(gen-locales)
+$(objpfx)tst-wfile-sync.out: $(gen-locales)
 endif
 
 $(objpfx)test-freopen.out: test-freopen.sh $(objpfx)test-freopen
diff --git a/libio/tst-wfile-sync.c b/libio/tst-wfile-sync.c
new file mode 100644
index 0000000000..618682064d
--- /dev/null
+++ b/libio/tst-wfile-sync.c
@@ -0,0 +1,39 @@
+/* Test that _IO_wfile_sync does not crash (bug 20568).
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <locale.h>
+#include <stdio.h>
+#include <wchar.h>
+#include <support/check.h>
+#include <support/xunistd.h>
+
+static int
+do_test (void)
+{
+  TEST_VERIFY_EXIT (setlocale (LC_ALL, "de_DE.UTF-8") != NULL);
+  /* Fill the stdio buffer and advance the read pointer.  */
+  TEST_VERIFY_EXIT (fgetwc (stdin) != WEOF);
+  /* This calls _IO_wfile_sync, it should not crash.  */
+  TEST_VERIFY_EXIT (setvbuf (stdin, NULL, _IONBF, 0) == 0);
+  /* Verify that the external file offset has been synchronized.  */
+  TEST_COMPARE (xlseek (0, 0, SEEK_CUR), 1);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/libio/tst-wfile-sync.input b/libio/tst-wfile-sync.input
new file mode 100644
index 0000000000..12d0958f7a
--- /dev/null
+++ b/libio/tst-wfile-sync.input
@@ -0,0 +1 @@
+This is a test of _IO_wfile_sync.
diff --git a/libio/wfileops.c b/libio/wfileops.c
index fb94f45040..727e1b23b9 100644
--- a/libio/wfileops.c
+++ b/libio/wfileops.c
@@ -526,11 +526,12 @@ _IO_wfile_sync (_IO_FILE *fp)
 	     generate the wide characters up to the current reading
 	     position.  */
 	  int nread;
-
+	  size_t wnread = (fp->_wide_data->_IO_read_ptr
+			   - fp->_wide_data->_IO_read_base);
 	  fp->_wide_data->_IO_state = fp->_wide_data->_IO_last_state;
 	  nread = (*cv->__codecvt_do_length) (cv, &fp->_wide_data->_IO_state,
 					      fp->_IO_read_base,
-					      fp->_IO_read_end, delta);
+					      fp->_IO_read_end, wnread);
 	  fp->_IO_read_ptr = fp->_IO_read_base + nread;
 	  delta = -(fp->_IO_read_end - fp->_IO_read_base - nread);
 	}
-- 
cgit 1.4.1


From ac92c66821fae2ae6587114acc454f97a6bc7859 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Fri, 10 May 2019 16:38:21 +0100
Subject: Fix tcache count maximum (BZ #24531)

The tcache counts[] array is a char, which has a very small range and thus
may overflow.  When setting tcache_count tunable, there is no overflow check.
However the tunable must not be larger than the maximum value of the tcache
counts[] array, otherwise it can overflow when filling the tcache.

	[BZ #24531]
	* malloc/malloc.c (MAX_TCACHE_COUNT): New define.
	(do_set_tcache_count): Only update if count is small enough.
	* manual/tunables.texi (glibc.malloc.tcache_count): Document max value.

(cherry picked from commit 5ad533e8e65092be962e414e0417112c65d154fb)
---
 ChangeLog            | 7 +++++++
 malloc/malloc.c      | 9 +++++++--
 manual/tunables.texi | 4 ++--
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 78924ef13e..fff671fd43 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2019-05-22  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	[BZ #24531]
+	* malloc/malloc.c (MAX_TCACHE_COUNT): New define.
+	(do_set_tcache_count): Only update if count is small enough.
+	* manual/tunables.texi (glibc.malloc.tcache_count): Document max value.
+
 2019-05-15  Andreas Schwab  <schwab@suse.de>
 
 	[BZ #20568]
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 49e8ed69c2..9896230b21 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -2924,6 +2924,8 @@ typedef struct tcache_perthread_struct
   tcache_entry *entries[TCACHE_MAX_BINS];
 } tcache_perthread_struct;
 
+#define MAX_TCACHE_COUNT 127	/* Maximum value of counts[] entries.  */
+
 static __thread bool tcache_shutting_down = false;
 static __thread tcache_perthread_struct *tcache = NULL;
 
@@ -5097,8 +5099,11 @@ static inline int
 __always_inline
 do_set_tcache_count (size_t value)
 {
-  LIBC_PROBE (memory_tunable_tcache_count, 2, value, mp_.tcache_count);
-  mp_.tcache_count = value;
+  if (value <= MAX_TCACHE_COUNT)
+    {
+      LIBC_PROBE (memory_tunable_tcache_count, 2, value, mp_.tcache_count);
+      mp_.tcache_count = value;
+    }
   return 1;
 }
 
diff --git a/manual/tunables.texi b/manual/tunables.texi
index b09e3fe791..a23c8d076a 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -187,8 +187,8 @@ per-thread cache.  The default (and maximum) value is 1032 bytes on
 
 @deftp Tunable glibc.malloc.tcache_count
 The maximum number of chunks of each size to cache. The default is 7.
-There is no upper limit, other than available system memory.  If set
-to zero, the per-thread cache is effectively disabled.
+The upper limit is 127.  If set to zero, the per-thread cache is effectively
+disabled.
 
 The approximate maximum overhead of the per-thread cache is thus equal
 to the number of bins times the chunk count in each bin times the size
-- 
cgit 1.4.1


From 71c2578a9b0ea0e5a37c88ee01885213f40eaeea Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Thu, 25 Apr 2019 15:35:35 +0100
Subject: aarch64: add STO_AARCH64_VARIANT_PCS and DT_AARCH64_VARIANT_PCS

STO_AARCH64_VARIANT_PCS is a non-visibility st_other flag for marking
symbols that reference functions that may follow a variant PCS with
different register usage convention from the base PCS.

DT_AARCH64_VARIANT_PCS is a dynamic tag that marks ELF modules that
have R_*_JUMP_SLOT relocations for symbols marked with
STO_AARCH64_VARIANT_PCS (i.e. have variant PCS calls via a PLT).

	* elf/elf.h (STO_AARCH64_VARIANT_PCS): Define.
	(DT_AARCH64_VARIANT_PCS): Define.
---
 ChangeLog | 5 +++++
 elf/elf.h | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index fff671fd43..55dde7df0c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-06-13  Szabolcs Nagy  <szabolcs.nagy@arm.com>
+
+	* elf/elf.h (STO_AARCH64_VARIANT_PCS): Define.
+	(DT_AARCH64_VARIANT_PCS): Define.
+
 2019-05-22  Wilco Dijkstra  <wdijkstr@arm.com>
 
 	[BZ #24531]
diff --git a/elf/elf.h b/elf/elf.h
index 3900b4c9f0..f80506c562 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -2759,6 +2759,13 @@ enum
 #define R_AARCH64_TLSDESC      1031	/* TLS Descriptor.  */
 #define R_AARCH64_IRELATIVE	1032	/* STT_GNU_IFUNC relocation.  */
 
+/* AArch64 specific values for the Dyn d_tag field.  */
+#define DT_AARCH64_VARIANT_PCS	(DT_LOPROC + 5)
+#define DT_AARCH64_NUM		6
+
+/* AArch64 specific values for the st_other field.  */
+#define STO_AARCH64_VARIANT_PCS 0x80
+
 /* ARM relocs.  */
 
 #define R_ARM_NONE		0	/* No reloc */
-- 
cgit 1.4.1


From f2f501ff397953d711ce249bc705fb6139ab1f7b Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Thu, 25 Apr 2019 15:35:35 +0100
Subject: aarch64: handle STO_AARCH64_VARIANT_PCS

Backport of commit 82bc69c012838a381c4167c156a06f4598f34227
and commit 30ba0375464f34e4bf8129f3d3dc14d0c09add17
without using DT_AARCH64_VARIANT_PCS for optimizing the symbol table check.
This is needed so the internal abi between ld.so and libc.so is unchanged.

Avoid lazy binding of symbols that may follow a variant PCS with different
register usage convention from the base PCS.

Currently the lazy binding entry code does not preserve all the registers
required for AdvSIMD and SVE vector calls.  Saving and restoring all
registers unconditionally may break existing binaries, even if they never
use vector calls, because of the larger stack requirement for lazy
resolution, which can be significant on an SVE system.

The solution is to mark all symbols in the symbol table that may follow
a variant PCS so the dynamic linker can handle them specially.  In this
patch such symbols are always resolved at load time, not lazily.

So currently LD_AUDIT for variant PCS symbols are not supported, for that
the _dl_runtime_profile entry needs to be changed e.g. to unconditionally
save/restore all registers (but pass down arg and retval registers to
pltentry/exit callbacks according to the base PCS).

This patch also removes a __builtin_expect from the modified code because
the branch prediction hint did not seem useful.

	* sysdeps/aarch64/dl-machine.h (elf_machine_lazy_rel): Check
	STO_AARCH64_VARIANT_PCS and bind such symbols at load time.
---
 ChangeLog                    |  5 +++++
 sysdeps/aarch64/dl-machine.h | 35 +++++++++++++++++++++++++++++++----
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 55dde7df0c..8a2fad8626 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-07-12  Szabolcs Nagy  <szabolcs.nagy@arm.com>
+
+	* sysdeps/aarch64/dl-machine.h (elf_machine_lazy_rel): Check
+	STO_AARCH64_VARIANT_PCS and bind such symbols at load time.
+
 2019-06-13  Szabolcs Nagy  <szabolcs.nagy@arm.com>
 
 	* elf/elf.h (STO_AARCH64_VARIANT_PCS): Define.
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 3fb00e6e2d..9ffc2e4c9d 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -391,10 +391,37 @@ elf_machine_lazy_rel (struct link_map *map,
   /* Check for unexpected PLT reloc type.  */
   if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))
     {
-      if (__builtin_expect (map->l_mach.plt, 0) == 0)
-	*reloc_addr += l_addr;
-      else
-	*reloc_addr = map->l_mach.plt;
+      if (map->l_mach.plt == 0)
+	{
+	  /* Prelinking.  */
+	  *reloc_addr += l_addr;
+	  return;
+	}
+
+      if (1) /* DT_AARCH64_VARIANT_PCS is not available, so always check.  */
+	{
+	  /* Check the symbol table for variant PCS symbols.  */
+	  const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
+	  const ElfW (Sym) *symtab =
+	    (const void *)D_PTR (map, l_info[DT_SYMTAB]);
+	  const ElfW (Sym) *sym = &symtab[symndx];
+	  if (__glibc_unlikely (sym->st_other & STO_AARCH64_VARIANT_PCS))
+	    {
+	      /* Avoid lazy resolution of variant PCS symbols.  */
+	      const struct r_found_version *version = NULL;
+	      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+		{
+		  const ElfW (Half) *vernum =
+		    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
+		  version = &map->l_versions[vernum[symndx] & 0x7fff];
+		}
+	      elf_machine_rela (map, reloc, sym, version, reloc_addr,
+				skip_ifunc);
+	      return;
+	    }
+	}
+
+      *reloc_addr = map->l_mach.plt;
     }
   else if (__builtin_expect (r_type == AARCH64_R(TLSDESC), 1))
     {
-- 
cgit 1.4.1


From 54194d8b4d097fd3a9cb9528a0ad686622d6c182 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Tue, 18 Jun 2019 14:09:09 -0300
Subject: posix: Fix large mmap64 offset for mips64n32 (BZ#24699)

The fix for BZ#21270 (commit 158d5fa0e19) added a mask to avoid offset larger
than 1^44 to be used along __NR_mmap2.  However mips64n32 users __NR_mmap,
as mips64n64, but still defines off_t as old non-LFS type (other ILP32, such
x32, defines off_t being equal to off64_t).  This leads to use the same
mask meant only for __NR_mmap2 call for __NR_mmap, thus limiting the maximum
offset it can use with mmap64.

This patch fixes by setting the high mask only for __NR_mmap2 usage. The
posix/tst-mmap-offset.c already tests it and also fails for mips64n32. The
patch also change the test to check for an arch-specific header that defines
the maximum supported offset.

Checked on x86_64-linux-gnu, i686-linux-gnu, and I also tests tst-mmap-offset
on qemu simulated mips64 with kernel 3.2.0 kernel for both mips-linux-gnu and
mips64-n32-linux-gnu.

	[BZ #24699]
	* posix/tst-mmap-offset.c: Mention BZ #24699.
	(do_test_bz21270): Rename to do_test_large_offset and use
	mmap64_maximum_offset to check for maximum expected offset value.
	* sysdeps/generic/mmap_info.h: New file.
	* sysdeps/unix/sysv/linux/mips/mmap_info.h: Likewise.
	* sysdeps/unix/sysv/linux/mmap64.c (MMAP_OFF_HIGH_MASK): Define iff
	__NR_mmap2 is used.

(cherry picked from commit a008c76b56e4f958cf5a0d6f67d29fade89421b7)
---
 ChangeLog                                | 11 +++++++++++
 posix/tst-mmap-offset.c                  |  9 +++++----
 sysdeps/generic/mmap_info.h              | 16 ++++++++++++++++
 sysdeps/unix/sysv/linux/mips/mmap_info.h | 13 +++++++++++++
 sysdeps/unix/sysv/linux/mmap64.c         |  9 ++++++++-
 5 files changed, 53 insertions(+), 5 deletions(-)
 create mode 100644 sysdeps/generic/mmap_info.h
 create mode 100644 sysdeps/unix/sysv/linux/mips/mmap_info.h

diff --git a/ChangeLog b/ChangeLog
index 8a2fad8626..4cd747a6f0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2019-07-12  Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+
+	[BZ #24699]
+	* posix/tst-mmap-offset.c: Mention BZ #24699.
+	(do_test_bz21270): Rename to do_test_large_offset and use
+	mmap64_maximum_offset to check for maximum expected offset value.
+	* sysdeps/generic/mmap_info.h: New file.
+	* sysdeps/unix/sysv/linux/mips/mmap_info.h: Likewise.
+	* sysdeps/unix/sysv/linux/mmap64.c (MMAP_OFF_HIGH_MASK): Define iff
+	__NR_mmap2 is used.
+
 2019-07-12  Szabolcs Nagy  <szabolcs.nagy@arm.com>
 
 	* sysdeps/aarch64/dl-machine.h (elf_machine_lazy_rel): Check
diff --git a/posix/tst-mmap-offset.c b/posix/tst-mmap-offset.c
index 5bb88aab10..cfd82484f6 100644
--- a/posix/tst-mmap-offset.c
+++ b/posix/tst-mmap-offset.c
@@ -1,4 +1,4 @@
-/* BZ #18877 and #21270 mmap offset test.
+/* BZ #18877, BZ #21270, and BZ #24699 mmap offset test.
 
    Copyright (C) 2015-2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
@@ -24,6 +24,7 @@
 #include <unistd.h>
 #include <errno.h>
 #include <sys/mman.h>
+#include <mmap_info.h>
 
 #include <support/check.h>
 
@@ -76,7 +77,7 @@ do_test_bz18877 (void)
 
 /* Check if invalid offset are handled correctly by mmap.  */
 static int
-do_test_bz21270 (void)
+do_test_large_offset (void)
 {
   /* For architectures with sizeof (off_t) < sizeof (off64_t) mmap is
      implemented with __SYS_mmap2 syscall and the offset is represented in
@@ -90,7 +91,7 @@ do_test_bz21270 (void)
   const size_t length = 4096;
 
   void *addr = mmap64 (NULL, length, prot, flags, fd, offset);
-  if (sizeof (off_t) < sizeof (off64_t))
+  if (mmap64_maximum_offset (page_shift) < UINT64_MAX)
     {
       if ((addr != MAP_FAILED) && (errno != EINVAL))
 	FAIL_RET ("mmap succeed");
@@ -110,7 +111,7 @@ do_test (void)
   int ret = 0;
 
   ret += do_test_bz18877 ();
-  ret += do_test_bz21270 ();
+  ret += do_test_large_offset ();
 
   return ret;
 }
diff --git a/sysdeps/generic/mmap_info.h b/sysdeps/generic/mmap_info.h
new file mode 100644
index 0000000000..b3087df2d3
--- /dev/null
+++ b/sysdeps/generic/mmap_info.h
@@ -0,0 +1,16 @@
+/* As default architectures with sizeof (off_t) < sizeof (off64_t) the mmap is
+   implemented with __SYS_mmap2 syscall and the offset is represented in
+   multiples of page size.  For offset larger than
+   '1 << (page_shift + 8 * sizeof (off_t))' (that is, 1<<44 on system with
+   page size of 4096 bytes) the system call silently truncates the offset.
+   For this case, glibc mmap implementation returns EINVAL.  */
+
+/* Return the maximum value expected as offset argument in mmap64 call.  */
+static inline uint64_t
+mmap64_maximum_offset (long int page_shift)
+{
+  if (sizeof (off_t) < sizeof (off64_t))
+    return (UINT64_C(1) << (page_shift + (8 * sizeof (off_t)))) - 1;
+  else
+    return UINT64_MAX;
+}
diff --git a/sysdeps/unix/sysv/linux/mips/mmap_info.h b/sysdeps/unix/sysv/linux/mips/mmap_info.h
new file mode 100644
index 0000000000..07c9e3a044
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/mips/mmap_info.h
@@ -0,0 +1,13 @@
+/* mips64n32 uses __NR_mmap for mmap64 while still having sizeof (off_t)
+   smaller than sizeof (off64_t).  So it allows mapping large offsets
+   using mmap64 than 32-bit archs which uses __NR_mmap2.  */
+
+static inline uint64_t
+mmap64_maximum_offset (long int page_shift)
+{
+#if _MIPS_SIM == _ABIN32 || _MIPS_SIM == _ABI64
+  return UINT64_MAX;
+#else
+  return (UINT64_C(1) << (page_shift + (8 * sizeof (off_t)))) - 1;
+#endif
+}
diff --git a/sysdeps/unix/sysv/linux/mmap64.c b/sysdeps/unix/sysv/linux/mmap64.c
index e8d519b17a..8441a9caa7 100644
--- a/sysdeps/unix/sysv/linux/mmap64.c
+++ b/sysdeps/unix/sysv/linux/mmap64.c
@@ -23,11 +23,18 @@
 #include <sysdep.h>
 #include <mmap_internal.h>
 
+#ifdef __NR_mmap2
 /* To avoid silent truncation of offset when using mmap2, do not accept
    offset larger than 1 << (page_shift + off_t bits).  For archictures with
    32 bits off_t and page size of 4096 it would be 1^44.  */
-#define MMAP_OFF_HIGH_MASK \
+# define MMAP_OFF_HIGH_MASK \
   ((-(MMAP2_PAGE_UNIT << 1) << (8 * sizeof (off_t) - 1)))
+#else
+/* Some ABIs might use __NR_mmap while having sizeof (off_t) smaller than
+   sizeof (off64_t) (currently only MIPS64n32).  For this case just set
+   zero the higher bits so mmap with large offset does not fail.  */
+# define MMAP_OFF_HIGH_MASK  0x0
+#endif
 
 #define MMAP_OFF_MASK (MMAP_OFF_HIGH_MASK | MMAP_OFF_LOW_MASK)
 
-- 
cgit 1.4.1


From 1896de3d926d299a1ed5c9f0a4f03f5a81969200 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Thu, 10 Aug 2017 17:00:38 +0100
Subject: [AArch64] Optimized memcmp.

This is an optimized memcmp for AArch64.  This is a complete rewrite
using a different algorithm.  The previous version split into cases
where both inputs were aligned, the inputs were mutually aligned and
unaligned using a byte loop.  The new version combines all these cases,
while small inputs of less than 8 bytes are handled separately.

This allows the main code to be sped up using unaligned loads since
there are now at least 8 bytes to be compared.  After the first 8 bytes,
align the first input.  This ensures each iteration does at most one
unaligned access and mutually aligned inputs behave as aligned.
After the main loop, process the last 8 bytes using unaligned accesses.

This improves performance of (mutually) aligned cases by 25% and
unaligned by >500% (yes >6 times faster) on large inputs.

	* sysdeps/aarch64/memcmp.S (memcmp):
	Rewrite of optimized memcmp.

(cherry picked from commit 922369032c604b4dcfd535e1bcddd4687e7126a5)
---
 ChangeLog                |   5 ++
 sysdeps/aarch64/memcmp.S | 176 +++++++++++++++++++----------------------------
 2 files changed, 76 insertions(+), 105 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 4cd747a6f0..5783090424 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-09-06  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* sysdeps/aarch64/memcmp.S (memcmp):
+	Rewrite of optimized memcmp.
+
 2019-07-12  Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 
 	[BZ #24699]
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index 4cfcb89297..b99c081bba 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -22,132 +22,98 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses.
  */
 
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define limit		x2
-#define result		x0
+#define result		w0
 
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
 #define data2		x4
 #define data2w		w4
-#define has_nul		x5
-#define diff		x6
-#define endloop		x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define pos		x11
-#define limit_wd	x12
-#define mask		x13
+#define tmp1		x5
 
 ENTRY_ALIGN (memcmp, 6)
 	DELOUSE (0)
 	DELOUSE (1)
 	DELOUSE (2)
-	cbz	limit, L(ret0)
-	eor	tmp1, src1, src2
-	tst	tmp1, #7
-	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
-	add	limit_wd, limit, #7
-	lsr	limit_wd, limit_wd, #3
-	/* Start of performance-critical section  -- one 64B cache line.  */
-L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-L(start_realigned):
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */
-	cbz	endloop, L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
-
-	/* Not reached the limit, must have found a diff.  */
-	cbnz	limit_wd, L(not_limit)
-
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	L(not_limit)
-
-	lsl	limit, limit, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-#ifdef __AARCH64EB__
-	lsr	mask, mask, limit
-#else
-	lsl	mask, mask, limit
-#endif
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	orr	diff, diff, mask
-L(not_limit):
 
-#ifndef	__AARCH64EB__
-	rev	diff, diff
+	subs	limit, limit, 8
+	b.lo	.Lless8
+
+	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	and	tmp1, src1, 7
+	add	limit, limit, tmp1
+	cmp	data1, data2
+	bne	.Lreturn
+
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+
+	subs	limit, limit, 8
+	b.ls	.Llast_bytes
+
+	/* Loop performing 8 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 8 and must be larger than zero.
+	   Exit if <= 8 bytes left to do or if the data is not equal.  */
+	.p2align 4
+.Lloop8:
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	subs	limit, limit, 8
+	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
+	b.eq	.Lloop8
+
+	cmp	data1, data2
+	bne	.Lreturn
+
+	/* Compare last 1-8 bytes using unaligned access.  */
+.Llast_bytes:
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+.Lreturn:
+#ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
-	/* The MS-non-zero bit of DIFF marks either the first bit
-	   that is different, or the end of the significant data.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	clz	pos, diff
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	RET
-
-L(mutual_align):
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	ldr	data1, [src1], #8
-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	add	limit_wd, limit, #7
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	lsr	limit_wd, limit_wd, #3
-	b	L(start_realigned)
-
-L(ret0):
-	mov	result, #0
-	RET
-
-	.p2align 6
-L(misaligned8):
-	sub	limit, limit, #1
-1:
-	/* Perhaps we can do better than this.  */
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
-	sub	result, data1, data2
-	RET
+	cmp     data1, data2
+.Lret_eq:
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+.Lless8:
+	adds	limit, limit, 4
+	b.lo	.Lless4
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
+	cmp	data1w, data2w
+	b.ne	.Lreturn
+	sub	limit, limit, 4
+.Lless4:
+	adds	limit, limit, 4
+	beq	.Lret_eq
+.Lbyte_loop:
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	.Lbyte_loop
+	sub	result, data1w, data2w
+	ret
+
 END (memcmp)
 #undef bcmp
 weak_alias (memcmp, bcmp)
-- 
cgit 1.4.1


From 600e4e866c4de0cc0b16aec482c65da732960367 Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri, 2 Feb 2018 10:15:20 +0530
Subject: aarch64: Use the L() macro for labels in memcmp

The L() macro makes the assembly a bit more readable.

	* sysdeps/aarch64/memcmp.S: Use L() macro for labels.

(cherry picked from commit 84c94d2fd90d84ae7e67657ee8e22c2d1b796f63)
---
 ChangeLog                |  4 ++++
 sysdeps/aarch64/memcmp.S | 32 ++++++++++++++++----------------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 5783090424..8674417ed7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/memcmp.S: Use L() macro for labels.
+
 2019-09-06  Wilco Dijkstra  <wdijkstr@arm.com>
 
 	* sysdeps/aarch64/memcmp.S (memcmp):
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index b99c081bba..708c8278b7 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -44,7 +44,7 @@ ENTRY_ALIGN (memcmp, 6)
 	DELOUSE (2)
 
 	subs	limit, limit, 8
-	b.lo	.Lless8
+	b.lo	L(less8)
 
 	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
 	ldr	data1, [src1], 8
@@ -52,65 +52,65 @@ ENTRY_ALIGN (memcmp, 6)
 	and	tmp1, src1, 7
 	add	limit, limit, tmp1
 	cmp	data1, data2
-	bne	.Lreturn
+	bne	L(return)
 
 	/* Align src1 and adjust src2 with bytes not yet done.  */
 	sub	src1, src1, tmp1
 	sub	src2, src2, tmp1
 
 	subs	limit, limit, 8
-	b.ls	.Llast_bytes
+	b.ls	L(last_bytes)
 
 	/* Loop performing 8 bytes per iteration using aligned src1.
 	   Limit is pre-decremented by 8 and must be larger than zero.
 	   Exit if <= 8 bytes left to do or if the data is not equal.  */
 	.p2align 4
-.Lloop8:
+L(loop8):
 	ldr	data1, [src1], 8
 	ldr	data2, [src2], 8
 	subs	limit, limit, 8
 	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
-	b.eq	.Lloop8
+	b.eq	L(loop8)
 
 	cmp	data1, data2
-	bne	.Lreturn
+	bne	L(return)
 
 	/* Compare last 1-8 bytes using unaligned access.  */
-.Llast_bytes:
+L(last_bytes):
 	ldr	data1, [src1, limit]
 	ldr	data2, [src2, limit]
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
-.Lreturn:
+L(return):
 #ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
 	cmp     data1, data2
-.Lret_eq:
+L(ret_eq):
 	cset	result, ne
 	cneg	result, result, lo
 	ret
 
 	.p2align 4
 	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
-.Lless8:
+L(less8):
 	adds	limit, limit, 4
-	b.lo	.Lless4
+	b.lo	L(less4)
 	ldr	data1w, [src1], 4
 	ldr	data2w, [src2], 4
 	cmp	data1w, data2w
-	b.ne	.Lreturn
+	b.ne	L(return)
 	sub	limit, limit, 4
-.Lless4:
+L(less4):
 	adds	limit, limit, 4
-	beq	.Lret_eq
-.Lbyte_loop:
+	beq	L(ret_eq)
+L(byte_loop):
 	ldrb	data1w, [src1], 1
 	ldrb	data2w, [src2], 1
 	subs	limit, limit, 1
 	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.eq	.Lbyte_loop
+	b.eq	L(byte_loop)
 	sub	result, data1w, data2w
 	ret
 
-- 
cgit 1.4.1


From ec4512194f035856b8a231476c9139d72f47c58f Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Tue, 6 Mar 2018 19:22:39 +0530
Subject: aarch64: Optimized memcmp for medium to large sizes

This improved memcmp provides a fast path for compares up to 16 bytes
and then compares 16 bytes at a time, thus optimizing loads from both
sources.  The glibc memcmp microbenchmark retains performance (with an
error of ~1ns) for smaller compare sizes and reduces up to 31% of
execution time for compares up to 4K on the APM Mustang.  On Qualcomm
Falkor this improves to almost 48%, i.e. it is almost 2x improvement
for sizes of 2K and above.

	* sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a
	time.

(cherry picked from commit 30a81dae5b752f8aa5f96e7f7c341ec57cba3585)
---
 ChangeLog                |  5 ++++
 sysdeps/aarch64/memcmp.S | 76 +++++++++++++++++++++++++++++++++++-------------
 2 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 8674417ed7..204d047c0d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a
+	time.
+
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
 	* sysdeps/aarch64/memcmp.S: Use L() macro for labels.
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index 708c8278b7..d074c98615 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -34,9 +34,12 @@
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define tmp1		x5
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
 
 ENTRY_ALIGN (memcmp, 6)
 	DELOUSE (0)
@@ -46,39 +49,70 @@ ENTRY_ALIGN (memcmp, 6)
 	subs	limit, limit, 8
 	b.lo	L(less8)
 
-	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
 	ldr	data1, [src1], 8
 	ldr	data2, [src2], 8
-	and	tmp1, src1, 7
-	add	limit, limit, tmp1
+	cmp	data1, data2
+	b.ne	L(return)
+
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
 	cmp	data1, data2
 	bne	L(return)
 
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
+
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop8)
+
 	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
 	sub	src1, src1, tmp1
 	sub	src2, src2, tmp1
 
-	subs	limit, limit, 8
-	b.ls	L(last_bytes)
-
-	/* Loop performing 8 bytes per iteration using aligned src1.
-	   Limit is pre-decremented by 8 and must be larger than zero.
-	   Exit if <= 8 bytes left to do or if the data is not equal.  */
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
 	.p2align 4
-L(loop8):
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	subs	limit, limit, 8
-	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
-	b.eq	L(loop8)
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)
 
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
 	cmp	data1, data2
 	bne	L(return)
 
-	/* Compare last 1-8 bytes using unaligned access.  */
+	/* Compare last 1-16 bytes using unaligned access.  */
 L(last_bytes):
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp     data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
 L(return):
-- 
cgit 1.4.1


From 8569357e11aa7b8e912142727eac1d106c785433 Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Tue, 6 Mar 2018 22:56:35 +0530
Subject: aarch64: Fix branch target to loop16

I goofed up when changing the loop8 name to loop16 and missed on out
the branch instance.  Fixed and actually build tested this time.

	* sysdeps/aarch64/memcmp.S (more16): Fix branch target loop16.

(cherry picked from commit 4e54d918630ea53e29dd70d3bdffcb00d29ed3d4)
---
 ChangeLog                | 2 ++
 sysdeps/aarch64/memcmp.S | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/ChangeLog b/ChangeLog
index 204d047c0d..18a01ed12b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,7 @@
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
+	* sysdeps/aarch64/memcmp.S (more16): Fix loop16 branch target.
+
 	* sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a
 	time.
 
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index d074c98615..a741e7b17f 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -75,7 +75,7 @@ L(more16):
 	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
 	   try to align, so limit it only to strings larger than 128 bytes.  */
 	cmp	limit, 96
-	b.ls	L(loop8)
+	b.ls	L(loop16)
 
 	/* Align src1 and adjust src2 with bytes not yet done.  */
 	and	tmp1, src1, 15
-- 
cgit 1.4.1


From 4e75091d6ce3f7ac8b1750ca6135bc37d6707caf Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Wed, 13 Dec 2017 18:50:27 +0530
Subject: aarch64: Improve strcmp unaligned performance

Replace the simple byte-wise compare in the misaligned case with a
dword compare with page boundary checks in place.  For simplicity I've
chosen a 4K page boundary so that we don't have to query the actual
page size on the system.

This results in up to 3x improvement in performance in the unaligned
case on falkor and about 2.5x improvement on mustang as measured using
bench-strcmp.

	* sysdeps/aarch64/strcmp.S (misaligned8): Compare dword at a
	time whenever possible.

(cherry picked from commit 2bce01ebbaf8db52ba4a5635eb5744f989cdbf69)
---
 ChangeLog                |  5 +++++
 sysdeps/aarch64/strcmp.S | 31 +++++++++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 18a01ed12b..cd0c1db768 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/strcmp.S (misaligned8): Compare dword at a
+	time whenever possible.
+
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
 	* sysdeps/aarch64/memcmp.S (more16): Fix loop16 branch target.
diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S
index e99d6625b7..c260e1d8ac 100644
--- a/sysdeps/aarch64/strcmp.S
+++ b/sysdeps/aarch64/strcmp.S
@@ -72,6 +72,7 @@ L(start_realigned):
 	cbz	syndrome, L(loop_aligned)
 	/* End of performance-critical section  -- one 64B cache line.  */
 
+L(end):
 #ifndef	__AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
@@ -145,12 +146,38 @@ L(mutual_align):
 	b	L(start_realigned)
 
 L(misaligned8):
-	/* We can do better than this.  */
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond page boundary in
+	   SRC2.  */
+	tst	src1, #7
+	b.eq	L(loop_misaligned)
+L(do_misaligned):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	cmp	data1w, #1
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	L(misaligned8)
+	b.ne	L(done)
+	tst	src1, #7
+	b.ne	L(misaligned8)
+
+L(loop_misaligned):
+	/* Test if we are within the last dword of the end of a 4K page.  If
+	   yes then jump back to the misaligned loop to copy a byte at a time.  */
+	and	tmp1, src2, #0xff8
+	eor	tmp1, tmp1, #0xff8
+	cbz	tmp1, L(do_misaligned)
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_misaligned)
+	b	L(end)
+
+L(done):
 	sub	result, data1, data2
 	RET
 END(strcmp)
-- 
cgit 1.4.1


From 01de24dbca4374665fb2a439be39c05427c0a24a Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Thu, 22 Feb 2018 23:48:13 +0530
Subject: aarch64/strcmp: fix misaligned loop jump target

I accidentally set the loop jump back label as misaligned8 instead of
do_misaligned.  The typo is harmless but it's always nice to not have
to unnecessarily execute those two instructions.

	* sysdeps/aarch64/strcmp.S (do_misaligned): Jump back to
	do_misaligned, not misaligned8.

(cherry picked from commit 6ca24c43481e2c93a6eec362b04c3e77a35b28e3)
---
 ChangeLog                | 5 +++++
 sysdeps/aarch64/strcmp.S | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/ChangeLog b/ChangeLog
index cd0c1db768..29f9e1bac5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/strcmp.S (do_misaligned): Jump back to
+	do_misaligned, not misaligned8.
+
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
 	* sysdeps/aarch64/strcmp.S (misaligned8): Compare dword at a
diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S
index c260e1d8ac..7eed82cee7 100644
--- a/sysdeps/aarch64/strcmp.S
+++ b/sysdeps/aarch64/strcmp.S
@@ -158,7 +158,7 @@ L(do_misaligned):
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
 	b.ne	L(done)
 	tst	src1, #7
-	b.ne	L(misaligned8)
+	b.ne	L(do_misaligned)
 
 L(loop_misaligned):
 	/* Test if we are within the last dword of the end of a 4K page.  If
-- 
cgit 1.4.1


From af9381b734e2218f84235e000cd20d2824a6ed42 Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Tue, 13 Mar 2018 23:57:03 +0530
Subject: aarch64: Improve strncmp for mutually misaligned inputs

The mutually misaligned inputs on aarch64 are compared with a simple
byte copy, which is not very efficient.  Enhance the comparison
similar to strcmp by loading a double-word at a time.  The peak
performance improvement (i.e. 4k maxlen comparisons) due to this on
the strncmp microbenchmark is as follows:

falkor: 3.5x (up to 72% time reduction)
cortex-a73: 3.5x (up to 71% time reduction)
cortex-a53: 3.5x (up to 71% time reduction)

All mutually misaligned inputs from 16 bytes maxlen onwards show
upwards of 15% improvement and there is no measurable effect on the
performance of aligned/mutually aligned inputs.

	* sysdeps/aarch64/strncmp.S (count): New macro.
	(strncmp): Store misaligned length in SRC1 in COUNT.
	(mutual_align): Adjust.
	(misaligned8): Load dword at a time when it is safe.

(cherry picked from commit 7108f1f944792ac68332967015d5e6418c5ccc88)
---
 ChangeLog                 |  7 ++++
 sysdeps/aarch64/strncmp.S | 95 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 87 insertions(+), 15 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 29f9e1bac5..b1de9c05c8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/strncmp.S (count): New macro.
+	(strncmp): Store misaligned length in SRC1 in COUNT.
+	(mutual_align): Adjust.
+	(misaligned8): Load dword at a time when it is safe.
+
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
 	* sysdeps/aarch64/strcmp.S (do_misaligned): Jump back to
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
index 3e4d88a5d7..33c07befd8 100644
--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -49,6 +49,7 @@
 #define limit_wd	x13
 #define mask		x14
 #define endloop		x15
+#define count		mask
 
 ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
 	DELOUSE (0)
@@ -58,9 +59,9 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
+	and	count, src1, #7
 	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
+	cbnz	count, L(mutual_align)
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
@@ -165,43 +166,107 @@ L(mutual_align):
 	bic	src1, src1, #7
 	bic	src2, src2, #7
 	ldr	data1, [src1], #8
-	neg	tmp3, tmp1, lsl #3	/* 64 - bits(bytes beyond align). */
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 #ifdef __AARCH64EB__
 	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #else
 	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #endif
 	and	tmp3, limit_wd, #7
 	lsr	limit_wd, limit_wd, #3
 	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, tmp1
-	add	tmp3, tmp3, tmp1
+	add	limit, limit, count
+	add	tmp3, tmp3, count
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
-L(ret0):
-	mov	result, #0
-	RET
-
 	.p2align 6
+	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
-	sub	limit, limit, #1
-1:
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
+
+L(byte_loop):
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
-	ccmp	data1w, #1, #0, cs	/* NZCV = 0b0000.  */
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
+	b.eq	L(byte_loop)
+L(done):
 	sub	result, data1, data2
 	RET
+
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
+	mov	limit_wd, limit, lsr #3
+	cbz	count, L(do_misaligned)
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+	mov	limit_wd, limit, lsr #3
+
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+L(do_misaligned):
+	/* Prepare ourselves for the next page crossing.  Unlike the aligned
+	   loop, we fetch 1 less dword because we risk crossing bounds on
+	   SRC2.  */
+	mov	count, #8
+	subs	limit_wd, limit_wd, #1
+	b.lo	L(done_loop)
+L(loop_misaligned):
+	and	tmp2, src2, #0xff8
+	eor	tmp2, tmp2, #0xff8
+	cbz	tmp2, L(page_end_loop)
+
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+	subs	limit_wd, limit_wd, #1
+	b.pl	L(loop_misaligned)
+
+L(done_loop):
+	/* We found a difference or a NULL before the limit was reached.  */
+	and	limit, limit, #7
+	cbz	limit, L(not_limit)
+	/* Read the last word.  */
+	sub	src1, src1, 8
+	sub	src2, src2, 8
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+
+L(ret0):
+	mov	result, #0
+	RET
+
 END (strncmp)
 libc_hidden_builtin_def (strncmp)
-- 
cgit 1.4.1


From 00fd3acde1475e1bc488ee13ae124d0340da856e Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Wed, 14 Mar 2018 18:51:05 +0530
Subject: aarch64/strncmp: Unbreak builds with old binutils

Binutils 2.26.* and older do not support moves with shifted registers,
so use a separate shift instruction instead.

(cherry picked from commit d46f84de745db8f3f06a37048261f4e5ceacf0a3)
---
 ChangeLog                 | 5 +++++
 sysdeps/aarch64/strncmp.S | 6 ++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index b1de9c05c8..2c388920dc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/strncmp.S (strncmp): Use a separate shift
+	instruction to unbreak builds with binutils 2.26 and older.
+
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
 	* sysdeps/aarch64/strncmp.S (count): New macro.
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
index 33c07befd8..d1bfa27ca4 100644
--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -208,13 +208,15 @@ L(done):
 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
 	   the dword loop.  */
 L(try_misaligned_words):
-	mov	limit_wd, limit, lsr #3
+	mov	limit_wd, limit
+	lsr	limit_wd, limit_wd, #3
 	cbz	count, L(do_misaligned)
 
 	neg	count, count
 	and	count, count, #7
 	sub	limit, limit, count
-	mov	limit_wd, limit, lsr #3
+	mov	limit_wd, limit
+	lsr	limit_wd, limit_wd, #3
 
 L(page_end_loop):
 	ldrb	data1w, [src1], #1
-- 
cgit 1.4.1


From e3c35100d32f83aa3c0ec57b83746fea9b98bc2f Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Thu, 15 Mar 2018 08:06:21 +0530
Subject: aarch64/strncmp: Use lsr instead of mov+lsr

A lsr can do what the mov and lsr did.

(cherry picked from commit b47c3e7637efb77818cbef55dcd0ed1f0ea0ddf1)
---
 ChangeLog                 | 5 +++++
 sysdeps/aarch64/strncmp.S | 6 ++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 2c388920dc..e9557b8c85 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/strncmp.S (strncmp): Use lsr instead of
+	mov + lsr.
+
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
 	* sysdeps/aarch64/strncmp.S (strncmp): Use a separate shift
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
index d1bfa27ca4..fc1f633348 100644
--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -208,15 +208,13 @@ L(done):
 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
 	   the dword loop.  */
 L(try_misaligned_words):
-	mov	limit_wd, limit
-	lsr	limit_wd, limit_wd, #3
+	lsr	limit_wd, limit, #3
 	cbz	count, L(do_misaligned)
 
 	neg	count, count
 	and	count, count, #7
 	sub	limit, limit, count
-	mov	limit_wd, limit
-	lsr	limit_wd, limit_wd, #3
+	lsr	limit_wd, limit, #3
 
 L(page_end_loop):
 	ldrb	data1w, [src1], #1
-- 
cgit 1.4.1


From d3c05bfffa65b39ba64d62e24ed4a6c118fab3ef Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri, 11 May 2018 00:11:52 +0530
Subject: aarch64,falkor: Ignore prefetcher tagging for smaller copies

For smaller and medium sized copies, the effect of hardware
prefetching are not as dominant as instruction level parallelism.
Hence it makes more sense to load data into multiple registers than to
try and route them to the same prefetch unit.  This is also the case
for the loop exit where we are unable to latch on to the same prefetch
unit anyway so it makes more sense to have data loaded in parallel.

The performance results are a bit mixed with memcpy-random, with
numbers jumping between -1% and +3%, i.e. the numbers don't seem
repeatable.  memcpy-walk sees a 70% improvement (i.e. > 2x) for 128
bytes and that improvement reduces down as the impact of the tail copy
decreases in comparison to the loop.

	* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
	Use multiple registers to copy data in loop tail.

(cherry picked from commit db725a458e1cb0e17204daa543744faf08bb2e06)
---
 ChangeLog                                 |  5 +++
 sysdeps/aarch64/multiarch/memcpy_falkor.S | 68 +++++++++++++++++++------------
 2 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index e9557b8c85..65b46ef409 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
+	Use multiple registers to copy data in loop tail.
+
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
 	* sysdeps/aarch64/strncmp.S (strncmp): Use lsr instead of
diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
index dea4f225ee..3b8601f87e 100644
--- a/sysdeps/aarch64/multiarch/memcpy_falkor.S
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -35,6 +35,20 @@
 #define A_hw	w7
 #define tmp1	x14
 
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	dst
+#define E_h	tmp1
+#define F_l	src
+#define F_h	count
+#define G_l	srcend
+#define G_h	x15
+
 /* Copies are split into 3 main cases:
 
    1. Small copies of up to 32 bytes
@@ -74,21 +88,21 @@ ENTRY_ALIGN (__memcpy_falkor, 6)
 	/* Medium copies: 33..128 bytes.  */
 	sub	tmp1, count, 1
 	ldp	A_l, A_h, [src, 16]
-	stp	A_l, A_h, [dstin, 16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -16]
 	tbz	tmp1, 6, 1f
-	ldp	A_l, A_h, [src, 32]
-	stp	A_l, A_h, [dstin, 32]
-	ldp	A_l, A_h, [src, 48]
-	stp	A_l, A_h, [dstin, 48]
-	ldp	A_l, A_h, [srcend, -64]
-	stp	A_l, A_h, [dstend, -64]
-	ldp	A_l, A_h, [srcend, -48]
-	stp	A_l, A_h, [dstend, -48]
+	ldp	D_l, D_h, [src, 32]
+	ldp	E_l, E_h, [src, 48]
+	stp	D_l, D_h, [dstin, 32]
+	stp	E_l, E_h, [dstin, 48]
+	ldp	F_l, F_h, [srcend, -64]
+	ldp	G_l, G_h, [srcend, -48]
+	stp	F_l, F_h, [dstend, -64]
+	stp	G_l, G_h, [dstend, -48]
 1:
-	ldp	A_l, A_h, [srcend, -32]
-	stp	A_l, A_h, [dstend, -32]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	A_l, A_h, [dstend, -16]
+	stp	A_l, A_h, [dstin, 16]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
 	ret
 
 	.p2align 4
@@ -98,36 +112,36 @@ L(copy32):
 	cmp	count, 16
 	b.lo	1f
 	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [srcend, -16]
 	stp	A_l, A_h, [dstin]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	A_l, A_h, [dstend, -16]
+	stp	B_l, B_h, [dstend, -16]
 	ret
 	.p2align 4
 1:
 	/* 8-15 */
 	tbz	count, 3, 1f
 	ldr	A_l, [src]
+	ldr	B_l, [srcend, -8]
 	str	A_l, [dstin]
-	ldr	A_l, [srcend, -8]
-	str	A_l, [dstend, -8]
+	str	B_l, [dstend, -8]
 	ret
 	.p2align 4
 1:
 	/* 4-7 */
 	tbz	count, 2, 1f
 	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
 	str	A_lw, [dstin]
-	ldr	A_lw, [srcend, -4]
-	str	A_lw, [dstend, -4]
+	str	B_lw, [dstend, -4]
 	ret
 	.p2align 4
 1:
 	/* 2-3 */
 	tbz	count, 1, 1f
 	ldrh	A_lw, [src]
+	ldrh	B_lw, [srcend, -2]
 	strh	A_lw, [dstin]
-	ldrh	A_lw, [srcend, -2]
-	strh	A_lw, [dstend, -2]
+	strh	B_lw, [dstend, -2]
 	ret
 	.p2align 4
 1:
@@ -171,12 +185,12 @@ L(loop64):
 L(last64):
 	ldp	A_l, A_h, [srcend, -64]
 	stnp	A_l, A_h, [dstend, -64]
-	ldp	A_l, A_h, [srcend, -48]
-	stnp	A_l, A_h, [dstend, -48]
-	ldp	A_l, A_h, [srcend, -32]
-	stnp	A_l, A_h, [dstend, -32]
-	ldp	A_l, A_h, [srcend, -16]
-	stnp	A_l, A_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -48]
+	stnp	B_l, B_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -32]
+	stnp	C_l, C_h, [dstend, -32]
+	ldp	D_l, D_h, [srcend, -16]
+	stnp	D_l, D_h, [dstend, -16]
 	ret
 
 END (__memcpy_falkor)
-- 
cgit 1.4.1


From ad64510e5c74729108a02a6c22f03aa8ee07a8d3 Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri, 29 Jun 2018 22:45:59 +0530
Subject: aarch64,falkor: Use vector registers for memcpy

Vector registers perform better than scalar register pairs for copying
data so prefer them instead.  This results in a time reduction of over
50% (i.e. 2x speed improvemnet) for some smaller sizes for memcpy-walk.
Larger sizes show improvements of around 1% to 2%.  memcpy-random shows
a very small improvement, in the range of 1-2%.

	* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
	Use vector registers.

(cherry picked from commit 0aec4c1d1801e8016ebe89281d16597e0557b8be)
---
 ChangeLog                                 |   5 ++
 sysdeps/aarch64/multiarch/memcpy_falkor.S | 137 ++++++++++++++----------------
 2 files changed, 70 insertions(+), 72 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 65b46ef409..0482b0c435 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
+	Use vector registers.
+
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
 	* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
index 3b8601f87e..9cde8dcbd6 100644
--- a/sysdeps/aarch64/multiarch/memcpy_falkor.S
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -29,25 +29,19 @@
 #define dst	x3
 #define srcend	x4
 #define dstend	x5
-#define A_l	x6
-#define A_lw	w6
-#define A_h	x7
-#define A_hw	w7
 #define tmp1	x14
-
-#define B_l	x8
-#define B_lw	w8
-#define B_h	x9
-#define C_l	x10
-#define C_h	x11
-#define D_l	x12
-#define D_h	x13
-#define E_l	dst
-#define E_h	tmp1
-#define F_l	src
-#define F_h	count
-#define G_l	srcend
-#define G_h	x15
+#define A_x	x6
+#define B_x	x7
+#define A_w	w6
+#define B_w	w7
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
 
 /* Copies are split into 3 main cases:
 
@@ -67,9 +61,9 @@
    bumping up the small copies up to 32 bytes allows us to do that without
    cost and also allows us to reduce the size of the prep code before loop64.
 
-   All copies are done only via two registers r6 and r7.  This is to ensure
-   that all loads hit a single hardware prefetcher which can get correctly
-   trained to prefetch a single stream.
+   The copy loop uses only one register q0.  This is to ensure that all loads
+   hit a single hardware prefetcher which can get correctly trained to prefetch
+   a single stream.
 
    The non-temporal stores help optimize cache utilization.  */
 
@@ -80,29 +74,29 @@ ENTRY_ALIGN (__memcpy_falkor, 6)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	b.ls	L(copy32)
-	ldp	A_l, A_h, [src]
+	ldr	A_q, [src]
 	cmp	count, 128
-	stp	A_l, A_h, [dstin]
+	str	A_q, [dstin]
 	b.hi	L(copy_long)
 
 	/* Medium copies: 33..128 bytes.  */
 	sub	tmp1, count, 1
-	ldp	A_l, A_h, [src, 16]
-	ldp	B_l, B_h, [srcend, -32]
-	ldp	C_l, C_h, [srcend, -16]
+	ldr	A_q, [src, 16]
+	ldr	B_q, [srcend, -32]
+	ldr	C_q, [srcend, -16]
 	tbz	tmp1, 6, 1f
-	ldp	D_l, D_h, [src, 32]
-	ldp	E_l, E_h, [src, 48]
-	stp	D_l, D_h, [dstin, 32]
-	stp	E_l, E_h, [dstin, 48]
-	ldp	F_l, F_h, [srcend, -64]
-	ldp	G_l, G_h, [srcend, -48]
-	stp	F_l, F_h, [dstend, -64]
-	stp	G_l, G_h, [dstend, -48]
+	ldr	D_q, [src, 32]
+	ldr	E_q, [src, 48]
+	str	D_q, [dstin, 32]
+	str	E_q, [dstin, 48]
+	ldr	F_q, [srcend, -64]
+	ldr	G_q, [srcend, -48]
+	str	F_q, [dstend, -64]
+	str	G_q, [dstend, -48]
 1:
-	stp	A_l, A_h, [dstin, 16]
-	stp	B_l, B_h, [dstend, -32]
-	stp	C_l, C_h, [dstend, -16]
+	str	A_q, [dstin, 16]
+	str	B_q, [dstend, -32]
+	str	C_q, [dstend, -16]
 	ret
 
 	.p2align 4
@@ -111,44 +105,44 @@ L(copy32):
 	/* 16-32 */
 	cmp	count, 16
 	b.lo	1f
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [srcend, -16]
-	stp	A_l, A_h, [dstin]
-	stp	B_l, B_h, [dstend, -16]
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
 	ret
 	.p2align 4
 1:
 	/* 8-15 */
 	tbz	count, 3, 1f
-	ldr	A_l, [src]
-	ldr	B_l, [srcend, -8]
-	str	A_l, [dstin]
-	str	B_l, [dstend, -8]
+	ldr	A_x, [src]
+	ldr	B_x, [srcend, -8]
+	str	A_x, [dstin]
+	str	B_x, [dstend, -8]
 	ret
 	.p2align 4
 1:
 	/* 4-7 */
 	tbz	count, 2, 1f
-	ldr	A_lw, [src]
-	ldr	B_lw, [srcend, -4]
-	str	A_lw, [dstin]
-	str	B_lw, [dstend, -4]
+	ldr	A_w, [src]
+	ldr	B_w, [srcend, -4]
+	str	A_w, [dstin]
+	str	B_w, [dstend, -4]
 	ret
 	.p2align 4
 1:
 	/* 2-3 */
 	tbz	count, 1, 1f
-	ldrh	A_lw, [src]
-	ldrh	B_lw, [srcend, -2]
-	strh	A_lw, [dstin]
-	strh	B_lw, [dstend, -2]
+	ldrh	A_w, [src]
+	ldrh	B_w, [srcend, -2]
+	strh	A_w, [dstin]
+	strh	B_w, [dstend, -2]
 	ret
 	.p2align 4
 1:
 	/* 0-1 */
 	tbz	count, 0, 1f
-	ldrb	A_lw, [src]
-	strb	A_lw, [dstin]
+	ldrb	A_w, [src]
+	strb	A_w, [dstin]
 1:
 	ret
 
@@ -167,30 +161,29 @@ L(copy_long):
 	add	count, count, tmp1
 
 L(loop64):
-	ldp	A_l, A_h, [src, 16]!
-	stnp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [src, 16]!
+	ldr	A_q, [src, 16]!
+	str	A_q, [dst, 16]
+	ldr	A_q, [src, 16]!
 	subs	count, count, 64
-	stnp	A_l, A_h, [dst, 32]
-	ldp	A_l, A_h, [src, 16]!
-	stnp	A_l, A_h, [dst, 48]
-	ldp	A_l, A_h, [src, 16]!
-	stnp	A_l, A_h, [dst, 64]
-	add	dst, dst, 64
+	str	A_q, [dst, 32]
+	ldr	A_q, [src, 16]!
+	str	A_q, [dst, 48]
+	ldr	A_q, [src, 16]!
+	str	A_q, [dst, 64]!
 	b.hi	L(loop64)
 
 	/* Write the last full set of 64 bytes.  The remainder is at most 64
 	   bytes, so it is safe to always copy 64 bytes from the end even if
 	   there is just 1 byte left.  */
 L(last64):
-	ldp	A_l, A_h, [srcend, -64]
-	stnp	A_l, A_h, [dstend, -64]
-	ldp	B_l, B_h, [srcend, -48]
-	stnp	B_l, B_h, [dstend, -48]
-	ldp	C_l, C_h, [srcend, -32]
-	stnp	C_l, C_h, [dstend, -32]
-	ldp	D_l, D_h, [srcend, -16]
-	stnp	D_l, D_h, [dstend, -16]
+	ldr	E_q, [srcend, -64]
+	str	E_q, [dstend, -64]
+	ldr	D_q, [srcend, -48]
+	str	D_q, [dstend, -48]
+	ldr	C_q, [srcend, -32]
+	str	C_q, [dstend, -32]
+	ldr	B_q, [srcend, -16]
+	str	B_q, [dstend, -16]
 	ret
 
 END (__memcpy_falkor)
-- 
cgit 1.4.1


From d6613ad24f708706c24bffa38351e26e1dd5b5d1 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Date: Wed, 19 Dec 2018 18:28:24 +0000
Subject: [AArch64] Add ifunc support for Ares

Add Ares to the midr_el0 list and support ifunc dispatch.  Since Ares
supports 2 128-bit loads/stores, use Neon registers for memcpy by
selecting __memcpy_falkor by default (we should rename this to
__memcpy_simd or similar).

	* manual/tunables.texi (glibc.cpu.name): Add ares tunable.
	* sysdeps/aarch64/multiarch/memcpy.c (__libc_memcpy): Use
	__memcpy_falkor for ares.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_ARES):
	Add new define.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c (cpu_list):
	Add ares cpu.

(cherry picked from commit 02f440c1ef5d5d79552a524065aa3e2fabe469b9)
---
 ChangeLog                                      | 10 ++++++++++
 manual/tunables.texi                           |  2 +-
 sysdeps/aarch64/multiarch/memcpy.c             |  2 +-
 sysdeps/unix/sysv/linux/aarch64/cpu-features.c |  1 +
 sysdeps/unix/sysv/linux/aarch64/cpu-features.h |  3 +++
 5 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 0482b0c435..2a9b6ed7ef 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2019-01-09  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* manual/tunables.texi (glibc.cpu.name): Add ares tunable.
+	* sysdeps/aarch64/multiarch/memcpy.c (__libc_memcpy): Use
+	__memcpy_falkor for ares.
+	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_ARES):
+	Add new define.
+	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c (cpu_list):
+	Add ares cpu.
+
 2019-09-06  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
 	* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
diff --git a/manual/tunables.texi b/manual/tunables.texi
index a23c8d076a..b230cde556 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -253,7 +253,7 @@ This tunable is specific to i386 and x86-64.
 @deftp Tunable glibc.tune.cpu
 The @code{glibc.tune.cpu=xxx} tunable allows the user to tell @theglibc{} to
 assume that the CPU is @code{xxx} where xxx may have one of these values:
-@code{generic}, @code{falkor}, @code{thunderxt88}.
+@code{generic}, @code{falkor}, @code{thunderxt88}, @code{ares}.
 
 This tunable is specific to aarch64.
 @end deftp
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index b395df1c63..ee4d78ea1d 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -35,7 +35,7 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
 libc_ifunc (__libc_memcpy,
             (IS_THUNDERX (midr)
 	     ? __memcpy_thunderx
-	     : (IS_FALKOR (midr)
+	     : (IS_FALKOR (midr) || IS_ARES (midr)
 		? __memcpy_falkor
 		: __memcpy_generic)));
 
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
index 0c7e13f4fa..50297bc409 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -30,6 +30,7 @@ struct cpu_list
 static struct cpu_list cpu_list[] = {
       {"falkor",	0x510FC000},
       {"thunderxt88",	0x430F0A10},
+      {"ares",		0x411FD0C0},
       {"generic", 	0x0}
 };
 
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index 73cb53da9a..d2ad5c63b9 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -44,6 +44,9 @@
 #define IS_FALKOR(midr) (MIDR_IMPLEMENTOR(midr) == 'Q'			      \
                         && MIDR_PARTNUM(midr) == 0xc00)
 
+#define IS_ARES(midr) (MIDR_IMPLEMENTOR(midr) == 'A'			      \
+			&& MIDR_PARTNUM(midr) == 0xd0c)
+
 struct cpu_features
 {
   uint64_t midr_el1;
-- 
cgit 1.4.1


From 55a280689e61cb8a7879ebbe0586d031559f1ba4 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Mon, 16 Jul 2018 17:50:09 +0100
Subject: Improve strstr performance

Improve strstr performance.  Strstr tends to be slow because it uses
many calls to memchr and a slow byte loop to scan for the next match.
Performance is significantly improved by using strnlen on larger blocks
and using strchr to search for the next matching character.  strcasestr
can also use strnlen to scan ahead, and memmem can use memchr to check
for the next match.

On the GLIBC bench tests the performance gains on Cortex-A72 are:
strstr: +25%
strcasestr: +4.3%
memmem: +18%

On a 256KB dataset strstr performance improves by 67%, strcasestr by 47%.

    Reviewd-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>

(cherry picked from commit 3ae725dfb6d7f61447d27d00ed83e573bd5454f4)
---
 ChangeLog                     | 15 +++++++++++-
 benchtests/bench-strcasestr.c |  1 +
 benchtests/bench-strstr.c     |  3 +++
 string/memmem.c               |  1 +
 string/str-two-way.h          | 56 +++++++++++++++++++++----------------------
 string/strcasestr.c           |  4 ++--
 string/strstr.c               |  5 ++--
 string/test-strcasestr.c      |  1 +
 string/test-strstr.c          |  1 +
 9 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 2a9b6ed7ef..d21bdbcce8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,17 @@
-2019-01-09  Wilco Dijkstra  <wdijkstr@arm.com>
+2019-09-13  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* benchtests/bench-strcasestr.c: Rename __strnlen to strnlen.
+	* benchtests/bench-strstr.c: Likewise.
+	* string/memmem.c (FASTSEARCH): Define.
+	* string/str-two-way.h (two_way_short_needle): Minor cleanups.
+	Add support for FASTSEARCH.
+	* string/strcasestr.c (AVAILABLE): Use read-ahead __strnlen.
+	* string/strstr.c (AVAILABLE): Use read-ahead __strnlen.
+	(FASTSEARCH): Define.
+	* string/test-strcasestr.c: Rename __strnlen to strnlen.
+	* string/test-strstr.c: Likewise.
+
+2019-09-06  Wilco Dijkstra  <wdijkstr@arm.com>
 
 	* manual/tunables.texi (glibc.cpu.name): Add ares tunable.
 	* sysdeps/aarch64/multiarch/memcpy.c (__libc_memcpy): Use
diff --git a/benchtests/bench-strcasestr.c b/benchtests/bench-strcasestr.c
index 4e6f480c84..9a031b3064 100644
--- a/benchtests/bench-strcasestr.c
+++ b/benchtests/bench-strcasestr.c
@@ -24,6 +24,7 @@
 #define STRCASESTR simple_strcasestr
 #define NO_ALIAS
 #define __strncasecmp strncasecmp
+#define __strnlen strnlen
 #include "../string/strcasestr.c"
 
 
diff --git a/benchtests/bench-strstr.c b/benchtests/bench-strstr.c
index e63659f136..2fa64118f4 100644
--- a/benchtests/bench-strstr.c
+++ b/benchtests/bench-strstr.c
@@ -22,6 +22,9 @@
 
 
 #define STRSTR simple_strstr
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(X)
+#define __strnlen strnlen
 #include "../string/strstr.c"
 
 
diff --git a/string/memmem.c b/string/memmem.c
index 54fca4966d..34299b8864 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -31,6 +31,7 @@
 
 #define RETURN_TYPE void *
 #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
+#define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N))
 #include "str-two-way.h"
 
 #undef memmem
diff --git a/string/str-two-way.h b/string/str-two-way.h
index 599c867ffd..f433c76478 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -281,50 +281,50 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
     }
   else
     {
-      const unsigned char *phaystack = &haystack[suffix];
+      const unsigned char *phaystack;
       /* The comparison always starts from needle[suffix], so cache it
 	 and use an optimized first-character loop.  */
       unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]);
 
-#if CHECK_EOL
-      /* We start matching from the SUFFIX'th element, so make sure we
-	 don't hit '\0' before that.  */
-      if (haystack_len < suffix + 1
-	  && !AVAILABLE (haystack, haystack_len, 0, suffix + 1))
-	return NULL;
-#endif
-
       /* The two halves of needle are distinct; no extra memory is
 	 required, and any mismatch results in a maximal shift.  */
       period = MAX (suffix, needle_len - suffix) + 1;
       j = 0;
-      while (1
-#if !CHECK_EOL
-	     && AVAILABLE (haystack, haystack_len, j, needle_len)
-#endif
-	     )
+      while (AVAILABLE (haystack, haystack_len, j, needle_len))
 	{
 	  unsigned char haystack_char;
 	  const unsigned char *pneedle;
 
-	  /* TODO: The first-character loop can be sped up by adapting
-	     longword-at-a-time implementation of memchr/strchr.  */
-	  if (needle_suffix
+	  phaystack = &haystack[suffix + j];
+
+#ifdef FASTSEARCH
+	  if (*phaystack++ != needle_suffix)
+	    {
+	      phaystack = FASTSEARCH (phaystack, needle_suffix,
+				      haystack_len - needle_len - j);
+	      if (phaystack == NULL)
+		goto ret0;
+	      j = phaystack - &haystack[suffix];
+	      phaystack++;
+	    }
+#else
+	  while (needle_suffix
 	      != (haystack_char = CANON_ELEMENT (*phaystack++)))
 	    {
 	      RET0_IF_0 (haystack_char);
-#if !CHECK_EOL
+# if !CHECK_EOL
 	      ++j;
-#endif
-	      continue;
+	      if (!AVAILABLE (haystack, haystack_len, j, needle_len))
+		goto ret0;
+# endif
 	    }
 
-#if CHECK_EOL
+# if CHECK_EOL
 	  /* Calculate J if it wasn't kept up-to-date in the first-character
 	     loop.  */
 	  j = phaystack - &haystack[suffix] - 1;
+# endif
 #endif
-
 	  /* Scan for matches in right half.  */
 	  i = suffix + 1;
 	  pneedle = &needle[i];
@@ -338,6 +338,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
 		}
 	      ++i;
 	    }
+#if CHECK_EOL
+	  /* Update minimal length of haystack.  */
+	  if (phaystack > haystack + haystack_len)
+	    haystack_len = phaystack - haystack;
+#endif
 	  if (needle_len <= i)
 	    {
 	      /* Scan for matches in left half.  */
@@ -360,13 +365,6 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
 	    }
 	  else
 	    j += i - suffix + 1;
-
-#if CHECK_EOL
-	  if (!AVAILABLE (haystack, haystack_len, j, needle_len))
-	    break;
-#endif
-
-	  phaystack = &haystack[suffix + j];
 	}
     }
  ret0: __attribute__ ((unused))
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 2acf003155..7caaade5d2 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -37,8 +37,8 @@
 /* Two-Way algorithm.  */
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
+  (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \
+			      (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
 #define CANON_ELEMENT(c) TOLOWER (c)
diff --git a/string/strstr.c b/string/strstr.c
index 88f1d5de36..63facae4a5 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -33,10 +33,11 @@
 
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
+  (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \
+			      (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
+#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C))
 #include "str-two-way.h"
 
 #undef strstr
diff --git a/string/test-strcasestr.c b/string/test-strcasestr.c
index abb3916732..78e03da7c4 100644
--- a/string/test-strcasestr.c
+++ b/string/test-strcasestr.c
@@ -25,6 +25,7 @@
 #define STRCASESTR simple_strcasestr
 #define NO_ALIAS
 #define __strncasecmp strncasecmp
+#define __strnlen strnlen
 #include "strcasestr.c"
 
 
diff --git a/string/test-strstr.c b/string/test-strstr.c
index 33f221149a..8d04134e0b 100644
--- a/string/test-strstr.c
+++ b/string/test-strstr.c
@@ -24,6 +24,7 @@
 
 #define STRSTR simple_strstr
 #define libc_hidden_builtin_def(arg) /* nothing */
+#define __strnlen strnlen
 #include "strstr.c"
 
 
-- 
cgit 1.4.1


From c60bf879b21aefedaf632f585b9c39af8532bc71 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Fri, 3 Aug 2018 17:24:12 +0100
Subject: Simplify and speedup strstr/strcasestr first match

Looking at the benchtests, both strstr and strcasestr spend a lot of time
in a slow initialization loop handling one character per iteration.
This can be simplified and use the much faster strlen/strnlen/strchr/memcmp.
Read ahead a few cachelines to reduce the number of strnlen calls, which
improves performance by ~3-4%.  This patch improves the time taken for the
full strstr benchtest by >40%.

	* string/strcasestr.c (STRCASESTR): Simplify and speedup first match.
	* string/strstr.c (AVAILABLE): Likewise.

(cherry picked from commit 284f42bc778e487dfd5dff5c01959f93b9e0c4f5)
---
 ChangeLog           |  5 +++++
 string/strcasestr.c | 37 ++++++++++++++-----------------------
 string/strstr.c     | 43 +++++++++++++++++++++----------------------
 3 files changed, 40 insertions(+), 45 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index d21bdbcce8..22012fb458 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-09-13  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* string/strcasestr.c (STRCASESTR): Simplify and speedup first match.
+	* string/strstr.c (AVAILABLE): Likewise.
+
 2019-09-13  Wilco Dijkstra  <wdijkstr@arm.com>
 
 	* benchtests/bench-strcasestr.c: Rename __strnlen to strnlen.
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 7caaade5d2..772e513626 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -58,31 +58,22 @@
    case-insensitive comparison.  This function gives unspecified
    results in multibyte locales.  */
 char *
-STRCASESTR (const char *haystack_start, const char *needle_start)
+STRCASESTR (const char *haystack, const char *needle)
 {
-  const char *haystack = haystack_start;
-  const char *needle = needle_start;
   size_t needle_len; /* Length of NEEDLE.  */
   size_t haystack_len; /* Known minimum length of HAYSTACK.  */
-  bool ok = true; /* True if NEEDLE is prefix of HAYSTACK.  */
-
-  /* Determine length of NEEDLE, and in the process, make sure
-     HAYSTACK is at least as long (no point processing all of a long
-     NEEDLE if HAYSTACK is too short).  */
-  while (*haystack && *needle)
-    {
-      ok &= (TOLOWER ((unsigned char) *haystack)
-	     == TOLOWER ((unsigned char) *needle));
-      haystack++;
-      needle++;
-    }
-  if (*needle)
+
+  /* Handle empty NEEDLE special case.  */
+  if (needle[0] == '\0')
+    return (char *) haystack;
+
+  /* Ensure HAYSTACK length is at least as long as NEEDLE length.
+     Since a match may occur early on in a huge HAYSTACK, use strnlen
+     and read ahead a few cachelines for improved performance.  */
+  needle_len = strlen (needle);
+  haystack_len = __strnlen (haystack, needle_len + 256);
+  if (haystack_len < needle_len)
     return NULL;
-  if (ok)
-    return (char *) haystack_start;
-  needle_len = needle - needle_start;
-  haystack = haystack_start + 1;
-  haystack_len = needle_len - 1;
 
   /* Perform the search.  Abstract memory is considered to be an array
      of 'unsigned char' values, not an array of 'char' values.  See
@@ -90,10 +81,10 @@ STRCASESTR (const char *haystack_start, const char *needle_start)
   if (needle_len < LONG_NEEDLE_THRESHOLD)
     return two_way_short_needle ((const unsigned char *) haystack,
 				 haystack_len,
-				 (const unsigned char *) needle_start,
+				 (const unsigned char *) needle,
 				 needle_len);
   return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
-			      (const unsigned char *) needle_start,
+			      (const unsigned char *) needle,
 			      needle_len);
 }
 
diff --git a/string/strstr.c b/string/strstr.c
index 63facae4a5..1eb4dc11db 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -50,33 +50,32 @@
    if NEEDLE is empty, otherwise NULL if NEEDLE is not found in
    HAYSTACK.  */
 char *
-STRSTR (const char *haystack_start, const char *needle_start)
+STRSTR (const char *haystack, const char *needle)
 {
-  const char *haystack = haystack_start;
-  const char *needle = needle_start;
   size_t needle_len; /* Length of NEEDLE.  */
   size_t haystack_len; /* Known minimum length of HAYSTACK.  */
-  bool ok = true; /* True if NEEDLE is prefix of HAYSTACK.  */
-
-  /* Determine length of NEEDLE, and in the process, make sure
-     HAYSTACK is at least as long (no point processing all of a long
-     NEEDLE if HAYSTACK is too short).  */
-  while (*haystack && *needle)
-    ok &= *haystack++ == *needle++;
-  if (*needle)
+
+  /* Handle empty NEEDLE special case.  */
+  if (needle[0] == '\0')
+    return (char *) haystack;
+
+  /* Skip until we find the first matching char from NEEDLE.  */
+  haystack = strchr (haystack, needle[0]);
+  if (haystack == NULL || needle[1] == '\0')
+    return (char *) haystack;
+
+  /* Ensure HAYSTACK length is at least as long as NEEDLE length.
+     Since a match may occur early on in a huge HAYSTACK, use strnlen
+     and read ahead a few cachelines for improved performance.  */
+  needle_len = strlen (needle);
+  haystack_len = __strnlen (haystack, needle_len + 256);
+  if (haystack_len < needle_len)
     return NULL;
-  if (ok)
-    return (char *) haystack_start;
-
-  /* Reduce the size of haystack using strchr, since it has a smaller
-     linear coefficient than the Two-Way algorithm.  */
-  needle_len = needle - needle_start;
-  haystack = strchr (haystack_start + 1, *needle_start);
-  if (!haystack || __builtin_expect (needle_len == 1, 0))
+
+  /* Check whether we have a match.  This improves performance since we avoid
+     the initialization overhead of the two-way algorithm.  */
+  if (memcmp (haystack, needle, needle_len) == 0)
     return (char *) haystack;
-  needle -= needle_len;
-  haystack_len = (haystack > haystack_start + needle_len ? 1
-		  : needle_len + haystack_start - haystack);
 
   /* Perform the search.  Abstract memory is considered to be an array
      of 'unsigned char' values, not an array of 'char' values.  See
-- 
cgit 1.4.1


From ceeba1d73c84f1a551677149ce3b3ed3372fb3ec Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Date: Tue, 28 Aug 2018 12:42:19 +0530
Subject: Speedup first memmem match

As done in commit 284f42bc778e487dfd5dff5c01959f93b9e0c4f5, memcmp
can be used after memchr to avoid the initialization overhead of the
two-way algorithm for the first match.  This has shown improvement
>40% for first match.

(cherry picked from commit c8dd67e7c958de04c3783cbea7c384431707b5f8)
---
 ChangeLog       | 4 ++++
 string/memmem.c | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 22012fb458..12e7d27e49 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2019-09-13  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
+
+	* string/memmem.c: Use memcmp for first match.
+
 2019-09-13  Wilco Dijkstra  <wdijkstr@arm.com>
 
 	* string/strcasestr.c (STRCASESTR): Simplify and speedup first match.
diff --git a/string/memmem.c b/string/memmem.c
index 34299b8864..645b6d1a26 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -70,6 +70,10 @@ __memmem (const void *haystack_start, size_t haystack_len,
       haystack_len -= haystack - (const unsigned char *) haystack_start;
       if (haystack_len < needle_len)
 	return NULL;
+      /* Check whether we have a match.  This improves performance since we
+	 avoid the initialization overhead of the two-way algorithm.  */
+      if (memcmp (haystack, needle, needle_len) == 0)
+	return (void *) haystack;
       return two_way_short_needle (haystack, haystack_len, needle, needle_len);
     }
   else
-- 
cgit 1.4.1


From cd3487afa276f817749d3a418e81849130e2dbce Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Wed, 19 Sep 2018 16:50:18 +0100
Subject: Fix strstr bug with huge needles (bug 23637)

The generic strstr in GLIBC 2.28 fails to match huge needles.  The optimized
AVAILABLE macro reads ahead a large fixed amount to reduce the overhead of
repeatedly checking for the end of the string.  However if the needle length
is larger than this, two_way_long_needle may confuse this as meaning the end
of the string and return NULL.  This is fixed by adding the needle length to
the amount to read ahead.

	[BZ #23637]
	* string/test-strstr.c (pr23637): New function.
	(test_main): Add tests with longer needles.
	* string/strcasestr.c (AVAILABLE): Fix readahead distance.
	* string/strstr.c (AVAILABLE): Likewise.

(cherry picked from commit 83a552b0bb9fc2a5e80a0ab3723c0a80ce1db9f2)
---
 ChangeLog            |  8 ++++++++
 string/strcasestr.c  |  5 +++--
 string/strstr.c      |  5 +++--
 string/test-strstr.c | 30 ++++++++++++++++++++++++++++++
 4 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 12e7d27e49..a759cf936c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2019-09-13  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	[BZ #23637]
+	* string/test-strstr.c (pr23637): New function.
+	(test_main): Add tests with longer needles.
+	* string/strcasestr.c (AVAILABLE): Fix readahead distance.
+	* string/strstr.c (AVAILABLE): Likewise.
+
 2019-09-13  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
 
 	* string/memmem.c: Use memcmp for first match.
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 772e513626..19ea1d4bbf 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -37,8 +37,9 @@
 /* Two-Way algorithm.  */
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
-  (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \
-			      (j) + (n_l) <= (h_l)))
+  (((j) + (n_l) <= (h_l)) \
+   || ((h_l) += __strnlen ((void*)((h) + (h_l)), (n_l) + 512), \
+       (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
 #define CANON_ELEMENT(c) TOLOWER (c)
diff --git a/string/strstr.c b/string/strstr.c
index 1eb4dc11db..9cfcfdf77d 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -33,8 +33,9 @@
 
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
-  (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \
-			      (j) + (n_l) <= (h_l)))
+  (((j) + (n_l) <= (h_l)) \
+   || ((h_l) += __strnlen ((void*)((h) + (h_l)), (n_l) + 512), \
+       (j) + (n_l) <= (h_l)))
 #define CHECK_EOL (1)
 #define RET0_IF_0(a) if (!a) goto ret0
 #define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C))
diff --git a/string/test-strstr.c b/string/test-strstr.c
index 8d04134e0b..5bce73b0bd 100644
--- a/string/test-strstr.c
+++ b/string/test-strstr.c
@@ -151,6 +151,32 @@ check2 (void)
     }
 }
 
+#define N 1024
+
+static void
+pr23637 (void)
+{
+  char *h = (char*) buf1;
+  char *n = (char*) buf2;
+
+  for (int i = 0; i < N; i++)
+    {
+      n[i] = 'x';
+      h[i] = ' ';
+      h[i + N] = 'x';
+    }
+
+  n[N] = '\0';
+  h[N * 2] = '\0';
+
+  /* Ensure we don't match at the first 'x'.  */
+  h[0] = 'x';
+
+  char *exp_result = stupid_strstr (h, n);
+  FOR_EACH_IMPL (impl, 0)
+    check_result (impl, h, n, exp_result);
+}
+
 static int
 test_main (void)
 {
@@ -158,6 +184,7 @@ test_main (void)
 
   check1 ();
   check2 ();
+  pr23637 ();
 
   printf ("%23s", "");
   FOR_EACH_IMPL (impl, 0)
@@ -202,6 +229,9 @@ test_main (void)
 	do_test (15, 9, hlen, klen, 1);
 	do_test (15, 15, hlen, klen, 0);
 	do_test (15, 15, hlen, klen, 1);
+
+	do_test (15, 15, hlen + klen * 4, klen * 4, 0);
+	do_test (15, 15, hlen + klen * 4, klen * 4, 1);
       }
 
   do_test (0, 0, page_size - 1, 16, 0);
-- 
cgit 1.4.1


From 796c5ee030deac07ed846d9531c1322d57c0a6c7 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Wed, 12 Jun 2019 11:38:52 +0100
Subject: Improve performance of strstr

This patch significantly improves performance of strstr using a novel
modified Horspool algorithm.  Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.

By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.

Small needles up to size 3 use a dedicated linear search.  Very long needles
use the Two-Way algorithm.

The performance gain using the improved bench-strstr on Cortex-A72 is 5.8
times basic_strstr and 3.7 times twoway_strstr.

Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test
(https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c).

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>

	* string/str-two-way.h (two_way_short_needle): Add inline to avoid
	warning.
	(two_way_long_needle): Block inlining.
	* string/strstr.c (strstr2): Add new function.
	(strstr3): Likewise.
	(STRSTR): Completely rewrite strstr to improve performance.

(cherry picked from commit 5e0a7ecb6629461b28adc1a5aabcc0ede122f201)
---
 ChangeLog            |   9 +++
 string/str-two-way.h |   9 ++-
 string/strstr.c      | 165 ++++++++++++++++++++++++++++++++++++---------------
 3 files changed, 132 insertions(+), 51 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index a759cf936c..f0ce3a594a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2019-09-13  Wilco Dijkstra  <wdijkstr@arm.com>
+
+	* string/str-two-way.h (two_way_short_needle): Add inline to avoid
+	warning.
+	(two_way_long_needle): Block inlining.
+	* string/strstr.c (strstr2): Add new function.
+	(strstr3): Likewise.
+	(STRSTR): Completely rewrite strstr to improve performance.
+
 2019-09-13  Wilco Dijkstra  <wdijkstr@arm.com>
 
 	[BZ #23637]
diff --git a/string/str-two-way.h b/string/str-two-way.h
index f433c76478..30aca30c40 100644
--- a/string/str-two-way.h
+++ b/string/str-two-way.h
@@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len,
    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
-static RETURN_TYPE
+static inline RETURN_TYPE
 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
 		      const unsigned char *needle, size_t needle_len)
 {
@@ -382,8 +382,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
    and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
-   sublinear performance is not possible.  */
-static RETURN_TYPE
+   sublinear performance is not possible.
+
+   Since this function is large and complex, block inlining to avoid
+   slowing down the common case of small needles.  */
+__attribute__((noinline)) static RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
 		     const unsigned char *needle, size_t needle_len)
 {
diff --git a/string/strstr.c b/string/strstr.c
index 9cfcfdf77d..4d72ffbfc9 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -16,29 +16,17 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
 
-/* Specification of strstr.  */
 #include <string.h>
 
-#include <stdbool.h>
-
-#ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
-#endif
-
 #define RETURN_TYPE char *
 #define AVAILABLE(h, h_l, j, n_l)			\
   (((j) + (n_l) <= (h_l)) \
    || ((h_l) += __strnlen ((void*)((h) + (h_l)), (n_l) + 512), \
        (j) + (n_l) <= (h_l)))
-#define CHECK_EOL (1)
-#define RET0_IF_0(a) if (!a) goto ret0
-#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C))
 #include "str-two-way.h"
 
 #undef strstr
@@ -47,47 +35,128 @@
 #define STRSTR strstr
 #endif
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE is empty, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
-char *
-STRSTR (const char *haystack, const char *needle)
+static inline char *
+strstr2 (const unsigned char *hs, const unsigned char *ne)
 {
-  size_t needle_len; /* Length of NEEDLE.  */
-  size_t haystack_len; /* Known minimum length of HAYSTACK.  */
-
-  /* Handle empty NEEDLE special case.  */
-  if (needle[0] == '\0')
-    return (char *) haystack;
+  uint32_t h1 = (ne[0] << 16) | ne[1];
+  uint32_t h2 = 0;
+  for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
+      h2 = (h2 << 16) | c;
+  return h1 == h2 ? (char *)hs - 2 : NULL;
+}
 
-  /* Skip until we find the first matching char from NEEDLE.  */
-  haystack = strchr (haystack, needle[0]);
-  if (haystack == NULL || needle[1] == '\0')
-    return (char *) haystack;
+static inline char *
+strstr3 (const unsigned char *hs, const unsigned char *ne)
+{
+  uint32_t h1 = ((uint32_t)ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8);
+  uint32_t h2 = 0;
+  for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
+      h2 = (h2 | c) << 8;
+  return h1 == h2 ? (char *)hs - 3 : NULL;
+}
 
-  /* Ensure HAYSTACK length is at least as long as NEEDLE length.
-     Since a match may occur early on in a huge HAYSTACK, use strnlen
+/* Hash character pairs so a small shift table can be used.  All bits of
+   p[0] are included, but not all bits from p[-1].  So if two equal hashes
+   match on p[-1], p[0] matches too.  Hash collisions are harmless and result
+   in smaller shifts.  */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast strstr algorithm with guaranteed linear-time performance.
+   Small needles up to size 3 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
+char *
+STRSTR (const char *haystack, const char *needle)
+{
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
+
+  /* Handle short needle special cases first.  */
+  if (ne[0] == '\0')
+    return (char *)hs;
+  hs = (const unsigned char *)strchr ((const char*)hs, ne[0]);
+  if (hs == NULL || ne[1] == '\0')
+    return (char*)hs;
+  if (ne[2] == '\0')
+    return strstr2 (hs, ne);
+  if (ne[3] == '\0')
+    return strstr3 (hs, ne);
+
+  /* Ensure haystack length is at least as long as needle length.
+     Since a match may occur early on in a huge haystack, use strnlen
      and read ahead a few cachelines for improved performance.  */
-  needle_len = strlen (needle);
-  haystack_len = __strnlen (haystack, needle_len + 256);
-  if (haystack_len < needle_len)
+  size_t ne_len = strlen ((const char*)ne);
+  size_t hs_len = __strnlen ((const char*)hs, ne_len | 512);
+  if (hs_len < ne_len)
     return NULL;
 
-  /* Check whether we have a match.  This improves performance since we avoid
-     the initialization overhead of the two-way algorithm.  */
-  if (memcmp (haystack, needle, needle_len) == 0)
-    return (char *) haystack;
-
-  /* Perform the search.  Abstract memory is considered to be an array
-     of 'unsigned char' values, not an array of 'char' values.  See
-     ISO C 99 section 6.2.6.1.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
-    return two_way_short_needle ((const unsigned char *) haystack,
-				 haystack_len,
-				 (const unsigned char *) needle, needle_len);
-  return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
-			      (const unsigned char *) needle, needle_len);
+  /* Check whether we have a match.  This improves performance since we
+     avoid initialization overheads.  */
+  if (memcmp (hs, ne, ne_len) == 0)
+    return (char *) hs;
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__glibc_unlikely (ne_len > 256))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  const unsigned char *end = hs + hs_len - ne_len;
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  /* Initialize bad character shift hash table.  */
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  /* Shift1 is the amount we can skip after matching the hash of the
+     needle end but not the full needle.  */
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  while (1)
+    {
+      if (__glibc_unlikely (hs > end))
+	{
+	  end += __strnlen ((const char*)end + m1 + 1, 2048);
+	  if (hs > end)
+	    return NULL;
+	}
+
+      /* Skip past character pairs not in the needle.  */
+      do
+	{
+	  hs += m1;
+	  tmp = shift[hash2 (hs)];
+	}
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+	 and continue until we match the hash of the needle end.  */
+      hs -= tmp;
+      if (tmp < m1)
+	continue;
+
+      /* Hash of the last 2 characters matches.  If the needle is long,
+	 try to quickly filter out mismatches.  */
+      if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0)
+	{
+	  if (memcmp (hs, ne, m1) == 0)
+	    return (void *) hs;
+
+	  /* Adjust filter offset when it doesn't find the mismatch.  */
+	  offset = (offset >= 8 ? offset : m1) - 8;
+	}
+
+      /* Skip based on matching the hash of the needle end.  */
+      hs += shift1;
+    }
 }
 libc_hidden_builtin_def (strstr)
-
-#undef LONG_NEEDLE_THRESHOLD
-- 
cgit 1.4.1


From 612fba2fe9036732c5ee08f691c52365e5bd32c0 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Wed, 12 Jun 2019 11:42:34 +0100
Subject: Improve performance of memmem

This patch significantly improves performance of memmem using a novel
modified Horspool algorithm.  Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.

By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.

Small needles up to size 2 use a dedicated linear search.  Very long needles
use the Two-Way algorithm (to avoid increasing stack size or slowing down
the common case, inlining is disabled).

The performance gain is 6.6 times on English text on AArch64 using random
needles with average size 8.

Tested against GLIBC testsuite and randomized tests.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>

	* string/memmem.c (__memmem): Rewrite to improve performance.

(cherry picked from commit 680942b0167715e123d934b609060cd382f8e39f)
---
 ChangeLog       |   4 ++
 string/memmem.c | 127 +++++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 89 insertions(+), 42 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index f0ce3a594a..5f3df320bc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2019-09-13  Wilco Dijkstra  <wdijkstr@arm.com>
 
+	* string/memmem.c (__memmem): Rewrite to improve performance.
+
+2019-06-12  Wilco Dijkstra  <wdijkstr@arm.com>
+
 	* string/str-two-way.h (two_way_short_needle): Add inline to avoid
 	warning.
 	(two_way_long_needle): Block inlining.
diff --git a/string/memmem.c b/string/memmem.c
index 645b6d1a26..fba7fe33f7 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -15,17 +15,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This particular implementation was written by Eric Blake, 2008.  */
-
 #ifndef _LIBC
 # include <config.h>
 #endif
 
-/* Specification of memmem.  */
 #include <string.h>
 
 #ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
 # define __memmem	memmem
 #endif
 
@@ -36,51 +32,98 @@
 
 #undef memmem
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
+/* Hash character pairs so a small shift table can be used.  All bits of
+   p[0] are included, but not all bits from p[-1].  So if two equal hashes
+   match on p[-1], p[0] matches too.  Hash collisions are harmless and result
+   in smaller shifts.  */
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.  The main search loop only
+   exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+   and allowing for a larger skip if there is no match.  A self-adapting
+   filtering check is used to quickly detect mismatches in long needles.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 void *
-__memmem (const void *haystack_start, size_t haystack_len,
-	  const void *needle_start, size_t needle_len)
+__memmem (const void *haystack, size_t hs_len,
+	  const void *needle, size_t ne_len)
 {
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (__glibc_unlikely (haystack_len < needle_len))
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
+
+  if (ne_len == 0)
+    return (void *) hs;
+  if (ne_len == 1)
+    return (void *) memchr (hs, ne[0], hs_len);
+
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
     return NULL;
 
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
+  const unsigned char *end = hs + hs_len - ne_len;
+
+  if (ne_len == 2)
+    {
+      uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+      for (hs++; hs <= end && hw != nw; )
+	hw = hw << 16 | *++hs;
+      return hw == nw ? (void *)hs - 1 : NULL;
+    }
+
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
+
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
+
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  /* Shift1 is the amount we can skip after matching the hash of the
+     needle end but not the full needle.  */
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
+
+  for ( ; hs <= end; )
     {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || __builtin_expect (needle_len == 1, 0))
-	return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-	return NULL;
-      /* Check whether we have a match.  This improves performance since we
-	 avoid the initialization overhead of the two-way algorithm.  */
-      if (memcmp (haystack, needle, needle_len) == 0)
-	return (void *) haystack;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+      /* Skip past character pairs not in the needle.  */
+      do
+	{
+	  hs += m1;
+	  tmp = shift[hash2 (hs)];
+	}
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+	 and continue until we match the hash of the needle end.  */
+      hs -= tmp;
+      if (tmp < m1)
+	continue;
+
+      /* Hash of the last 2 characters matches.  If the needle is long,
+	 try to quickly filter out mismatches.  */
+      if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0)
+	{
+	  if (memcmp (hs, ne, m1) == 0)
+	    return (void *) hs;
+
+	  /* Adjust filter offset when it doesn't find the mismatch.  */
+	  offset = (offset >= 8 ? offset : m1) - 8;
+	}
+
+      /* Skip based on matching the hash of the needle end.  */
+      hs += shift1;
     }
-  else
-    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
+  return NULL;
 }
 libc_hidden_def (__memmem)
 weak_alias (__memmem, memmem)
 libc_hidden_weak (memmem)
-
-#undef LONG_NEEDLE_THRESHOLD
-- 
cgit 1.4.1


From aaf2f25b61b570f0ae746488ab677f3a5159f038 Mon Sep 17 00:00:00 2001
From: Dragan Mladjenovic <dmladjenovic@wavecomp.com>
Date: Fri, 23 Aug 2019 16:38:04 +0000
Subject: mips: Force RWX stack for hard-float builds that can run on pre-4.8
 kernels

Linux/Mips kernels prior to 4.8 could potentially crash the user
process when doing FPU emulation while running on non-executable
user stack.

Currently, gcc doesn't emit .note.GNU-stack for mips, but that will
change in the future. To ensure that glibc can be used with such
future gcc, without silently resulting in binaries that might crash
in runtime, this patch forces RWX stack for all built objects if
configured to run against minimum kernel version less than 4.8.

	* sysdeps/unix/sysv/linux/mips/Makefile
	(test-xfail-check-execstack):
	Move under mips-has-gnustack != yes.
	(CFLAGS-.o*, ASFLAGS-.o*): New rules.
	Apply -Wa,-execstack if mips-force-execstack == yes.
	* sysdeps/unix/sysv/linux/mips/configure: Regenerated.
	* sysdeps/unix/sysv/linux/mips/configure.ac
	(mips-force-execstack): New var.
	Set to yes for hard-float builds with minimum_kernel < 4.8.0
	or minimum_kernel not set at all.
	(mips-has-gnustack): New var.
	Use value of libc_cv_as_noexecstack
	if mips-force-execstack != yes, otherwise set to no.

(cherry picked from commit 33bc9efd91de1b14354291fc8ebd5bce96379f12)
---
 sysdeps/unix/sysv/linux/mips/Makefile     | 21 ++++++++++++----
 sysdeps/unix/sysv/linux/mips/configure    | 41 +++++++++++++++++++++++++++++++
 sysdeps/unix/sysv/linux/mips/configure.ac | 32 ++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/sysdeps/unix/sysv/linux/mips/Makefile b/sysdeps/unix/sysv/linux/mips/Makefile
index bca11d39e0..a58e6954a7 100644
--- a/sysdeps/unix/sysv/linux/mips/Makefile
+++ b/sysdeps/unix/sysv/linux/mips/Makefile
@@ -99,14 +99,25 @@ sysdep-dl-routines += dl-static
 
 sysdep_routines += dl-vdso
 endif
-
-# Supporting non-executable stacks on MIPS requires changes to both
-# the Linux kernel and glibc.  See
-# <https://sourceware.org/ml/libc-alpha/2016-01/msg00567.html> and
-# <https://sourceware.org/ml/libc-alpha/2016-01/msg00719.html>.
+# If the compiler doesn't use GNU.stack note,
+# this test is expected to fail.
+ifneq ($(mips-has-gnustack),yes)
 test-xfail-check-execstack = yes
 endif
+endif
 
 ifeq ($(subdir),stdlib)
 gen-as-const-headers += ucontext_i.sym
 endif
+
+ifeq ($(mips-force-execstack),yes)
+CFLAGS-.o += -Wa,-execstack
+CFLAGS-.os += -Wa,-execstack
+CFLAGS-.op += -Wa,-execstack
+CFLAGS-.oS += -Wa,-execstack
+
+ASFLAGS-.o += -Wa,-execstack
+ASFLAGS-.os += -Wa,-execstack
+ASFLAGS-.op += -Wa,-execstack
+ASFLAGS-.oS += -Wa,-execstack
+endif
diff --git a/sysdeps/unix/sysv/linux/mips/configure b/sysdeps/unix/sysv/linux/mips/configure
index a5513fad48..2ec86a3121 100644
--- a/sysdeps/unix/sysv/linux/mips/configure
+++ b/sysdeps/unix/sysv/linux/mips/configure
@@ -475,3 +475,44 @@ if test -z "$arch_minimum_kernel"; then
     arch_minimum_kernel=4.5.0
   fi
 fi
+
+# Check if we are supposed to run on kernels older than 4.8.0. If so,
+# force executable stack to avoid potential runtime problems with fpu
+# emulation.
+# NOTE: The check below assumes that in absence of user-provided minumum_kernel
+# we will default to arch_minimum_kernel which is currently less than 4.8.0 for
+# all known configurations. If this changes, the check must be updated.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the compiler must use executable stack" >&5
+$as_echo_n "checking whether the compiler must use executable stack... " >&6; }
+if ${libc_cv_mips_force_execstack+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  libc_cv_mips_force_execstack=no
+  if test $libc_mips_float = hard; then
+    if test -n "$minimum_kernel"; then
+
+       min_version=$((`echo "$minimum_kernel.0.0.0" | sed 's/\([0-9]*\)\.\([0-9]*\)\.\([0-9]*\).*/\1 \* 65536 + \2 \* 256 + \3/'`))
+
+       if test $min_version -lt 264192; then
+         libc_cv_mips_force_execstack=yes
+       fi
+    else
+      libc_cv_mips_force_execstack=yes
+    fi
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_mips_force_execstack" >&5
+$as_echo "$libc_cv_mips_force_execstack" >&6; }
+
+libc_mips_has_gnustack=$libc_cv_as_noexecstack
+
+if test $libc_cv_mips_force_execstack = yes; then
+  libc_mips_has_gnustack=no
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: forcing executable stack for pre-4.8.0 Linux kernels" >&5
+$as_echo "$as_me: WARNING: forcing executable stack for pre-4.8.0 Linux kernels" >&2;}
+fi
+
+config_vars="$config_vars
+mips-force-execstack = ${libc_cv_mips_force_execstack}"
+config_vars="$config_vars
+mips-has-gnustack = ${libc_mips_has_gnustack}"
diff --git a/sysdeps/unix/sysv/linux/mips/configure.ac b/sysdeps/unix/sysv/linux/mips/configure.ac
index 9147aa4582..3db1b32b08 100644
--- a/sysdeps/unix/sysv/linux/mips/configure.ac
+++ b/sysdeps/unix/sysv/linux/mips/configure.ac
@@ -134,3 +134,35 @@ if test -z "$arch_minimum_kernel"; then
     arch_minimum_kernel=4.5.0
   fi
 fi
+
+# Check if we are supposed to run on kernels older than 4.8.0. If so,
+# force executable stack to avoid potential runtime problems with fpu
+# emulation.
+# NOTE: The check below assumes that in absence of user-provided minumum_kernel
+# we will default to arch_minimum_kernel which is currently less than 4.8.0 for
+# all known configurations. If this changes, the check must be updated.
+AC_CACHE_CHECK([whether the compiler must use executable stack],
+        libc_cv_mips_force_execstack, [dnl
+libc_cv_mips_force_execstack=no
+  if test $libc_mips_float = hard; then
+    if test -n "$minimum_kernel"; then
+       changequote(,)
+       min_version=$((`echo "$minimum_kernel.0.0.0" | sed 's/\([0-9]*\)\.\([0-9]*\)\.\([0-9]*\).*/\1 \* 65536 + \2 \* 256 + \3/'`))
+       changequote([,])
+       if test $min_version -lt 264192; then
+         libc_cv_mips_force_execstack=yes
+       fi
+    else
+      libc_cv_mips_force_execstack=yes
+    fi
+  fi])
+
+libc_mips_has_gnustack=$libc_cv_as_noexecstack
+
+if test $libc_cv_mips_force_execstack = yes; then
+  libc_mips_has_gnustack=no
+  AC_MSG_WARN([forcing executable stack for pre-4.8.0 Linux kernels])
+fi
+
+LIBC_CONFIG_VAR([mips-force-execstack],[${libc_cv_mips_force_execstack}])
+LIBC_CONFIG_VAR([mips-has-gnustack],[${libc_mips_has_gnustack}])
-- 
cgit 1.4.1


From bc42e3bd44e7e9f616162d4550f1d35e04bddb5b Mon Sep 17 00:00:00 2001
From: Marcin Kościelnicki <mwk@0x04.net>
Date: Thu, 21 Nov 2019 00:20:15 +0100
Subject: rtld: Check __libc_enable_secure before honoring
 LD_PREFER_MAP_32BIT_EXEC (CVE-2019-19126) [BZ #25204]

The problem was introduced in glibc 2.23, in commit
b9eb92ab05204df772eb4929eccd018637c9f3e9
("Add Prefer_MAP_32BIT_EXEC to map executable pages with MAP_32BIT").

(cherry picked from commit d5dfad4326fc683c813df1e37bbf5cf920591c8e)
---
 NEWS                                            | 7 +++++++
 sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/NEWS b/NEWS
index c6c44a3914..89adc868bb 100644
--- a/NEWS
+++ b/NEWS
@@ -100,6 +100,12 @@ Security related changes:
   memcmp gave the wrong result since it treated the size argument as
   zero.  Reported by H.J. Lu.
 
+  CVE-2019-19126: ld.so failed to ignore the LD_PREFER_MAP_32BIT_EXEC
+  environment variable during program execution after a security
+  transition, allowing local attackers to restrict the possible mapping
+  addresses for loaded libraries and thus bypass ASLR for a setuid
+  program.  Reported by Marcin Kościelnicki.
+
 The following bugs are resolved with this release:
 
   [16750] ldd: Never run file directly.
@@ -182,6 +188,7 @@ The following bugs are resolved with this release:
   [24027] malloc: Integer overflow in realloc
   [24097] Can't use 64-bit register for size_t in assembly codes for x32 (CVE-2019-6488)
   [24155] x32 memcmp can treat positive length as 0 (if sign bit in RDX is set) (CVE-2019-7309)
+  [25204] Ignore LD_PREFER_MAP_32BIT_EXEC for SUID programs
 
 
 Version 2.26
diff --git a/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h b/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h
index 8d474d0d04..37f0b14adb 100644
--- a/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h
+++ b/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h
@@ -31,7 +31,8 @@
    environment variable, LD_PREFER_MAP_32BIT_EXEC.  */
 #define EXTRA_LD_ENVVARS \
   case 21:								  \
-    if (memcmp (envline, "PREFER_MAP_32BIT_EXEC", 21) == 0)		  \
+    if (!__libc_enable_secure						  \
+	&& memcmp (envline, "PREFER_MAP_32BIT_EXEC", 21) == 0)		  \
       GLRO(dl_x86_cpu_features).feature[index_arch_Prefer_MAP_32BIT_EXEC] \
 	|= bit_arch_Prefer_MAP_32BIT_EXEC;				  \
     break;
-- 
cgit 1.4.1


From 2dc2d678e91f3f093d0f4855ac086efb288a5e23 Mon Sep 17 00:00:00 2001
From: Florian Weimer <fweimer@redhat.com>
Date: Thu, 28 Nov 2019 14:44:48 +0100
Subject: libio: Disable vtable validation for pre-2.1 interposed handles [BZ
 #25203]

Commit c402355dfa7807b8e0adb27c009135a7e2b9f1b0 ("libio: Disable
vtable validation in case of interposition [BZ #23313]") only covered
the interposable glibc 2.1 handles, in libio/stdfiles.c.  The
parallel code in libio/oldstdfiles.c needs similar detection logic.

Fixes (again) commit db3476aff19b75c4fdefbe65fcd5f0a90588ba51
("libio: Implement vtable verification [BZ #20191]").

Change-Id: Ief6f9f17e91d1f7263421c56a7dc018f4f595c21
(cherry picked from commit cb61630ed712d033f54295f776967532d3f4b46a)
---
 NEWS                | 1 +
 libio/oldstdfiles.c | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/NEWS b/NEWS
index 89adc868bb..3ccaae3968 100644
--- a/NEWS
+++ b/NEWS
@@ -188,6 +188,7 @@ The following bugs are resolved with this release:
   [24027] malloc: Integer overflow in realloc
   [24097] Can't use 64-bit register for size_t in assembly codes for x32 (CVE-2019-6488)
   [24155] x32 memcmp can treat positive length as 0 (if sign bit in RDX is set) (CVE-2019-7309)
+  [25203] libio: Disable vtable validation for pre-2.1 interposed handles
   [25204] Ignore LD_PREFER_MAP_32BIT_EXEC for SUID programs
 
 
diff --git a/libio/oldstdfiles.c b/libio/oldstdfiles.c
index bed7bceca2..82ba986367 100644
--- a/libio/oldstdfiles.c
+++ b/libio/oldstdfiles.c
@@ -87,6 +87,11 @@ _IO_check_libio (void)
 	stdout->_vtable_offset = stderr->_vtable_offset =
 	((int) sizeof (struct _IO_FILE)
 	 - (int) sizeof (struct _IO_FILE_complete));
+
+      if (_IO_stdin_.vtable != &_IO_old_file_jumps
+	  || _IO_stdout_.vtable != &_IO_old_file_jumps
+	  || _IO_stderr_.vtable != &_IO_old_file_jumps)
+	IO_set_accept_foreign_vtables (&_IO_vtable_check);
     }
 }
 
-- 
cgit 1.4.1