From bff8346b0184b15fbb80863112133f48a7bd62a9 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 1 Feb 2019 12:17:09 -0800 Subject: x86-64 memchr/wmemchr: Properly handle the length parameter [BZ #24097] On x32, the size_t parameter may be passed in the lower 32 bits of a 64-bit register with the non-zero upper 32 bits. The string/memory functions written in assembly can only use the lower 32 bits of a 64-bit register as length or must clear the upper 32 bits before using the full 64-bit register for length. This pach fixes memchr/wmemchr for x32. Tested on x86-64 and x32. On x86-64, libc.so is the same with and withou the fix. [BZ #24097] CVE-2019-6488 * sysdeps/x86_64/memchr.S: Use RDX_LP for length. Clear the upper 32 bits of RDX register. * sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise. * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and tst-size_t-wmemchr. * sysdeps/x86_64/x32/test-size_t.h: New file. * sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise. * sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise. (cherry picked from commit 97700a34f36721b11a754cf37a1cc40695ece1fd) --- ChangeLog | 13 ++++++ NEWS | 7 ++++ sysdeps/x86_64/memchr.S | 10 +++-- sysdeps/x86_64/multiarch/memchr-avx2.S | 8 +++- sysdeps/x86_64/x32/Makefile | 8 ++++ sysdeps/x86_64/x32/test-size_t.h | 35 ++++++++++++++++ sysdeps/x86_64/x32/tst-size_t-memchr.c | 72 +++++++++++++++++++++++++++++++++ sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++++ 8 files changed, 168 insertions(+), 5 deletions(-) create mode 100644 sysdeps/x86_64/x32/test-size_t.h create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c diff --git a/ChangeLog b/ChangeLog index 60b0364037..d5a95a474c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +2019-02-01 H.J. Lu + + [BZ #24097] + CVE-2019-6488 + * sysdeps/x86_64/memchr.S: Use RDX_LP for length. Clear the + upper 32 bits of RDX register. + * sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and + tst-size_t-wmemchr. + * sysdeps/x86_64/x32/test-size_t.h: New file. + * sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise. + * sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise. + 2019-01-11 Gabriel F. T. Gomes * sysdeps/powerpc/fpu/libm-test-ulps: Regenerate. diff --git a/NEWS b/NEWS index 49895f81bd..1bea44cc8d 100644 --- a/NEWS +++ b/NEWS @@ -86,6 +86,12 @@ Security related changes: denial of service due to resource exhaustion when processing getaddrinfo calls with crafted host names. Reported by Guido Vranken. + CVE-2019-6488: On x32, the size_t parameter may be passed in the lower + 32 bits of a 64-bit register with with non-zero upper 32 bit. When it + happened, accessing the 32-bit size_t value as the full 64-bit register + in the assembly string/memory functions would cause a buffer overflow. + Reported by H.J. Lu. + The following bugs are resolved with this release: [16750] ldd: Never run file directly. @@ -165,6 +171,7 @@ The following bugs are resolved with this release: [23927] Linux if_nametoindex() does not close descriptor (CVE-2018-19591) [24018] gettext may return NULL [24027] malloc: Integer overflow in realloc + [24097] Can't use 64-bit register for size_t in assembly codes for x32 (CVE-2019-6488) Version 2.26 diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index f5f05f6c8c..fd20f64f9b 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -34,12 +34,16 @@ ENTRY(MEMCHR) mov %edi, %ecx #ifdef USE_AS_WMEMCHR - test %rdx, %rdx + test %RDX_LP, %RDX_LP jz L(return_null) - shl $2, %rdx + shl $2, %RDX_LP #else +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif punpcklbw %xmm1, %xmm1 - test %rdx, %rdx + test %RDX_LP, %RDX_LP jz L(return_null) punpcklbw %xmm1, %xmm1 #endif diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index a7275ed7e1..85bdca5b4f 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -40,16 +40,20 @@ ENTRY (MEMCHR) # ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ - testq %rdx, %rdx + test %RDX_LP, %RDX_LP jz L(null) # endif movl %edi, %ecx /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 # ifdef USE_AS_WMEMCHR - shl $2, %rdx + shl $2, %RDX_LP vpbroadcastd %xmm0, %ymm0 # else +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif vpbroadcastb %xmm0, %ymm0 # endif /* Check if we may cross page boundary with one vector load. */ diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile index f2ebc24fb0..7d528889c6 100644 --- a/sysdeps/x86_64/x32/Makefile +++ b/sysdeps/x86_64/x32/Makefile @@ -4,3 +4,11 @@ ifeq ($(subdir),math) # 64-bit llround. Add -fno-builtin-lround to silence the compiler. CFLAGS-s_llround.c += -fno-builtin-lround endif + +ifeq ($(subdir),string) +tests += tst-size_t-memchr +endif + +ifeq ($(subdir),wcsmbs) +tests += tst-size_t-wmemchr +endif diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h new file mode 100644 index 0000000000..78a940863e --- /dev/null +++ b/sysdeps/x86_64/x32/test-size_t.h @@ -0,0 +1,35 @@ +/* Test string/memory functions with size_t in the lower 32 bits of + 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define TEST_MAIN +#include + +/* On x32, parameter_t may be passed in a 64-bit register with the LEN + field in the lower 32 bits. When the LEN field of 64-bit register + is passed to string/memory function as the size_t parameter, only + the lower 32 bits can be used. */ +typedef struct +{ + union + { + size_t len; + void (*fn) (void); + }; + void *p; +} parameter_t; diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c new file mode 100644 index 0000000000..29a3daf102 --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c @@ -0,0 +1,72 @@ +/* Test memchr with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef WIDE +# define TEST_NAME "memchr" +#else +# define TEST_NAME "wmemchr" +#endif /* WIDE */ +#include "test-size_t.h" + +#ifndef WIDE +# define MEMCHR memchr +# define CHAR char +# define UCHAR unsigned char +#else +# include +# define MEMCHR wmemchr +# define CHAR wchar_t +# define UCHAR wchar_t +#endif /* WIDE */ + +IMPL (MEMCHR, 1) + +typedef CHAR * (*proto_t) (const CHAR*, int, size_t); + +static CHAR * +__attribute__ ((noinline, noclone)) +do_memchr (parameter_t a, parameter_t b) +{ + return CALL (&b, a.p, (uintptr_t) b.p, a.len); +} + +static int +test_main (void) +{ + test_init (); + + parameter_t src = { { page_size / sizeof (CHAR) }, buf2 }; + parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 }; + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + c.fn = impl->fn; + CHAR *res = do_memchr (src, c); + if (res) + { + error (0, 0, "Wrong result in function %s: %p != NULL", + impl->name, res); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c new file mode 100644 index 0000000000..877801d646 --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c @@ -0,0 +1,20 @@ +/* Test wmemchr with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define WIDE 1 +#include "tst-size_t-memchr.c" -- cgit 1.4.1 From 232a7628f01c1ed93144e3115b240950ec07e8f3 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 1 Feb 2019 12:19:07 -0800 Subject: x86-64 memcmp/wmemcmp: Properly handle the length parameter [BZ #24097] On x32, the size_t parameter may be passed in the lower 32 bits of a 64-bit register with the non-zero upper 32 bits. The string/memory functions written in assembly can only use the lower 32 bits of a 64-bit register as length or must clear the upper 32 bits before using the full 64-bit register for length. This pach fixes memcmp/wmemcmp for x32. Tested on x86-64 and x32. On x86-64, libc.so is the same with and withou the fix. [BZ #24097] CVE-2019-6488 * sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for length. Clear the upper 32 bits of RDX register. * sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise. * sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise. * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and tst-size_t-wmemcmp. * sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file. * sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise. (cherry picked from commit b304fc201d2f6baf52ea790df8643e99772243cd) --- ChangeLog | 13 +++++ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 7 ++- sysdeps/x86_64/multiarch/memcmp-sse4.S | 9 ++-- sysdeps/x86_64/multiarch/memcmp-ssse3.S | 7 ++- sysdeps/x86_64/x32/Makefile | 4 +- sysdeps/x86_64/x32/tst-size_t-memcmp.c | 76 ++++++++++++++++++++++++++++ sysdeps/x86_64/x32/tst-size_t-wmemcmp.c | 20 ++++++++ 7 files changed, 127 insertions(+), 9 deletions(-) create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c diff --git a/ChangeLog b/ChangeLog index d5a95a474c..dfa52bf21d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +2019-02-01 H.J. Lu + + [BZ #24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for + length. Clear the upper 32 bits of RDX register. + * sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise. + * sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and + tst-size_t-wmemcmp. + * sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file. + * sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise. + 2019-02-01 H.J. Lu [BZ #24097] diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S index 16f46301ca..fb12e13ddf 100644 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -58,9 +58,12 @@ .section .text.avx,"ax",@progbits ENTRY (MEMCMP) # ifdef USE_AS_WMEMCMP - shl $2, %rdx + shl $2, %RDX_LP +# elif defined __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx # endif - cmpq $VEC_SIZE, %rdx + cmp $VEC_SIZE, %RDX_LP jb L(less_vec) /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S index 771639f662..834b84cf72 100644 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -42,13 +42,16 @@ .section .text.sse4.1,"ax",@progbits ENTRY (MEMCMP) # ifdef USE_AS_WMEMCMP - shl $2, %rdx + shl $2, %RDX_LP +# elif defined __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx # endif pxor %xmm0, %xmm0 - cmp $79, %rdx + cmp $79, %RDX_LP ja L(79bytesormore) # ifndef USE_AS_WMEMCMP - cmp $1, %rdx + cmp $1, %RDX_LP je L(firstbyte) # endif add %rdx, %rsi diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S index 8d7d2fe67b..af8724e9db 100644 --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S +++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S @@ -33,9 +33,12 @@ atom_text_section ENTRY (MEMCMP) # ifdef USE_AS_WMEMCMP - shl $2, %rdx - test %rdx, %rdx + shl $2, %RDX_LP + test %RDX_LP, %RDX_LP jz L(equal) +# elif defined __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx # endif mov %rdx, %rcx mov %rdi, %rdx diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile index 7d528889c6..ddec7f0466 100644 --- a/sysdeps/x86_64/x32/Makefile +++ b/sysdeps/x86_64/x32/Makefile @@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround endif ifeq ($(subdir),string) -tests += tst-size_t-memchr +tests += tst-size_t-memchr tst-size_t-memcmp endif ifeq ($(subdir),wcsmbs) -tests += tst-size_t-wmemchr +tests += tst-size_t-wmemchr tst-size_t-wmemcmp endif diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c new file mode 100644 index 0000000000..9bd6fdb45a --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c @@ -0,0 +1,76 @@ +/* Test memcmp with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define TEST_MAIN +#ifdef WIDE +# define TEST_NAME "wmemcmp" +#else +# define TEST_NAME "memcmp" +#endif + +#include "test-size_t.h" + +#ifdef WIDE +# include +# include + +# define MEMCMP wmemcmp +# define CHAR wchar_t +#else +# define MEMCMP memcmp +# define CHAR char +#endif + +IMPL (MEMCMP, 1) + +typedef int (*proto_t) (const CHAR *, const CHAR *, size_t); + +static int +__attribute__ ((noinline, noclone)) +do_memcmp (parameter_t a, parameter_t b) +{ + return CALL (&b, a.p, b.p, a.len); +} + +static int +test_main (void) +{ + test_init (); + + parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 }; + parameter_t src = { { 0 }, buf2 }; + + memcpy (buf1, buf2, page_size); + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + src.fn = impl->fn; + int res = do_memcmp (dest, src); + if (res) + { + error (0, 0, "Wrong result in function %s: %i != 0", + impl->name, res); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c new file mode 100644 index 0000000000..e8b5ffd0d5 --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c @@ -0,0 +1,20 @@ +/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define WIDE 1 +#include "tst-size_t-memcmp.c" -- cgit 1.4.1 From 94b88894b15d8a0dcce3d3b17e04880d72f2a9a7 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 1 Feb 2019 12:20:06 -0800 Subject: x86-64 memcpy: Properly handle the length parameter [BZ #24097] On x32, the size_t parameter may be passed in the lower 32 bits of a 64-bit register with the non-zero upper 32 bits. The string/memory functions written in assembly can only use the lower 32 bits of a 64-bit register as length or must clear the upper 32 bits before using the full 64-bit register for length. This pach fixes memcpy for x32. Tested on x86-64 and x32. On x86-64, libc.so is the same with and withou the fix. [BZ #24097] CVE-2019-6488 * sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for length. Clear the upper 32 bits of RDX register. * sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise. * sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: Likewise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: Likewise. * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy. tst-size_t-wmemchr. * sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file. (cherry picked from commit 231c56760c1e2ded21ad96bbb860b1f08c556c7a) --- ChangeLog | 15 ++++++ sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 17 ++++--- sysdeps/x86_64/multiarch/memcpy-ssse3.S | 17 ++++--- .../multiarch/memmove-avx512-no-vzeroupper.S | 16 +++--- .../x86_64/multiarch/memmove-vec-unaligned-erms.S | 50 +++++++++++-------- sysdeps/x86_64/x32/Makefile | 2 +- sysdeps/x86_64/x32/tst-size_t-memcpy.c | 58 ++++++++++++++++++++++ 7 files changed, 135 insertions(+), 40 deletions(-) create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c diff --git a/ChangeLog b/ChangeLog index dfa52bf21d..857cc9364f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +2019-02-01 H.J. Lu + + [BZ #24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for + length. Clear the upper 32 bits of RDX register. + * sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise. + * sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: + Likewise. + * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: + Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy. + tst-size_t-wmemchr. + * sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file. + 2019-02-01 H.J. Lu [BZ #24097] diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S index 4e060a27fd..7388e7412c 100644 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S @@ -48,28 +48,33 @@ .section .text.ssse3,"ax",@progbits #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE ENTRY (MEMPCPY_CHK) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMPCPY_CHK) ENTRY (MEMPCPY) - movq %rdi, %rax - addq %rdx, %rax + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP jmp L(start) END (MEMPCPY) #endif #if !defined USE_AS_BCOPY ENTRY (MEMCPY_CHK) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMCPY_CHK) #endif ENTRY (MEMCPY) - mov %rdi, %rax + mov %RDI_LP, %RAX_LP #ifdef USE_AS_MEMPCPY - add %rdx, %rax + add %RDX_LP, %RAX_LP +#endif + +#ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx #endif #ifdef USE_AS_MEMMOVE diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S index f3ea52a46c..74306d7daf 100644 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S @@ -48,28 +48,33 @@ .section .text.ssse3,"ax",@progbits #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE ENTRY (MEMPCPY_CHK) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMPCPY_CHK) ENTRY (MEMPCPY) - movq %rdi, %rax - addq %rdx, %rax + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP jmp L(start) END (MEMPCPY) #endif #if !defined USE_AS_BCOPY ENTRY (MEMCPY_CHK) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMCPY_CHK) #endif ENTRY (MEMCPY) - mov %rdi, %rax + mov %RDI_LP, %RAX_LP #ifdef USE_AS_MEMPCPY - add %rdx, %rax + add %RDX_LP, %RAX_LP +#endif + +#ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx #endif #ifdef USE_AS_MEMMOVE diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S index ae84ddc667..dae0616019 100644 --- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S +++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S @@ -25,30 +25,34 @@ .section .text.avx512,"ax",@progbits # if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE ENTRY (__mempcpy_chk_avx512_no_vzeroupper) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (__mempcpy_chk_avx512_no_vzeroupper) ENTRY (__mempcpy_avx512_no_vzeroupper) - movq %rdi, %rax - addq %rdx, %rax + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP jmp L(start) END (__mempcpy_avx512_no_vzeroupper) # endif # ifdef SHARED ENTRY (__memmove_chk_avx512_no_vzeroupper) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (__memmove_chk_avx512_no_vzeroupper) # endif ENTRY (__memmove_avx512_no_vzeroupper) - mov %rdi, %rax + mov %RDI_LP, %RAX_LP # ifdef USE_AS_MEMPCPY - add %rdx, %rax + add %RDX_LP, %RAX_LP # endif L(start): +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx +# endif lea (%rsi, %rdx), %rcx lea (%rdi, %rdx), %r9 cmp $512, %rdx diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index d694e8b2be..9225fea1d7 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -107,22 +107,22 @@ .section SECTION(.text),"ax",@progbits #if defined SHARED && IS_IN (libc) ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) #endif #if VEC_SIZE == 16 || defined SHARED ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) - movq %rdi, %rax - addq %rdx, %rax + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP jmp L(start) END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) #endif #if defined SHARED && IS_IN (libc) ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) #endif @@ -130,9 +130,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) movq %rdi, %rax L(start): - cmpq $VEC_SIZE, %rdx +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif + cmp $VEC_SIZE, %RDX_LP jb L(less_vec) - cmpq $(VEC_SIZE * 2), %rdx + cmp $(VEC_SIZE * 2), %RDX_LP ja L(more_2x_vec) #if !defined USE_MULTIARCH || !IS_IN (libc) L(last_2x_vec): @@ -153,33 +157,33 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned)) # if VEC_SIZE == 16 # if defined SHARED ENTRY (__mempcpy_chk_erms) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (__mempcpy_chk_erms) /* Only used to measure performance of REP MOVSB. */ ENTRY (__mempcpy_erms) - movq %rdi, %rax - addq %rdx, %rax + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP jmp L(start_movsb) END (__mempcpy_erms) # endif ENTRY (__memmove_chk_erms) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (__memmove_chk_erms) ENTRY (__memmove_erms) movq %rdi, %rax L(start_movsb): - movq %rdx, %rcx - cmpq %rsi, %rdi + mov %RDX_LP, %RCX_LP + cmp %RSI_LP, %RDI_LP jb 1f /* Source == destination is less common. */ je 2f - leaq (%rsi,%rcx), %rdx - cmpq %rdx, %rdi + lea (%rsi,%rcx), %RDX_LP + cmp %RDX_LP, %RDI_LP jb L(movsb_backward) 1: rep movsb @@ -201,18 +205,18 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms) # ifdef SHARED ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) - movq %rdi, %rax - addq %rdx, %rax + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP jmp L(start_erms) END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) # endif @@ -220,9 +224,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) movq %rdi, %rax L(start_erms): - cmpq $VEC_SIZE, %rdx +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif + cmp $VEC_SIZE, %RDX_LP jb L(less_vec) - cmpq $(VEC_SIZE * 2), %rdx + cmp $(VEC_SIZE * 2), %RDX_LP ja L(movsb_more_2x_vec) L(last_2x_vec): /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ @@ -249,7 +257,7 @@ L(movsb): # endif jb L(more_8x_vec_backward) 1: - movq %rdx, %rcx + mov %RDX_LP, %RCX_LP rep movsb L(nop): ret diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile index ddec7f0466..2fe1e5ac5a 100644 --- a/sysdeps/x86_64/x32/Makefile +++ b/sysdeps/x86_64/x32/Makefile @@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround endif ifeq ($(subdir),string) -tests += tst-size_t-memchr tst-size_t-memcmp +tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy endif ifeq ($(subdir),wcsmbs) diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c new file mode 100644 index 0000000000..66b71e1749 --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c @@ -0,0 +1,58 @@ +/* Test memcpy with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define TEST_NAME "memcpy" +#include "test-size_t.h" + +IMPL (memcpy, 1) + +typedef void *(*proto_t) (void *, const void *, size_t); + +static void * +__attribute__ ((noinline, noclone)) +do_memcpy (parameter_t a, parameter_t b) +{ + return CALL (&b, a.p, b.p, a.len); +} + +static int +test_main (void) +{ + test_init (); + + parameter_t dest = { { page_size }, buf1 }; + parameter_t src = { { 0 }, buf2 }; + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + src.fn = impl->fn; + do_memcpy (dest, src); + int res = memcmp (dest.p, src.p, dest.len); + if (res) + { + error (0, 0, "Wrong result in function %s: %i != 0", + impl->name, res); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include -- cgit 1.4.1 From c9ea2e82d4f4002b162f427c4761c5bec6ee6876 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 1 Feb 2019 12:20:54 -0800 Subject: x86-64 memrchr: Properly handle the length parameter [BZ #24097] On x32, the size_t parameter may be passed in the lower 32 bits of a 64-bit register with the non-zero upper 32 bits. The string/memory functions written in assembly can only use the lower 32 bits of a 64-bit register as length or must clear the upper 32 bits before using the full 64-bit register for length. This pach fixes memrchr for x32. Tested on x86-64 and x32. On x86-64, libc.so is the same with and withou the fix. [BZ #24097] CVE-2019-6488 * sysdeps/x86_64/memrchr.S: Use RDX_LP for length. * sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise. * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr. * sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file. (cherry picked from commit ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0) --- ChangeLog | 9 ++++++ sysdeps/x86_64/memrchr.S | 4 +-- sysdeps/x86_64/multiarch/memrchr-avx2.S | 4 +-- sysdeps/x86_64/x32/Makefile | 3 +- sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++++++++++ 5 files changed, 72 insertions(+), 5 deletions(-) create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c diff --git a/ChangeLog b/ChangeLog index 857cc9364f..2cfeeabe66 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2019-02-01 H.J. Lu + + [BZ #24097] + CVE-2019-6488 + * sysdeps/x86_64/memrchr.S: Use RDX_LP for length. + * sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr. + * sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file. + 2019-02-01 H.J. Lu [BZ #24097] diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S index 5fa0fe9c1c..44ef5c213a 100644 --- a/sysdeps/x86_64/memrchr.S +++ b/sysdeps/x86_64/memrchr.S @@ -24,13 +24,13 @@ ENTRY (__memrchr) movd %esi, %xmm1 - sub $16, %rdx + sub $16, %RDX_LP jbe L(length_less16) punpcklbw %xmm1, %xmm1 punpcklbw %xmm1, %xmm1 - add %rdx, %rdi + add %RDX_LP, %RDI_LP pshufd $0, %xmm1, %xmm1 movdqu (%rdi), %xmm0 diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S index 3ee02e1cc3..40e1dba301 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2) vmovd %esi, %xmm0 vpbroadcastb %xmm0, %ymm0 - subq $VEC_SIZE, %rdx + sub $VEC_SIZE, %RDX_LP jbe L(last_vec_or_less) - addq %rdx, %rdi + add %RDX_LP, %RDI_LP /* Check the last VEC_SIZE bytes. */ vpcmpeqb (%rdi), %ymm0, %ymm1 diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile index 2fe1e5ac5a..e99dbd7c86 100644 --- a/sysdeps/x86_64/x32/Makefile +++ b/sysdeps/x86_64/x32/Makefile @@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround endif ifeq ($(subdir),string) -tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy +tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ + tst-size_t-memrchr endif ifeq ($(subdir),wcsmbs) diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c new file mode 100644 index 0000000000..c83699c097 --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c @@ -0,0 +1,57 @@ +/* Test memrchr with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define TEST_NAME "memrchr" +#include "test-size_t.h" + +IMPL (memchr, 1) + +typedef void * (*proto_t) (const void *, int, size_t); + +static void * +__attribute__ ((noinline, noclone)) +do_memrchr (parameter_t a, parameter_t b) +{ + return CALL (&b, a.p, (uintptr_t) b.p, a.len); +} + +static int +test_main (void) +{ + test_init (); + + parameter_t src = { { page_size }, buf2 }; + parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 }; + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + c.fn = impl->fn; + void * res = do_memrchr (src, c); + if (res) + { + error (0, 0, "Wrong result in function %s: %p != NULL", + impl->name, res); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include -- cgit 1.4.1 From 885e4af2ac9b272d6ddd7f9c1954d0364d7ebab3 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 1 Feb 2019 12:21:41 -0800 Subject: x86-64 memset/wmemset: Properly handle the length parameter [BZ #24097] On x32, the size_t parameter may be passed in the lower 32 bits of a 64-bit register with the non-zero upper 32 bits. The string/memory functions written in assembly can only use the lower 32 bits of a 64-bit register as length or must clear the upper 32 bits before using the full 64-bit register for length. This pach fixes memset/wmemset for x32. Tested on x86-64 and x32. On x86-64, libc.so is the same with and withou the fix. [BZ #24097] CVE-2019-6488 * sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use RDX_LP for length. Clear the upper 32 bits of RDX register. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise. * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset. * sysdeps/x86_64/x32/tst-size_t-memset.c: New file. * sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise. (cherry picked from commit 82d0b4a4d76db554eb6757acb790fcea30b19965) --- ChangeLog | 11 ++++ .../x86_64/multiarch/memset-avx512-no-vzeroupper.S | 6 +- .../x86_64/multiarch/memset-vec-unaligned-erms.S | 32 ++++++---- sysdeps/x86_64/x32/Makefile | 4 +- sysdeps/x86_64/x32/tst-size_t-memset.c | 73 ++++++++++++++++++++++ sysdeps/x86_64/x32/tst-size_t-wmemset.c | 20 ++++++ 6 files changed, 131 insertions(+), 15 deletions(-) create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c diff --git a/ChangeLog b/ChangeLog index 2cfeeabe66..c9a5db6f8e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +2019-02-01 H.J. Lu + + [BZ #24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use + RDX_LP for length. Clear the upper 32 bits of RDX register. + * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset. + * sysdeps/x86_64/x32/tst-size_t-memset.c: New file. + * sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise. + 2019-02-01 H.J. Lu [BZ #24097] diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S index 1f66602398..5be12bd06b 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S @@ -29,12 +29,16 @@ .section .text.avx512,"ax",@progbits #if defined PIC ENTRY (MEMSET_CHK) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMSET_CHK) #endif ENTRY (MEMSET) +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx +# endif vpxor %xmm0, %xmm0, %xmm0 vmovd %esi, %xmm1 lea (%rdi, %rdx), %rsi diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 8ed470283e..2023a8e108 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -75,8 +75,8 @@ .section SECTION(.text),"ax",@progbits #if VEC_SIZE == 16 && IS_IN (libc) ENTRY (__bzero) - movq %rdi, %rax /* Set return value. */ - movq %rsi, %rdx /* Set n. */ + mov %RDI_LP, %RAX_LP /* Set return value. */ + mov %RSI_LP, %RDX_LP /* Set n. */ pxor %xmm0, %xmm0 jmp L(entry_from_bzero) END (__bzero) @@ -86,13 +86,13 @@ weak_alias (__bzero, bzero) #if IS_IN (libc) # if defined SHARED ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) # endif ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) - shlq $2, %rdx + shl $2, %RDX_LP WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) jmp L(entry_from_bzero) END (WMEMSET_SYMBOL (__wmemset, unaligned)) @@ -100,13 +100,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned)) #if defined SHARED && IS_IN (libc) ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) #endif ENTRY (MEMSET_SYMBOL (__memset, unaligned)) MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx +# endif L(entry_from_bzero): cmpq $VEC_SIZE, %rdx jb L(less_vec) @@ -122,7 +126,7 @@ END (MEMSET_SYMBOL (__memset, unaligned)) # if VEC_SIZE == 16 ENTRY (__memset_chk_erms) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END (__memset_chk_erms) @@ -135,11 +139,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms)) L(stosb): /* Issue vzeroupper before rep stosb. */ VZEROUPPER - movq %rdx, %rcx + mov %RDX_LP, %RCX_LP movzbl %sil, %eax - movq %rdi, %rdx + mov %RDI_LP, %RDX_LP rep stosb - movq %rdx, %rax + mov %RDX_LP, %RAX_LP ret # if VEC_SIZE == 16 END (__memset_erms) @@ -149,16 +153,20 @@ END (MEMSET_SYMBOL (__memset, erms)) # if defined SHARED && IS_IN (libc) ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) - cmpq %rdx, %rcx + cmp %RDX_LP, %RCX_LP jb HIDDEN_JUMPTARGET (__chk_fail) END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) # endif ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) - cmpq $VEC_SIZE, %rdx +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx +# endif + cmp $VEC_SIZE, %RDX_LP jb L(less_vec) - cmpq $(VEC_SIZE * 2), %rdx + cmp $(VEC_SIZE * 2), %RDX_LP ja L(stosb_more_2x_vec) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile index e99dbd7c86..98bd9ae9cd 100644 --- a/sysdeps/x86_64/x32/Makefile +++ b/sysdeps/x86_64/x32/Makefile @@ -7,9 +7,9 @@ endif ifeq ($(subdir),string) tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ - tst-size_t-memrchr + tst-size_t-memrchr tst-size_t-memset endif ifeq ($(subdir),wcsmbs) -tests += tst-size_t-wmemchr tst-size_t-wmemcmp +tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset endif diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c new file mode 100644 index 0000000000..2c367af6cd --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-memset.c @@ -0,0 +1,73 @@ +/* Test memset with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifdef WIDE +# define TEST_NAME "wmemset" +#else +# define TEST_NAME "memset" +#endif /* WIDE */ + +#include "test-size_t.h" + +#ifdef WIDE +# include +# define MEMSET wmemset +# define CHAR wchar_t +#else +# define MEMSET memset +# define CHAR char +#endif /* WIDE */ + +IMPL (MEMSET, 1) + +typedef CHAR *(*proto_t) (CHAR *, int, size_t); + +static void * +__attribute__ ((noinline, noclone)) +do_memset (parameter_t a, parameter_t b) +{ + return CALL (&b, a.p, (uintptr_t) b.p, a.len); +} + +static int +test_main (void) +{ + test_init (); + + CHAR ch = 0x23; + parameter_t src = { { page_size / sizeof (CHAR) }, buf2 }; + parameter_t c = { { 0 }, (void *) (uintptr_t) ch }; + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + c.fn = impl->fn; + CHAR *p = (CHAR *) do_memset (src, c); + size_t i; + for (i = 0; i < src.len; i++) + if (p[i] != ch) + { + error (0, 0, "Wrong result in function %s", impl->name); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c new file mode 100644 index 0000000000..955eb488c2 --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c @@ -0,0 +1,20 @@ +/* Test wmemset with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define WIDE 1 +#include "tst-size_t-memset.c" -- cgit 1.4.1 From 15ce2f62f6bc04b87152e583549147073d71b1d2 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 1 Feb 2019 12:22:33 -0800 Subject: x86-64 strncmp family: Properly handle the length parameter [BZ #24097] On x32, the size_t parameter may be passed in the lower 32 bits of a 64-bit register with the non-zero upper 32 bits. The string/memory functions written in assembly can only use the lower 32 bits of a 64-bit register as length or must clear the upper 32 bits before using the full 64-bit register for length. This pach fixes the strncmp family for x32. Tested on x86-64 and x32. On x86-64, libc.so is the same with and withou the fix. [BZ #24097] CVE-2019-6488 * sysdeps/x86_64/multiarch/strcmp-sse42.S: Use RDX_LP for length. * sysdeps/x86_64/strcmp.S: Likewise. * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp, tst-size_t-strncmp and tst-size_t-wcsncmp. * sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file. * sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise. * sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise. (cherry picked from commit ee915088a0231cd421054dbd8abab7aadf331153) --- ChangeLog | 12 +++++ sysdeps/x86_64/multiarch/strcmp-sse42.S | 6 +-- sysdeps/x86_64/strcmp.S | 6 +-- sysdeps/x86_64/x32/Makefile | 6 ++- sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++++++++ sysdeps/x86_64/x32/tst-size_t-strncmp.c | 78 +++++++++++++++++++++++++++++ sysdeps/x86_64/x32/tst-size_t-wcsncmp.c | 20 ++++++++ 7 files changed, 179 insertions(+), 8 deletions(-) create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c diff --git a/ChangeLog b/ChangeLog index c9a5db6f8e..84813b6d9e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2019-02-01 H.J. Lu + + [BZ #24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/strcmp-sse42.S: Use RDX_LP for length. + * sysdeps/x86_64/strcmp.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp, + tst-size_t-strncmp and tst-size_t-wcsncmp. + * sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file. + * sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise. + * sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise. + 2019-02-01 H.J. Lu [BZ #24097] diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S index 4aeb14e175..9e5f93cb86 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S @@ -155,11 +155,11 @@ STRCMP_SSE42: #endif #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - test %rdx, %rdx + test %RDX_LP, %RDX_LP je LABEL(strcmp_exitz) - cmp $1, %rdx + cmp $1, %RDX_LP je LABEL(Byte0) - mov %rdx, %r11 + mov %RDX_LP, %R11_LP #endif mov %esi, %ecx mov %edi, %eax diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S index 076be04df5..2aa301997f 100644 --- a/sysdeps/x86_64/strcmp.S +++ b/sysdeps/x86_64/strcmp.S @@ -135,11 +135,11 @@ ENTRY (STRCMP) * This implementation uses SSE to compare up to 16 bytes at a time. */ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - test %rdx, %rdx + test %RDX_LP, %RDX_LP je LABEL(strcmp_exitz) - cmp $1, %rdx + cmp $1, %RDX_LP je LABEL(Byte0) - mov %rdx, %r11 + mov %RDX_LP, %R11_LP #endif mov %esi, %ecx mov %edi, %eax diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile index 98bd9ae9cd..db30283977 100644 --- a/sysdeps/x86_64/x32/Makefile +++ b/sysdeps/x86_64/x32/Makefile @@ -7,9 +7,11 @@ endif ifeq ($(subdir),string) tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ - tst-size_t-memrchr tst-size_t-memset + tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ + tst-size_t-strncmp endif ifeq ($(subdir),wcsmbs) -tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset +tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \ + tst-size_t-wcsncmp endif diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c new file mode 100644 index 0000000000..862335937b --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c @@ -0,0 +1,59 @@ +/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define TEST_NAME "strncasecmp" +#include "test-size_t.h" + +IMPL (strncasecmp, 1) + +typedef int (*proto_t) (const char *, const char *, size_t); + +static int +__attribute__ ((noinline, noclone)) +do_strncasecmp (parameter_t a, parameter_t b) +{ + return CALL (&b, a.p, b.p, a.len); +} + +static int +test_main (void) +{ + test_init (); + + parameter_t dest = { { page_size }, buf1 }; + parameter_t src = { { 0 }, buf2 }; + + strncpy ((char *) buf1, (const char *) buf2, page_size); + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + src.fn = impl->fn; + int res = do_strncasecmp (dest, src); + if (res) + { + error (0, 0, "Wrong result in function %s: %i != 0", + impl->name, res); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c new file mode 100644 index 0000000000..54e6bd83ef --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c @@ -0,0 +1,78 @@ +/* Test strncmp with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifdef WIDE +# define TEST_NAME "wcsncmp" +#else +# define TEST_NAME "strncmp" +#endif + +#include "test-size_t.h" + +#ifdef WIDE +# include + +# define STRNCMP wcsncmp +# define STRNCPY wcsncpy +# define CHAR wchar_t +#else +# define STRNCMP strncmp +# define STRNCPY strncpy +# define CHAR char +#endif + +IMPL (STRNCMP, 1) + +typedef int (*proto_t) (const CHAR *, const CHAR *, size_t); + + +static int +__attribute__ ((noinline, noclone)) +do_strncmp (parameter_t a, parameter_t b) +{ + return CALL (&b, a.p, b.p, a.len); +} + +static int +test_main (void) +{ + test_init (); + + size_t size = page_size / sizeof (CHAR); + parameter_t dest = { { size }, buf1 }; + parameter_t src = { { 0 }, buf2 }; + + STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size); + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + src.fn = impl->fn; + int res = do_strncmp (dest, src); + if (res) + { + error (0, 0, "Wrong result in function %s: %i != 0", + impl->name, res); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c new file mode 100644 index 0000000000..4829647c19 --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c @@ -0,0 +1,20 @@ +/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define WIDE 1 +#include "tst-size_t-strncmp.c" -- cgit 1.4.1 From 40575878cd2cb00c7fe6de1da9811e1c5ab2c0c6 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 1 Feb 2019 12:23:23 -0800 Subject: x86-64 strncpy: Properly handle the length parameter [BZ #24097] On x32, the size_t parameter may be passed in the lower 32 bits of a 64-bit register with the non-zero upper 32 bits. The string/memory functions written in assembly can only use the lower 32 bits of a 64-bit register as length or must clear the upper 32 bits before using the full 64-bit register for length. This pach fixes strncpy for x32. Tested on x86-64 and x32. On x86-64, libc.so is the same with and withou the fix. [BZ #24097] CVE-2019-6488 * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Use RDX_LP for length. * sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise. * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy. * sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file. (cherry picked from commit c7c54f65b080affb87a1513dee449c8ad6143c8b) --- ChangeLog | 10 ++++ sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S | 4 +- sysdeps/x86_64/multiarch/strcpy-ssse3.S | 6 +-- sysdeps/x86_64/x32/Makefile | 2 +- sysdeps/x86_64/x32/tst-size_t-strncpy.c | 58 ++++++++++++++++++++++++ 5 files changed, 74 insertions(+), 6 deletions(-) create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c diff --git a/ChangeLog b/ChangeLog index 84813b6d9e..77f977c36d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2019-02-01 H.J. Lu + + [BZ #24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Use RDX_LP + for length. + * sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy. + * sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file. + 2019-02-01 H.J. Lu [BZ #24097] diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S index 6a5ab7ab26..b46f6f8aed 100644 --- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S @@ -40,8 +40,8 @@ .text ENTRY (STRCPY) # ifdef USE_AS_STRNCPY - mov %rdx, %r8 - test %r8, %r8 + mov %RDX_LP, %R8_LP + test %R8_LP, %R8_LP jz L(ExitZero) # endif mov %rsi, %rcx diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S index 47aaeae671..83134f3b2c 100644 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S @@ -31,13 +31,13 @@ ENTRY (STRCPY) mov %rsi, %rcx # ifdef USE_AS_STRNCPY - mov %rdx, %r8 + mov %RDX_LP, %R8_LP # endif mov %rdi, %rdx # ifdef USE_AS_STRNCPY - test %r8, %r8 + test %R8_LP, %R8_LP jz L(Exit0) - cmp $8, %r8 + cmp $8, %R8_LP jbe L(StrncpyExit8Bytes) # endif cmpb $0, (%rcx) diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile index db30283977..2a9e20a90b 100644 --- a/sysdeps/x86_64/x32/Makefile +++ b/sysdeps/x86_64/x32/Makefile @@ -8,7 +8,7 @@ endif ifeq ($(subdir),string) tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ - tst-size_t-strncmp + tst-size_t-strncmp tst-size_t-strncpy endif ifeq ($(subdir),wcsmbs) diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c new file mode 100644 index 0000000000..4dec71e6b3 --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c @@ -0,0 +1,58 @@ +/* Test strncpy with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define TEST_NAME "strncpy" +#include "test-size_t.h" + +IMPL (strncpy, 1) + +typedef char *(*proto_t) (char *, const char*, size_t); + +static void * +__attribute__ ((noinline, noclone)) +do_strncpy (parameter_t a, parameter_t b) +{ + return CALL (&b, a.p, b.p, a.len); +} + +static int +test_main (void) +{ + test_init (); + + parameter_t dest = { { page_size }, buf1 }; + parameter_t src = { { 0 }, buf2 }; + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + src.fn = impl->fn; + do_strncpy (dest, src); + int res = strncmp (dest.p, src.p, dest.len); + if (res) + { + error (0, 0, "Wrong result in function %s: %i != 0", + impl->name, res); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include -- cgit 1.4.1 From dc968f557398565d416dab69224f722c1d2beb9b Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 1 Feb 2019 12:24:08 -0800 Subject: x86-64 strnlen/wcsnlen: Properly handle the length parameter [BZ #24097] On x32, the size_t parameter may be passed in the lower 32 bits of a 64-bit register with the non-zero upper 32 bits. The string/memory functions written in assembly can only use the lower 32 bits of a 64-bit register as length or must clear the upper 32 bits before using the full 64-bit register for length. This pach fixes strnlen/wcsnlen for x32. Tested on x86-64 and x32. On x86-64, libc.so is the same with and withou the fix. [BZ #24097] CVE-2019-6488 * sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length. Clear the upper 32 bits of RSI register. * sysdeps/x86_64/strlen.S: Use RSI_LP for length. * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen and tst-size_t-wcsnlen. * sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file. * sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise. (cherry picked from commit 5165de69c0908e28a380cbd4bb054e55ea4abc95) --- ChangeLog | 12 ++++++ sysdeps/x86_64/multiarch/strlen-avx2.S | 9 +++-- sysdeps/x86_64/strlen.S | 12 +++--- sysdeps/x86_64/x32/Makefile | 4 +- sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++++++++++ sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++++ 6 files changed, 118 insertions(+), 11 deletions(-) create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c diff --git a/ChangeLog b/ChangeLog index 77f977c36d..bc1f461a48 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2019-02-01 H.J. Lu + + [BZ #24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length. + Clear the upper 32 bits of RSI register. + * sysdeps/x86_64/strlen.S: Use RSI_LP for length. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen + and tst-size_t-wcsnlen. + * sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file. + * sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise. + 2019-02-01 H.J. Lu [BZ #24097] diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S index 1dc823af0a..2ce10f59e6 100644 --- a/sysdeps/x86_64/multiarch/strlen-avx2.S +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S @@ -42,12 +42,15 @@ ENTRY (STRLEN) # ifdef USE_AS_STRNLEN /* Check for zero length. */ - testq %rsi, %rsi + test %RSI_LP, %RSI_LP jz L(zero) # ifdef USE_AS_WCSLEN - shl $2, %rsi + shl $2, %RSI_LP +# elif defined __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi # endif - movq %rsi, %r8 + mov %RSI_LP, %R8_LP # endif movl %edi, %ecx movq %rdi, %rdx diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index b5ab117c79..30cec693c1 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -59,21 +59,21 @@ ENTRY(strlen) #ifdef AS_STRNLEN /* Do not read anything when n==0. */ - test %rsi, %rsi + test %RSI_LP, %RSI_LP jne L(n_nonzero) xor %rax, %rax ret L(n_nonzero): # ifdef AS_WCSLEN - shlq $2, %rsi + shl $2, %RSI_LP # endif /* Initialize long lived registers. */ - add %rdi, %rsi - mov %rsi, %r10 - and $-64, %r10 - mov %rsi, %r11 + add %RDI_LP, %RSI_LP + mov %RSI_LP, %R10_LP + and $-64, %R10_LP + mov %RSI_LP, %R11_LP #endif pxor %xmm0, %xmm0 diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile index 2a9e20a90b..1557724b0c 100644 --- a/sysdeps/x86_64/x32/Makefile +++ b/sysdeps/x86_64/x32/Makefile @@ -8,10 +8,10 @@ endif ifeq ($(subdir),string) tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ - tst-size_t-strncmp tst-size_t-strncpy + tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen endif ifeq ($(subdir),wcsmbs) tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \ - tst-size_t-wcsncmp + tst-size_t-wcsncmp tst-size_t-wcsnlen endif diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c new file mode 100644 index 0000000000..690a4a8a31 --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c @@ -0,0 +1,72 @@ +/* Test strnlen with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifdef WIDE +# define TEST_NAME "wcsnlen" +#else +# define TEST_NAME "strnlen" +#endif /* WIDE */ + +#include "test-size_t.h" + +#ifdef WIDE +# include +# define STRNLEN wcsnlen +# define CHAR wchar_t +#else +# define STRNLEN strnlen +# define CHAR char +#endif /* WIDE */ + +IMPL (STRNLEN, 1) + +typedef size_t (*proto_t) (const CHAR *, size_t); + +static size_t +__attribute__ ((noinline, noclone)) +do_strnlen (parameter_t a, parameter_t b) +{ + return CALL (&a, a.p, b.len); +} + +static int +test_main (void) +{ + test_init (); + + size_t size = page_size / sizeof (CHAR); + parameter_t src = { { 0 }, buf2 }; + parameter_t c = { { size }, (void *) (uintptr_t) 'a' }; + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + src.fn = impl->fn; + size_t res = do_strnlen (src, c); + if (res != size) + { + error (0, 0, "Wrong result in function %s: 0x%x != 0x%x", + impl->name, res, size); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c new file mode 100644 index 0000000000..093b4bbe1b --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c @@ -0,0 +1,20 @@ +/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define WIDE 1 +#include "tst-size_t-strnlen.c" -- cgit 1.4.1 From 04e767b59b16befce93c6086362acbc1fae63f4e Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 4 Feb 2019 08:55:52 -0800 Subject: x86-64 memcmp: Use unsigned Jcc instructions on size [BZ #24155] Since the size argument is unsigned. we should use unsigned Jcc instructions, instead of signed, to check size. Tested on x86-64 and x32, with and without --disable-multi-arch. [BZ #24155] CVE-2019-7309 * NEWS: Updated for CVE-2019-7309. * sysdeps/x86_64/memcmp.S: Use RDX_LP for size. Clear the upper 32 bits of RDX register for x32. Use unsigned Jcc instructions, instead of signed. * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2. * sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test. (cherry picked from commit 3f635fb43389b54f682fc9ed2acc0b2aaf4a923d) --- ChangeLog | 11 +++++ NEWS | 10 ++++ sysdeps/x86_64/memcmp.S | 20 ++++---- sysdeps/x86_64/x32/Makefile | 3 +- sysdeps/x86_64/x32/tst-size_t-memcmp-2.c | 79 ++++++++++++++++++++++++++++++++ 5 files changed, 114 insertions(+), 9 deletions(-) create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c diff --git a/ChangeLog b/ChangeLog index bc1f461a48..6570bc66bd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +2019-02-04 H.J. Lu + + [BZ #24155] + CVE-2019-7309 + * NEWS: Updated for CVE-2019-7309. + * sysdeps/x86_64/memcmp.S: Use RDX_LP for size. Clear the + upper 32 bits of RDX register for x32. Use unsigned Jcc + instructions, instead of signed. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2. + * sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test. + 2019-02-01 H.J. Lu [BZ #24097] diff --git a/NEWS b/NEWS index 1bea44cc8d..735c578726 100644 --- a/NEWS +++ b/NEWS @@ -92,6 +92,14 @@ Security related changes: in the assembly string/memory functions would cause a buffer overflow. Reported by H.J. Lu. + CVE-2019-7309: x86-64 memcmp used signed Jcc instructions to check + size. For x86-64, memcmp on an object size larger than SSIZE_MAX + has undefined behavior. On x32, the size_t argument may be passed + in the lower 32 bits of the 64-bit RDX register with non-zero upper + 32 bits. When it happened with the sign bit of RDX register set, + memcmp gave the wrong result since it treated the size argument as + zero. Reported by H.J. Lu. + The following bugs are resolved with this release: [16750] ldd: Never run file directly. @@ -172,6 +180,8 @@ The following bugs are resolved with this release: [24018] gettext may return NULL [24027] malloc: Integer overflow in realloc [24097] Can't use 64-bit register for size_t in assembly codes for x32 (CVE-2019-6488) + [24155] x32 memcmp can treat positive length as 0 (if sign bit in RDX is set) (CVE-2019-7309) + Version 2.26 diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S index 0828a22534..9ae90661c8 100644 --- a/sysdeps/x86_64/memcmp.S +++ b/sysdeps/x86_64/memcmp.S @@ -21,14 +21,18 @@ .text ENTRY (memcmp) - test %rdx, %rdx +#ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +#endif + test %RDX_LP, %RDX_LP jz L(finz) cmpq $1, %rdx - jle L(finr1b) + jbe L(finr1b) subq %rdi, %rsi movq %rdx, %r10 cmpq $32, %r10 - jge L(gt32) + jae L(gt32) /* Handle small chunks and last block of less than 32 bytes. */ L(small): testq $1, %r10 @@ -156,7 +160,7 @@ L(A32): movq %r11, %r10 andq $-32, %r10 cmpq %r10, %rdi - jge L(mt16) + jae L(mt16) /* Pre-unroll to be ready for unrolled 64B loop. */ testq $32, %rdi jz L(A64) @@ -178,7 +182,7 @@ L(A64): movq %r11, %r10 andq $-64, %r10 cmpq %r10, %rdi - jge L(mt32) + jae L(mt32) L(A64main): movdqu (%rdi,%rsi), %xmm0 @@ -216,7 +220,7 @@ L(mt32): movq %r11, %r10 andq $-32, %r10 cmpq %r10, %rdi - jge L(mt16) + jae L(mt16) L(A32main): movdqu (%rdi,%rsi), %xmm0 @@ -254,7 +258,7 @@ L(ATR): movq %r11, %r10 andq $-32, %r10 cmpq %r10, %rdi - jge L(mt16) + jae L(mt16) testq $16, %rdi jz L(ATR32) @@ -325,7 +329,7 @@ L(ATR64main): movq %r11, %r10 andq $-32, %r10 cmpq %r10, %rdi - jge L(mt16) + jae L(mt16) L(ATR32res): movdqa (%rdi,%rsi), %xmm0 diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile index 1557724b0c..8748956563 100644 --- a/sysdeps/x86_64/x32/Makefile +++ b/sysdeps/x86_64/x32/Makefile @@ -8,7 +8,8 @@ endif ifeq ($(subdir),string) tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ - tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen + tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen \ + tst-size_t-memcmp-2 endif ifeq ($(subdir),wcsmbs) diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c new file mode 100644 index 0000000000..d8ae1a0813 --- /dev/null +++ b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c @@ -0,0 +1,79 @@ +/* Test memcmp with size_t in the lower 32 bits of 64-bit register. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define TEST_MAIN +#ifdef WIDE +# define TEST_NAME "wmemcmp" +#else +# define TEST_NAME "memcmp" +#endif + +#include "test-size_t.h" + +#ifdef WIDE +# include +# include + +# define MEMCMP wmemcmp +# define CHAR wchar_t +#else +# define MEMCMP memcmp +# define CHAR char +#endif + +IMPL (MEMCMP, 1) + +typedef int (*proto_t) (const CHAR *, const CHAR *, size_t); + +static int +__attribute__ ((noinline, noclone)) +do_memcmp (parameter_t a, parameter_t b) +{ + return CALL (&b, a.p, b.p, a.len); +} + +static int +test_main (void) +{ + test_init (); + + parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 }; + parameter_t src = { { 0 }, buf2 }; + + memcpy (buf1, buf2, page_size); + + CHAR *p = (CHAR *) buf1; + p[page_size / sizeof (CHAR) - 1] = (CHAR) 1; + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + src.fn = impl->fn; + int res = do_memcmp (dest, src); + if (res >= 0) + { + error (0, 0, "Wrong result in function %s: %i >= 0", + impl->name, res); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include -- cgit 1.4.1 From c165427d55d4eefca2dd324073c52f54db0db15a Mon Sep 17 00:00:00 2001 From: Stefan Liebler Date: Thu, 7 Feb 2019 15:18:36 +0100 Subject: Add compiler barriers around modifications of the robust mutex list for pthread_mutex_trylock. [BZ #24180] While debugging a kernel warning, Thomas Gleixner, Sebastian Sewior and Heiko Carstens found a bug in pthread_mutex_trylock due to misordered instructions: 140: a5 1b 00 01 oill %r1,1 144: e5 48 a0 f0 00 00 mvghi 240(%r10),0 <--- THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); 14a: e3 10 a0 e0 00 24 stg %r1,224(%r10) <--- last THREAD_SETMEM of ENQUEUE_MUTEX_PI vs (with compiler barriers): 140: a5 1b 00 01 oill %r1,1 144: e3 10 a0 e0 00 24 stg %r1,224(%r10) 14a: e5 48 a0 f0 00 00 mvghi 240(%r10),0 Please have a look at the discussion: "Re: WARN_ON_ONCE(!new_owner) within wake_futex_pi() triggerede" (https://lore.kernel.org/lkml/20190202112006.GB3381@osiris/) This patch is introducing the same compiler barriers and comments for pthread_mutex_trylock as introduced for pthread_mutex_lock and pthread_mutex_timedlock by commit 8f9450a0b7a9e78267e8ae1ab1000ebca08e473e "Add compiler barriers around modifications of the robust mutex list." ChangeLog: [BZ #24180] * nptl/pthread_mutex_trylock.c (__pthread_mutex_trylock): Add compiler barriers and comments. (cherry picked from commit 823624bdc47f1f80109c9c52dee7939b9386d708) --- ChangeLog | 6 +++++ nptl/pthread_mutex_trylock.c | 57 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6570bc66bd..b0afd11b08 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2019-02-07 Stefan Liebler + + [BZ #24180] + * nptl/pthread_mutex_trylock.c (__pthread_mutex_trylock): + Add compiler barriers and comments. + 2019-02-04 H.J. Lu [BZ #24155] diff --git a/nptl/pthread_mutex_trylock.c b/nptl/pthread_mutex_trylock.c index ec7da61c73..d478eca21b 100644 --- a/nptl/pthread_mutex_trylock.c +++ b/nptl/pthread_mutex_trylock.c @@ -92,6 +92,9 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex) case PTHREAD_MUTEX_ROBUST_ADAPTIVE_NP: THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, &mutex->__data.__list.__next); + /* We need to set op_pending before starting the operation. Also + see comments at ENQUEUE_MUTEX. */ + __asm ("" ::: "memory"); oldval = mutex->__data.__lock; do @@ -117,7 +120,12 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex) /* But it is inconsistent unless marked otherwise. */ mutex->__data.__owner = PTHREAD_MUTEX_INCONSISTENT; + /* We must not enqueue the mutex before we have acquired it. + Also see comments at ENQUEUE_MUTEX. */ + __asm ("" ::: "memory"); ENQUEUE_MUTEX (mutex); + /* We need to clear op_pending after we enqueue the mutex. */ + __asm ("" ::: "memory"); THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); /* Note that we deliberately exist here. If we fall @@ -133,6 +141,8 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex) int kind = PTHREAD_MUTEX_TYPE (mutex); if (kind == PTHREAD_MUTEX_ROBUST_ERRORCHECK_NP) { + /* We do not need to ensure ordering wrt another memory + access. Also see comments at ENQUEUE_MUTEX. */ THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); return EDEADLK; @@ -140,6 +150,8 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex) if (kind == PTHREAD_MUTEX_ROBUST_RECURSIVE_NP) { + /* We do not need to ensure ordering wrt another memory + access. */ THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); @@ -158,6 +170,9 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex) id, 0); if (oldval != 0 && (oldval & FUTEX_OWNER_DIED) == 0) { + /* We haven't acquired the lock as it is already acquired by + another owner. We do not need to ensure ordering wrt another + memory access. */ THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); return EBUSY; @@ -171,13 +186,20 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex) if (oldval == id) lll_unlock (mutex->__data.__lock, PTHREAD_ROBUST_MUTEX_PSHARED (mutex)); + /* FIXME This violates the mutex destruction requirements. See + __pthread_mutex_unlock_full. */ THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); return ENOTRECOVERABLE; } } while ((oldval & FUTEX_OWNER_DIED) != 0); + /* We must not enqueue the mutex before we have acquired it. + Also see comments at ENQUEUE_MUTEX. */ + __asm ("" ::: "memory"); ENQUEUE_MUTEX (mutex); + /* We need to clear op_pending after we enqueue the mutex. */ + __asm ("" ::: "memory"); THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); mutex->__data.__owner = id; @@ -203,10 +225,15 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex) int robust = mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP; if (robust) - /* Note: robust PI futexes are signaled by setting bit 0. */ - THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, - (void *) (((uintptr_t) &mutex->__data.__list.__next) - | 1)); + { + /* Note: robust PI futexes are signaled by setting bit 0. */ + THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, + (void *) (((uintptr_t) &mutex->__data.__list.__next) + | 1)); + /* We need to set op_pending before starting the operation. Also + see comments at ENQUEUE_MUTEX. */ + __asm ("" ::: "memory"); + } oldval = mutex->__data.__lock; @@ -215,12 +242,16 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex) { if (kind == PTHREAD_MUTEX_ERRORCHECK_NP) { + /* We do not need to ensure ordering wrt another memory + access. */ THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); return EDEADLK; } if (kind == PTHREAD_MUTEX_RECURSIVE_NP) { + /* We do not need to ensure ordering wrt another memory + access. */ THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); /* Just bump the counter. */ @@ -242,6 +273,9 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex) { if ((oldval & FUTEX_OWNER_DIED) == 0) { + /* We haven't acquired the lock as it is already acquired by + another owner. We do not need to ensure ordering wrt another + memory access. */ THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); return EBUSY; @@ -262,6 +296,9 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex) if (INTERNAL_SYSCALL_ERROR_P (e, __err) && INTERNAL_SYSCALL_ERRNO (e, __err) == EWOULDBLOCK) { + /* The kernel has not yet finished the mutex owner death. + We do not need to ensure ordering wrt another memory + access. */ THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); return EBUSY; @@ -279,7 +316,12 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex) /* But it is inconsistent unless marked otherwise. */ mutex->__data.__owner = PTHREAD_MUTEX_INCONSISTENT; + /* We must not enqueue the mutex before we have acquired it. + Also see comments at ENQUEUE_MUTEX. */ + __asm ("" ::: "memory"); ENQUEUE_MUTEX (mutex); + /* We need to clear op_pending after we enqueue the mutex. */ + __asm ("" ::: "memory"); THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); /* Note that we deliberately exit here. If we fall @@ -302,13 +344,20 @@ __pthread_mutex_trylock (pthread_mutex_t *mutex) PTHREAD_ROBUST_MUTEX_PSHARED (mutex)), 0, 0); + /* To the kernel, this will be visible after the kernel has + acquired the mutex in the syscall. */ THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); return ENOTRECOVERABLE; } if (robust) { + /* We must not enqueue the mutex before we have acquired it. + Also see comments at ENQUEUE_MUTEX. */ + __asm ("" ::: "memory"); ENQUEUE_MUTEX_PI (mutex); + /* We need to clear op_pending after we enqueue the mutex. */ + __asm ("" ::: "memory"); THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL); } -- cgit 1.4.1 From 4385ec1d8af4203b23dce8c9dc2f1aff5acaf094 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 14 May 2019 17:14:59 +0200 Subject: Fix crash in _IO_wfile_sync (bug 20568) When computing the length of the converted part of the stdio buffer, use the number of consumed wide characters, not the (negative) distance to the end of the wide buffer. (cherry picked from commit 32ff397533715988c19cbf3675dcbd727ec13e18) --- ChangeLog | 10 ++++++++++ NEWS | 1 + libio/Makefile | 3 ++- libio/tst-wfile-sync.c | 39 +++++++++++++++++++++++++++++++++++++++ libio/tst-wfile-sync.input | 1 + libio/wfileops.c | 5 +++-- 6 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 libio/tst-wfile-sync.c create mode 100644 libio/tst-wfile-sync.input diff --git a/ChangeLog b/ChangeLog index b0afd11b08..78924ef13e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2019-05-15 Andreas Schwab + + [BZ #20568] + * libio/wfileops.c (_IO_wfile_sync): Correct last argument to + __codecvt_do_length. + * libio/Makefile (tests): Add tst-wfile-sync. + ($(objpfx)tst-wfile-sync.out): Depend on $(gen-locales). + * libio/tst-wfile-sync.c: New file. + * libio/tst-wfile-sync.input: New file. + 2019-02-07 Stefan Liebler [BZ #24180] diff --git a/NEWS b/NEWS index 735c578726..c6c44a3914 100644 --- a/NEWS +++ b/NEWS @@ -107,6 +107,7 @@ The following bugs are resolved with this release: [17956] crypt: Use NSPR header files in addition to NSS header files [20419] elf: Fix stack overflow with huge PT_NOTE segment [20532] getaddrinfo: More robust handling of dlopen failures + [20568] Fix crash in _IO_wfile_sync [21242] assert: Suppress pedantic warning caused by statement expression [21265] x86-64: Use fxsave/xsave/xsavec in _dl_runtime_resolve [21269] i386 sigaction sa_restorer handling is wrong diff --git a/libio/Makefile b/libio/Makefile index 74bf5279f1..79158f3ebd 100644 --- a/libio/Makefile +++ b/libio/Makefile @@ -62,7 +62,7 @@ tests = tst_swprintf tst_wprintf tst_swscanf tst_wscanf tst_getwc tst_putwc \ bug-memstream1 bug-wmemstream1 \ tst-setvbuf1 tst-popen1 tst-fgetwc bug-wsetpos tst-fseek \ tst-fwrite-error tst-ftell-partial-wide tst-ftell-active-handler \ - tst-ftell-append tst-fputws + tst-ftell-append tst-fputws tst-wfile-sync tests-internal = tst-vtables tst-vtables-interposed @@ -202,6 +202,7 @@ $(objpfx)tst-ungetwc1.out: $(gen-locales) $(objpfx)tst-ungetwc2.out: $(gen-locales) $(objpfx)tst-widetext.out: $(gen-locales) $(objpfx)tst_wprintf2.out: $(gen-locales) +$(objpfx)tst-wfile-sync.out: $(gen-locales) endif $(objpfx)test-freopen.out: test-freopen.sh $(objpfx)test-freopen diff --git a/libio/tst-wfile-sync.c b/libio/tst-wfile-sync.c new file mode 100644 index 0000000000..618682064d --- /dev/null +++ b/libio/tst-wfile-sync.c @@ -0,0 +1,39 @@ +/* Test that _IO_wfile_sync does not crash (bug 20568). + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include + +static int +do_test (void) +{ + TEST_VERIFY_EXIT (setlocale (LC_ALL, "de_DE.UTF-8") != NULL); + /* Fill the stdio buffer and advance the read pointer. */ + TEST_VERIFY_EXIT (fgetwc (stdin) != WEOF); + /* This calls _IO_wfile_sync, it should not crash. */ + TEST_VERIFY_EXIT (setvbuf (stdin, NULL, _IONBF, 0) == 0); + /* Verify that the external file offset has been synchronized. */ + TEST_COMPARE (xlseek (0, 0, SEEK_CUR), 1); + + return 0; +} + +#include diff --git a/libio/tst-wfile-sync.input b/libio/tst-wfile-sync.input new file mode 100644 index 0000000000..12d0958f7a --- /dev/null +++ b/libio/tst-wfile-sync.input @@ -0,0 +1 @@ +This is a test of _IO_wfile_sync. diff --git a/libio/wfileops.c b/libio/wfileops.c index fb94f45040..727e1b23b9 100644 --- a/libio/wfileops.c +++ b/libio/wfileops.c @@ -526,11 +526,12 @@ _IO_wfile_sync (_IO_FILE *fp) generate the wide characters up to the current reading position. */ int nread; - + size_t wnread = (fp->_wide_data->_IO_read_ptr + - fp->_wide_data->_IO_read_base); fp->_wide_data->_IO_state = fp->_wide_data->_IO_last_state; nread = (*cv->__codecvt_do_length) (cv, &fp->_wide_data->_IO_state, fp->_IO_read_base, - fp->_IO_read_end, delta); + fp->_IO_read_end, wnread); fp->_IO_read_ptr = fp->_IO_read_base + nread; delta = -(fp->_IO_read_end - fp->_IO_read_base - nread); } -- cgit 1.4.1 From ac92c66821fae2ae6587114acc454f97a6bc7859 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Fri, 10 May 2019 16:38:21 +0100 Subject: Fix tcache count maximum (BZ #24531) The tcache counts[] array is a char, which has a very small range and thus may overflow. When setting tcache_count tunable, there is no overflow check. However the tunable must not be larger than the maximum value of the tcache counts[] array, otherwise it can overflow when filling the tcache. [BZ #24531] * malloc/malloc.c (MAX_TCACHE_COUNT): New define. (do_set_tcache_count): Only update if count is small enough. * manual/tunables.texi (glibc.malloc.tcache_count): Document max value. (cherry picked from commit 5ad533e8e65092be962e414e0417112c65d154fb) --- ChangeLog | 7 +++++++ malloc/malloc.c | 9 +++++++-- manual/tunables.texi | 4 ++-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 78924ef13e..fff671fd43 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2019-05-22 Wilco Dijkstra + + [BZ #24531] + * malloc/malloc.c (MAX_TCACHE_COUNT): New define. + (do_set_tcache_count): Only update if count is small enough. + * manual/tunables.texi (glibc.malloc.tcache_count): Document max value. + 2019-05-15 Andreas Schwab [BZ #20568] diff --git a/malloc/malloc.c b/malloc/malloc.c index 49e8ed69c2..9896230b21 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -2924,6 +2924,8 @@ typedef struct tcache_perthread_struct tcache_entry *entries[TCACHE_MAX_BINS]; } tcache_perthread_struct; +#define MAX_TCACHE_COUNT 127 /* Maximum value of counts[] entries. */ + static __thread bool tcache_shutting_down = false; static __thread tcache_perthread_struct *tcache = NULL; @@ -5097,8 +5099,11 @@ static inline int __always_inline do_set_tcache_count (size_t value) { - LIBC_PROBE (memory_tunable_tcache_count, 2, value, mp_.tcache_count); - mp_.tcache_count = value; + if (value <= MAX_TCACHE_COUNT) + { + LIBC_PROBE (memory_tunable_tcache_count, 2, value, mp_.tcache_count); + mp_.tcache_count = value; + } return 1; } diff --git a/manual/tunables.texi b/manual/tunables.texi index b09e3fe791..a23c8d076a 100644 --- a/manual/tunables.texi +++ b/manual/tunables.texi @@ -187,8 +187,8 @@ per-thread cache. The default (and maximum) value is 1032 bytes on @deftp Tunable glibc.malloc.tcache_count The maximum number of chunks of each size to cache. The default is 7. -There is no upper limit, other than available system memory. If set -to zero, the per-thread cache is effectively disabled. +The upper limit is 127. If set to zero, the per-thread cache is effectively +disabled. The approximate maximum overhead of the per-thread cache is thus equal to the number of bins times the chunk count in each bin times the size -- cgit 1.4.1 From 71c2578a9b0ea0e5a37c88ee01885213f40eaeea Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Thu, 25 Apr 2019 15:35:35 +0100 Subject: aarch64: add STO_AARCH64_VARIANT_PCS and DT_AARCH64_VARIANT_PCS STO_AARCH64_VARIANT_PCS is a non-visibility st_other flag for marking symbols that reference functions that may follow a variant PCS with different register usage convention from the base PCS. DT_AARCH64_VARIANT_PCS is a dynamic tag that marks ELF modules that have R_*_JUMP_SLOT relocations for symbols marked with STO_AARCH64_VARIANT_PCS (i.e. have variant PCS calls via a PLT). * elf/elf.h (STO_AARCH64_VARIANT_PCS): Define. (DT_AARCH64_VARIANT_PCS): Define. --- ChangeLog | 5 +++++ elf/elf.h | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/ChangeLog b/ChangeLog index fff671fd43..55dde7df0c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2019-06-13 Szabolcs Nagy + + * elf/elf.h (STO_AARCH64_VARIANT_PCS): Define. + (DT_AARCH64_VARIANT_PCS): Define. + 2019-05-22 Wilco Dijkstra [BZ #24531] diff --git a/elf/elf.h b/elf/elf.h index 3900b4c9f0..f80506c562 100644 --- a/elf/elf.h +++ b/elf/elf.h @@ -2759,6 +2759,13 @@ enum #define R_AARCH64_TLSDESC 1031 /* TLS Descriptor. */ #define R_AARCH64_IRELATIVE 1032 /* STT_GNU_IFUNC relocation. */ +/* AArch64 specific values for the Dyn d_tag field. */ +#define DT_AARCH64_VARIANT_PCS (DT_LOPROC + 5) +#define DT_AARCH64_NUM 6 + +/* AArch64 specific values for the st_other field. */ +#define STO_AARCH64_VARIANT_PCS 0x80 + /* ARM relocs. */ #define R_ARM_NONE 0 /* No reloc */ -- cgit 1.4.1 From f2f501ff397953d711ce249bc705fb6139ab1f7b Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Thu, 25 Apr 2019 15:35:35 +0100 Subject: aarch64: handle STO_AARCH64_VARIANT_PCS Backport of commit 82bc69c012838a381c4167c156a06f4598f34227 and commit 30ba0375464f34e4bf8129f3d3dc14d0c09add17 without using DT_AARCH64_VARIANT_PCS for optimizing the symbol table check. This is needed so the internal abi between ld.so and libc.so is unchanged. Avoid lazy binding of symbols that may follow a variant PCS with different register usage convention from the base PCS. Currently the lazy binding entry code does not preserve all the registers required for AdvSIMD and SVE vector calls. Saving and restoring all registers unconditionally may break existing binaries, even if they never use vector calls, because of the larger stack requirement for lazy resolution, which can be significant on an SVE system. The solution is to mark all symbols in the symbol table that may follow a variant PCS so the dynamic linker can handle them specially. In this patch such symbols are always resolved at load time, not lazily. So currently LD_AUDIT for variant PCS symbols are not supported, for that the _dl_runtime_profile entry needs to be changed e.g. to unconditionally save/restore all registers (but pass down arg and retval registers to pltentry/exit callbacks according to the base PCS). This patch also removes a __builtin_expect from the modified code because the branch prediction hint did not seem useful. * sysdeps/aarch64/dl-machine.h (elf_machine_lazy_rel): Check STO_AARCH64_VARIANT_PCS and bind such symbols at load time. --- ChangeLog | 5 +++++ sysdeps/aarch64/dl-machine.h | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 55dde7df0c..8a2fad8626 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2019-07-12 Szabolcs Nagy + + * sysdeps/aarch64/dl-machine.h (elf_machine_lazy_rel): Check + STO_AARCH64_VARIANT_PCS and bind such symbols at load time. + 2019-06-13 Szabolcs Nagy * elf/elf.h (STO_AARCH64_VARIANT_PCS): Define. diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h index 3fb00e6e2d..9ffc2e4c9d 100644 --- a/sysdeps/aarch64/dl-machine.h +++ b/sysdeps/aarch64/dl-machine.h @@ -391,10 +391,37 @@ elf_machine_lazy_rel (struct link_map *map, /* Check for unexpected PLT reloc type. */ if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1)) { - if (__builtin_expect (map->l_mach.plt, 0) == 0) - *reloc_addr += l_addr; - else - *reloc_addr = map->l_mach.plt; + if (map->l_mach.plt == 0) + { + /* Prelinking. */ + *reloc_addr += l_addr; + return; + } + + if (1) /* DT_AARCH64_VARIANT_PCS is not available, so always check. */ + { + /* Check the symbol table for variant PCS symbols. */ + const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info); + const ElfW (Sym) *symtab = + (const void *)D_PTR (map, l_info[DT_SYMTAB]); + const ElfW (Sym) *sym = &symtab[symndx]; + if (__glibc_unlikely (sym->st_other & STO_AARCH64_VARIANT_PCS)) + { + /* Avoid lazy resolution of variant PCS symbols. */ + const struct r_found_version *version = NULL; + if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL) + { + const ElfW (Half) *vernum = + (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]); + version = &map->l_versions[vernum[symndx] & 0x7fff]; + } + elf_machine_rela (map, reloc, sym, version, reloc_addr, + skip_ifunc); + return; + } + } + + *reloc_addr = map->l_mach.plt; } else if (__builtin_expect (r_type == AARCH64_R(TLSDESC), 1)) { -- cgit 1.4.1 From 54194d8b4d097fd3a9cb9528a0ad686622d6c182 Mon Sep 17 00:00:00 2001 From: Adhemerval Zanella Date: Tue, 18 Jun 2019 14:09:09 -0300 Subject: posix: Fix large mmap64 offset for mips64n32 (BZ#24699) The fix for BZ#21270 (commit 158d5fa0e19) added a mask to avoid offset larger than 1^44 to be used along __NR_mmap2. However mips64n32 users __NR_mmap, as mips64n64, but still defines off_t as old non-LFS type (other ILP32, such x32, defines off_t being equal to off64_t). This leads to use the same mask meant only for __NR_mmap2 call for __NR_mmap, thus limiting the maximum offset it can use with mmap64. This patch fixes by setting the high mask only for __NR_mmap2 usage. The posix/tst-mmap-offset.c already tests it and also fails for mips64n32. The patch also change the test to check for an arch-specific header that defines the maximum supported offset. Checked on x86_64-linux-gnu, i686-linux-gnu, and I also tests tst-mmap-offset on qemu simulated mips64 with kernel 3.2.0 kernel for both mips-linux-gnu and mips64-n32-linux-gnu. [BZ #24699] * posix/tst-mmap-offset.c: Mention BZ #24699. (do_test_bz21270): Rename to do_test_large_offset and use mmap64_maximum_offset to check for maximum expected offset value. * sysdeps/generic/mmap_info.h: New file. * sysdeps/unix/sysv/linux/mips/mmap_info.h: Likewise. * sysdeps/unix/sysv/linux/mmap64.c (MMAP_OFF_HIGH_MASK): Define iff __NR_mmap2 is used. (cherry picked from commit a008c76b56e4f958cf5a0d6f67d29fade89421b7) --- ChangeLog | 11 +++++++++++ posix/tst-mmap-offset.c | 9 +++++---- sysdeps/generic/mmap_info.h | 16 ++++++++++++++++ sysdeps/unix/sysv/linux/mips/mmap_info.h | 13 +++++++++++++ sysdeps/unix/sysv/linux/mmap64.c | 9 ++++++++- 5 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 sysdeps/generic/mmap_info.h create mode 100644 sysdeps/unix/sysv/linux/mips/mmap_info.h diff --git a/ChangeLog b/ChangeLog index 8a2fad8626..4cd747a6f0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +2019-07-12 Adhemerval Zanella + + [BZ #24699] + * posix/tst-mmap-offset.c: Mention BZ #24699. + (do_test_bz21270): Rename to do_test_large_offset and use + mmap64_maximum_offset to check for maximum expected offset value. + * sysdeps/generic/mmap_info.h: New file. + * sysdeps/unix/sysv/linux/mips/mmap_info.h: Likewise. + * sysdeps/unix/sysv/linux/mmap64.c (MMAP_OFF_HIGH_MASK): Define iff + __NR_mmap2 is used. + 2019-07-12 Szabolcs Nagy * sysdeps/aarch64/dl-machine.h (elf_machine_lazy_rel): Check diff --git a/posix/tst-mmap-offset.c b/posix/tst-mmap-offset.c index 5bb88aab10..cfd82484f6 100644 --- a/posix/tst-mmap-offset.c +++ b/posix/tst-mmap-offset.c @@ -1,4 +1,4 @@ -/* BZ #18877 and #21270 mmap offset test. +/* BZ #18877, BZ #21270, and BZ #24699 mmap offset test. Copyright (C) 2015-2017 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -76,7 +77,7 @@ do_test_bz18877 (void) /* Check if invalid offset are handled correctly by mmap. */ static int -do_test_bz21270 (void) +do_test_large_offset (void) { /* For architectures with sizeof (off_t) < sizeof (off64_t) mmap is implemented with __SYS_mmap2 syscall and the offset is represented in @@ -90,7 +91,7 @@ do_test_bz21270 (void) const size_t length = 4096; void *addr = mmap64 (NULL, length, prot, flags, fd, offset); - if (sizeof (off_t) < sizeof (off64_t)) + if (mmap64_maximum_offset (page_shift) < UINT64_MAX) { if ((addr != MAP_FAILED) && (errno != EINVAL)) FAIL_RET ("mmap succeed"); @@ -110,7 +111,7 @@ do_test (void) int ret = 0; ret += do_test_bz18877 (); - ret += do_test_bz21270 (); + ret += do_test_large_offset (); return ret; } diff --git a/sysdeps/generic/mmap_info.h b/sysdeps/generic/mmap_info.h new file mode 100644 index 0000000000..b3087df2d3 --- /dev/null +++ b/sysdeps/generic/mmap_info.h @@ -0,0 +1,16 @@ +/* As default architectures with sizeof (off_t) < sizeof (off64_t) the mmap is + implemented with __SYS_mmap2 syscall and the offset is represented in + multiples of page size. For offset larger than + '1 << (page_shift + 8 * sizeof (off_t))' (that is, 1<<44 on system with + page size of 4096 bytes) the system call silently truncates the offset. + For this case, glibc mmap implementation returns EINVAL. */ + +/* Return the maximum value expected as offset argument in mmap64 call. */ +static inline uint64_t +mmap64_maximum_offset (long int page_shift) +{ + if (sizeof (off_t) < sizeof (off64_t)) + return (UINT64_C(1) << (page_shift + (8 * sizeof (off_t)))) - 1; + else + return UINT64_MAX; +} diff --git a/sysdeps/unix/sysv/linux/mips/mmap_info.h b/sysdeps/unix/sysv/linux/mips/mmap_info.h new file mode 100644 index 0000000000..07c9e3a044 --- /dev/null +++ b/sysdeps/unix/sysv/linux/mips/mmap_info.h @@ -0,0 +1,13 @@ +/* mips64n32 uses __NR_mmap for mmap64 while still having sizeof (off_t) + smaller than sizeof (off64_t). So it allows mapping large offsets + using mmap64 than 32-bit archs which uses __NR_mmap2. */ + +static inline uint64_t +mmap64_maximum_offset (long int page_shift) +{ +#if _MIPS_SIM == _ABIN32 || _MIPS_SIM == _ABI64 + return UINT64_MAX; +#else + return (UINT64_C(1) << (page_shift + (8 * sizeof (off_t)))) - 1; +#endif +} diff --git a/sysdeps/unix/sysv/linux/mmap64.c b/sysdeps/unix/sysv/linux/mmap64.c index e8d519b17a..8441a9caa7 100644 --- a/sysdeps/unix/sysv/linux/mmap64.c +++ b/sysdeps/unix/sysv/linux/mmap64.c @@ -23,11 +23,18 @@ #include #include +#ifdef __NR_mmap2 /* To avoid silent truncation of offset when using mmap2, do not accept offset larger than 1 << (page_shift + off_t bits). For archictures with 32 bits off_t and page size of 4096 it would be 1^44. */ -#define MMAP_OFF_HIGH_MASK \ +# define MMAP_OFF_HIGH_MASK \ ((-(MMAP2_PAGE_UNIT << 1) << (8 * sizeof (off_t) - 1))) +#else +/* Some ABIs might use __NR_mmap while having sizeof (off_t) smaller than + sizeof (off64_t) (currently only MIPS64n32). For this case just set + zero the higher bits so mmap with large offset does not fail. */ +# define MMAP_OFF_HIGH_MASK 0x0 +#endif #define MMAP_OFF_MASK (MMAP_OFF_HIGH_MASK | MMAP_OFF_LOW_MASK) -- cgit 1.4.1 From 1896de3d926d299a1ed5c9f0a4f03f5a81969200 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Thu, 10 Aug 2017 17:00:38 +0100 Subject: [AArch64] Optimized memcmp. This is an optimized memcmp for AArch64. This is a complete rewrite using a different algorithm. The previous version split into cases where both inputs were aligned, the inputs were mutually aligned and unaligned using a byte loop. The new version combines all these cases, while small inputs of less than 8 bytes are handled separately. This allows the main code to be sped up using unaligned loads since there are now at least 8 bytes to be compared. After the first 8 bytes, align the first input. This ensures each iteration does at most one unaligned access and mutually aligned inputs behave as aligned. After the main loop, process the last 8 bytes using unaligned accesses. This improves performance of (mutually) aligned cases by 25% and unaligned by >500% (yes >6 times faster) on large inputs. * sysdeps/aarch64/memcmp.S (memcmp): Rewrite of optimized memcmp. (cherry picked from commit 922369032c604b4dcfd535e1bcddd4687e7126a5) --- ChangeLog | 5 ++ sysdeps/aarch64/memcmp.S | 176 +++++++++++++++++++---------------------------- 2 files changed, 76 insertions(+), 105 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4cd747a6f0..5783090424 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2019-09-06 Wilco Dijkstra + + * sysdeps/aarch64/memcmp.S (memcmp): + Rewrite of optimized memcmp. + 2019-07-12 Adhemerval Zanella [BZ #24699] diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index 4cfcb89297..b99c081bba 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -22,132 +22,98 @@ /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64, unaligned accesses. */ /* Parameters and result. */ #define src1 x0 #define src2 x1 #define limit x2 -#define result x0 +#define result w0 /* Internal variables. */ #define data1 x3 #define data1w w3 #define data2 x4 #define data2w w4 -#define has_nul x5 -#define diff x6 -#define endloop x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define pos x11 -#define limit_wd x12 -#define mask x13 +#define tmp1 x5 ENTRY_ALIGN (memcmp, 6) DELOUSE (0) DELOUSE (1) DELOUSE (2) - cbz limit, L(ret0) - eor tmp1, src1, src2 - tst tmp1, #7 - b.ne L(misaligned8) - ands tmp1, src1, #7 - b.ne L(mutual_align) - add limit_wd, limit, #7 - lsr limit_wd, limit_wd, #3 - /* Start of performance-critical section -- one 64B cache line. */ -L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 -L(start_realigned): - subs limit_wd, limit_wd, #1 - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, ne /* Last Dword or differences. */ - cbz endloop, L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ - - /* Not reached the limit, must have found a diff. */ - cbnz limit_wd, L(not_limit) - - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq L(not_limit) - - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask - - orr diff, diff, mask -L(not_limit): -#ifndef __AARCH64EB__ - rev diff, diff + subs limit, limit, 8 + b.lo .Lless8 + + /* Limit >= 8, so check first 8 bytes using unaligned loads. */ + ldr data1, [src1], 8 + ldr data2, [src2], 8 + and tmp1, src1, 7 + add limit, limit, tmp1 + cmp data1, data2 + bne .Lreturn + + /* Align src1 and adjust src2 with bytes not yet done. */ + sub src1, src1, tmp1 + sub src2, src2, tmp1 + + subs limit, limit, 8 + b.ls .Llast_bytes + + /* Loop performing 8 bytes per iteration using aligned src1. + Limit is pre-decremented by 8 and must be larger than zero. + Exit if <= 8 bytes left to do or if the data is not equal. */ + .p2align 4 +.Lloop8: + ldr data1, [src1], 8 + ldr data2, [src2], 8 + subs limit, limit, 8 + ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ + b.eq .Lloop8 + + cmp data1, data2 + bne .Lreturn + + /* Compare last 1-8 bytes using unaligned access. */ +.Llast_bytes: + ldr data1, [src1, limit] + ldr data2, [src2, limit] + + /* Compare data bytes and set return value to 0, -1 or 1. */ +.Lreturn: +#ifndef __AARCH64EB__ rev data1, data1 rev data2, data2 #endif - /* The MS-non-zero bit of DIFF marks either the first bit - that is different, or the end of the significant data. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, diff - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - RET - -L(mutual_align): - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - add limit, limit, tmp1 /* Adjust the limit for the extra. */ - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - add limit_wd, limit, #7 - orr data1, data1, tmp2 - orr data2, data2, tmp2 - lsr limit_wd, limit_wd, #3 - b L(start_realigned) - -L(ret0): - mov result, #0 - RET - - .p2align 6 -L(misaligned8): - sub limit, limit, #1 -1: - /* Perhaps we can do better than this. */ - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq 1b - sub result, data1, data2 - RET + cmp data1, data2 +.Lret_eq: + cset result, ne + cneg result, result, lo + ret + + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +.Lless8: + adds limit, limit, 4 + b.lo .Lless4 + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne .Lreturn + sub limit, limit, 4 +.Lless4: + adds limit, limit, 4 + beq .Lret_eq +.Lbyte_loop: + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq .Lbyte_loop + sub result, data1w, data2w + ret + END (memcmp) #undef bcmp weak_alias (memcmp, bcmp) -- cgit 1.4.1 From 600e4e866c4de0cc0b16aec482c65da732960367 Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Fri, 2 Feb 2018 10:15:20 +0530 Subject: aarch64: Use the L() macro for labels in memcmp The L() macro makes the assembly a bit more readable. * sysdeps/aarch64/memcmp.S: Use L() macro for labels. (cherry picked from commit 84c94d2fd90d84ae7e67657ee8e22c2d1b796f63) --- ChangeLog | 4 ++++ sysdeps/aarch64/memcmp.S | 32 ++++++++++++++++---------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index 5783090424..8674417ed7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/memcmp.S: Use L() macro for labels. + 2019-09-06 Wilco Dijkstra * sysdeps/aarch64/memcmp.S (memcmp): diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index b99c081bba..708c8278b7 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -44,7 +44,7 @@ ENTRY_ALIGN (memcmp, 6) DELOUSE (2) subs limit, limit, 8 - b.lo .Lless8 + b.lo L(less8) /* Limit >= 8, so check first 8 bytes using unaligned loads. */ ldr data1, [src1], 8 @@ -52,65 +52,65 @@ ENTRY_ALIGN (memcmp, 6) and tmp1, src1, 7 add limit, limit, tmp1 cmp data1, data2 - bne .Lreturn + bne L(return) /* Align src1 and adjust src2 with bytes not yet done. */ sub src1, src1, tmp1 sub src2, src2, tmp1 subs limit, limit, 8 - b.ls .Llast_bytes + b.ls L(last_bytes) /* Loop performing 8 bytes per iteration using aligned src1. Limit is pre-decremented by 8 and must be larger than zero. Exit if <= 8 bytes left to do or if the data is not equal. */ .p2align 4 -.Lloop8: +L(loop8): ldr data1, [src1], 8 ldr data2, [src2], 8 subs limit, limit, 8 ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ - b.eq .Lloop8 + b.eq L(loop8) cmp data1, data2 - bne .Lreturn + bne L(return) /* Compare last 1-8 bytes using unaligned access. */ -.Llast_bytes: +L(last_bytes): ldr data1, [src1, limit] ldr data2, [src2, limit] /* Compare data bytes and set return value to 0, -1 or 1. */ -.Lreturn: +L(return): #ifndef __AARCH64EB__ rev data1, data1 rev data2, data2 #endif cmp data1, data2 -.Lret_eq: +L(ret_eq): cset result, ne cneg result, result, lo ret .p2align 4 /* Compare up to 8 bytes. Limit is [-8..-1]. */ -.Lless8: +L(less8): adds limit, limit, 4 - b.lo .Lless4 + b.lo L(less4) ldr data1w, [src1], 4 ldr data2w, [src2], 4 cmp data1w, data2w - b.ne .Lreturn + b.ne L(return) sub limit, limit, 4 -.Lless4: +L(less4): adds limit, limit, 4 - beq .Lret_eq -.Lbyte_loop: + beq L(ret_eq) +L(byte_loop): ldrb data1w, [src1], 1 ldrb data2w, [src2], 1 subs limit, limit, 1 ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ - b.eq .Lbyte_loop + b.eq L(byte_loop) sub result, data1w, data2w ret -- cgit 1.4.1 From ec4512194f035856b8a231476c9139d72f47c58f Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Tue, 6 Mar 2018 19:22:39 +0530 Subject: aarch64: Optimized memcmp for medium to large sizes This improved memcmp provides a fast path for compares up to 16 bytes and then compares 16 bytes at a time, thus optimizing loads from both sources. The glibc memcmp microbenchmark retains performance (with an error of ~1ns) for smaller compare sizes and reduces up to 31% of execution time for compares up to 4K on the APM Mustang. On Qualcomm Falkor this improves to almost 48%, i.e. it is almost 2x improvement for sizes of 2K and above. * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a time. (cherry picked from commit 30a81dae5b752f8aa5f96e7f7c341ec57cba3585) --- ChangeLog | 5 ++++ sysdeps/aarch64/memcmp.S | 76 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 60 insertions(+), 21 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8674417ed7..204d047c0d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a + time. + 2019-09-06 Siddhesh Poyarekar * sysdeps/aarch64/memcmp.S: Use L() macro for labels. diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index 708c8278b7..d074c98615 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -34,9 +34,12 @@ /* Internal variables. */ #define data1 x3 #define data1w w3 -#define data2 x4 -#define data2w w4 -#define tmp1 x5 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 ENTRY_ALIGN (memcmp, 6) DELOUSE (0) @@ -46,39 +49,70 @@ ENTRY_ALIGN (memcmp, 6) subs limit, limit, 8 b.lo L(less8) - /* Limit >= 8, so check first 8 bytes using unaligned loads. */ ldr data1, [src1], 8 ldr data2, [src2], 8 - and tmp1, src1, 7 - add limit, limit, tmp1 + cmp data1, data2 + b.ne L(return) + + subs limit, limit, 8 + b.gt L(more16) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + +L(more16): + ldr data1, [src1], 8 + ldr data2, [src2], 8 cmp data1, data2 bne L(return) + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) + strings. */ + subs limit, limit, 16 + b.ls L(last_bytes) + + /* We overlap loads between 0-32 bytes at either side of SRC1 when we + try to align, so limit it only to strings larger than 128 bytes. */ + cmp limit, 96 + b.ls L(loop8) + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 sub src1, src1, tmp1 sub src2, src2, tmp1 - subs limit, limit, 8 - b.ls L(last_bytes) - - /* Loop performing 8 bytes per iteration using aligned src1. - Limit is pre-decremented by 8 and must be larger than zero. - Exit if <= 8 bytes left to do or if the data is not equal. */ + /* Loop performing 16 bytes per iteration using aligned src1. + Limit is pre-decremented by 16 and must be larger than zero. + Exit if <= 16 bytes left to do or if the data is not equal. */ .p2align 4 -L(loop8): - ldr data1, [src1], 8 - ldr data2, [src2], 8 - subs limit, limit, 8 - ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ - b.eq L(loop8) +L(loop16): + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq + b.eq L(loop16) + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h cmp data1, data2 bne L(return) - /* Compare last 1-8 bytes using unaligned access. */ + /* Compare last 1-16 bytes using unaligned access. */ L(last_bytes): - ldr data1, [src1, limit] - ldr data2, [src2, limit] + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 /* Compare data bytes and set return value to 0, -1 or 1. */ L(return): -- cgit 1.4.1 From 8569357e11aa7b8e912142727eac1d106c785433 Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Tue, 6 Mar 2018 22:56:35 +0530 Subject: aarch64: Fix branch target to loop16 I goofed up when changing the loop8 name to loop16 and missed on out the branch instance. Fixed and actually build tested this time. * sysdeps/aarch64/memcmp.S (more16): Fix branch target loop16. (cherry picked from commit 4e54d918630ea53e29dd70d3bdffcb00d29ed3d4) --- ChangeLog | 2 ++ sysdeps/aarch64/memcmp.S | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 204d047c0d..18a01ed12b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ 2019-09-06 Siddhesh Poyarekar + * sysdeps/aarch64/memcmp.S (more16): Fix loop16 branch target. + * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a time. diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index d074c98615..a741e7b17f 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -75,7 +75,7 @@ L(more16): /* We overlap loads between 0-32 bytes at either side of SRC1 when we try to align, so limit it only to strings larger than 128 bytes. */ cmp limit, 96 - b.ls L(loop8) + b.ls L(loop16) /* Align src1 and adjust src2 with bytes not yet done. */ and tmp1, src1, 15 -- cgit 1.4.1 From 4e75091d6ce3f7ac8b1750ca6135bc37d6707caf Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Wed, 13 Dec 2017 18:50:27 +0530 Subject: aarch64: Improve strcmp unaligned performance Replace the simple byte-wise compare in the misaligned case with a dword compare with page boundary checks in place. For simplicity I've chosen a 4K page boundary so that we don't have to query the actual page size on the system. This results in up to 3x improvement in performance in the unaligned case on falkor and about 2.5x improvement on mustang as measured using bench-strcmp. * sysdeps/aarch64/strcmp.S (misaligned8): Compare dword at a time whenever possible. (cherry picked from commit 2bce01ebbaf8db52ba4a5635eb5744f989cdbf69) --- ChangeLog | 5 +++++ sysdeps/aarch64/strcmp.S | 31 +++++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 18a01ed12b..cd0c1db768 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/strcmp.S (misaligned8): Compare dword at a + time whenever possible. + 2019-09-06 Siddhesh Poyarekar * sysdeps/aarch64/memcmp.S (more16): Fix loop16 branch target. diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S index e99d6625b7..c260e1d8ac 100644 --- a/sysdeps/aarch64/strcmp.S +++ b/sysdeps/aarch64/strcmp.S @@ -72,6 +72,7 @@ L(start_realigned): cbz syndrome, L(loop_aligned) /* End of performance-critical section -- one 64B cache line. */ +L(end): #ifndef __AARCH64EB__ rev syndrome, syndrome rev data1, data1 @@ -145,12 +146,38 @@ L(mutual_align): b L(start_realigned) L(misaligned8): - /* We can do better than this. */ + /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always + checking to make sure that we don't access beyond page boundary in + SRC2. */ + tst src1, #7 + b.eq L(loop_misaligned) +L(do_misaligned): ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 cmp data1w, #1 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq L(misaligned8) + b.ne L(done) + tst src1, #7 + b.ne L(misaligned8) + +L(loop_misaligned): + /* Test if we are within the last dword of the end of a 4K page. If + yes then jump back to the misaligned loop to copy a byte at a time. */ + and tmp1, src2, #0xff8 + eor tmp1, tmp1, #0xff8 + cbz tmp1, L(do_misaligned) + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + orr syndrome, diff, has_nul + cbz syndrome, L(loop_misaligned) + b L(end) + +L(done): sub result, data1, data2 RET END(strcmp) -- cgit 1.4.1 From 01de24dbca4374665fb2a439be39c05427c0a24a Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Thu, 22 Feb 2018 23:48:13 +0530 Subject: aarch64/strcmp: fix misaligned loop jump target I accidentally set the loop jump back label as misaligned8 instead of do_misaligned. The typo is harmless but it's always nice to not have to unnecessarily execute those two instructions. * sysdeps/aarch64/strcmp.S (do_misaligned): Jump back to do_misaligned, not misaligned8. (cherry picked from commit 6ca24c43481e2c93a6eec362b04c3e77a35b28e3) --- ChangeLog | 5 +++++ sysdeps/aarch64/strcmp.S | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index cd0c1db768..29f9e1bac5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/strcmp.S (do_misaligned): Jump back to + do_misaligned, not misaligned8. + 2019-09-06 Siddhesh Poyarekar * sysdeps/aarch64/strcmp.S (misaligned8): Compare dword at a diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S index c260e1d8ac..7eed82cee7 100644 --- a/sysdeps/aarch64/strcmp.S +++ b/sysdeps/aarch64/strcmp.S @@ -158,7 +158,7 @@ L(do_misaligned): ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ b.ne L(done) tst src1, #7 - b.ne L(misaligned8) + b.ne L(do_misaligned) L(loop_misaligned): /* Test if we are within the last dword of the end of a 4K page. If -- cgit 1.4.1 From af9381b734e2218f84235e000cd20d2824a6ed42 Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Tue, 13 Mar 2018 23:57:03 +0530 Subject: aarch64: Improve strncmp for mutually misaligned inputs The mutually misaligned inputs on aarch64 are compared with a simple byte copy, which is not very efficient. Enhance the comparison similar to strcmp by loading a double-word at a time. The peak performance improvement (i.e. 4k maxlen comparisons) due to this on the strncmp microbenchmark is as follows: falkor: 3.5x (up to 72% time reduction) cortex-a73: 3.5x (up to 71% time reduction) cortex-a53: 3.5x (up to 71% time reduction) All mutually misaligned inputs from 16 bytes maxlen onwards show upwards of 15% improvement and there is no measurable effect on the performance of aligned/mutually aligned inputs. * sysdeps/aarch64/strncmp.S (count): New macro. (strncmp): Store misaligned length in SRC1 in COUNT. (mutual_align): Adjust. (misaligned8): Load dword at a time when it is safe. (cherry picked from commit 7108f1f944792ac68332967015d5e6418c5ccc88) --- ChangeLog | 7 ++++ sysdeps/aarch64/strncmp.S | 95 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 87 insertions(+), 15 deletions(-) diff --git a/ChangeLog b/ChangeLog index 29f9e1bac5..b1de9c05c8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/strncmp.S (count): New macro. + (strncmp): Store misaligned length in SRC1 in COUNT. + (mutual_align): Adjust. + (misaligned8): Load dword at a time when it is safe. + 2019-09-06 Siddhesh Poyarekar * sysdeps/aarch64/strcmp.S (do_misaligned): Jump back to diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S index 3e4d88a5d7..33c07befd8 100644 --- a/sysdeps/aarch64/strncmp.S +++ b/sysdeps/aarch64/strncmp.S @@ -49,6 +49,7 @@ #define limit_wd x13 #define mask x14 #define endloop x15 +#define count mask ENTRY_ALIGN_AND_PAD (strncmp, 6, 7) DELOUSE (0) @@ -58,9 +59,9 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 + and count, src1, #7 b.ne L(misaligned8) - ands tmp1, src1, #7 - b.ne L(mutual_align) + cbnz count, L(mutual_align) /* Calculate the number of full and partial words -1. */ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ @@ -165,43 +166,107 @@ L(mutual_align): bic src1, src1, #7 bic src2, src2, #7 ldr data1, [src1], #8 - neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ ldr data2, [src2], #8 mov tmp2, #~0 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ #ifdef __AARCH64EB__ /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ + lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ #else /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ + lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ #endif and tmp3, limit_wd, #7 lsr limit_wd, limit_wd, #3 /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ - add limit, limit, tmp1 - add tmp3, tmp3, tmp1 + add limit, limit, count + add tmp3, tmp3, count orr data1, data1, tmp2 orr data2, data2, tmp2 add limit_wd, limit_wd, tmp3, lsr #3 b L(start_realigned) -L(ret0): - mov result, #0 - RET - .p2align 6 + /* Don't bother with dwords for up to 16 bytes. */ L(misaligned8): - sub limit, limit, #1 -1: + cmp limit, #16 + b.hs L(try_misaligned_words) + +L(byte_loop): /* Perhaps we can do better than this. */ ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 subs limit, limit, #1 - ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */ + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq 1b + b.eq L(byte_loop) +L(done): sub result, data1, data2 RET + + /* Align the SRC1 to a dword by doing a bytewise compare and then do + the dword loop. */ +L(try_misaligned_words): + mov limit_wd, limit, lsr #3 + cbz count, L(do_misaligned) + + neg count, count + and count, count, #7 + sub limit, limit, count + mov limit_wd, limit, lsr #3 + +L(page_end_loop): + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + subs count, count, #1 + b.hi L(page_end_loop) + +L(do_misaligned): + /* Prepare ourselves for the next page crossing. Unlike the aligned + loop, we fetch 1 less dword because we risk crossing bounds on + SRC2. */ + mov count, #8 + subs limit_wd, limit_wd, #1 + b.lo L(done_loop) +L(loop_misaligned): + and tmp2, src2, #0xff8 + eor tmp2, tmp2, #0xff8 + cbz tmp2, L(page_end_loop) + + ldr data1, [src1], #8 + ldr data2, [src2], #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne L(not_limit) + subs limit_wd, limit_wd, #1 + b.pl L(loop_misaligned) + +L(done_loop): + /* We found a difference or a NULL before the limit was reached. */ + and limit, limit, #7 + cbz limit, L(not_limit) + /* Read the last word. */ + sub src1, src1, 8 + sub src2, src2, 8 + ldr data1, [src1, limit] + ldr data2, [src2, limit] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne L(not_limit) + +L(ret0): + mov result, #0 + RET + END (strncmp) libc_hidden_builtin_def (strncmp) -- cgit 1.4.1 From 00fd3acde1475e1bc488ee13ae124d0340da856e Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Wed, 14 Mar 2018 18:51:05 +0530 Subject: aarch64/strncmp: Unbreak builds with old binutils Binutils 2.26.* and older do not support moves with shifted registers, so use a separate shift instruction instead. (cherry picked from commit d46f84de745db8f3f06a37048261f4e5ceacf0a3) --- ChangeLog | 5 +++++ sysdeps/aarch64/strncmp.S | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index b1de9c05c8..2c388920dc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/strncmp.S (strncmp): Use a separate shift + instruction to unbreak builds with binutils 2.26 and older. + 2019-09-06 Siddhesh Poyarekar * sysdeps/aarch64/strncmp.S (count): New macro. diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S index 33c07befd8..d1bfa27ca4 100644 --- a/sysdeps/aarch64/strncmp.S +++ b/sysdeps/aarch64/strncmp.S @@ -208,13 +208,15 @@ L(done): /* Align the SRC1 to a dword by doing a bytewise compare and then do the dword loop. */ L(try_misaligned_words): - mov limit_wd, limit, lsr #3 + mov limit_wd, limit + lsr limit_wd, limit_wd, #3 cbz count, L(do_misaligned) neg count, count and count, count, #7 sub limit, limit, count - mov limit_wd, limit, lsr #3 + mov limit_wd, limit + lsr limit_wd, limit_wd, #3 L(page_end_loop): ldrb data1w, [src1], #1 -- cgit 1.4.1 From e3c35100d32f83aa3c0ec57b83746fea9b98bc2f Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Thu, 15 Mar 2018 08:06:21 +0530 Subject: aarch64/strncmp: Use lsr instead of mov+lsr A lsr can do what the mov and lsr did. (cherry picked from commit b47c3e7637efb77818cbef55dcd0ed1f0ea0ddf1) --- ChangeLog | 5 +++++ sysdeps/aarch64/strncmp.S | 6 ++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 2c388920dc..e9557b8c85 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/strncmp.S (strncmp): Use lsr instead of + mov + lsr. + 2019-09-06 Siddhesh Poyarekar * sysdeps/aarch64/strncmp.S (strncmp): Use a separate shift diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S index d1bfa27ca4..fc1f633348 100644 --- a/sysdeps/aarch64/strncmp.S +++ b/sysdeps/aarch64/strncmp.S @@ -208,15 +208,13 @@ L(done): /* Align the SRC1 to a dword by doing a bytewise compare and then do the dword loop. */ L(try_misaligned_words): - mov limit_wd, limit - lsr limit_wd, limit_wd, #3 + lsr limit_wd, limit, #3 cbz count, L(do_misaligned) neg count, count and count, count, #7 sub limit, limit, count - mov limit_wd, limit - lsr limit_wd, limit_wd, #3 + lsr limit_wd, limit, #3 L(page_end_loop): ldrb data1w, [src1], #1 -- cgit 1.4.1 From d3c05bfffa65b39ba64d62e24ed4a6c118fab3ef Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Fri, 11 May 2018 00:11:52 +0530 Subject: aarch64,falkor: Ignore prefetcher tagging for smaller copies For smaller and medium sized copies, the effect of hardware prefetching are not as dominant as instruction level parallelism. Hence it makes more sense to load data into multiple registers than to try and route them to the same prefetch unit. This is also the case for the loop exit where we are unable to latch on to the same prefetch unit anyway so it makes more sense to have data loaded in parallel. The performance results are a bit mixed with memcpy-random, with numbers jumping between -1% and +3%, i.e. the numbers don't seem repeatable. memcpy-walk sees a 70% improvement (i.e. > 2x) for 128 bytes and that improvement reduces down as the impact of the tail copy decreases in comparison to the loop. * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): Use multiple registers to copy data in loop tail. (cherry picked from commit db725a458e1cb0e17204daa543744faf08bb2e06) --- ChangeLog | 5 +++ sysdeps/aarch64/multiarch/memcpy_falkor.S | 68 +++++++++++++++++++------------ 2 files changed, 46 insertions(+), 27 deletions(-) diff --git a/ChangeLog b/ChangeLog index e9557b8c85..65b46ef409 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): + Use multiple registers to copy data in loop tail. + 2019-09-06 Siddhesh Poyarekar * sysdeps/aarch64/strncmp.S (strncmp): Use lsr instead of diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S index dea4f225ee..3b8601f87e 100644 --- a/sysdeps/aarch64/multiarch/memcpy_falkor.S +++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S @@ -35,6 +35,20 @@ #define A_hw w7 #define tmp1 x14 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l dst +#define E_h tmp1 +#define F_l src +#define F_h count +#define G_l srcend +#define G_h x15 + /* Copies are split into 3 main cases: 1. Small copies of up to 32 bytes @@ -74,21 +88,21 @@ ENTRY_ALIGN (__memcpy_falkor, 6) /* Medium copies: 33..128 bytes. */ sub tmp1, count, 1 ldp A_l, A_h, [src, 16] - stp A_l, A_h, [dstin, 16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -16] tbz tmp1, 6, 1f - ldp A_l, A_h, [src, 32] - stp A_l, A_h, [dstin, 32] - ldp A_l, A_h, [src, 48] - stp A_l, A_h, [dstin, 48] - ldp A_l, A_h, [srcend, -64] - stp A_l, A_h, [dstend, -64] - ldp A_l, A_h, [srcend, -48] - stp A_l, A_h, [dstend, -48] + ldp D_l, D_h, [src, 32] + ldp E_l, E_h, [src, 48] + stp D_l, D_h, [dstin, 32] + stp E_l, E_h, [dstin, 48] + ldp F_l, F_h, [srcend, -64] + ldp G_l, G_h, [srcend, -48] + stp F_l, F_h, [dstend, -64] + stp G_l, G_h, [dstend, -48] 1: - ldp A_l, A_h, [srcend, -32] - stp A_l, A_h, [dstend, -32] - ldp A_l, A_h, [srcend, -16] - stp A_l, A_h, [dstend, -16] + stp A_l, A_h, [dstin, 16] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] ret .p2align 4 @@ -98,36 +112,36 @@ L(copy32): cmp count, 16 b.lo 1f ldp A_l, A_h, [src] + ldp B_l, B_h, [srcend, -16] stp A_l, A_h, [dstin] - ldp A_l, A_h, [srcend, -16] - stp A_l, A_h, [dstend, -16] + stp B_l, B_h, [dstend, -16] ret .p2align 4 1: /* 8-15 */ tbz count, 3, 1f ldr A_l, [src] + ldr B_l, [srcend, -8] str A_l, [dstin] - ldr A_l, [srcend, -8] - str A_l, [dstend, -8] + str B_l, [dstend, -8] ret .p2align 4 1: /* 4-7 */ tbz count, 2, 1f ldr A_lw, [src] + ldr B_lw, [srcend, -4] str A_lw, [dstin] - ldr A_lw, [srcend, -4] - str A_lw, [dstend, -4] + str B_lw, [dstend, -4] ret .p2align 4 1: /* 2-3 */ tbz count, 1, 1f ldrh A_lw, [src] + ldrh B_lw, [srcend, -2] strh A_lw, [dstin] - ldrh A_lw, [srcend, -2] - strh A_lw, [dstend, -2] + strh B_lw, [dstend, -2] ret .p2align 4 1: @@ -171,12 +185,12 @@ L(loop64): L(last64): ldp A_l, A_h, [srcend, -64] stnp A_l, A_h, [dstend, -64] - ldp A_l, A_h, [srcend, -48] - stnp A_l, A_h, [dstend, -48] - ldp A_l, A_h, [srcend, -32] - stnp A_l, A_h, [dstend, -32] - ldp A_l, A_h, [srcend, -16] - stnp A_l, A_h, [dstend, -16] + ldp B_l, B_h, [srcend, -48] + stnp B_l, B_h, [dstend, -48] + ldp C_l, C_h, [srcend, -32] + stnp C_l, C_h, [dstend, -32] + ldp D_l, D_h, [srcend, -16] + stnp D_l, D_h, [dstend, -16] ret END (__memcpy_falkor) -- cgit 1.4.1 From ad64510e5c74729108a02a6c22f03aa8ee07a8d3 Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Fri, 29 Jun 2018 22:45:59 +0530 Subject: aarch64,falkor: Use vector registers for memcpy Vector registers perform better than scalar register pairs for copying data so prefer them instead. This results in a time reduction of over 50% (i.e. 2x speed improvemnet) for some smaller sizes for memcpy-walk. Larger sizes show improvements of around 1% to 2%. memcpy-random shows a very small improvement, in the range of 1-2%. * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): Use vector registers. (cherry picked from commit 0aec4c1d1801e8016ebe89281d16597e0557b8be) --- ChangeLog | 5 ++ sysdeps/aarch64/multiarch/memcpy_falkor.S | 137 ++++++++++++++---------------- 2 files changed, 70 insertions(+), 72 deletions(-) diff --git a/ChangeLog b/ChangeLog index 65b46ef409..0482b0c435 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2019-09-06 Siddhesh Poyarekar + + * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): + Use vector registers. + 2019-09-06 Siddhesh Poyarekar * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S index 3b8601f87e..9cde8dcbd6 100644 --- a/sysdeps/aarch64/multiarch/memcpy_falkor.S +++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S @@ -29,25 +29,19 @@ #define dst x3 #define srcend x4 #define dstend x5 -#define A_l x6 -#define A_lw w6 -#define A_h x7 -#define A_hw w7 #define tmp1 x14 - -#define B_l x8 -#define B_lw w8 -#define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l dst -#define E_h tmp1 -#define F_l src -#define F_h count -#define G_l srcend -#define G_h x15 +#define A_x x6 +#define B_x x7 +#define A_w w6 +#define B_w w7 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 /* Copies are split into 3 main cases: @@ -67,9 +61,9 @@ bumping up the small copies up to 32 bytes allows us to do that without cost and also allows us to reduce the size of the prep code before loop64. - All copies are done only via two registers r6 and r7. This is to ensure - that all loads hit a single hardware prefetcher which can get correctly - trained to prefetch a single stream. + The copy loop uses only one register q0. This is to ensure that all loads + hit a single hardware prefetcher which can get correctly trained to prefetch + a single stream. The non-temporal stores help optimize cache utilization. */ @@ -80,29 +74,29 @@ ENTRY_ALIGN (__memcpy_falkor, 6) add srcend, src, count add dstend, dstin, count b.ls L(copy32) - ldp A_l, A_h, [src] + ldr A_q, [src] cmp count, 128 - stp A_l, A_h, [dstin] + str A_q, [dstin] b.hi L(copy_long) /* Medium copies: 33..128 bytes. */ sub tmp1, count, 1 - ldp A_l, A_h, [src, 16] - ldp B_l, B_h, [srcend, -32] - ldp C_l, C_h, [srcend, -16] + ldr A_q, [src, 16] + ldr B_q, [srcend, -32] + ldr C_q, [srcend, -16] tbz tmp1, 6, 1f - ldp D_l, D_h, [src, 32] - ldp E_l, E_h, [src, 48] - stp D_l, D_h, [dstin, 32] - stp E_l, E_h, [dstin, 48] - ldp F_l, F_h, [srcend, -64] - ldp G_l, G_h, [srcend, -48] - stp F_l, F_h, [dstend, -64] - stp G_l, G_h, [dstend, -48] + ldr D_q, [src, 32] + ldr E_q, [src, 48] + str D_q, [dstin, 32] + str E_q, [dstin, 48] + ldr F_q, [srcend, -64] + ldr G_q, [srcend, -48] + str F_q, [dstend, -64] + str G_q, [dstend, -48] 1: - stp A_l, A_h, [dstin, 16] - stp B_l, B_h, [dstend, -32] - stp C_l, C_h, [dstend, -16] + str A_q, [dstin, 16] + str B_q, [dstend, -32] + str C_q, [dstend, -16] ret .p2align 4 @@ -111,44 +105,44 @@ L(copy32): /* 16-32 */ cmp count, 16 b.lo 1f - ldp A_l, A_h, [src] - ldp B_l, B_h, [srcend, -16] - stp A_l, A_h, [dstin] - stp B_l, B_h, [dstend, -16] + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] ret .p2align 4 1: /* 8-15 */ tbz count, 3, 1f - ldr A_l, [src] - ldr B_l, [srcend, -8] - str A_l, [dstin] - str B_l, [dstend, -8] + ldr A_x, [src] + ldr B_x, [srcend, -8] + str A_x, [dstin] + str B_x, [dstend, -8] ret .p2align 4 1: /* 4-7 */ tbz count, 2, 1f - ldr A_lw, [src] - ldr B_lw, [srcend, -4] - str A_lw, [dstin] - str B_lw, [dstend, -4] + ldr A_w, [src] + ldr B_w, [srcend, -4] + str A_w, [dstin] + str B_w, [dstend, -4] ret .p2align 4 1: /* 2-3 */ tbz count, 1, 1f - ldrh A_lw, [src] - ldrh B_lw, [srcend, -2] - strh A_lw, [dstin] - strh B_lw, [dstend, -2] + ldrh A_w, [src] + ldrh B_w, [srcend, -2] + strh A_w, [dstin] + strh B_w, [dstend, -2] ret .p2align 4 1: /* 0-1 */ tbz count, 0, 1f - ldrb A_lw, [src] - strb A_lw, [dstin] + ldrb A_w, [src] + strb A_w, [dstin] 1: ret @@ -167,30 +161,29 @@ L(copy_long): add count, count, tmp1 L(loop64): - ldp A_l, A_h, [src, 16]! - stnp A_l, A_h, [dst, 16] - ldp A_l, A_h, [src, 16]! + ldr A_q, [src, 16]! + str A_q, [dst, 16] + ldr A_q, [src, 16]! subs count, count, 64 - stnp A_l, A_h, [dst, 32] - ldp A_l, A_h, [src, 16]! - stnp A_l, A_h, [dst, 48] - ldp A_l, A_h, [src, 16]! - stnp A_l, A_h, [dst, 64] - add dst, dst, 64 + str A_q, [dst, 32] + ldr A_q, [src, 16]! + str A_q, [dst, 48] + ldr A_q, [src, 16]! + str A_q, [dst, 64]! b.hi L(loop64) /* Write the last full set of 64 bytes. The remainder is at most 64 bytes, so it is safe to always copy 64 bytes from the end even if there is just 1 byte left. */ L(last64): - ldp A_l, A_h, [srcend, -64] - stnp A_l, A_h, [dstend, -64] - ldp B_l, B_h, [srcend, -48] - stnp B_l, B_h, [dstend, -48] - ldp C_l, C_h, [srcend, -32] - stnp C_l, C_h, [dstend, -32] - ldp D_l, D_h, [srcend, -16] - stnp D_l, D_h, [dstend, -16] + ldr E_q, [srcend, -64] + str E_q, [dstend, -64] + ldr D_q, [srcend, -48] + str D_q, [dstend, -48] + ldr C_q, [srcend, -32] + str C_q, [dstend, -32] + ldr B_q, [srcend, -16] + str B_q, [dstend, -16] ret END (__memcpy_falkor) -- cgit 1.4.1 From d6613ad24f708706c24bffa38351e26e1dd5b5d1 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Wed, 19 Dec 2018 18:28:24 +0000 Subject: [AArch64] Add ifunc support for Ares Add Ares to the midr_el0 list and support ifunc dispatch. Since Ares supports 2 128-bit loads/stores, use Neon registers for memcpy by selecting __memcpy_falkor by default (we should rename this to __memcpy_simd or similar). * manual/tunables.texi (glibc.cpu.name): Add ares tunable. * sysdeps/aarch64/multiarch/memcpy.c (__libc_memcpy): Use __memcpy_falkor for ares. * sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_ARES): Add new define. * sysdeps/unix/sysv/linux/aarch64/cpu-features.c (cpu_list): Add ares cpu. (cherry picked from commit 02f440c1ef5d5d79552a524065aa3e2fabe469b9) --- ChangeLog | 10 ++++++++++ manual/tunables.texi | 2 +- sysdeps/aarch64/multiarch/memcpy.c | 2 +- sysdeps/unix/sysv/linux/aarch64/cpu-features.c | 1 + sysdeps/unix/sysv/linux/aarch64/cpu-features.h | 3 +++ 5 files changed, 16 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 0482b0c435..2a9b6ed7ef 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2019-01-09 Wilco Dijkstra + + * manual/tunables.texi (glibc.cpu.name): Add ares tunable. + * sysdeps/aarch64/multiarch/memcpy.c (__libc_memcpy): Use + __memcpy_falkor for ares. + * sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_ARES): + Add new define. + * sysdeps/unix/sysv/linux/aarch64/cpu-features.c (cpu_list): + Add ares cpu. + 2019-09-06 Siddhesh Poyarekar * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor): diff --git a/manual/tunables.texi b/manual/tunables.texi index a23c8d076a..b230cde556 100644 --- a/manual/tunables.texi +++ b/manual/tunables.texi @@ -253,7 +253,7 @@ This tunable is specific to i386 and x86-64. @deftp Tunable glibc.tune.cpu The @code{glibc.tune.cpu=xxx} tunable allows the user to tell @theglibc{} to assume that the CPU is @code{xxx} where xxx may have one of these values: -@code{generic}, @code{falkor}, @code{thunderxt88}. +@code{generic}, @code{falkor}, @code{thunderxt88}, @code{ares}. This tunable is specific to aarch64. @end deftp diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index b395df1c63..ee4d78ea1d 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -35,7 +35,7 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; libc_ifunc (__libc_memcpy, (IS_THUNDERX (midr) ? __memcpy_thunderx - : (IS_FALKOR (midr) + : (IS_FALKOR (midr) || IS_ARES (midr) ? __memcpy_falkor : __memcpy_generic))); diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c index 0c7e13f4fa..50297bc409 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c @@ -30,6 +30,7 @@ struct cpu_list static struct cpu_list cpu_list[] = { {"falkor", 0x510FC000}, {"thunderxt88", 0x430F0A10}, + {"ares", 0x411FD0C0}, {"generic", 0x0} }; diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h index 73cb53da9a..d2ad5c63b9 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h @@ -44,6 +44,9 @@ #define IS_FALKOR(midr) (MIDR_IMPLEMENTOR(midr) == 'Q' \ && MIDR_PARTNUM(midr) == 0xc00) +#define IS_ARES(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \ + && MIDR_PARTNUM(midr) == 0xd0c) + struct cpu_features { uint64_t midr_el1; -- cgit 1.4.1 From 55a280689e61cb8a7879ebbe0586d031559f1ba4 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Mon, 16 Jul 2018 17:50:09 +0100 Subject: Improve strstr performance Improve strstr performance. Strstr tends to be slow because it uses many calls to memchr and a slow byte loop to scan for the next match. Performance is significantly improved by using strnlen on larger blocks and using strchr to search for the next matching character. strcasestr can also use strnlen to scan ahead, and memmem can use memchr to check for the next match. On the GLIBC bench tests the performance gains on Cortex-A72 are: strstr: +25% strcasestr: +4.3% memmem: +18% On a 256KB dataset strstr performance improves by 67%, strcasestr by 47%. Reviewd-by: Adhemerval Zanella (cherry picked from commit 3ae725dfb6d7f61447d27d00ed83e573bd5454f4) --- ChangeLog | 15 +++++++++++- benchtests/bench-strcasestr.c | 1 + benchtests/bench-strstr.c | 3 +++ string/memmem.c | 1 + string/str-two-way.h | 56 +++++++++++++++++++++---------------------- string/strcasestr.c | 4 ++-- string/strstr.c | 5 ++-- string/test-strcasestr.c | 1 + string/test-strstr.c | 1 + 9 files changed, 53 insertions(+), 34 deletions(-) diff --git a/ChangeLog b/ChangeLog index 2a9b6ed7ef..d21bdbcce8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,17 @@ -2019-01-09 Wilco Dijkstra +2019-09-13 Wilco Dijkstra + + * benchtests/bench-strcasestr.c: Rename __strnlen to strnlen. + * benchtests/bench-strstr.c: Likewise. + * string/memmem.c (FASTSEARCH): Define. + * string/str-two-way.h (two_way_short_needle): Minor cleanups. + Add support for FASTSEARCH. + * string/strcasestr.c (AVAILABLE): Use read-ahead __strnlen. + * string/strstr.c (AVAILABLE): Use read-ahead __strnlen. + (FASTSEARCH): Define. + * string/test-strcasestr.c: Rename __strnlen to strnlen. + * string/test-strstr.c: Likewise. + +2019-09-06 Wilco Dijkstra * manual/tunables.texi (glibc.cpu.name): Add ares tunable. * sysdeps/aarch64/multiarch/memcpy.c (__libc_memcpy): Use diff --git a/benchtests/bench-strcasestr.c b/benchtests/bench-strcasestr.c index 4e6f480c84..9a031b3064 100644 --- a/benchtests/bench-strcasestr.c +++ b/benchtests/bench-strcasestr.c @@ -24,6 +24,7 @@ #define STRCASESTR simple_strcasestr #define NO_ALIAS #define __strncasecmp strncasecmp +#define __strnlen strnlen #include "../string/strcasestr.c" diff --git a/benchtests/bench-strstr.c b/benchtests/bench-strstr.c index e63659f136..2fa64118f4 100644 --- a/benchtests/bench-strstr.c +++ b/benchtests/bench-strstr.c @@ -22,6 +22,9 @@ #define STRSTR simple_strstr +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(X) +#define __strnlen strnlen #include "../string/strstr.c" diff --git a/string/memmem.c b/string/memmem.c index 54fca4966d..34299b8864 100644 --- a/string/memmem.c +++ b/string/memmem.c @@ -31,6 +31,7 @@ #define RETURN_TYPE void * #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l)) +#define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N)) #include "str-two-way.h" #undef memmem diff --git a/string/str-two-way.h b/string/str-two-way.h index 599c867ffd..f433c76478 100644 --- a/string/str-two-way.h +++ b/string/str-two-way.h @@ -281,50 +281,50 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, } else { - const unsigned char *phaystack = &haystack[suffix]; + const unsigned char *phaystack; /* The comparison always starts from needle[suffix], so cache it and use an optimized first-character loop. */ unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]); -#if CHECK_EOL - /* We start matching from the SUFFIX'th element, so make sure we - don't hit '\0' before that. */ - if (haystack_len < suffix + 1 - && !AVAILABLE (haystack, haystack_len, 0, suffix + 1)) - return NULL; -#endif - /* The two halves of needle are distinct; no extra memory is required, and any mismatch results in a maximal shift. */ period = MAX (suffix, needle_len - suffix) + 1; j = 0; - while (1 -#if !CHECK_EOL - && AVAILABLE (haystack, haystack_len, j, needle_len) -#endif - ) + while (AVAILABLE (haystack, haystack_len, j, needle_len)) { unsigned char haystack_char; const unsigned char *pneedle; - /* TODO: The first-character loop can be sped up by adapting - longword-at-a-time implementation of memchr/strchr. */ - if (needle_suffix + phaystack = &haystack[suffix + j]; + +#ifdef FASTSEARCH + if (*phaystack++ != needle_suffix) + { + phaystack = FASTSEARCH (phaystack, needle_suffix, + haystack_len - needle_len - j); + if (phaystack == NULL) + goto ret0; + j = phaystack - &haystack[suffix]; + phaystack++; + } +#else + while (needle_suffix != (haystack_char = CANON_ELEMENT (*phaystack++))) { RET0_IF_0 (haystack_char); -#if !CHECK_EOL +# if !CHECK_EOL ++j; -#endif - continue; + if (!AVAILABLE (haystack, haystack_len, j, needle_len)) + goto ret0; +# endif } -#if CHECK_EOL +# if CHECK_EOL /* Calculate J if it wasn't kept up-to-date in the first-character loop. */ j = phaystack - &haystack[suffix] - 1; +# endif #endif - /* Scan for matches in right half. */ i = suffix + 1; pneedle = &needle[i]; @@ -338,6 +338,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, } ++i; } +#if CHECK_EOL + /* Update minimal length of haystack. */ + if (phaystack > haystack + haystack_len) + haystack_len = phaystack - haystack; +#endif if (needle_len <= i) { /* Scan for matches in left half. */ @@ -360,13 +365,6 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, } else j += i - suffix + 1; - -#if CHECK_EOL - if (!AVAILABLE (haystack, haystack_len, j, needle_len)) - break; -#endif - - phaystack = &haystack[suffix + j]; } } ret0: __attribute__ ((unused)) diff --git a/string/strcasestr.c b/string/strcasestr.c index 2acf003155..7caaade5d2 100644 --- a/string/strcasestr.c +++ b/string/strcasestr.c @@ -37,8 +37,8 @@ /* Two-Way algorithm. */ #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ - (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \ - && ((h_l) = (j) + (n_l))) + (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \ + (j) + (n_l) <= (h_l))) #define CHECK_EOL (1) #define RET0_IF_0(a) if (!a) goto ret0 #define CANON_ELEMENT(c) TOLOWER (c) diff --git a/string/strstr.c b/string/strstr.c index 88f1d5de36..63facae4a5 100644 --- a/string/strstr.c +++ b/string/strstr.c @@ -33,10 +33,11 @@ #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ - (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \ - && ((h_l) = (j) + (n_l))) + (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \ + (j) + (n_l) <= (h_l))) #define CHECK_EOL (1) #define RET0_IF_0(a) if (!a) goto ret0 +#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C)) #include "str-two-way.h" #undef strstr diff --git a/string/test-strcasestr.c b/string/test-strcasestr.c index abb3916732..78e03da7c4 100644 --- a/string/test-strcasestr.c +++ b/string/test-strcasestr.c @@ -25,6 +25,7 @@ #define STRCASESTR simple_strcasestr #define NO_ALIAS #define __strncasecmp strncasecmp +#define __strnlen strnlen #include "strcasestr.c" diff --git a/string/test-strstr.c b/string/test-strstr.c index 33f221149a..8d04134e0b 100644 --- a/string/test-strstr.c +++ b/string/test-strstr.c @@ -24,6 +24,7 @@ #define STRSTR simple_strstr #define libc_hidden_builtin_def(arg) /* nothing */ +#define __strnlen strnlen #include "strstr.c" -- cgit 1.4.1 From c60bf879b21aefedaf632f585b9c39af8532bc71 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Fri, 3 Aug 2018 17:24:12 +0100 Subject: Simplify and speedup strstr/strcasestr first match Looking at the benchtests, both strstr and strcasestr spend a lot of time in a slow initialization loop handling one character per iteration. This can be simplified and use the much faster strlen/strnlen/strchr/memcmp. Read ahead a few cachelines to reduce the number of strnlen calls, which improves performance by ~3-4%. This patch improves the time taken for the full strstr benchtest by >40%. * string/strcasestr.c (STRCASESTR): Simplify and speedup first match. * string/strstr.c (AVAILABLE): Likewise. (cherry picked from commit 284f42bc778e487dfd5dff5c01959f93b9e0c4f5) --- ChangeLog | 5 +++++ string/strcasestr.c | 37 ++++++++++++++----------------------- string/strstr.c | 43 +++++++++++++++++++++---------------------- 3 files changed, 40 insertions(+), 45 deletions(-) diff --git a/ChangeLog b/ChangeLog index d21bdbcce8..22012fb458 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2019-09-13 Wilco Dijkstra + + * string/strcasestr.c (STRCASESTR): Simplify and speedup first match. + * string/strstr.c (AVAILABLE): Likewise. + 2019-09-13 Wilco Dijkstra * benchtests/bench-strcasestr.c: Rename __strnlen to strnlen. diff --git a/string/strcasestr.c b/string/strcasestr.c index 7caaade5d2..772e513626 100644 --- a/string/strcasestr.c +++ b/string/strcasestr.c @@ -58,31 +58,22 @@ case-insensitive comparison. This function gives unspecified results in multibyte locales. */ char * -STRCASESTR (const char *haystack_start, const char *needle_start) +STRCASESTR (const char *haystack, const char *needle) { - const char *haystack = haystack_start; - const char *needle = needle_start; size_t needle_len; /* Length of NEEDLE. */ size_t haystack_len; /* Known minimum length of HAYSTACK. */ - bool ok = true; /* True if NEEDLE is prefix of HAYSTACK. */ - - /* Determine length of NEEDLE, and in the process, make sure - HAYSTACK is at least as long (no point processing all of a long - NEEDLE if HAYSTACK is too short). */ - while (*haystack && *needle) - { - ok &= (TOLOWER ((unsigned char) *haystack) - == TOLOWER ((unsigned char) *needle)); - haystack++; - needle++; - } - if (*needle) + + /* Handle empty NEEDLE special case. */ + if (needle[0] == '\0') + return (char *) haystack; + + /* Ensure HAYSTACK length is at least as long as NEEDLE length. + Since a match may occur early on in a huge HAYSTACK, use strnlen + and read ahead a few cachelines for improved performance. */ + needle_len = strlen (needle); + haystack_len = __strnlen (haystack, needle_len + 256); + if (haystack_len < needle_len) return NULL; - if (ok) - return (char *) haystack_start; - needle_len = needle - needle_start; - haystack = haystack_start + 1; - haystack_len = needle_len - 1; /* Perform the search. Abstract memory is considered to be an array of 'unsigned char' values, not an array of 'char' values. See @@ -90,10 +81,10 @@ STRCASESTR (const char *haystack_start, const char *needle_start) if (needle_len < LONG_NEEDLE_THRESHOLD) return two_way_short_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle_start, + (const unsigned char *) needle, needle_len); return two_way_long_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle_start, + (const unsigned char *) needle, needle_len); } diff --git a/string/strstr.c b/string/strstr.c index 63facae4a5..1eb4dc11db 100644 --- a/string/strstr.c +++ b/string/strstr.c @@ -50,33 +50,32 @@ if NEEDLE is empty, otherwise NULL if NEEDLE is not found in HAYSTACK. */ char * -STRSTR (const char *haystack_start, const char *needle_start) +STRSTR (const char *haystack, const char *needle) { - const char *haystack = haystack_start; - const char *needle = needle_start; size_t needle_len; /* Length of NEEDLE. */ size_t haystack_len; /* Known minimum length of HAYSTACK. */ - bool ok = true; /* True if NEEDLE is prefix of HAYSTACK. */ - - /* Determine length of NEEDLE, and in the process, make sure - HAYSTACK is at least as long (no point processing all of a long - NEEDLE if HAYSTACK is too short). */ - while (*haystack && *needle) - ok &= *haystack++ == *needle++; - if (*needle) + + /* Handle empty NEEDLE special case. */ + if (needle[0] == '\0') + return (char *) haystack; + + /* Skip until we find the first matching char from NEEDLE. */ + haystack = strchr (haystack, needle[0]); + if (haystack == NULL || needle[1] == '\0') + return (char *) haystack; + + /* Ensure HAYSTACK length is at least as long as NEEDLE length. + Since a match may occur early on in a huge HAYSTACK, use strnlen + and read ahead a few cachelines for improved performance. */ + needle_len = strlen (needle); + haystack_len = __strnlen (haystack, needle_len + 256); + if (haystack_len < needle_len) return NULL; - if (ok) - return (char *) haystack_start; - - /* Reduce the size of haystack using strchr, since it has a smaller - linear coefficient than the Two-Way algorithm. */ - needle_len = needle - needle_start; - haystack = strchr (haystack_start + 1, *needle_start); - if (!haystack || __builtin_expect (needle_len == 1, 0)) + + /* Check whether we have a match. This improves performance since we avoid + the initialization overhead of the two-way algorithm. */ + if (memcmp (haystack, needle, needle_len) == 0) return (char *) haystack; - needle -= needle_len; - haystack_len = (haystack > haystack_start + needle_len ? 1 - : needle_len + haystack_start - haystack); /* Perform the search. Abstract memory is considered to be an array of 'unsigned char' values, not an array of 'char' values. See -- cgit 1.4.1 From ceeba1d73c84f1a551677149ce3b3ed3372fb3ec Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 28 Aug 2018 12:42:19 +0530 Subject: Speedup first memmem match As done in commit 284f42bc778e487dfd5dff5c01959f93b9e0c4f5, memcmp can be used after memchr to avoid the initialization overhead of the two-way algorithm for the first match. This has shown improvement >40% for first match. (cherry picked from commit c8dd67e7c958de04c3783cbea7c384431707b5f8) --- ChangeLog | 4 ++++ string/memmem.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/ChangeLog b/ChangeLog index 22012fb458..12e7d27e49 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2019-09-13 Rajalakshmi Srinivasaraghavan + + * string/memmem.c: Use memcmp for first match. + 2019-09-13 Wilco Dijkstra * string/strcasestr.c (STRCASESTR): Simplify and speedup first match. diff --git a/string/memmem.c b/string/memmem.c index 34299b8864..645b6d1a26 100644 --- a/string/memmem.c +++ b/string/memmem.c @@ -70,6 +70,10 @@ __memmem (const void *haystack_start, size_t haystack_len, haystack_len -= haystack - (const unsigned char *) haystack_start; if (haystack_len < needle_len) return NULL; + /* Check whether we have a match. This improves performance since we + avoid the initialization overhead of the two-way algorithm. */ + if (memcmp (haystack, needle, needle_len) == 0) + return (void *) haystack; return two_way_short_needle (haystack, haystack_len, needle, needle_len); } else -- cgit 1.4.1 From cd3487afa276f817749d3a418e81849130e2dbce Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Wed, 19 Sep 2018 16:50:18 +0100 Subject: Fix strstr bug with huge needles (bug 23637) The generic strstr in GLIBC 2.28 fails to match huge needles. The optimized AVAILABLE macro reads ahead a large fixed amount to reduce the overhead of repeatedly checking for the end of the string. However if the needle length is larger than this, two_way_long_needle may confuse this as meaning the end of the string and return NULL. This is fixed by adding the needle length to the amount to read ahead. [BZ #23637] * string/test-strstr.c (pr23637): New function. (test_main): Add tests with longer needles. * string/strcasestr.c (AVAILABLE): Fix readahead distance. * string/strstr.c (AVAILABLE): Likewise. (cherry picked from commit 83a552b0bb9fc2a5e80a0ab3723c0a80ce1db9f2) --- ChangeLog | 8 ++++++++ string/strcasestr.c | 5 +++-- string/strstr.c | 5 +++-- string/test-strstr.c | 30 ++++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 12e7d27e49..a759cf936c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2019-09-13 Wilco Dijkstra + + [BZ #23637] + * string/test-strstr.c (pr23637): New function. + (test_main): Add tests with longer needles. + * string/strcasestr.c (AVAILABLE): Fix readahead distance. + * string/strstr.c (AVAILABLE): Likewise. + 2019-09-13 Rajalakshmi Srinivasaraghavan * string/memmem.c: Use memcmp for first match. diff --git a/string/strcasestr.c b/string/strcasestr.c index 772e513626..19ea1d4bbf 100644 --- a/string/strcasestr.c +++ b/string/strcasestr.c @@ -37,8 +37,9 @@ /* Two-Way algorithm. */ #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ - (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \ - (j) + (n_l) <= (h_l))) + (((j) + (n_l) <= (h_l)) \ + || ((h_l) += __strnlen ((void*)((h) + (h_l)), (n_l) + 512), \ + (j) + (n_l) <= (h_l))) #define CHECK_EOL (1) #define RET0_IF_0(a) if (!a) goto ret0 #define CANON_ELEMENT(c) TOLOWER (c) diff --git a/string/strstr.c b/string/strstr.c index 1eb4dc11db..9cfcfdf77d 100644 --- a/string/strstr.c +++ b/string/strstr.c @@ -33,8 +33,9 @@ #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ - (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \ - (j) + (n_l) <= (h_l))) + (((j) + (n_l) <= (h_l)) \ + || ((h_l) += __strnlen ((void*)((h) + (h_l)), (n_l) + 512), \ + (j) + (n_l) <= (h_l))) #define CHECK_EOL (1) #define RET0_IF_0(a) if (!a) goto ret0 #define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C)) diff --git a/string/test-strstr.c b/string/test-strstr.c index 8d04134e0b..5bce73b0bd 100644 --- a/string/test-strstr.c +++ b/string/test-strstr.c @@ -151,6 +151,32 @@ check2 (void) } } +#define N 1024 + +static void +pr23637 (void) +{ + char *h = (char*) buf1; + char *n = (char*) buf2; + + for (int i = 0; i < N; i++) + { + n[i] = 'x'; + h[i] = ' '; + h[i + N] = 'x'; + } + + n[N] = '\0'; + h[N * 2] = '\0'; + + /* Ensure we don't match at the first 'x'. */ + h[0] = 'x'; + + char *exp_result = stupid_strstr (h, n); + FOR_EACH_IMPL (impl, 0) + check_result (impl, h, n, exp_result); +} + static int test_main (void) { @@ -158,6 +184,7 @@ test_main (void) check1 (); check2 (); + pr23637 (); printf ("%23s", ""); FOR_EACH_IMPL (impl, 0) @@ -202,6 +229,9 @@ test_main (void) do_test (15, 9, hlen, klen, 1); do_test (15, 15, hlen, klen, 0); do_test (15, 15, hlen, klen, 1); + + do_test (15, 15, hlen + klen * 4, klen * 4, 0); + do_test (15, 15, hlen + klen * 4, klen * 4, 1); } do_test (0, 0, page_size - 1, 16, 0); -- cgit 1.4.1 From 796c5ee030deac07ed846d9531c1322d57c0a6c7 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Wed, 12 Jun 2019 11:38:52 +0100 Subject: Improve performance of strstr This patch significantly improves performance of strstr using a novel modified Horspool algorithm. Needles up to size 256 use a bad-character table indexed by hashed pairs of characters to quickly skip past mismatches. Long needles use a self-adapting filtering step to avoid comparing the whole needle repeatedly. By limiting the needle length to 256, the shift table only requires 8 bits per entry, lowering preprocessing overhead and minimizing cache effects. This limit also implies worst-case performance is linear. Small needles up to size 3 use a dedicated linear search. Very long needles use the Two-Way algorithm. The performance gain using the improved bench-strstr on Cortex-A72 is 5.8 times basic_strstr and 3.7 times twoway_strstr. Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test (https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c). Reviewed-by: Szabolcs Nagy * string/str-two-way.h (two_way_short_needle): Add inline to avoid warning. (two_way_long_needle): Block inlining. * string/strstr.c (strstr2): Add new function. (strstr3): Likewise. (STRSTR): Completely rewrite strstr to improve performance. (cherry picked from commit 5e0a7ecb6629461b28adc1a5aabcc0ede122f201) --- ChangeLog | 9 +++ string/str-two-way.h | 9 ++- string/strstr.c | 165 ++++++++++++++++++++++++++++++++++++--------------- 3 files changed, 132 insertions(+), 51 deletions(-) diff --git a/ChangeLog b/ChangeLog index a759cf936c..f0ce3a594a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2019-09-13 Wilco Dijkstra + + * string/str-two-way.h (two_way_short_needle): Add inline to avoid + warning. + (two_way_long_needle): Block inlining. + * string/strstr.c (strstr2): Add new function. + (strstr3): Likewise. + (STRSTR): Completely rewrite strstr to improve performance. + 2019-09-13 Wilco Dijkstra [BZ #23637] diff --git a/string/str-two-way.h b/string/str-two-way.h index f433c76478..30aca30c40 100644 --- a/string/str-two-way.h +++ b/string/str-two-way.h @@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len, most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ -static RETURN_TYPE +static inline RETURN_TYPE two_way_short_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { @@ -382,8 +382,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and - sublinear performance is not possible. */ -static RETURN_TYPE + sublinear performance is not possible. + + Since this function is large and complex, block inlining to avoid + slowing down the common case of small needles. */ +__attribute__((noinline)) static RETURN_TYPE two_way_long_needle (const unsigned char *haystack, size_t haystack_len, const unsigned char *needle, size_t needle_len) { diff --git a/string/strstr.c b/string/strstr.c index 9cfcfdf77d..4d72ffbfc9 100644 --- a/string/strstr.c +++ b/string/strstr.c @@ -16,29 +16,17 @@ License along with the GNU C Library; if not, see . */ -/* This particular implementation was written by Eric Blake, 2008. */ - #ifndef _LIBC # include #endif -/* Specification of strstr. */ #include -#include - -#ifndef _LIBC -# define __builtin_expect(expr, val) (expr) -#endif - #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ (((j) + (n_l) <= (h_l)) \ || ((h_l) += __strnlen ((void*)((h) + (h_l)), (n_l) + 512), \ (j) + (n_l) <= (h_l))) -#define CHECK_EOL (1) -#define RET0_IF_0(a) if (!a) goto ret0 -#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C)) #include "str-two-way.h" #undef strstr @@ -47,47 +35,128 @@ #define STRSTR strstr #endif -/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK - if NEEDLE is empty, otherwise NULL if NEEDLE is not found in - HAYSTACK. */ -char * -STRSTR (const char *haystack, const char *needle) +static inline char * +strstr2 (const unsigned char *hs, const unsigned char *ne) { - size_t needle_len; /* Length of NEEDLE. */ - size_t haystack_len; /* Known minimum length of HAYSTACK. */ - - /* Handle empty NEEDLE special case. */ - if (needle[0] == '\0') - return (char *) haystack; + uint32_t h1 = (ne[0] << 16) | ne[1]; + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 << 16) | c; + return h1 == h2 ? (char *)hs - 2 : NULL; +} - /* Skip until we find the first matching char from NEEDLE. */ - haystack = strchr (haystack, needle[0]); - if (haystack == NULL || needle[1] == '\0') - return (char *) haystack; +static inline char * +strstr3 (const unsigned char *hs, const unsigned char *ne) +{ + uint32_t h1 = ((uint32_t)ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8); + uint32_t h2 = 0; + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) + h2 = (h2 | c) << 8; + return h1 == h2 ? (char *)hs - 3 : NULL; +} - /* Ensure HAYSTACK length is at least as long as NEEDLE length. - Since a match may occur early on in a huge HAYSTACK, use strnlen +/* Hash character pairs so a small shift table can be used. All bits of + p[0] are included, but not all bits from p[-1]. So if two equal hashes + match on p[-1], p[0] matches too. Hash collisions are harmless and result + in smaller shifts. */ +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) + +/* Fast strstr algorithm with guaranteed linear-time performance. + Small needles up to size 3 use a dedicated linear search. Longer needles + up to size 256 use a novel modified Horspool algorithm. It hashes pairs + of characters to quickly skip past mismatches. The main search loop only + exits if the last 2 characters match, avoiding unnecessary calls to memcmp + and allowing for a larger skip if there is no match. A self-adapting + filtering check is used to quickly detect mismatches in long needles. + By limiting the needle length to 256, the shift table can be reduced to 8 + bits per entry, lowering preprocessing overhead and minimizing cache effects. + The limit also implies worst-case performance is linear. + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ +char * +STRSTR (const char *haystack, const char *needle) +{ + const unsigned char *hs = (const unsigned char *) haystack; + const unsigned char *ne = (const unsigned char *) needle; + + /* Handle short needle special cases first. */ + if (ne[0] == '\0') + return (char *)hs; + hs = (const unsigned char *)strchr ((const char*)hs, ne[0]); + if (hs == NULL || ne[1] == '\0') + return (char*)hs; + if (ne[2] == '\0') + return strstr2 (hs, ne); + if (ne[3] == '\0') + return strstr3 (hs, ne); + + /* Ensure haystack length is at least as long as needle length. + Since a match may occur early on in a huge haystack, use strnlen and read ahead a few cachelines for improved performance. */ - needle_len = strlen (needle); - haystack_len = __strnlen (haystack, needle_len + 256); - if (haystack_len < needle_len) + size_t ne_len = strlen ((const char*)ne); + size_t hs_len = __strnlen ((const char*)hs, ne_len | 512); + if (hs_len < ne_len) return NULL; - /* Check whether we have a match. This improves performance since we avoid - the initialization overhead of the two-way algorithm. */ - if (memcmp (haystack, needle, needle_len) == 0) - return (char *) haystack; - - /* Perform the search. Abstract memory is considered to be an array - of 'unsigned char' values, not an array of 'char' values. See - ISO C 99 section 6.2.6.1. */ - if (needle_len < LONG_NEEDLE_THRESHOLD) - return two_way_short_needle ((const unsigned char *) haystack, - haystack_len, - (const unsigned char *) needle, needle_len); - return two_way_long_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) needle, needle_len); + /* Check whether we have a match. This improves performance since we + avoid initialization overheads. */ + if (memcmp (hs, ne, ne_len) == 0) + return (char *) hs; + + /* Use Two-Way algorithm for very long needles. */ + if (__glibc_unlikely (ne_len > 256)) + return two_way_long_needle (hs, hs_len, ne, ne_len); + + const unsigned char *end = hs + hs_len - ne_len; + uint8_t shift[256]; + size_t tmp, shift1; + size_t m1 = ne_len - 1; + size_t offset = 0; + + /* Initialize bad character shift hash table. */ + memset (shift, 0, sizeof (shift)); + for (int i = 1; i < m1; i++) + shift[hash2 (ne + i)] = i; + /* Shift1 is the amount we can skip after matching the hash of the + needle end but not the full needle. */ + shift1 = m1 - shift[hash2 (ne + m1)]; + shift[hash2 (ne + m1)] = m1; + + while (1) + { + if (__glibc_unlikely (hs > end)) + { + end += __strnlen ((const char*)end + m1 + 1, 2048); + if (hs > end) + return NULL; + } + + /* Skip past character pairs not in the needle. */ + do + { + hs += m1; + tmp = shift[hash2 (hs)]; + } + while (tmp == 0 && hs <= end); + + /* If the match is not at the end of the needle, shift to the end + and continue until we match the hash of the needle end. */ + hs -= tmp; + if (tmp < m1) + continue; + + /* Hash of the last 2 characters matches. If the needle is long, + try to quickly filter out mismatches. */ + if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0) + { + if (memcmp (hs, ne, m1) == 0) + return (void *) hs; + + /* Adjust filter offset when it doesn't find the mismatch. */ + offset = (offset >= 8 ? offset : m1) - 8; + } + + /* Skip based on matching the hash of the needle end. */ + hs += shift1; + } } libc_hidden_builtin_def (strstr) - -#undef LONG_NEEDLE_THRESHOLD -- cgit 1.4.1 From 612fba2fe9036732c5ee08f691c52365e5bd32c0 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Wed, 12 Jun 2019 11:42:34 +0100 Subject: Improve performance of memmem This patch significantly improves performance of memmem using a novel modified Horspool algorithm. Needles up to size 256 use a bad-character table indexed by hashed pairs of characters to quickly skip past mismatches. Long needles use a self-adapting filtering step to avoid comparing the whole needle repeatedly. By limiting the needle length to 256, the shift table only requires 8 bits per entry, lowering preprocessing overhead and minimizing cache effects. This limit also implies worst-case performance is linear. Small needles up to size 2 use a dedicated linear search. Very long needles use the Two-Way algorithm (to avoid increasing stack size or slowing down the common case, inlining is disabled). The performance gain is 6.6 times on English text on AArch64 using random needles with average size 8. Tested against GLIBC testsuite and randomized tests. Reviewed-by: Szabolcs Nagy * string/memmem.c (__memmem): Rewrite to improve performance. (cherry picked from commit 680942b0167715e123d934b609060cd382f8e39f) --- ChangeLog | 4 ++ string/memmem.c | 127 +++++++++++++++++++++++++++++++++++++------------------- 2 files changed, 89 insertions(+), 42 deletions(-) diff --git a/ChangeLog b/ChangeLog index f0ce3a594a..5f3df320bc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2019-09-13 Wilco Dijkstra + * string/memmem.c (__memmem): Rewrite to improve performance. + +2019-06-12 Wilco Dijkstra + * string/str-two-way.h (two_way_short_needle): Add inline to avoid warning. (two_way_long_needle): Block inlining. diff --git a/string/memmem.c b/string/memmem.c index 645b6d1a26..fba7fe33f7 100644 --- a/string/memmem.c +++ b/string/memmem.c @@ -15,17 +15,13 @@ License along with the GNU C Library; if not, see . */ -/* This particular implementation was written by Eric Blake, 2008. */ - #ifndef _LIBC # include #endif -/* Specification of memmem. */ #include #ifndef _LIBC -# define __builtin_expect(expr, val) (expr) # define __memmem memmem #endif @@ -36,51 +32,98 @@ #undef memmem -/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK - if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in - HAYSTACK. */ +/* Hash character pairs so a small shift table can be used. All bits of + p[0] are included, but not all bits from p[-1]. So if two equal hashes + match on p[-1], p[0] matches too. Hash collisions are harmless and result + in smaller shifts. */ +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) + +/* Fast memmem algorithm with guaranteed linear-time performance. + Small needles up to size 2 use a dedicated linear search. Longer needles + up to size 256 use a novel modified Horspool algorithm. It hashes pairs + of characters to quickly skip past mismatches. The main search loop only + exits if the last 2 characters match, avoiding unnecessary calls to memcmp + and allowing for a larger skip if there is no match. A self-adapting + filtering check is used to quickly detect mismatches in long needles. + By limiting the needle length to 256, the shift table can be reduced to 8 + bits per entry, lowering preprocessing overhead and minimizing cache effects. + The limit also implies worst-case performance is linear. + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ void * -__memmem (const void *haystack_start, size_t haystack_len, - const void *needle_start, size_t needle_len) +__memmem (const void *haystack, size_t hs_len, + const void *needle, size_t ne_len) { - /* Abstract memory is considered to be an array of 'unsigned char' values, - not an array of 'char' values. See ISO C 99 section 6.2.6.1. */ - const unsigned char *haystack = (const unsigned char *) haystack_start; - const unsigned char *needle = (const unsigned char *) needle_start; - - if (needle_len == 0) - /* The first occurrence of the empty string is deemed to occur at - the beginning of the string. */ - return (void *) haystack; - - /* Sanity check, otherwise the loop might search through the whole - memory. */ - if (__glibc_unlikely (haystack_len < needle_len)) + const unsigned char *hs = (const unsigned char *) haystack; + const unsigned char *ne = (const unsigned char *) needle; + + if (ne_len == 0) + return (void *) hs; + if (ne_len == 1) + return (void *) memchr (hs, ne[0], hs_len); + + /* Ensure haystack length is >= needle length. */ + if (hs_len < ne_len) return NULL; - /* Use optimizations in memchr when possible, to reduce the search - size of haystack using a linear algorithm with a smaller - coefficient. However, avoid memchr for long needles, since we - can often achieve sublinear performance. */ - if (needle_len < LONG_NEEDLE_THRESHOLD) + const unsigned char *end = hs + hs_len - ne_len; + + if (ne_len == 2) + { + uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1]; + for (hs++; hs <= end && hw != nw; ) + hw = hw << 16 | *++hs; + return hw == nw ? (void *)hs - 1 : NULL; + } + + /* Use Two-Way algorithm for very long needles. */ + if (__builtin_expect (ne_len > 256, 0)) + return two_way_long_needle (hs, hs_len, ne, ne_len); + + uint8_t shift[256]; + size_t tmp, shift1; + size_t m1 = ne_len - 1; + size_t offset = 0; + + memset (shift, 0, sizeof (shift)); + for (int i = 1; i < m1; i++) + shift[hash2 (ne + i)] = i; + /* Shift1 is the amount we can skip after matching the hash of the + needle end but not the full needle. */ + shift1 = m1 - shift[hash2 (ne + m1)]; + shift[hash2 (ne + m1)] = m1; + + for ( ; hs <= end; ) { - haystack = memchr (haystack, *needle, haystack_len); - if (!haystack || __builtin_expect (needle_len == 1, 0)) - return (void *) haystack; - haystack_len -= haystack - (const unsigned char *) haystack_start; - if (haystack_len < needle_len) - return NULL; - /* Check whether we have a match. This improves performance since we - avoid the initialization overhead of the two-way algorithm. */ - if (memcmp (haystack, needle, needle_len) == 0) - return (void *) haystack; - return two_way_short_needle (haystack, haystack_len, needle, needle_len); + /* Skip past character pairs not in the needle. */ + do + { + hs += m1; + tmp = shift[hash2 (hs)]; + } + while (tmp == 0 && hs <= end); + + /* If the match is not at the end of the needle, shift to the end + and continue until we match the hash of the needle end. */ + hs -= tmp; + if (tmp < m1) + continue; + + /* Hash of the last 2 characters matches. If the needle is long, + try to quickly filter out mismatches. */ + if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0) + { + if (memcmp (hs, ne, m1) == 0) + return (void *) hs; + + /* Adjust filter offset when it doesn't find the mismatch. */ + offset = (offset >= 8 ? offset : m1) - 8; + } + + /* Skip based on matching the hash of the needle end. */ + hs += shift1; } - else - return two_way_long_needle (haystack, haystack_len, needle, needle_len); + return NULL; } libc_hidden_def (__memmem) weak_alias (__memmem, memmem) libc_hidden_weak (memmem) - -#undef LONG_NEEDLE_THRESHOLD -- cgit 1.4.1 From aaf2f25b61b570f0ae746488ab677f3a5159f038 Mon Sep 17 00:00:00 2001 From: Dragan Mladjenovic Date: Fri, 23 Aug 2019 16:38:04 +0000 Subject: mips: Force RWX stack for hard-float builds that can run on pre-4.8 kernels Linux/Mips kernels prior to 4.8 could potentially crash the user process when doing FPU emulation while running on non-executable user stack. Currently, gcc doesn't emit .note.GNU-stack for mips, but that will change in the future. To ensure that glibc can be used with such future gcc, without silently resulting in binaries that might crash in runtime, this patch forces RWX stack for all built objects if configured to run against minimum kernel version less than 4.8. * sysdeps/unix/sysv/linux/mips/Makefile (test-xfail-check-execstack): Move under mips-has-gnustack != yes. (CFLAGS-.o*, ASFLAGS-.o*): New rules. Apply -Wa,-execstack if mips-force-execstack == yes. * sysdeps/unix/sysv/linux/mips/configure: Regenerated. * sysdeps/unix/sysv/linux/mips/configure.ac (mips-force-execstack): New var. Set to yes for hard-float builds with minimum_kernel < 4.8.0 or minimum_kernel not set at all. (mips-has-gnustack): New var. Use value of libc_cv_as_noexecstack if mips-force-execstack != yes, otherwise set to no. (cherry picked from commit 33bc9efd91de1b14354291fc8ebd5bce96379f12) --- sysdeps/unix/sysv/linux/mips/Makefile | 21 ++++++++++++---- sysdeps/unix/sysv/linux/mips/configure | 41 +++++++++++++++++++++++++++++++ sysdeps/unix/sysv/linux/mips/configure.ac | 32 ++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 5 deletions(-) diff --git a/sysdeps/unix/sysv/linux/mips/Makefile b/sysdeps/unix/sysv/linux/mips/Makefile index bca11d39e0..a58e6954a7 100644 --- a/sysdeps/unix/sysv/linux/mips/Makefile +++ b/sysdeps/unix/sysv/linux/mips/Makefile @@ -99,14 +99,25 @@ sysdep-dl-routines += dl-static sysdep_routines += dl-vdso endif - -# Supporting non-executable stacks on MIPS requires changes to both -# the Linux kernel and glibc. See -# and -# . +# If the compiler doesn't use GNU.stack note, +# this test is expected to fail. +ifneq ($(mips-has-gnustack),yes) test-xfail-check-execstack = yes endif +endif ifeq ($(subdir),stdlib) gen-as-const-headers += ucontext_i.sym endif + +ifeq ($(mips-force-execstack),yes) +CFLAGS-.o += -Wa,-execstack +CFLAGS-.os += -Wa,-execstack +CFLAGS-.op += -Wa,-execstack +CFLAGS-.oS += -Wa,-execstack + +ASFLAGS-.o += -Wa,-execstack +ASFLAGS-.os += -Wa,-execstack +ASFLAGS-.op += -Wa,-execstack +ASFLAGS-.oS += -Wa,-execstack +endif diff --git a/sysdeps/unix/sysv/linux/mips/configure b/sysdeps/unix/sysv/linux/mips/configure index a5513fad48..2ec86a3121 100644 --- a/sysdeps/unix/sysv/linux/mips/configure +++ b/sysdeps/unix/sysv/linux/mips/configure @@ -475,3 +475,44 @@ if test -z "$arch_minimum_kernel"; then arch_minimum_kernel=4.5.0 fi fi + +# Check if we are supposed to run on kernels older than 4.8.0. If so, +# force executable stack to avoid potential runtime problems with fpu +# emulation. +# NOTE: The check below assumes that in absence of user-provided minumum_kernel +# we will default to arch_minimum_kernel which is currently less than 4.8.0 for +# all known configurations. If this changes, the check must be updated. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the compiler must use executable stack" >&5 +$as_echo_n "checking whether the compiler must use executable stack... " >&6; } +if ${libc_cv_mips_force_execstack+:} false; then : + $as_echo_n "(cached) " >&6 +else + libc_cv_mips_force_execstack=no + if test $libc_mips_float = hard; then + if test -n "$minimum_kernel"; then + + min_version=$((`echo "$minimum_kernel.0.0.0" | sed 's/\([0-9]*\)\.\([0-9]*\)\.\([0-9]*\).*/\1 \* 65536 + \2 \* 256 + \3/'`)) + + if test $min_version -lt 264192; then + libc_cv_mips_force_execstack=yes + fi + else + libc_cv_mips_force_execstack=yes + fi + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_mips_force_execstack" >&5 +$as_echo "$libc_cv_mips_force_execstack" >&6; } + +libc_mips_has_gnustack=$libc_cv_as_noexecstack + +if test $libc_cv_mips_force_execstack = yes; then + libc_mips_has_gnustack=no + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: forcing executable stack for pre-4.8.0 Linux kernels" >&5 +$as_echo "$as_me: WARNING: forcing executable stack for pre-4.8.0 Linux kernels" >&2;} +fi + +config_vars="$config_vars +mips-force-execstack = ${libc_cv_mips_force_execstack}" +config_vars="$config_vars +mips-has-gnustack = ${libc_mips_has_gnustack}" diff --git a/sysdeps/unix/sysv/linux/mips/configure.ac b/sysdeps/unix/sysv/linux/mips/configure.ac index 9147aa4582..3db1b32b08 100644 --- a/sysdeps/unix/sysv/linux/mips/configure.ac +++ b/sysdeps/unix/sysv/linux/mips/configure.ac @@ -134,3 +134,35 @@ if test -z "$arch_minimum_kernel"; then arch_minimum_kernel=4.5.0 fi fi + +# Check if we are supposed to run on kernels older than 4.8.0. If so, +# force executable stack to avoid potential runtime problems with fpu +# emulation. +# NOTE: The check below assumes that in absence of user-provided minumum_kernel +# we will default to arch_minimum_kernel which is currently less than 4.8.0 for +# all known configurations. If this changes, the check must be updated. +AC_CACHE_CHECK([whether the compiler must use executable stack], + libc_cv_mips_force_execstack, [dnl +libc_cv_mips_force_execstack=no + if test $libc_mips_float = hard; then + if test -n "$minimum_kernel"; then + changequote(,) + min_version=$((`echo "$minimum_kernel.0.0.0" | sed 's/\([0-9]*\)\.\([0-9]*\)\.\([0-9]*\).*/\1 \* 65536 + \2 \* 256 + \3/'`)) + changequote([,]) + if test $min_version -lt 264192; then + libc_cv_mips_force_execstack=yes + fi + else + libc_cv_mips_force_execstack=yes + fi + fi]) + +libc_mips_has_gnustack=$libc_cv_as_noexecstack + +if test $libc_cv_mips_force_execstack = yes; then + libc_mips_has_gnustack=no + AC_MSG_WARN([forcing executable stack for pre-4.8.0 Linux kernels]) +fi + +LIBC_CONFIG_VAR([mips-force-execstack],[${libc_cv_mips_force_execstack}]) +LIBC_CONFIG_VAR([mips-has-gnustack],[${libc_mips_has_gnustack}]) -- cgit 1.4.1 From bc42e3bd44e7e9f616162d4550f1d35e04bddb5b Mon Sep 17 00:00:00 2001 From: Marcin Koƛcielnicki Date: Thu, 21 Nov 2019 00:20:15 +0100 Subject: rtld: Check __libc_enable_secure before honoring LD_PREFER_MAP_32BIT_EXEC (CVE-2019-19126) [BZ #25204] The problem was introduced in glibc 2.23, in commit b9eb92ab05204df772eb4929eccd018637c9f3e9 ("Add Prefer_MAP_32BIT_EXEC to map executable pages with MAP_32BIT"). (cherry picked from commit d5dfad4326fc683c813df1e37bbf5cf920591c8e) --- NEWS | 7 +++++++ sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index c6c44a3914..89adc868bb 100644 --- a/NEWS +++ b/NEWS @@ -100,6 +100,12 @@ Security related changes: memcmp gave the wrong result since it treated the size argument as zero. Reported by H.J. Lu. + CVE-2019-19126: ld.so failed to ignore the LD_PREFER_MAP_32BIT_EXEC + environment variable during program execution after a security + transition, allowing local attackers to restrict the possible mapping + addresses for loaded libraries and thus bypass ASLR for a setuid + program. Reported by Marcin Koƛcielnicki. + The following bugs are resolved with this release: [16750] ldd: Never run file directly. @@ -182,6 +188,7 @@ The following bugs are resolved with this release: [24027] malloc: Integer overflow in realloc [24097] Can't use 64-bit register for size_t in assembly codes for x32 (CVE-2019-6488) [24155] x32 memcmp can treat positive length as 0 (if sign bit in RDX is set) (CVE-2019-7309) + [25204] Ignore LD_PREFER_MAP_32BIT_EXEC for SUID programs Version 2.26 diff --git a/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h b/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h index 8d474d0d04..37f0b14adb 100644 --- a/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h +++ b/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h @@ -31,7 +31,8 @@ environment variable, LD_PREFER_MAP_32BIT_EXEC. */ #define EXTRA_LD_ENVVARS \ case 21: \ - if (memcmp (envline, "PREFER_MAP_32BIT_EXEC", 21) == 0) \ + if (!__libc_enable_secure \ + && memcmp (envline, "PREFER_MAP_32BIT_EXEC", 21) == 0) \ GLRO(dl_x86_cpu_features).feature[index_arch_Prefer_MAP_32BIT_EXEC] \ |= bit_arch_Prefer_MAP_32BIT_EXEC; \ break; -- cgit 1.4.1 From 2dc2d678e91f3f093d0f4855ac086efb288a5e23 Mon Sep 17 00:00:00 2001 From: Florian Weimer Date: Thu, 28 Nov 2019 14:44:48 +0100 Subject: libio: Disable vtable validation for pre-2.1 interposed handles [BZ #25203] Commit c402355dfa7807b8e0adb27c009135a7e2b9f1b0 ("libio: Disable vtable validation in case of interposition [BZ #23313]") only covered the interposable glibc 2.1 handles, in libio/stdfiles.c. The parallel code in libio/oldstdfiles.c needs similar detection logic. Fixes (again) commit db3476aff19b75c4fdefbe65fcd5f0a90588ba51 ("libio: Implement vtable verification [BZ #20191]"). Change-Id: Ief6f9f17e91d1f7263421c56a7dc018f4f595c21 (cherry picked from commit cb61630ed712d033f54295f776967532d3f4b46a) --- NEWS | 1 + libio/oldstdfiles.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/NEWS b/NEWS index 89adc868bb..3ccaae3968 100644 --- a/NEWS +++ b/NEWS @@ -188,6 +188,7 @@ The following bugs are resolved with this release: [24027] malloc: Integer overflow in realloc [24097] Can't use 64-bit register for size_t in assembly codes for x32 (CVE-2019-6488) [24155] x32 memcmp can treat positive length as 0 (if sign bit in RDX is set) (CVE-2019-7309) + [25203] libio: Disable vtable validation for pre-2.1 interposed handles [25204] Ignore LD_PREFER_MAP_32BIT_EXEC for SUID programs diff --git a/libio/oldstdfiles.c b/libio/oldstdfiles.c index bed7bceca2..82ba986367 100644 --- a/libio/oldstdfiles.c +++ b/libio/oldstdfiles.c @@ -87,6 +87,11 @@ _IO_check_libio (void) stdout->_vtable_offset = stderr->_vtable_offset = ((int) sizeof (struct _IO_FILE) - (int) sizeof (struct _IO_FILE_complete)); + + if (_IO_stdin_.vtable != &_IO_old_file_jumps + || _IO_stdout_.vtable != &_IO_old_file_jumps + || _IO_stderr_.vtable != &_IO_old_file_jumps) + IO_set_accept_foreign_vtables (&_IO_vtable_check); } } -- cgit 1.4.1