diff options
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/aarch64/dl-machine.h | 12 | ||||
-rw-r--r-- | sysdeps/aarch64/memcpy.S | 211 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/Makefile | 2 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/ifunc-impl-list.c | 2 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy.c | 8 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy_advsimd.S | 248 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memmove.c | 6 | ||||
-rw-r--r-- | sysdeps/aarch64/sysdep.h | 2 | ||||
-rw-r--r-- | sysdeps/i386/dl-machine.h | 16 | ||||
-rw-r--r-- | sysdeps/unix/sysv/linux/aarch64/cpu-features.h | 8 | ||||
-rw-r--r-- | sysdeps/x86/Makefile | 6 | ||||
-rw-r--r-- | sysdeps/x86/dl-cet.c | 6 | ||||
-rw-r--r-- | sysdeps/x86/tst-setjmp-cet.c | 1 | ||||
-rw-r--r-- | sysdeps/x86_64/dl-machine.h | 16 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 21 |
15 files changed, 442 insertions, 123 deletions
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h index 9617cb754f..c7ae423417 100644 --- a/sysdeps/aarch64/dl-machine.h +++ b/sysdeps/aarch64/dl-machine.h @@ -388,13 +388,6 @@ elf_machine_lazy_rel (struct link_map *map, /* Check for unexpected PLT reloc type. */ if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1)) { - if (map->l_mach.plt == 0) - { - /* Prelinking. */ - *reloc_addr += l_addr; - return; - } - if (1) /* DT_AARCH64_VARIANT_PCS is not available, so always check. */ { /* Check the symbol table for variant PCS symbols. */ @@ -418,7 +411,10 @@ elf_machine_lazy_rel (struct link_map *map, } } - *reloc_addr = map->l_mach.plt; + if (map->l_mach.plt == 0) + *reloc_addr += l_addr; + else + *reloc_addr = map->l_mach.plt; } else if (__builtin_expect (r_type == AARCH64_R(TLSDESC), 1)) { diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S index 7e1163e6a0..919b28d7e1 100644 --- a/sysdeps/aarch64/memcpy.S +++ b/sysdeps/aarch64/memcpy.S @@ -33,32 +33,24 @@ #define A_l x6 #define A_lw w6 #define A_h x7 -#define A_hw w7 #define B_l x8 #define B_lw w8 #define B_h x9 #define C_l x10 +#define C_lw w10 #define C_h x11 #define D_l x12 #define D_h x13 -#define E_l src -#define E_h count -#define F_l srcend -#define F_h dst +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 #define G_l count #define G_h dst +#define H_l src +#define H_h srcend #define tmp1 x14 -/* Copies are split into 3 main cases: small copies of up to 16 bytes, - medium copies of 17..96 bytes which are fully unrolled. Large copies - of more than 96 bytes align the destination and use an unrolled loop - processing 64 bytes per iteration. - In order to share code with memmove, small and medium copies read all - data before writing, allowing any kind of overlap. So small, medium - and large backwards memmoves are handled by falling through into memcpy. - Overlapping large forward memmoves use a loop that copies backwards. -*/ - #ifndef MEMMOVE # define MEMMOVE memmove #endif @@ -66,108 +58,115 @@ # define MEMCPY memcpy #endif -ENTRY_ALIGN (MEMMOVE, 6) - - DELOUSE (0) - DELOUSE (1) - DELOUSE (2) +/* This implementation supports both memcpy and memmove and shares most code. + It uses unaligned accesses and branchless sequences to keep the code small, + simple and improve performance. - sub tmp1, dstin, src - cmp count, 96 - ccmp tmp1, count, 2, hi - b.lo L(move_long) + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check in memmove is negligible since it is only required for large copies. - /* Common case falls through into memcpy. */ -END (MEMMOVE) -libc_hidden_builtin_def (MEMMOVE) -ENTRY (MEMCPY) + Large copies use a software pipelined loop processing 64 bytes per + iteration. The destination pointer is 16-byte aligned to minimize + unaligned accesses. The loop tail is handled by always copying 64 bytes + from the end. +*/ +ENTRY_ALIGN (MEMCPY, 6) DELOUSE (0) DELOUSE (1) DELOUSE (2) - prfm PLDL1KEEP, [src] add srcend, src, count add dstend, dstin, count - cmp count, 16 - b.ls L(copy16) - cmp count, 96 + cmp count, 128 b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) - /* Medium copies: 17..96 bytes. */ - sub tmp1, count, 1 + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) ldp A_l, A_h, [src] - tbnz tmp1, 6, L(copy96) ldp D_l, D_h, [srcend, -16] - tbz tmp1, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] -1: stp A_l, A_h, [dstin] stp D_l, D_h, [dstend, -16] ret - .p2align 4 - /* Small copies: 0..16 bytes. */ + /* Copy 8-15 bytes. */ L(copy16): - cmp count, 8 - b.lo 1f + tbz count, 3, L(copy8) ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret - .p2align 4 -1: - tbz count, 2, 1f + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) ldr A_lw, [src] - ldr A_hw, [srcend, -4] + ldr B_lw, [srcend, -4] str A_lw, [dstin] - str A_hw, [dstend, -4] + str B_lw, [dstend, -4] ret - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -1: - cbz count, 2f + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) lsr tmp1, count, 1 ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] + ldrb C_lw, [srcend, -1] ldrb B_lw, [src, tmp1] strb A_lw, [dstin] strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] -2: ret + strb C_lw, [dstend, -1] +L(copy0): + ret .p2align 4 - /* Copy 64..96 bytes. Copy 64 bytes from the start and - 32 bytes from the end. */ -L(copy96): + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [src, 32] - ldp D_l, D_h, [src, 48] - ldp E_l, E_h, [srcend, -32] - ldp F_l, F_h, [srcend, -16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) stp A_l, A_h, [dstin] stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin, 32] - stp D_l, D_h, [dstin, 48] - stp E_l, E_h, [dstend, -32] - stp F_l, F_h, [dstend, -16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] ret - /* Align DST to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 96 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret .p2align 4 + /* Copy more than 128 bytes. */ L(copy_long): + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + ldp D_l, D_h, [src] and tmp1, dstin, 15 bic dst, dstin, 15 - ldp D_l, D_h, [src] sub src, src, tmp1 add count, count, tmp1 /* Count is now 16 too large. */ ldp A_l, A_h, [src, 16] @@ -176,7 +175,8 @@ L(copy_long): ldp C_l, C_h, [src, 48] ldp D_l, D_h, [src, 64]! subs count, count, 128 + 16 /* Test and readjust count. */ - b.ls L(last64) + b.ls L(copy64_from_end) + L(loop64): stp A_l, A_h, [dst, 16] ldp A_l, A_h, [src, 16] @@ -189,10 +189,8 @@ L(loop64): subs count, count, 64 b.hi L(loop64) - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the end even if - there is just 1 byte left. */ -L(last64): + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): ldp E_l, E_h, [srcend, -64] stp A_l, A_h, [dst, 16] ldp A_l, A_h, [srcend, -48] @@ -207,20 +205,42 @@ L(last64): stp C_l, C_h, [dstend, -16] ret - .p2align 4 -L(move_long): - cbz tmp1, 3f +END (MEMCPY) +libc_hidden_builtin_def (MEMCPY) + +ENTRY_ALIGN (MEMMOVE, 4) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) add srcend, src, count add dstend, dstin, count + cmp count, 128 + b.hi L(move_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret - /* Align dstend to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 96 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ + .p2align 4 +L(move_long): + /* Only use backward copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.hs L(copy_long) - and tmp1, dstend, 15 + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 sub srcend, srcend, tmp1 sub count, count, tmp1 ldp A_l, A_h, [srcend, -16] @@ -230,10 +250,9 @@ L(move_long): ldp D_l, D_h, [srcend, -64]! sub dstend, dstend, tmp1 subs count, count, 128 - b.ls 2f + b.ls L(copy64_from_start) - nop -1: +L(loop64_backwards): stp A_l, A_h, [dstend, -16] ldp A_l, A_h, [srcend, -16] stp B_l, B_h, [dstend, -32] @@ -243,12 +262,10 @@ L(move_long): stp D_l, D_h, [dstend, -64]! ldp D_l, D_h, [srcend, -64]! subs count, count, 64 - b.hi 1b + b.hi L(loop64_backwards) - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ -2: + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): ldp G_l, G_h, [src, 48] stp A_l, A_h, [dstend, -16] ldp A_l, A_h, [src, 32] @@ -261,7 +278,7 @@ L(move_long): stp A_l, A_h, [dstin, 32] stp B_l, B_h, [dstin, 16] stp C_l, C_h, [dstin] -3: ret + ret -END (MEMCPY) -libc_hidden_builtin_def (MEMCPY) +END (MEMMOVE) +libc_hidden_builtin_def (MEMMOVE) diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 57ffdf7238..134fe463fd 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,4 +1,4 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ +sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \ memcpy_falkor memmove_falkor memset_generic memset_falkor endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index e55be80103..0ccd14118c 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -42,10 +42,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) IFUNC_IMPL (i, name, memmove, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) IFUNC_IMPL (i, name, memset, /* Enable this on non-falkor processors too so that other cores diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index 8f5d4e7df5..e69a1ae5af 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -29,6 +29,7 @@ extern __typeof (__redirect_memcpy) __libc_memcpy; extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; @@ -36,11 +37,14 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; libc_ifunc (__libc_memcpy, (IS_THUNDERX (midr) ? __memcpy_thunderx - : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr) + : (IS_FALKOR (midr) || IS_PHECDA (midr) ? __memcpy_falkor : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) ? __memcpy_thunderx2 - : __memcpy_generic)))); + : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr) + || IS_NEOVERSE_V1 (midr) + ? __memcpy_simd + : __memcpy_generic))))); # undef memcpy strong_alias (__libc_memcpy, memcpy); diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S new file mode 100644 index 0000000000..48bb6d7ca4 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S @@ -0,0 +1,248 @@ +/* Generic optimized memcpy using SIMD. + Copyright (C) 2020 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_lw w10 +#define tmp1 x14 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + + +/* This implementation supports both memcpy and memmove and shares most code. + It uses unaligned accesses and branchless sequences to keep the code small, + simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check in memmove is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per + iteration. The destination pointer is 16-byte aligned to minimize + unaligned accesses. The loop tail is handled by always copying 64 bytes + from the end. */ + +ENTRY (__memcpy_simd) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Align loop64 below to 16 bytes. */ + nop + + /* Copy more than 128 bytes. */ +L(copy_long): + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + +END (__memcpy_simd) +libc_hidden_builtin_def (__memcpy_simd) + + +ENTRY (__memmove_simd) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(move_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small moves: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] + ret + +L(move_long): + /* Only use backward copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(move0) + cmp tmp1, count + b.hs L(copy_long) + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(move0): + ret + +END (__memmove_simd) +libc_hidden_builtin_def (__memmove_simd) diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c index e69d816291..b426dad834 100644 --- a/sysdeps/aarch64/multiarch/memmove.c +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -29,6 +29,7 @@ extern __typeof (__redirect_memmove) __libc_memmove; extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden; extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden; @@ -37,7 +38,10 @@ libc_ifunc (__libc_memmove, ? __memmove_thunderx : (IS_FALKOR (midr) || IS_PHECDA (midr) ? __memmove_falkor - : __memmove_generic))); + : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr) + || IS_NEOVERSE_V1 (midr) + ? __memmove_simd + : __memmove_generic)))); # undef memmove strong_alias (__libc_memmove, memmove); diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h index 5b30709436..509e3e14c5 100644 --- a/sysdeps/aarch64/sysdep.h +++ b/sysdeps/aarch64/sysdep.h @@ -45,7 +45,7 @@ #define ENTRY(name) \ .globl C_SYMBOL_NAME(name); \ .type C_SYMBOL_NAME(name),%function; \ - .align 4; \ + .p2align 6; \ C_LABEL(name) \ cfi_startproc; \ CALL_MCOUNT diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h index f6cfb90e21..1e3ef25498 100644 --- a/sysdeps/i386/dl-machine.h +++ b/sysdeps/i386/dl-machine.h @@ -338,16 +338,22 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc, { # ifndef RTLD_BOOTSTRAP if (sym_map != map - && sym_map->l_type != lt_executable && !sym_map->l_relocated) { const char *strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]); - _dl_error_printf ("\ + if (sym_map->l_type == lt_executable) + _dl_fatal_printf ("\ +%s: IFUNC symbol '%s' referenced in '%s' is defined in the executable \ +and creates an unsatisfiable circular dependency.\n", + RTLD_PROGNAME, strtab + refsym->st_name, + map->l_name); + else + _dl_error_printf ("\ %s: Relink `%s' with `%s' for IFUNC symbol `%s'\n", - RTLD_PROGNAME, map->l_name, - sym_map->l_name, - strtab + refsym->st_name); + RTLD_PROGNAME, map->l_name, + sym_map->l_name, + strtab + refsym->st_name); } # endif value = ((Elf32_Addr (*) (void)) value) (); diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h index 153d258afe..fbe1148652 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h @@ -51,8 +51,12 @@ #define IS_PHECDA(midr) (MIDR_IMPLEMENTOR(midr) == 'h' \ && MIDR_PARTNUM(midr) == 0x000) -#define IS_ARES(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \ - && MIDR_PARTNUM(midr) == 0xd0c) +#define IS_NEOVERSE_N1(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \ + && MIDR_PARTNUM(midr) == 0xd0c) +#define IS_NEOVERSE_N2(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \ + && MIDR_PARTNUM(midr) == 0xd49) +#define IS_NEOVERSE_V1(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \ + && MIDR_PARTNUM(midr) == 0xd40) struct cpu_features { diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile index 43ad4a79ff..891a4fdbb0 100644 --- a/sysdeps/x86/Makefile +++ b/sysdeps/x86/Makefile @@ -12,6 +12,12 @@ endif ifeq ($(subdir),setjmp) gen-as-const-headers += jmp_buf-ssp.sym sysdep_routines += __longjmp_cancel +ifneq ($(enable-cet),no) +ifneq ($(have-tunables),no) +tests += tst-setjmp-cet +tst-setjmp-cet-ENV = GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on +endif +endif endif ifeq ($(enable-cet),yes) diff --git a/sysdeps/x86/dl-cet.c b/sysdeps/x86/dl-cet.c index b82ba14e75..56c37bcd4f 100644 --- a/sysdeps/x86/dl-cet.c +++ b/sysdeps/x86/dl-cet.c @@ -105,7 +105,11 @@ dl_cet_check (struct link_map *m, const char *program) /* No legacy object check if both IBT and SHSTK are always on. */ if (enable_ibt_type == CET_ALWAYS_ON && enable_shstk_type == CET_ALWAYS_ON) - return; + { + THREAD_SETMEM (THREAD_SELF, header.feature_1, + GL(dl_x86_feature_1)[0]); + return; + } /* Check if IBT is enabled by kernel. */ bool ibt_enabled diff --git a/sysdeps/x86/tst-setjmp-cet.c b/sysdeps/x86/tst-setjmp-cet.c new file mode 100644 index 0000000000..42c795d2a8 --- /dev/null +++ b/sysdeps/x86/tst-setjmp-cet.c @@ -0,0 +1 @@ +#include <setjmp/tst-setjmp.c> diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h index 1942ed5061..c5fdcd3d39 100644 --- a/sysdeps/x86_64/dl-machine.h +++ b/sysdeps/x86_64/dl-machine.h @@ -315,16 +315,22 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc, { # ifndef RTLD_BOOTSTRAP if (sym_map != map - && sym_map->l_type != lt_executable && !sym_map->l_relocated) { const char *strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]); - _dl_error_printf ("\ + if (sym_map->l_type == lt_executable) + _dl_fatal_printf ("\ +%s: IFUNC symbol '%s' referenced in '%s' is defined in the executable \ +and creates an unsatisfiable circular dependency.\n", + RTLD_PROGNAME, strtab + refsym->st_name, + map->l_name); + else + _dl_error_printf ("\ %s: Relink `%s' with `%s' for IFUNC symbol `%s'\n", - RTLD_PROGNAME, map->l_name, - sym_map->l_name, - strtab + refsym->st_name); + RTLD_PROGNAME, map->l_name, + sym_map->l_name, + strtab + refsym->st_name); } # endif value = ((ElfW(Addr) (*) (void)) value) (); diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index 9bab1147d5..5aaadc233f 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -67,6 +67,13 @@ # define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16)) #endif +/* Avoid short distance rep movsb only with non-SSE vector. */ +#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB +# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) +#else +# define AVOID_SHORT_DISTANCE_REP_MOVSB 0 +#endif + #ifndef PREFETCH # define PREFETCH(addr) prefetcht0 addr #endif @@ -257,7 +264,21 @@ L(movsb): # error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE! # endif jb L(more_8x_vec_backward) +# if AVOID_SHORT_DISTANCE_REP_MOVSB + movq %rdi, %rcx + subq %rsi, %rcx + jmp 2f +# endif 1: +# if AVOID_SHORT_DISTANCE_REP_MOVSB + movq %rsi, %rcx + subq %rdi, %rcx +2: +/* Avoid "rep movsb" if RCX, the distance between source and destination, + is N*4GB + [1..63] with N >= 0. */ + cmpl $63, %ecx + jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ +# endif mov %RDX_LP, %RCX_LP rep movsb L(nop): |