diff options
Diffstat (limited to 'sysdeps/aarch64')
-rw-r--r-- | sysdeps/aarch64/multiarch/Makefile | 3 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/ifunc-impl-list.c | 4 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/strlen.c | 41 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/strlen_asimd.S | 168 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/strlen_generic.S | 39 | ||||
-rw-r--r-- | sysdeps/aarch64/strlen.S | 10 |
6 files changed, 261 insertions, 4 deletions
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 57ffdf7238..b1a5f59fcd 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,4 +1,5 @@ ifeq ($(subdir),string) sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ - memcpy_falkor memmove_falkor memset_generic memset_falkor + memcpy_falkor memmove_falkor memset_generic memset_falkor \ + strlen_generic strlen_asimd endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index e55be80103..af446650a6 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -53,5 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) + IFUNC_IMPL (i, name, strlen, + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic)) + return i; } diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c new file mode 100644 index 0000000000..6c62d684c8 --- /dev/null +++ b/sysdeps/aarch64/multiarch/strlen.c @@ -0,0 +1,41 @@ +/* Multiple versions of strlen. AARCH64 version. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine strlen so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef strlen +# define strlen __redirect_strlen +# include <string.h> +# include <init-arch.h> + +#define USE_ASIMD_STRLEN() IS_FALKOR (midr) + +extern __typeof (__redirect_strlen) __strlen; + +extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden; +extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden; + +libc_ifunc (__strlen, + (USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic)); + +# undef strlen +strong_alias (__strlen, strlen); +#endif diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S new file mode 100644 index 0000000000..ca37b931cb --- /dev/null +++ b/sysdeps/aarch64/multiarch/strlen_asimd.S @@ -0,0 +1,168 @@ +/* Strlen implementation that uses ASIMD instructions for load and NULL checks. + Copyright (C) 2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + + ARMv8-a, AArch64, ASIMD, unaligned accesses, min page size 4k. */ + +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ + +/* Arguments and results. */ +#define srcin x0 +#define len x0 + +/* Locals and temporaries. */ +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 +#define dataq q2 +#define datav v2 +#define datab2 b3 +#define dataq2 q3 +#define datav2 v3 + +#ifdef TEST_PAGE_CROSS +# define MIN_PAGE_SIZE 16 +#else +# define MIN_PAGE_SIZE 4096 +#endif + + /* Since strings are short on average, we check the first 16 bytes + of the string for a NUL character. In order to do an unaligned load + safely we have to do a page cross check first. If there is a NUL + byte we calculate the length from the 2 8-byte words using + conditional select to reduce branch mispredictions (it is unlikely + strlen_asimd will be repeatedly called on strings with the same + length). + + If the string is longer than 16 bytes, we align src so don't need + further page cross checks, and process 16 bytes per iteration. + + If the page cross check fails, we read 16 bytes from an aligned + address, remove any characters before the string, and continue + in the main loop using aligned loads. Since strings crossing a + page in the first 16 bytes are rare (probability of + 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. + + AArch64 systems have a minimum page size of 4k. We don't bother + checking for larger page sizes - the cost of setting up the correct + page size is just not worth the extra gain from a small reduction in + the cases taking the slow path. Note that we only care about + whether the first fetch, which may be misaligned, crosses a page + boundary. */ + +ENTRY_ALIGN (__strlen_asimd, 6) + DELOUSE (0) + DELOUSE (1) + and tmp1, srcin, MIN_PAGE_SIZE - 1 + cmp tmp1, MIN_PAGE_SIZE - 16 + b.gt L(page_cross) + ldr dataq, [srcin] +#ifdef __AARCH64EB__ + rev64 datav.16b, datav.16b +#endif + + /* Get the minimum value and keep going if it is not zero. */ + uminv datab2, datav.16b + mov tmp1, datav2.d[0] + cbnz tmp1, L(main_loop_entry) + + cmeq datav.16b, datav.16b, #0 + mov data1, datav.d[0] + mov data2, datav.d[1] + cmp data1, 0 + csel data1, data1, data2, ne + mov len, 8 + rev data1, data1 + clz tmp1, data1 + csel len, xzr, len, ne + add len, len, tmp1, lsr 3 + ret + +L(main_loop_entry): + bic src, srcin, 15 + +L(main_loop): + ldr dataq, [src, 16]! +L(page_cross_entry): + /* Get the minimum value and keep going if it is not zero. */ + uminv datab2, datav.16b + mov tmp1, datav2.d[0] + cbnz tmp1, L(main_loop) + +L(tail): +#ifdef __AARCH64EB__ + rev64 datav.16b, datav.16b +#endif + /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a + pair of scalars and then compute the length from the earliest NULL + byte. */ + cmeq datav.16b, datav.16b, #0 + mov data1, datav.d[0] + mov data2, datav.d[1] + cmp data1, 0 + csel data1, data1, data2, ne + sub len, src, srcin + rev data1, data1 + add tmp2, len, 8 + clz tmp1, data1 + csel len, len, tmp2, ne + add len, len, tmp1, lsr 3 + ret + + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede + srcin to 0xff, so we ignore any NUL bytes before the string. + Then continue in the aligned loop. */ +L(page_cross): + mov tmp3, 63 + bic src, srcin, 15 + and tmp1, srcin, 7 + ands tmp2, srcin, 8 + ldr dataq, [src] + lsl tmp1, tmp1, 3 + csel tmp2, tmp2, tmp1, eq + csel tmp1, tmp1, tmp3, eq + mov tmp4, -1 +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsr tmp1, tmp4, tmp1 + lsr tmp2, tmp4, tmp2 +#else + /* Little-endian. Early bytes are at LSB. */ + lsl tmp1, tmp4, tmp1 + lsl tmp2, tmp4, tmp2 +#endif + mov datav2.d[0], tmp1 + mov datav2.d[1], tmp2 + orn datav.16b, datav.16b, datav2.16b + b L(page_cross_entry) +END (__strlen_asimd) +weak_alias (__strlen_asimd, strlen_asimd) +libc_hidden_builtin_def (strlen_asimd) diff --git a/sysdeps/aarch64/multiarch/strlen_generic.S b/sysdeps/aarch64/multiarch/strlen_generic.S new file mode 100644 index 0000000000..4234948eba --- /dev/null +++ b/sysdeps/aarch64/multiarch/strlen_generic.S @@ -0,0 +1,39 @@ +/* A Generic Optimized strlen implementation for AARCH64. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* The actual strlen code is in ../strlen.S. If we are building libc this file + defines __strlen_generic. Otherwise the include of ../strlen.S will define + the normal __strlen entry points. */ + +#include <sysdep.h> + +#if IS_IN (libc) + +# define STRLEN __strlen_generic + +/* Do not hide the generic version of strlen, we use it internally. */ +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) + +# ifdef SHARED +/* It doesn't make sense to send libc-internal strlen calls through a PLT. */ + .globl __GI_strlen; __GI_strlen = __strlen_generic +# endif +#endif + +#include "../strlen.S" diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S index eb773ef532..521ebc3b75 100644 --- a/sysdeps/aarch64/strlen.S +++ b/sysdeps/aarch64/strlen.S @@ -23,6 +23,10 @@ * ARMv8-a, AArch64, unaligned accesses, min page size 4k. */ +#ifndef STRLEN +# define STRLEN __strlen +#endif + /* To test the page crossing code path more thoroughly, compile with -DTEST_PAGE_CROSS - this will force all calls through the slower entry path. This option is not intended for production use. */ @@ -84,7 +88,7 @@ whether the first fetch, which may be misaligned, crosses a page boundary. */ -ENTRY_ALIGN (__strlen, 6) +ENTRY_ALIGN (STRLEN, 6) DELOUSE (0) DELOUSE (1) and tmp1, srcin, MIN_PAGE_SIZE - 1 @@ -215,6 +219,6 @@ L(page_cross): csel data1, data1, tmp4, eq csel data2, data2, tmp2, eq b L(page_cross_entry) -END (__strlen) -weak_alias (__strlen, strlen) +END (STRLEN) +weak_alias (STRLEN, strlen) libc_hidden_builtin_def (strlen) |