diff options
author | Feng Xue <fxue@os.amperecomputing.com> | 2018-08-09 04:38:03 -0400 |
---|---|---|
committer | Feng Xue <fxue@os.amperecomputing.com> | 2019-02-01 08:14:21 -0500 |
commit | 83d1cc42d8e6b18a4b6ba53addfdae98c694ea36 (patch) | |
tree | 31baf4b5309964aadf357cb81b21f5e6614a983e /sysdeps/aarch64/multiarch/memchr_nosimd.S | |
parent | c7d3890ff51bceb38fac0947ce1f2bb0c34f6b15 (diff) | |
download | glibc-83d1cc42d8e6b18a4b6ba53addfdae98c694ea36.tar.gz glibc-83d1cc42d8e6b18a4b6ba53addfdae98c694ea36.tar.xz glibc-83d1cc42d8e6b18a4b6ba53addfdae98c694ea36.zip |
aarch64: Optimized memchr specific to AmpereComputing emag
This version uses general register based memory instruction to load data, because vector register based is slightly slower in emag. Character-matching is performed on 16-byte (both size and alignment) memory block in parallel each iteration. * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR. [!MEMCHR](MEMCHR): Set to __memchr. * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add memchr_generic and memchr_nosimd. * sysdeps/aarch64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add memchr ifuncs. * sysdeps/aarch64/multiarch/memchr.c: New file. * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise. * sysdeps/aarch64/multiarch/memchr_nosimd.S: Likewise.
Diffstat (limited to 'sysdeps/aarch64/multiarch/memchr_nosimd.S')
-rw-r--r-- | sysdeps/aarch64/multiarch/memchr_nosimd.S | 223 |
1 files changed, 223 insertions, 0 deletions
diff --git a/sysdeps/aarch64/multiarch/memchr_nosimd.S b/sysdeps/aarch64/multiarch/memchr_nosimd.S new file mode 100644 index 0000000000..5ce8eb7625 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memchr_nosimd.S @@ -0,0 +1,223 @@ +/* memchr - find a character in a memory zone using base integer registers + + Copyright (C) 2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Use base integer registers. + */ + +#ifndef MEMCHR +# define MEMCHR __memchr_nosimd +#endif + +/* Arguments and results. */ +#define srcin x0 +#define chrin x1 +#define cntin x2 + +#define result x0 + +#define repchr x1 + +#define tmp1 x2 +#define tmp2 x3 +#define tmp3 x4 +#define tmp4 x5 + +#define src x6 +#define srcend x7 +#define srcend16 x8 + +#define anymore x9 + +#define zeroones x10 + +#define data1 x11 +#define data2 x12 + +#define has_chr1 x13 +#define has_chr2 x14 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + + +ENTRY_ALIGN (MEMCHR, 6) + + DELOUSE (0) + DELOUSE (2) + + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, L(none_chr) + + /* Start address is 16-byte aligned or not? */ + tst srcin, 15 + bic src, srcin, 15 + + mov zeroones, REP8_01 + and repchr, chrin, 255 + /* Generate a qword integer as |c|c|c|c|c|c|c|c|. */ + mul repchr, repchr, zeroones + + add srcend, srcin, cntin + /* + * srcend16 is address of the block following the last block. + * + * [A block is 16-byte aligned and sized.] + */ + add srcend16, srcend, 15 + bic srcend16, srcend16, 15 + + b.eq L(loop) + + /* Load the first block containing start address. */ + ldp data1, data2, [src], 16 + + lsl tmp1, srcin, 3 + mov tmp2, ~0 +#ifdef __AARCH64EB__ + lsr tmp3, tmp2, tmp1 +#else + lsl tmp3, tmp2, tmp1 +#endif + /* Start address is in the first or the second qword? */ + tst srcin, 8 + + /* + * Transform any byte in the block to zero using XOR operation, + * if that byte equals the char to search. In this way, searching + * the char becomes detecting zero in the resulting two qwords. + */ + eor data1, data1, repchr + eor data2, data2, repchr + + /* + * Set those unused bytes(before start address) to 0xff, so + * that they will not hit any zero detection. + */ + orn tmp1, data1, tmp3 + orn tmp2, data2, tmp3 + + csinv data1, tmp1, xzr, eq + csel data2, data2, tmp2, eq + + /* + * When the first and last block are the same, there are two cases: + * o. Memory range to search is just in one block. + * ( start address - end address) < 0 + * + * o. Memory range is so large that end address wrap-around. + * ( start address - end address) > 0 + */ + cmp srcin, srcend + ccmp src, srcend16, 0, mi + csetm anymore, ne + b L(find_chr) + + .p2align 4 +L(loop): + ldp data1, data2, [src], 16 + + subs anymore, src, srcend16 + + /* + * Transform any byte in the block to zero using XOR operation, + * if that byte equals the char to search. + */ + eor data1, data1, repchr + eor data2, data2, repchr + +L(find_chr): + /* + * Use the following integer test to find out if any byte in a + * qword is zero. If do not contain zero-valued byte, test result + * is zero. + * + * (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080 + * = + * (qword - 0x0101010101010101) & ~(qword | 0x7f7f7f7f7f7f7f7f) + * + */ + sub tmp1, data1, zeroones + sub tmp2, data2, zeroones + + orr tmp3, data1, REP8_7f + orr tmp4, data2, REP8_7f + + bic has_chr1, tmp1, tmp3 + bic has_chr2, tmp2, tmp4 + + orr tmp1, has_chr1, has_chr2 + ccmp tmp1, 0, 0, ne + + b.eq L(loop) + + cbz has_chr1, 1f + sub result, src, 16 +#ifdef __AARCH64EB__ + rev data1, data1 +#else + rev has_chr1, has_chr1 +#endif + b L(done) + +1: cbz has_chr2, L(none_chr) + sub result, src, 8 +#ifdef __AARCH64EB__ + rev data1, data2 +#else + rev has_chr1, has_chr2 +#endif + +L(done): +#ifdef __AARCH64EB__ + /* + * For big-endian, can not directly use has_chr1/has_chr2 because + * two qwords has been reversed after loading from memory. + * Thus, have to perform char detection on two qwords again, which + * should be byte-swapped this time. + */ + sub tmp1, data1, zeroones + orr tmp3, data1, REP8_7f + bic has_chr1, tmp1, tmp3 + rev has_chr1, has_chr1 +#endif + + /* + * If the specified char is found in a qword, the corresponding + * byte of in has_chr has value of 1, while this is only true for + * the first occurrence, not other occurrences. + */ + cmp anymore, 0 + clz tmp1, has_chr1 + add result, result, tmp1, lsr 3 + ccmp result, srcend, 8, eq /* NZCV = 8000 */ + csel result, result, xzr, mi + ret + +L(none_chr): + mov result, 0 + ret + +END (MEMCHR) +libc_hidden_builtin_def (MEMCHR) |