diff options
author | Danila Kutenin <danilak@google.com> | 2022-06-27 16:12:13 +0000 |
---|---|---|
committer | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2024-04-08 17:25:25 +0100 |
commit | ea25fe5599068cbadb19e0b2d8640353c32758eb (patch) | |
tree | 37ca9bfc3d43710e8dfebe1c472a313ae16e511e /sysdeps/aarch64/strcpy.S | |
parent | 2c4ae9faa5dced399fe9185980056cbbfb016444 (diff) | |
download | glibc-ea25fe5599068cbadb19e0b2d8640353c32758eb.tar.gz glibc-ea25fe5599068cbadb19e0b2d8640353c32758eb.tar.xz glibc-ea25fe5599068cbadb19e0b2d8640353c32758eb.zip |
aarch64: Optimize string functions with shrn instruction
We found that string functions were using AND+ADDP to find the nibble/syndrome mask but there is an easier opportunity through `SHRN dst.8b, src.8h, 4` (shift right every 2 bytes by 4 and narrow to 1 byte) and has same latency on all SIMD ARMv8 targets as ADDP. There are also possible gaps for memcmp but that's for another patch. We see 10-20% savings for small-mid size cases (<=128) which are primary cases for general workloads. (cherry picked from commit 3c9980698988ef64072f1fac339b180f52792faf)
Diffstat (limited to 'sysdeps/aarch64/strcpy.S')
-rw-r--r-- | sysdeps/aarch64/strcpy.S | 32 |
1 files changed, 12 insertions, 20 deletions
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S index da53170ece..78d27b4aa6 100644 --- a/sysdeps/aarch64/strcpy.S +++ b/sysdeps/aarch64/strcpy.S @@ -40,7 +40,6 @@ #define len x4 #define synd x4 #define tmp x5 -#define wtmp w5 #define shift x5 #define data1 x6 #define dataw1 w6 @@ -50,9 +49,8 @@ #define dataq q0 #define vdata v0 #define vhas_nul v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 +#define vend v2 +#define dend d2 #define dataq2 q1 #ifdef BUILD_STPCPY @@ -63,34 +61,29 @@ # define IFSTPCPY(X,...) #endif -/* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ +/* + Core algorithm: + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (STRCPY) PTR_ARG (0) PTR_ARG (1) bic src, srcin, 15 - mov wtmp, 0xf00f ld1 {vdata.16b}, [src] - dup vrepmask.8h, wtmp cmeq vhas_nul.16b, vdata.16b, 0 lsl shift, srcin, 2 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbnz synd, L(tail) ldr dataq, [src, 16]! cmeq vhas_nul.16b, vdata.16b, 0 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov synd, dend cbz synd, L(start_loop) @@ -162,8 +155,7 @@ L(loop): fmov synd, dend cbz synd, L(loop) - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov synd, dend #ifndef __AARCH64EB__ rbit synd, synd |