diff options
author | Xuelei Zhang <zhangxuelei4@huawei.com> | 2019-12-19 13:49:46 +0000 |
---|---|---|
committer | Adhemerval Zanella <adhemerval.zanella@linaro.org> | 2019-12-19 16:31:04 -0300 |
commit | 2911cb68ed3d6c515ad1979237e74e1fefab3674 (patch) | |
tree | e7a53b39337a460ef833279f1d0a3c2ab54177fb | |
parent | 0237b61526e716fa9597f521643908a4fda3b46a (diff) | |
download | glibc-2911cb68ed3d6c515ad1979237e74e1fefab3674.tar.gz glibc-2911cb68ed3d6c515ad1979237e74e1fefab3674.tar.xz glibc-2911cb68ed3d6c515ad1979237e74e1fefab3674.zip |
aarch64: Optimized implementation of strnlen
Optimize the strlen implementation by using vector operations and loop unrooling in main loop. Compared to aarch64/strnlen.S, it reduces latency of cases in bench-strnlen by 11%~24% when the length of src is greater than 64 bytes, with gains throughout the benchmark. Checked on aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
-rw-r--r-- | sysdeps/aarch64/strnlen.S | 52 |
1 files changed, 51 insertions, 1 deletions
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S index 70283c8074..a57753b0a2 100644 --- a/sysdeps/aarch64/strnlen.S +++ b/sysdeps/aarch64/strnlen.S @@ -45,6 +45,11 @@ #define pos x13 #define limit_wd x14 +#define dataq q2 +#define datav v2 +#define datab2 b3 +#define dataq2 q3 +#define datav2 v3 #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f #define REP8_80 0x8080808080808080 @@ -71,7 +76,7 @@ ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9) cycle, as we get much better parallelism out of the operations. */ /* Start of critial section -- keep to one 64Byte cache line. */ -L(loop): + ldp data1, data2, [src], #16 L(realigned): sub tmp1, data1, zeroones @@ -119,6 +124,51 @@ L(nul_in_data2): csel len, len, limit, ls /* Return the lower value. */ RET +L(loop): + ldr dataq, [src], #16 + uminv datab2, datav.16b + mov tmp1, datav2.d[0] + subs limit_wd, limit_wd, #1 + ccmp tmp1, #0, #4, pl /* NZCV = 0000 */ + b.eq L(loop_end) + ldr dataq, [src], #16 + uminv datab2, datav.16b + mov tmp1, datav2.d[0] + subs limit_wd, limit_wd, #1 + ccmp tmp1, #0, #4, pl /* NZCV = 0000 */ + b.ne L(loop) +L(loop_end): + /* End of critical section -- keep to one 64Byte cache line. */ + + cbnz tmp1, L(hit_limit) /* No null in final Qword. */ + + /* We know there's a null in the final Qword. The easiest thing + to do now is work out the length of the string and return + MIN (len, limit). */ + +#ifdef __AARCH64EB__ + rev64 datav.16b, datav.16b +#endif + /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a + pair of scalars and then compute the length from the earliest NULL + byte. */ + + cmeq datav.16b, datav.16b, #0 + mov data1, datav.d[0] + mov data2, datav.d[1] + cmp data1, 0 + csel data1, data1, data2, ne + sub len, src, srcin + sub len, len, #16 + rev data1, data1 + add tmp2, len, 8 + clz tmp1, data1 + csel len, len, tmp2, ne + add len, len, tmp1, lsr 3 + cmp len, limit + csel len, len, limit, ls /* Return the lower value. */ + RET + L(misaligned): /* Deal with a partial first word. We're doing two things in parallel here; |