From ea547d896decb172a5687c57292148809a79a6eb Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Thu, 2 Dec 2021 18:30:55 +0000 Subject: AArch64: Optimize memcmp Rewrite memcmp to improve performance. On small and medium inputs performance is 10-20% better. Large inputs use a SIMD loop processing 64 bytes per iteration, which is 30-50% faster depending on the size. Reviewed-by: Szabolcs Nagy (cherry picked from commit b51eb35c572b015641f03e3682c303f7631279b7) --- sysdeps/aarch64/memcmp.S | 241 ++++++++++++++++++++++++++--------------------- 1 file changed, 134 insertions(+), 107 deletions(-) diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index c1937f6f5c..c7d56a8af0 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -22,105 +22,79 @@ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. */ -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result w0 - -/* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data1h x4 -#define data2 x5 -#define data2w w5 -#define data2h x6 -#define tmp1 x7 -#define tmp2 x8 - -ENTRY_ALIGN (memcmp, 6) +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 + +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define data3 x5 +#define data3w w5 +#define data4 x6 +#define data4w w6 +#define tmp x6 +#define src1end x7 +#define src2end x8 + + +ENTRY (memcmp) PTR_ARG (0) PTR_ARG (1) SIZE_ARG (2) - subs limit, limit, 16 + cmp limit, 16 b.lo L(less16) - - ldp data1, data1h, [src1], 16 - ldp data2, data2h, [src2], 16 + ldp data1, data3, [src1] + ldp data2, data4, [src2] ccmp data1, data2, 0, ne - ccmp data1h, data2h, 0, eq - b.ne L(return64) + ccmp data3, data4, 0, eq + b.ne L(return2) - subs limit, limit, 16 + add src1end, src1, limit + add src2end, src2, limit + cmp limit, 32 b.ls L(last_bytes) - cmp limit, 112 - b.lo L(loop16) - - and tmp1, src1, 15 - add limit, limit, tmp1 - sub src1, src1, tmp1 - sub src2, src2, tmp1 - subs limit, limit, 48 + cmp limit, 160 + b.hs L(loop_align) + sub limit, limit, 32 - /* Compare 128 up bytes using aligned access. */ .p2align 4 -L(loop64): - ldp data1, data1h, [src1] - ldp data2, data2h, [src2] - cmp data1, data2 - ccmp data1h, data2h, 0, eq - b.ne L(return64) - - ldp data1, data1h, [src1, 16] - ldp data2, data2h, [src2, 16] - cmp data1, data2 - ccmp data1h, data2h, 0, eq - b.ne L(return64) - - ldp data1, data1h, [src1, 32] - ldp data2, data2h, [src2, 32] - cmp data1, data2 - ccmp data1h, data2h, 0, eq - b.ne L(return64) - - ldp data1, data1h, [src1, 48] - ldp data2, data2h, [src2, 48] +L(loop32): + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] cmp data1, data2 - ccmp data1h, data2h, 0, eq - b.ne L(return64) + ccmp data3, data4, 0, eq + b.ne L(return2) + cmp limit, 16 + b.ls L(last_bytes) - subs limit, limit, 64 - add src1, src1, 64 - add src2, src2, 64 - b.pl L(loop64) - adds limit, limit, 48 - b.lo L(last_bytes) - -L(loop16): - ldp data1, data1h, [src1], 16 - ldp data2, data2h, [src2], 16 + ldp data1, data3, [src1, 32] + ldp data2, data4, [src2, 32] cmp data1, data2 - ccmp data1h, data2h, 0, eq - b.ne L(return64) + ccmp data3, data4, 0, eq + b.ne L(return2) + add src1, src1, 32 + add src2, src2, 32 +L(last64): + subs limit, limit, 32 + b.hi L(loop32) - subs limit, limit, 16 - b.hi L(loop16) /* Compare last 1-16 bytes using unaligned access. */ L(last_bytes): - add src1, src1, limit - add src2, src2, limit - ldp data1, data1h, [src1] - ldp data2, data2h, [src2] + ldp data1, data3, [src1end, -16] + ldp data2, data4, [src2end, -16] +L(return2): + cmp data1, data2 + csel data1, data1, data3, ne + csel data2, data2, data4, ne /* Compare data bytes and set return value to 0, -1 or 1. */ -L(return64): - cmp data1, data2 - csel data1, data1, data1h, ne - csel data2, data2, data2h, ne L(return): #ifndef __AARCH64EB__ rev data1, data1 @@ -133,45 +107,98 @@ L(return): .p2align 4 L(less16): - adds limit, limit, 8 - b.lo L(less8) //lo:< + add src1end, src1, limit + add src2end, src2, limit + tbz limit, 3, L(less8) ldr data1, [src1] ldr data2, [src2] - /* equal 8 optimized */ - ccmp data1, data2, 0, ne - b.ne L(return) - - ldr data1, [src1, limit] - ldr data2, [src2, limit] - b L(return) + ldr data3, [src1end, -8] + ldr data4, [src2end, -8] + b L(return2) .p2align 4 L(less8): - adds limit, limit, 4 - b.lo L(less4) + tbz limit, 2, L(less4) ldr data1w, [src1] ldr data2w, [src2] - ccmp data1w, data2w, 0, ne - b.ne L(return) - ldr data1w, [src1, limit] - ldr data2w, [src2, limit] - b L(return) + ldr data3w, [src1end, -4] + ldr data4w, [src2end, -4] + b L(return2) - .p2align 4 L(less4): - adds limit, limit, 4 - b.eq L(ret_0) - -L(byte_loop): - ldrb data1w, [src1], 1 - ldrb data2w, [src2], 1 - subs limit, limit, 1 - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ - b.eq L(byte_loop) + tbz limit, 1, L(less2) + ldrh data1w, [src1] + ldrh data2w, [src2] + cmp data1w, data2w + b.ne L(return) +L(less2): + mov result, 0 + tbz limit, 0, L(return_zero) + ldrb data1w, [src1end, -1] + ldrb data2w, [src2end, -1] sub result, data1w, data2w +L(return_zero): ret -L(ret_0): - mov result, 0 + +L(loop_align): + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] + cmp data1, data2 + ccmp data3, data4, 0, eq + b.ne L(return2) + + /* Align src2 and adjust src1, src2 and limit. */ + and tmp, src2, 15 + sub tmp, tmp, 16 + sub src2, src2, tmp + add limit, limit, tmp + sub src1, src1, tmp + sub limit, limit, 64 + 16 + + .p2align 4 +L(loop64): + ldr q0, [src1, 16] + ldr q1, [src2, 16] + subs limit, limit, 64 + ldr q2, [src1, 32] + ldr q3, [src2, 32] + eor v0.16b, v0.16b, v1.16b + eor v1.16b, v2.16b, v3.16b + ldr q2, [src1, 48] + ldr q3, [src2, 48] + umaxp v0.16b, v0.16b, v1.16b + ldr q4, [src1, 64]! + ldr q5, [src2, 64]! + eor v1.16b, v2.16b, v3.16b + eor v2.16b, v4.16b, v5.16b + umaxp v1.16b, v1.16b, v2.16b + umaxp v0.16b, v0.16b, v1.16b + umaxp v0.16b, v0.16b, v0.16b + fmov tmp, d0 + ccmp tmp, 0, 0, hi + b.eq L(loop64) + + /* If equal, process last 1-64 bytes using scalar loop. */ + add limit, limit, 64 + 16 + cbz tmp, L(last64) + + /* Determine the 8-byte aligned offset of the first difference. */ +#ifdef __AARCH64EB__ + rev16 tmp, tmp +#endif + rev tmp, tmp + clz tmp, tmp + bic tmp, tmp, 7 + sub tmp, tmp, 48 + ldr data1, [src1, tmp] + ldr data2, [src2, tmp] +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + mov result, 1 + cmp data1, data2 + cneg result, result, lo ret END (memcmp) -- cgit 1.4.1