From 922369032c604b4dcfd535e1bcddd4687e7126a5 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Thu, 10 Aug 2017 17:00:38 +0100 Subject: [AArch64] Optimized memcmp. This is an optimized memcmp for AArch64. This is a complete rewrite using a different algorithm. The previous version split into cases where both inputs were aligned, the inputs were mutually aligned and unaligned using a byte loop. The new version combines all these cases, while small inputs of less than 8 bytes are handled separately. This allows the main code to be sped up using unaligned loads since there are now at least 8 bytes to be compared. After the first 8 bytes, align the first input. This ensures each iteration does at most one unaligned access and mutually aligned inputs behave as aligned. After the main loop, process the last 8 bytes using unaligned accesses. This improves performance of (mutually) aligned cases by 25% and unaligned by >500% (yes >6 times faster) on large inputs. * sysdeps/aarch64/memcmp.S (memcmp): Rewrite of optimized memcmp. --- ChangeLog | 5 ++ sysdeps/aarch64/memcmp.S | 176 +++++++++++++++++++---------------------------- 2 files changed, 76 insertions(+), 105 deletions(-) diff --git a/ChangeLog b/ChangeLog index dcf86261ca..e5e36a40e6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2017-08-10 Wilco Dijkstra + + * sysdeps/aarch64/memcmp.S (memcmp): + Rewrite of optimized memcmp. + 2017-08-10 Florian Weimer Introduce ld.so exceptions. diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index 4cfcb89297..b99c081bba 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -22,132 +22,98 @@ /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64, unaligned accesses. */ /* Parameters and result. */ #define src1 x0 #define src2 x1 #define limit x2 -#define result x0 +#define result w0 /* Internal variables. */ #define data1 x3 #define data1w w3 #define data2 x4 #define data2w w4 -#define has_nul x5 -#define diff x6 -#define endloop x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define pos x11 -#define limit_wd x12 -#define mask x13 +#define tmp1 x5 ENTRY_ALIGN (memcmp, 6) DELOUSE (0) DELOUSE (1) DELOUSE (2) - cbz limit, L(ret0) - eor tmp1, src1, src2 - tst tmp1, #7 - b.ne L(misaligned8) - ands tmp1, src1, #7 - b.ne L(mutual_align) - add limit_wd, limit, #7 - lsr limit_wd, limit_wd, #3 - /* Start of performance-critical section -- one 64B cache line. */ -L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 -L(start_realigned): - subs limit_wd, limit_wd, #1 - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, ne /* Last Dword or differences. */ - cbz endloop, L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ - - /* Not reached the limit, must have found a diff. */ - cbnz limit_wd, L(not_limit) - - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq L(not_limit) - - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask - - orr diff, diff, mask -L(not_limit): -#ifndef __AARCH64EB__ - rev diff, diff + subs limit, limit, 8 + b.lo .Lless8 + + /* Limit >= 8, so check first 8 bytes using unaligned loads. */ + ldr data1, [src1], 8 + ldr data2, [src2], 8 + and tmp1, src1, 7 + add limit, limit, tmp1 + cmp data1, data2 + bne .Lreturn + + /* Align src1 and adjust src2 with bytes not yet done. */ + sub src1, src1, tmp1 + sub src2, src2, tmp1 + + subs limit, limit, 8 + b.ls .Llast_bytes + + /* Loop performing 8 bytes per iteration using aligned src1. + Limit is pre-decremented by 8 and must be larger than zero. + Exit if <= 8 bytes left to do or if the data is not equal. */ + .p2align 4 +.Lloop8: + ldr data1, [src1], 8 + ldr data2, [src2], 8 + subs limit, limit, 8 + ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ + b.eq .Lloop8 + + cmp data1, data2 + bne .Lreturn + + /* Compare last 1-8 bytes using unaligned access. */ +.Llast_bytes: + ldr data1, [src1, limit] + ldr data2, [src2, limit] + + /* Compare data bytes and set return value to 0, -1 or 1. */ +.Lreturn: +#ifndef __AARCH64EB__ rev data1, data1 rev data2, data2 #endif - /* The MS-non-zero bit of DIFF marks either the first bit - that is different, or the end of the significant data. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, diff - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - RET - -L(mutual_align): - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - add limit, limit, tmp1 /* Adjust the limit for the extra. */ - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - add limit_wd, limit, #7 - orr data1, data1, tmp2 - orr data2, data2, tmp2 - lsr limit_wd, limit_wd, #3 - b L(start_realigned) - -L(ret0): - mov result, #0 - RET - - .p2align 6 -L(misaligned8): - sub limit, limit, #1 -1: - /* Perhaps we can do better than this. */ - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq 1b - sub result, data1, data2 - RET + cmp data1, data2 +.Lret_eq: + cset result, ne + cneg result, result, lo + ret + + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +.Lless8: + adds limit, limit, 4 + b.lo .Lless4 + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne .Lreturn + sub limit, limit, 4 +.Lless4: + adds limit, limit, 4 + beq .Lret_eq +.Lbyte_loop: + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq .Lbyte_loop + sub result, data1w, data2w + ret + END (memcmp) #undef bcmp weak_alias (memcmp, bcmp) -- cgit 1.4.1