diff options
author | Wilco Dijkstra <wdijkstr@arm.com> | 2016-06-22 13:24:24 +0100 |
---|---|---|
committer | Wilco Dijkstra <wdijkstr@arm.com> | 2016-06-22 13:24:24 +0100 |
commit | a024b39a4e31a049391b459234f6b3575c9fc107 (patch) | |
tree | b7d551e9d11222c5e74292eb0732b2ca1b5010e7 /sysdeps | |
parent | a3b473373ee43a292f5ec68a7fda6b9cfb26a9b0 (diff) | |
download | glibc-a024b39a4e31a049391b459234f6b3575c9fc107.tar.gz glibc-a024b39a4e31a049391b459234f6b3575c9fc107.tar.xz glibc-a024b39a4e31a049391b459234f6b3575c9fc107.zip |
This patch further tunes memcpy - avoid one branch for sizes 1-3,
add a prefetch and improve small copies that are exact powers of 2. * sysdeps/aarch64/memcpy.S (memcpy): Further tuning for performance.
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/aarch64/memcpy.S | 56 |
1 files changed, 32 insertions, 24 deletions
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S index c256828af7..de73f0fb95 100644 --- a/sysdeps/aarch64/memcpy.S +++ b/sysdeps/aarch64/memcpy.S @@ -35,6 +35,7 @@ #define A_h x7 #define A_hw w7 #define B_l x8 +#define B_lw w8 #define B_h x9 #define C_l x10 #define C_h x11 @@ -70,21 +71,40 @@ END (memmove) libc_hidden_builtin_def (memmove) ENTRY (memcpy) + prfm PLDL1KEEP, [src] add srcend, src, count add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) cmp count, 96 b.hi L(copy_long) - cmp count, 16 - b.hs L(copy_medium) + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 /* Small copies: 0..16 bytes. */ L(copy16): - tbz count, 3, 1f + cmp count, 8 + b.lo 1f ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret + .p2align 4 1: tbz count, 2, 1f ldr A_lw, [src] @@ -92,33 +112,21 @@ L(copy16): str A_lw, [dstin] str A_hw, [dstend, -4] ret - .p2align 4 + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1: cbz count, 2f + lsr tmp1, count, 1 ldrb A_lw, [src] - tbz count, 1, 1f - ldrh A_hw, [srcend, -2] - strh A_hw, [dstend, -2] -1: strb A_lw, [dstin] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] 2: ret .p2align 4 - /* Medium copies: 17..96 bytes. */ -L(copy_medium): - ldp A_l, A_h, [src] - tbnz count, 6, L(copy96) - ldp D_l, D_h, [srcend, -16] - tbz count, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] -1: - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] - ret - - .p2align 4 /* Copy 64..96 bytes. Copy 64 bytes from the start and 32 bytes from the end. */ L(copy96): |