diff options
author | Richard Henderson <rth@twiddle.net> | 2014-06-16 11:53:52 -0700 |
---|---|---|
committer | Richard Henderson <rth@twiddle.net> | 2014-06-16 11:53:52 -0700 |
commit | 1268f9e6edbcbc0c3c32848d12ced16440bfa177 (patch) | |
tree | 5ef7cc643651debd537e64e9ee7677c724ecd7ab | |
parent | 449b455a688c4cf01c05cd6c90f8f434c1af4862 (diff) | |
download | glibc-rth/aa-memset.tar.gz glibc-rth/aa-memset.tar.xz glibc-rth/aa-memset.zip |
neon for memset; higher minimums to enter loops rth/aa-memset
-rw-r--r-- | sysdeps/aarch64/memset.S | 221 |
1 files changed, 125 insertions, 96 deletions
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index 523406d3c8..2e15551006 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -26,7 +26,6 @@ #define dstin x0 #define dstin_w w0 -#define val x1 #define valw w1 #define count x2 #define tmp1 x3 @@ -87,28 +86,27 @@ memset: .type memset_zva_64, %function memset_zva_64: CALL_MCOUNT - and valw, valw, #255 - cmp count, #256 - ccmp valw, #0, #0, hs /* hs ? cmp val,0 : !z */ + tst valw, #255 b.ne L(nz_or_small) - stp xzr, xzr, [dstin] /* first 16 aligned 1. */ + cmp count, #256 + dup v16.16b, valw + add dstend, dstin, count + b.lo L(le_255) + + str q16, [dstin] /* first 16 aligned 1. */ and tmp2, dstin, #-16 and dst, dstin, #-64 - stp xzr, xzr, [tmp2, #16] /* first 64 aligned 16. */ - add dstend, dstin, count + stp q16, q16, [tmp2, #16] /* first 64 aligned 16. */ add dst, dst, #64 - stp xzr, xzr, [tmp2, #32] + stp q16, q16, [tmp2, #48] sub count, dstend, dst /* recompute for misalign */ add tmp1, dst, #64 - stp xzr, xzr, [tmp2, #48] sub count, count, #128 /* pre-bias */ - stp xzr, xzr, [tmp2, #64] - .p2align 6,,24 0: dc zva, dst subs count, count, #128 @@ -126,7 +124,26 @@ memset_zva_64: /* For larger zva sizes, a simple loop ought to suffice. */ /* ??? Needs performance testing, when such hardware becomes available. */ -.macro do_zva len +.macro do_zvas len + .p2align 4 + .type memset_zva_\len, %function +memset_zva_\len: + CALL_MCOUNT + tst valw, #255 + b.ne L(nz_or_small) + + cmp count, #256 + dup v16.16b, valw + add dstend, dstin, count + b.lo L(le_255) + + mov zva_len, #\len + b memset_zva_n + + .size memset_zva_\len, . - memset_zva_\len +.endm + +.macro do_zval len .p2align 4 .type memset_zva_\len, %function memset_zva_\len: @@ -138,23 +155,22 @@ memset_zva_\len: add dstend, dstin, count mov zva_len, #\len - mov zva_mask, #\len-1 b memset_zva_n .size memset_zva_\len, . - memset_zva_\len .endm - do_zva 128 // 5 - do_zva 256 // 6 - do_zva 512 // 7 - do_zva 1024 // 8 - do_zva 2048 // 9 - do_zva 4096 // 10 - do_zva 8192 // 11 - do_zva 16384 // 12 - do_zva 32768 // 13 - do_zva 65536 // 14 - do_zva 131072 // 15 + do_zvas 128 // 5 + do_zvas 256 // 6 + do_zval 512 // 7 + do_zval 1024 // 8 + do_zval 2048 // 9 + do_zval 4096 // 10 + do_zval 8192 // 11 + do_zval 16384 // 12 + do_zval 32768 // 13 + do_zval 65536 // 14 + do_zval 131072 // 15 .p2align 6 #else @@ -163,21 +179,26 @@ memset_zva_\len: .p2align 6 .type memset, %function memset: - and valw, valw, #255 - cmp count, #256 - ccmp valw, #0, #0, hs /* hs ? cmp val,0 : !z */ + tst valw, #255 b.ne L(nz_or_small) + cmp count, #256 + dup v16.16b, valw + add dstend, dstin, count + b.lo L(le_255) + mrs tmp1, dczid_el0 - tbnz tmp1, #4, L(nz_or_small) + mov zva_len, #4 + tst tmp1w, #16 /* dc disabled? */ and tmp1w, tmp1w, #15 - mov zva_len, #4 - add dstend, dstin, count + + ccmp tmp1w, #4, #0, eq /* eq ? cmp len,64 : !c */ lsl zva_len, zva_len, tmp1w - cmp count, zva_len_x - sub zva_mask, zva_len, #1 - b.lo L(ge_64) + + ccmp count, zva_len_x, #0, hs /* hs ? cmp count,len : !c */ + + b.lo L(ge_256) /* disabled || len<64 || count<len */ /* Fall through into memset_zva_n. */ .size memset, . - memset @@ -188,8 +209,9 @@ memset: .type memset_zva_n, %function memset_zva_n: - stp xzr, xzr, [dstin] /* first 16 aligned 1. */ + stp q16, q16, [dstin] /* first 32 aligned 1. */ neg tmp1w, dstin_w + sub zva_mask, zva_len, #1 sub count, count, zva_len_x /* pre-bias */ mov dst, dstin ands tmp1w, tmp1w, zva_mask @@ -206,16 +228,14 @@ memset_zva_n: RET .p2align 4 -3: and tmp2, dstin, #-16 +3: and tmp2, dstin, #-32 sub count, count, tmp1 /* account for misalign */ add dst, dstin, tmp1 .p2align 6,,24 -4: stp xzr, xzr, [tmp2, #16] - stp xzr, xzr, [tmp2, #32] +4: stp q16, q16, [tmp2, #32] subs tmp1w, tmp1w, #64 - stp xzr, xzr, [tmp2, #48] - stp xzr, xzr, [tmp2, #64]! + stp q16, q16, [tmp2, #64]! b.hi 4b b 2b @@ -228,83 +248,92 @@ memset_zva_n: .type memset_nozva, %function memset_nozva: CALL_MCOUNT - and valw, valw, #255 L(nz_or_small): - orr valw, valw, valw, lsl #8 /* replicate the byte */ + dup v16.16b, valw + cmp count, #256 + add dstend, dstin, count + b.hs L(ge_256) + + /* Small data -- original count is less than 256 bytes. */ +L(le_255): + cmp count, #32 + b.lo L(le_31) + + stp q16, q16, [dstin] cmp count, #64 - orr valw, valw, valw, lsl #16 - add dstend, dstin, count /* remember end of buffer */ - orr val, val, val, lsl #32 - b.hs L(ge_64) + b.lo L(le_63) - /* Small data -- original count is less than 64 bytes. */ + stp q16, q16, [dstin, #0x20] + tbz count, #7, L(le_127) + + stp q16, q16, [dstin, #0x40] + stp q16, q16, [dstin, #0x60] + stp q16, q16, [dstend, #-0x80] + stp q16, q16, [dstend, #-0x60] +L(le_127): + stp q16, q16, [dstend, #-0x40] L(le_63): - cmp count, #16 - b.lo L(le_15) - stp val, val, [dstin] - tbz count, #5, L(le_31) - stp val, val, [dstin, #16] - stp val, val, [dstend, #-32] -L(le_31): - stp val, val, [dstend, #-16] - RET - .p2align 6,,16 -L(le_15): - tbz count, #3, L(le_7) - str val, [dstin] - str val, [dstend, #-8] - RET - .p2align 6,,16 -L(le_7): - tbz count, #2, L(le_3) - str valw, [dstin] - str valw, [dstend, #-4] - RET - .p2align 6,,20 -L(le_3): - tbz count, #1, L(le_1) - strh valw, [dstend, #-2] -L(le_1): - tbz count, #0, L(le_0) - strb valw, [dstin] -L(le_0): + stp q16, q16, [dstend, #-0x20] RET - .p2align 6 -L(ge_64): - and dst, dstin, #-16 /* align the pointer / pre-bias. */ - stp val, val, [dstin] /* first 16 align 1 */ + .p2align 6,,16 +L(ge_256): + and dst, dstin, #-32 /* align the pointer / pre-bias. */ + stp q16, q16, [dstin] /* first 32 align 1 */ sub count, dstend, dst /* begin misalign recompute */ - subs count, count, #16+64 /* finish recompute + pre-bias */ - b.ls L(loop_tail) + sub count, count, #32+128 /* finish recompute + pre-bias */ .p2align 6,,24 L(loop): - stp val, val, [dst, #16] - stp val, val, [dst, #32] - subs count, count, #64 - stp val, val, [dst, #48] - stp val, val, [dst, #64]! + stp q16, q16, [dst, #0x20] + stp q16, q16, [dst, #0x40] + subs count, count, #128 + stp q16, q16, [dst, #0x60] + stp q16, q16, [dst, #0x80]! b.hs L(loop) - adds count, count, #64 /* undo pre-bias */ + adds count, count, #128 /* undo pre-bias */ b.ne L(loop_tail) RET /* Tail of the zva loop. Less than ZVA bytes, but possibly lots - more than 64. Note that dst is aligned but unbiased. */ + more than 128. Note that dst is aligned but unbiased. */ L(zva_tail): - subs count, count, #64 /* pre-bias */ - sub dst, dst, #16 /* pre-bias */ + subs count, count, #128 /* pre-bias */ + sub dst, dst, #32 /* pre-bias */ b.hi L(loop) - /* Tail of the stp loop; less than 64 bytes left. - Note that dst is still aligned and biased by -16. */ + /* Tail of the stp loop; less than 128 bytes left. + Note that dst is still aligned and biased by -32. */ L(loop_tail): - stp val, val, [dstend, #-64] - stp val, val, [dstend, #-48] - stp val, val, [dstend, #-32] - stp val, val, [dstend, #-16] + stp q16, q16, [dstend, #-0x80] + stp q16, q16, [dstend, #-0x60] + stp q16, q16, [dstend, #-0x40] + stp q16, q16, [dstend, #-0x20] + RET + +L(le_31): + tbz count, #4, L(le_15) + str q16, [dstin] + str q16, [dstend, #-0x10] + RET +L(le_15): + tbz count, #3, L(le_7) + str d16, [dstin] + str d16, [dstend, #-8] + RET +L(le_7): + tbz count, #2, L(le_3) + str s16, [dstin] + str s16, [dstend, #-4] + RET +L(le_3): + tbz count, #1, L(le_1) + str h16, [dstend, #-2] +L(le_1): + tbz count, #0, L(le_0) + str b16, [dstin] +L(le_0): RET .size memset_nozva, . - memset_nozva |