From 1268f9e6edbcbc0c3c32848d12ced16440bfa177 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 16 Jun 2014 11:53:52 -0700 Subject: neon for memset; higher minimums to enter loops --- sysdeps/aarch64/memset.S | 221 +++++++++++++++++++++++++++-------------------- 1 file changed, 125 insertions(+), 96 deletions(-) diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index 523406d3c8..2e15551006 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -26,7 +26,6 @@ #define dstin x0 #define dstin_w w0 -#define val x1 #define valw w1 #define count x2 #define tmp1 x3 @@ -87,28 +86,27 @@ memset: .type memset_zva_64, %function memset_zva_64: CALL_MCOUNT - and valw, valw, #255 - cmp count, #256 - ccmp valw, #0, #0, hs /* hs ? cmp val,0 : !z */ + tst valw, #255 b.ne L(nz_or_small) - stp xzr, xzr, [dstin] /* first 16 aligned 1. */ + cmp count, #256 + dup v16.16b, valw + add dstend, dstin, count + b.lo L(le_255) + + str q16, [dstin] /* first 16 aligned 1. */ and tmp2, dstin, #-16 and dst, dstin, #-64 - stp xzr, xzr, [tmp2, #16] /* first 64 aligned 16. */ - add dstend, dstin, count + stp q16, q16, [tmp2, #16] /* first 64 aligned 16. */ add dst, dst, #64 - stp xzr, xzr, [tmp2, #32] + stp q16, q16, [tmp2, #48] sub count, dstend, dst /* recompute for misalign */ add tmp1, dst, #64 - stp xzr, xzr, [tmp2, #48] sub count, count, #128 /* pre-bias */ - stp xzr, xzr, [tmp2, #64] - .p2align 6,,24 0: dc zva, dst subs count, count, #128 @@ -126,7 +124,26 @@ memset_zva_64: /* For larger zva sizes, a simple loop ought to suffice. */ /* ??? Needs performance testing, when such hardware becomes available. */ -.macro do_zva len +.macro do_zvas len + .p2align 4 + .type memset_zva_\len, %function +memset_zva_\len: + CALL_MCOUNT + tst valw, #255 + b.ne L(nz_or_small) + + cmp count, #256 + dup v16.16b, valw + add dstend, dstin, count + b.lo L(le_255) + + mov zva_len, #\len + b memset_zva_n + + .size memset_zva_\len, . - memset_zva_\len +.endm + +.macro do_zval len .p2align 4 .type memset_zva_\len, %function memset_zva_\len: @@ -138,23 +155,22 @@ memset_zva_\len: add dstend, dstin, count mov zva_len, #\len - mov zva_mask, #\len-1 b memset_zva_n .size memset_zva_\len, . - memset_zva_\len .endm - do_zva 128 // 5 - do_zva 256 // 6 - do_zva 512 // 7 - do_zva 1024 // 8 - do_zva 2048 // 9 - do_zva 4096 // 10 - do_zva 8192 // 11 - do_zva 16384 // 12 - do_zva 32768 // 13 - do_zva 65536 // 14 - do_zva 131072 // 15 + do_zvas 128 // 5 + do_zvas 256 // 6 + do_zval 512 // 7 + do_zval 1024 // 8 + do_zval 2048 // 9 + do_zval 4096 // 10 + do_zval 8192 // 11 + do_zval 16384 // 12 + do_zval 32768 // 13 + do_zval 65536 // 14 + do_zval 131072 // 15 .p2align 6 #else @@ -163,21 +179,26 @@ memset_zva_\len: .p2align 6 .type memset, %function memset: - and valw, valw, #255 - cmp count, #256 - ccmp valw, #0, #0, hs /* hs ? cmp val,0 : !z */ + tst valw, #255 b.ne L(nz_or_small) + cmp count, #256 + dup v16.16b, valw + add dstend, dstin, count + b.lo L(le_255) + mrs tmp1, dczid_el0 - tbnz tmp1, #4, L(nz_or_small) + mov zva_len, #4 + tst tmp1w, #16 /* dc disabled? */ and tmp1w, tmp1w, #15 - mov zva_len, #4 - add dstend, dstin, count + + ccmp tmp1w, #4, #0, eq /* eq ? cmp len,64 : !c */ lsl zva_len, zva_len, tmp1w - cmp count, zva_len_x - sub zva_mask, zva_len, #1 - b.lo L(ge_64) + + ccmp count, zva_len_x, #0, hs /* hs ? cmp count,len : !c */ + + b.lo L(ge_256) /* disabled || len<64 || count