/* Set a block of memory to some byte value. For SUN4V M7. Copyright (C) 2017-2024 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see . */ #include #ifndef XCC # define XCC xcc #endif .register %g2, #scratch .register %g3, #scratch /* The algorithm is as follows : * * For small 7 or fewer bytes stores, bytes will be stored. * * For less than 32 bytes stores, align the address on 4 byte boundary. * Then store as many 4-byte chunks, followed by trailing bytes. * * For sizes greater than 32 bytes, align the address on 8 byte boundary. * if (count >= 64) { * store 8-bytes chunks to align the address on 64 byte boundary * if (value to be set is zero && count >= MIN_ZERO) { * Using BIS stores, set the first long word of each * 64-byte cache line to zero which will also clear the * other seven long words of the cache line. * } * else if (count >= MIN_LOOP) { * Using BIS stores, set the first long word of each of * ST_CHUNK cache lines (64 bytes each) before the main * loop is entered. * In the main loop, continue pre-setting the first long * word of each cache line ST_CHUNK lines in advance while * setting the other seven long words (56 bytes) of each * cache line until fewer than ST_CHUNK*64 bytes remain. * Then set the remaining seven long words of each cache * line that has already had its first long word set. * } * store remaining data in 64-byte chunks until less than * 64 bytes remain. * } * Store as many 8-byte chunks, followed by trailing bytes. * * * BIS = Block Init Store * Doing the advance store of the first element of the cache line * initiates the displacement of a cache line while only using a single * instruction in the pipeline. That avoids various pipeline delays, * such as filling the miss buffer. The performance effect is * similar to prefetching for normal stores. * The special case for zero fills runs faster and uses fewer instruction * cycles than the normal memset loop. * * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence * BIS stores must be followed by a membar #StoreStore. The benefit of * the BIS store must be balanced against the cost of the membar operation. */ /* * ASI_STBI_P marks the cache line as "least recently used" * which means if many threads are active, it has a high chance * of being pushed out of the cache between the first initializing * store and the final stores. * Thus, we use ASI_STBIMRU_P which marks the cache line as * "most recently used" for all but the last store to the cache line. */ #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 #define ASI_ST_BLK_INIT_MRU_P 0xf2 #define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P #define ASI_STBIMRU_P ASI_ST_BLK_INIT_MRU_P #define ST_CHUNK 24 /* multiple of 4 due to loop unrolling */ #define MIN_LOOP (ST_CHUNK)*64 #define MIN_ZERO 256 #define EX_ST(x) x #define EX_RETVAL(x) x #define STORE_ASI(src,addr) stxa src, [addr] ASI_STBIMRU_P #define STORE_INIT(src,addr) stxa src, [addr] ASI_STBI_P #if IS_IN (libc) .text .align 32 ENTRY(__memset_niagara7) /* memset (src, c, size) */ mov %o0, %o5 /* copy sp1 before using it */ cmp %o2, 7 /* if small counts, just write bytes */ bleu,pn %XCC, .Lwrchar and %o1, 0xff, %o1 /* o1 is (char)c */ sll %o1, 8, %o3 or %o1, %o3, %o1 /* now o1 has 2 bytes of c */ sll %o1, 16, %o3 cmp %o2, 32 blu,pn %XCC, .Lwdalign or %o1, %o3, %o1 /* now o1 has 4 bytes of c */ sllx %o1, 32, %o3 or %o1, %o3, %o1 /* now o1 has 8 bytes of c */ .Ldbalign: andcc %o5, 7, %o3 /* is sp1 aligned on a 8 byte bound? */ bz,pt %XCC, .Lblkalign /* already long word aligned */ sub %o3, 8, %o3 /* -(bytes till long word aligned) */ add %o2, %o3, %o2 /* update o2 with new count */ /* Set -(%o3) bytes till sp1 long word aligned */ 1: stb %o1, [%o5] /* there is at least 1 byte to set */ inccc %o3 /* byte clearing loop */ bl,pt %XCC, 1b inc %o5 /* Now sp1 is long word aligned (sp1 is found in %o5) */ .Lblkalign: cmp %o2, 64 /* check if there are 64 bytes to set */ blu,pn %XCC, .Lwrshort mov %o2, %o3 andcc %o5, 63, %o3 /* is sp1 block aligned? */ bz,pt %XCC, .Lblkwr /* now block aligned */ sub %o3, 64, %o3 /* o3 is -(bytes till block aligned) */ add %o2, %o3, %o2 /* o2 is the remainder */ /* Store -(%o3) bytes till dst is block (64 byte) aligned. */ /* Use long word stores. */ /* Recall that dst is already long word aligned */ 1: addcc %o3, 8, %o3 stx %o1, [%o5] bl,pt %XCC, 1b add %o5, 8, %o5 /* Now sp1 is block aligned */ .Lblkwr: andn %o2, 63, %o4 /* calculate size of blocks in bytes */ brz,pn %o1, .Lwrzero /* special case if c == 0 */ and %o2, 63, %o3 /* %o3 = bytes left after blk stores */ cmp %o4, MIN_LOOP /* check for enough bytes to set */ blu,pn %XCC, .Lshort_set /* to justify cost of membar */ nop /* must be > pre-cleared lines */ /* initial cache-clearing stores */ /* get store pipeline moving */ /* Primary memset loop for large memsets */ .Lwr_loop: mov ST_CHUNK, %g1 .Lwr_loop_start: subcc %g1, 4, %g1 EX_ST(STORE_ASI(%o1,%o5)) add %o5, 64, %o5 EX_ST(STORE_ASI(%o1,%o5)) add %o5, 64, %o5 EX_ST(STORE_ASI(%o1,%o5)) add %o5, 64, %o5 EX_ST(STORE_ASI(%o1,%o5)) bgu %XCC, .Lwr_loop_start add %o5, 64, %o5 sub %o5, ST_CHUNK*64, %o5 /* reset %o5 */ mov ST_CHUNK, %g1 sub %o5, 8, %o5 /* adjust %o5 for ASI store */ .Lwr_loop_rest: stx %o1,[%o5+8+8] sub %o4, 64, %o4 stx %o1,[%o5+16+8] subcc %g1, 1, %g1 stx %o1,[%o5+24+8] stx %o1,[%o5+32+8] stx %o1,[%o5+40+8] add %o5, 64, %o5 stx %o1,[%o5-8] bgu %XCC, .Lwr_loop_rest EX_ST(STORE_INIT(%o1,%o5)) add %o5, 8, %o5 /* restore %o5 offset */ /* If more than ST_CHUNK*64 bytes remain to set, continue */ /* setting the first long word of each cache line in advance */ /* to keep the store pipeline moving. */ cmp %o4, ST_CHUNK*64 bge,pt %XCC, .Lwr_loop_start mov ST_CHUNK, %g1 brz,a,pn %o4, .Lasi_done nop sub %o5, 8, %o5 /* adjust %o5 for ASI store */ .Lwr_loop_small: add %o5, 8, %o5 /* adjust %o5 for ASI store */ EX_ST(STORE_ASI(%o1,%o5)) stx %o1,[%o5+8] stx %o1,[%o5+16] stx %o1,[%o5+24] stx %o1,[%o5+32] subcc %o4, 64, %o4 stx %o1,[%o5+40] add %o5, 56, %o5 stx %o1,[%o5-8] bgu,pt %XCC, .Lwr_loop_small EX_ST(STORE_INIT(%o1,%o5)) ba .Lasi_done add %o5, 8, %o5 /* restore %o5 offset */ /* Special case loop for zero fill memsets */ /* For each 64 byte cache line, single STBI to first element */ /* clears line */ .Lwrzero: cmp %o4, MIN_ZERO /* check if enough bytes to set */ /* to pay %asi + membar cost */ blu %XCC, .Lshort_set nop sub %o4, 256, %o4 .Lwrzero_loop: mov 64, %g3 EX_ST(STORE_INIT(%o1,%o5)) subcc %o4, 256, %o4 EX_ST(STORE_INIT(%o1,%o5+%g3)) add %o5, 256, %o5 sub %g3, 192, %g3 EX_ST(STORE_INIT(%o1,%o5+%g3)) add %g3, 64, %g3 bge,pt %XCC, .Lwrzero_loop EX_ST(STORE_INIT(%o1,%o5+%g3)) add %o4, 256, %o4 brz,pn %o4, .Lbsi_done nop .Lwrzero_small: EX_ST(STORE_INIT(%o1,%o5)) subcc %o4, 64, %o4 bgu,pt %XCC, .Lwrzero_small add %o5, 64, %o5 .Lasi_done: .Lbsi_done: membar #StoreStore /* required by use of BSI */ .Lshort_set: cmp %o4, 64 /* check if 64 bytes to set */ blu %XCC, 5f nop 4: /* set final blocks of 64 bytes */ stx %o1, [%o5] stx %o1, [%o5+8] stx %o1, [%o5+16] stx %o1, [%o5+24] subcc %o4, 64, %o4 stx %o1, [%o5+32] stx %o1, [%o5+40] add %o5, 64, %o5 stx %o1, [%o5-16] bgu,pt %XCC, 4b stx %o1, [%o5-8] 5: /* Set the remaining long words */ .Lwrshort: subcc %o3, 8, %o3 /* Can we store any long words? */ blu,pn %XCC, .Lwrchars and %o2, 7, %o2 /* calc bytes left after long words */ 6: subcc %o3, 8, %o3 stx %o1, [%o5] /* store the long words */ bgeu,pt %XCC, 6b add %o5, 8, %o5 .Lwrchars: /* check for extra chars */ brnz %o2, .Lwrfin nop retl nop .Lwdalign: andcc %o5, 3, %o3 /* is sp1 aligned on a word boundary */ bz,pn %XCC, .Lwrword andn %o2, 3, %o3 /* create word sized count in %o3 */ dec %o2 /* decrement count */ stb %o1, [%o5] /* clear a byte */ b .Lwdalign inc %o5 /* next byte */ .Lwrword: subcc %o3, 4, %o3 st %o1, [%o5] /* 4-byte writing loop */ bnz,pt %XCC, .Lwrword add %o5, 4, %o5 and %o2, 3, %o2 /* leftover count, if any */ .Lwrchar: /* Set the remaining bytes, if any */ brz %o2, .Lexit nop .Lwrfin: deccc %o2 stb %o1, [%o5] bgu,pt %XCC, .Lwrfin inc %o5 .Lexit: retl /* %o0 was preserved */ nop END(__memset_niagara7) #endif