diff options
author | Noah Goldstein <goldstein.w.n@gmail.com> | 2024-05-24 12:38:50 -0500 |
---|---|---|
committer | Noah Goldstein <goldstein.w.n@gmail.com> | 2024-05-30 12:36:09 -0500 |
commit | 5bf0ab80573d66e4ae5d94b094659094336da90f (patch) | |
tree | 5221932dc8f91a0e79255f9224a33a0b6a309505 /sysdeps/x86_64/multiarch | |
parent | 53f9d74322c831c76bc6cf6ed8941267e8749604 (diff) | |
download | glibc-5bf0ab80573d66e4ae5d94b094659094336da90f.tar.gz glibc-5bf0ab80573d66e4ae5d94b094659094336da90f.tar.xz glibc-5bf0ab80573d66e4ae5d94b094659094336da90f.zip |
x86: Improve large memset perf with non-temporal stores [RHEL-29312]
Previously we use `rep stosb` for all medium/large memsets. This is notably worse than non-temporal stores for large (above a few MBs) memsets. See: https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing For data using different stategies for large memset on ICX and SKX. Using non-temporal stores can be up to 3x faster on ICX and 2x faster on SKX. Historically, these numbers would not have been so good because of the zero-over-zero writeback optimization that `rep stosb` is able to do. But, the zero-over-zero writeback optimization has been removed as a potential side-channel attack, so there is no longer any good reason to only rely on `rep stosb` for large memsets. On the flip size, non-temporal writes can avoid data in their RFO requests saving memory bandwidth. All of the other changes to the file are to re-organize the code-blocks to maintain "good" alignment given the new code added in the `L(stosb_local)` case. The results from running the GLIBC memset benchmarks on TGL-client for N=20 runs: Geometric Mean across the suite New / Old EXEX256: 0.979 Geometric Mean across the suite New / Old EXEX512: 0.979 Geometric Mean across the suite New / Old AVX2 : 0.986 Geometric Mean across the suite New / Old SSE2 : 0.979 Most of the cases are essentially unchanged, this is mostly to show that adding the non-temporal case didn't add any regressions to the other cases. The results on the memset-large benchmark suite on TGL-client for N=20 runs: Geometric Mean across the suite New / Old EXEX256: 0.926 Geometric Mean across the suite New / Old EXEX512: 0.925 Geometric Mean across the suite New / Old AVX2 : 0.928 Geometric Mean across the suite New / Old SSE2 : 0.924 So roughly a 7.5% speedup. This is lower than what we see on servers (likely because clients typically have faster single-core bandwidth so saving bandwidth on RFOs is less impactful), but still advantageous. Full test-suite passes on x86_64 w/ and w/o multiarch. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Diffstat (limited to 'sysdeps/x86_64/multiarch')
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 149 |
1 files changed, 91 insertions, 58 deletions
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 97839a2248..637caadb40 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -21,10 +21,13 @@ 2. If size is less than VEC, use integer register stores. 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. - 5. On machines ERMS feature, if size is greater or equal than - __x86_rep_stosb_threshold then REP STOSB will be used. - 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with - 4 VEC stores and store 4 * VEC at a time until done. */ + 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with + 4 VEC stores and store 4 * VEC at a time until done. + 6. On machines ERMS feature, if size is range + [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold) + then REP STOSB will be used. + 7. If size >= __x86_shared_non_temporal_threshold, use a + non-temporal stores. */ #include <sysdep.h> @@ -147,6 +150,41 @@ L(entry_from_wmemset): VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VMM(0), (%rdi) VZEROUPPER_RETURN + + /* If have AVX512 mask instructions put L(less_vec) close to + entry as it doesn't take much space and is likely a hot target. */ +#ifdef USE_LESS_VEC_MASK_STORE + /* Align to ensure the L(less_vec) logic all fits in 1x cache lines. */ + .p2align 6,, 47 + .p2align 4 +L(less_vec): +L(less_vec_from_wmemset): + /* Less than 1 VEC. */ +# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +# endif + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. Note that we are using rax which is set in + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ + andl $(PAGE_SIZE - 1), %edi + /* Check if VEC_SIZE store cross page. Mask stores suffer + serious performance degradation when it has to fault suppress. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %edi + /* This is generally considered a cold target. */ + ja L(cross_page) +# if VEC_SIZE > 32 + movq $-1, %rcx + bzhiq %rdx, %rcx, %rcx + kmovq %rcx, %k1 +# else + movl $-1, %ecx + bzhil %edx, %ecx, %ecx + kmovd %ecx, %k1 +# endif + vmovdqu8 %VMM(0), (%rax){%k1} + VZEROUPPER_RETURN +#endif + #if defined USE_MULTIARCH && IS_IN (libc) END (MEMSET_SYMBOL (__memset, unaligned)) @@ -185,54 +223,6 @@ L(last_2x_vec): #endif VZEROUPPER_RETURN - /* If have AVX512 mask instructions put L(less_vec) close to - entry as it doesn't take much space and is likely a hot target. - */ -#ifdef USE_LESS_VEC_MASK_STORE - .p2align 4,, 10 -L(less_vec): -L(less_vec_from_wmemset): - /* Less than 1 VEC. */ -# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 -# error Unsupported VEC_SIZE! -# endif - /* Clear high bits from edi. Only keeping bits relevant to page - cross check. Note that we are using rax which is set in - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ - andl $(PAGE_SIZE - 1), %edi - /* Check if VEC_SIZE store cross page. Mask stores suffer - serious performance degradation when it has to fault suppress. - */ - cmpl $(PAGE_SIZE - VEC_SIZE), %edi - /* This is generally considered a cold target. */ - ja L(cross_page) -# if VEC_SIZE > 32 - movq $-1, %rcx - bzhiq %rdx, %rcx, %rcx - kmovq %rcx, %k1 -# else - movl $-1, %ecx - bzhil %edx, %ecx, %ecx - kmovd %ecx, %k1 -# endif - vmovdqu8 %VMM(0), (%rax){%k1} - VZEROUPPER_RETURN - -# if defined USE_MULTIARCH && IS_IN (libc) - /* Include L(stosb_local) here if including L(less_vec) between - L(stosb_more_2x_vec) and ENTRY. This is to cache align the - L(stosb_more_2x_vec) target. */ - .p2align 4,, 10 -L(stosb_local): - movzbl %sil, %eax - mov %RDX_LP, %RCX_LP - mov %RDI_LP, %RDX_LP - rep stosb - mov %RDX_LP, %RAX_LP - VZEROUPPER_RETURN -# endif -#endif - #if defined USE_MULTIARCH && IS_IN (libc) .p2align 4 L(stosb_more_2x_vec): @@ -318,21 +308,33 @@ L(return_vzeroupper): ret #endif - .p2align 4,, 10 -#ifndef USE_LESS_VEC_MASK_STORE -# if defined USE_MULTIARCH && IS_IN (libc) +#ifdef USE_WITH_AVX2 + .p2align 4 +#else + .p2align 4,, 4 +#endif + +#if defined USE_MULTIARCH && IS_IN (libc) /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in range for 2-byte jump encoding. */ L(stosb_local): + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + jae L(nt_memset) movzbl %sil, %eax mov %RDX_LP, %RCX_LP mov %RDI_LP, %RDX_LP rep stosb +# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512) + /* Use xchg to save 1-byte (this helps align targets below). */ + xchg %RDX_LP, %RAX_LP +# else mov %RDX_LP, %RAX_LP - VZEROUPPER_RETURN # endif + VZEROUPPER_RETURN +#endif +#ifndef USE_LESS_VEC_MASK_STORE /* Define L(less_vec) only if not otherwise defined. */ - .p2align 4 + .p2align 4,, 12 L(less_vec): /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to xmm). This is only does anything for AVX2. */ @@ -423,4 +425,35 @@ L(between_2_3): movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) #endif ret -END (MEMSET_SYMBOL (__memset, unaligned_erms)) + +#if defined USE_MULTIARCH && IS_IN (libc) +# ifdef USE_WITH_AVX512 + /* Force align so the loop doesn't cross a cache-line. */ + .p2align 4 +# endif + .p2align 4,, 7 + /* Memset using non-temporal stores. */ +L(nt_memset): + VMOVU %VMM(0), (VEC_SIZE * 0)(%rdi) + leaq (VEC_SIZE * -4)(%rdi, %rdx), %rdx + /* Align DST. */ + orq $(VEC_SIZE * 1 - 1), %rdi + incq %rdi + .p2align 4,, 7 +L(nt_loop): + VMOVNT %VMM(0), (VEC_SIZE * 0)(%rdi) + VMOVNT %VMM(0), (VEC_SIZE * 1)(%rdi) + VMOVNT %VMM(0), (VEC_SIZE * 2)(%rdi) + VMOVNT %VMM(0), (VEC_SIZE * 3)(%rdi) + subq $(VEC_SIZE * -4), %rdi + cmpq %rdx, %rdi + jb L(nt_loop) + sfence + VMOVU %VMM(0), (VEC_SIZE * 0)(%rdx) + VMOVU %VMM(0), (VEC_SIZE * 1)(%rdx) + VMOVU %VMM(0), (VEC_SIZE * 2)(%rdx) + VMOVU %VMM(0), (VEC_SIZE * 3)(%rdx) + VZEROUPPER_RETURN +#endif + +END(MEMSET_SYMBOL(__memset, unaligned_erms)) |