diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2016-04-24 10:53:25 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2016-04-25 08:13:38 -0700 |
commit | 0d9d7bc875a16911be7a93acdf7999c0023761ea (patch) | |
tree | b4aa0c86b1b6e25b335d4e915d19f3835c7741a8 | |
parent | 8dd19b0b3ca334060eec990f0afa502700939ad3 (diff) | |
download | glibc-0d9d7bc875a16911be7a93acdf7999c0023761ea.tar.gz glibc-0d9d7bc875a16911be7a93acdf7999c0023761ea.tar.xz glibc-0d9d7bc875a16911be7a93acdf7999c0023761ea.zip |
Align to cacheline
-rw-r--r-- | sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index 0a2bf4108f..aaee527dca 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -466,6 +466,26 @@ L(large_forward): leaq (%rdi, %rdx), %r10 cmpq %r10, %rsi jb L(loop_4x_vec_forward) +# if CACHELINE_SIZE != VEC_SIZE + movl %edi, %r8d + andl $(CACHELINE_SIZE - 1), %r8d + je L(loop_large_forward) +# if CACHELINE_SIZE == (VEC_SIZE * 4) + /* Cacheline misaligned by VEC_SIZE, 2 * VEC_SIZE, or + 3 * VEC_SIZE. */ + cmpl $(VEC_SIZE * 2), %r8d + je L(misaligned_by_2x_vec_forward) + jb L(misaligned_by_3x_vec_forward) +# elif CACHELINE_SIZE != (VEC_SIZE * 2) +# error Unsupported CACHELINE_SIZE! +# endif + /* Cacheline misaligned by VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + addq $VEC_SIZE, %rsi + subq $VEC_SIZE, %rdx + VMOVA %VEC(0), (%rdi) + addq $VEC_SIZE, %rdi +# endif L(loop_large_forward): /* Copy 4 * VEC a time forward with non-temporal stores. */ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) @@ -494,6 +514,32 @@ L(loop_large_forward): VZEROUPPER ret +# if CACHELINE_SIZE == (VEC_SIZE * 4) +L(misaligned_by_2x_vec_forward): + /* Cacheline misaligned by 2 * VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + addq $(VEC_SIZE * 2), %rsi + subq $(VEC_SIZE * 2), %rdx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + addq $(VEC_SIZE * 2), %rdi + jmp L(loop_large_forward) + +L(misaligned_by_3x_vec_forward): + /* Cacheline misaligned by 3 * VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + addq $(VEC_SIZE * 3), %rsi + subq $(VEC_SIZE * 3), %rdx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + addq $(VEC_SIZE * 3), %rdi + jmp L(loop_large_forward) +# endif + L(large_backward): /* Don't use non-temporal store if there is overlap between destination and source since destination may be in cache @@ -501,6 +547,26 @@ L(large_backward): leaq (%rcx, %rdx), %r10 cmpq %r10, %r9 jb L(loop_4x_vec_backward) +# if CACHELINE_SIZE != VEC_SIZE + movl %r9d, %r8d + andl $(CACHELINE_SIZE - 1), %r8d + je L(loop_large_backward) +# if CACHELINE_SIZE == (VEC_SIZE * 4) + /* Cacheline misaligned by VEC_SIZE, 2 * VEC_SIZE, or + 3 * VEC_SIZE. */ + cmpl $(VEC_SIZE * 2), %r8d + je L(misaligned_by_2x_vec_backward) + jb L(misaligned_by_3x_vec_backward) +# elif CACHELINE_SIZE != (VEC_SIZE * 2) +# error Unsupported CACHELINE_SIZE! +# endif + /* Cacheline misaligned by VEC_SIZE. */ + VMOVU (%rcx), %VEC(0) + subq $VEC_SIZE, %rcx + subq $VEC_SIZE, %rdx + VMOVA %VEC(0), (%r9) + subq $VEC_SIZE, %r9 +# endif L(loop_large_backward): /* Copy 4 * VEC a time backward with non-temporal stores. */ PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) @@ -528,6 +594,33 @@ L(loop_large_backward): VMOVU %VEC(8), (%r11) VZEROUPPER ret + +# if CACHELINE_SIZE == (VEC_SIZE * 4) +L(misaligned_by_2x_vec_backward): + /* Cacheline misaligned by 2 * VEC_SIZE. */ + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + subq $(VEC_SIZE * 2), %rcx + subq $(VEC_SIZE * 2), %rdx + VMOVA %VEC(0), (%r9) + VMOVA %VEC(1), -VEC_SIZE(%r9) + subq $(VEC_SIZE * 2), %r9 + jmp L(loop_large_backward) + +L(misaligned_by_3x_vec_backward): + /* Cacheline misaligned by 3 * VEC_SIZE. */ + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + subq $(VEC_SIZE * 3), %rcx + subq $(VEC_SIZE * 3), %rdx + VMOVA %VEC(0), (%r9) + VMOVA %VEC(1), -VEC_SIZE(%r9) + VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) + subq $(VEC_SIZE * 3), %r9 + jmp L(loop_large_backward) +# endif + #endif END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |