diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 462 |
1 files changed, 462 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S new file mode 100644 index 0000000000..cf645dd7ff --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -0,0 +1,462 @@ +/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* memmove/memcpy/mempcpy is implemented as: + 1. Use overlapping load and store to avoid branch. + 2. Use 8-bit or 32-bit displacements for branches and nop paddings + to avoid long nop between instructions. + 3. Load all sources into registers and store them together to avoid + possible address overflap between source and destination. + 4. If size is 2 * VEC_SIZE or less, load all sources into registers + and store them together. + 5. If there is no address overflap, copy from both ends with + 4 * VEC_SIZE at a time. + 6. If size is 8 * VEC_SIZE or less, load all sources into registers + and store them together. + 7. If address of destination > address of source, backward copy + 8 * VEC_SIZE at a time. + 8. Otherwise, forward copy 8 * VEC_SIZE at a time. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# include "asm-syntax.h" + +# ifndef VZEROUPPER +# if VEC_SIZE > 16 +# define VZEROUPPER vzeroupper +# else +# define VZEROUPPER +# endif +# endif + +/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set + up REP MOVSB operation, REP MOVSB isn't faster on short data. The + memcpy micro benchmark in glibc shows that 2KB is the approximate + value above which REP MOVSB becomes faster than SSE2 optimization + on processors with Enhanced REP MOVSB. Since larger register size + can move more data with a single load and store, the threshold is + higher with larger register size. */ +# ifndef REP_MOVSB_THRESHOLD +# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16)) +# endif + +# ifndef SECTION +# error SECTION is not defined! +# endif + .section SECTION(.text),"ax",@progbits + +# ifdef SHARED +ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2)) + +ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2)) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2)) + +ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2)) +# endif + +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2)) + movq %rdi, %rax +L(start): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + VZEROUPPER + ret +END (MEMMOVE_SYMBOL (__memmove, unaligned_2)) + +# ifdef SHARED +ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms)) +# endif + +ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start_erms) +END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + +# ifdef SHARED +ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms)) +# endif + +# if VEC_SIZE == 16 +/* Only used to measure performance of REP MOVSB. */ +# ifdef SHARED +ENTRY (__mempcpy_erms) + movq %rdi, %rax + addq %rdx, %rax + jmp L(movsb) +END (__mempcpy_erms) +# endif + +ENTRY (__memmove_erms) + movq %rdi, %rax + movq %rdx, %rcx + cmpq %rsi, %rdi + jbe 1f + leaq (%rsi,%rcx), %rdx + cmpq %rdx, %rdi + jb L(movsb_backward) +1: + rep movsb + ret +L(movsb_backward): + leaq -1(%rdi,%rcx), %rdi + leaq -1(%rsi,%rcx), %rsi + std + rep movsb + cld + ret +END (__memmove_erms) +strong_alias (__memmove_erms, __memcpy_erms) +# endif + +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + movq %rdi, %rax +L(start_erms): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(movsb_more_2x_vec) +L(last_2x_vec): + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) +L(return): + VZEROUPPER + ret + +L(movsb): + cmpq %rsi, %rdi + je L(nop) + jb 1f + leaq (%rsi,%rdx), %r9 + cmpq %r9, %rdi + /* Avoid slow backward REP MOVSB. */ +# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8) +# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE! +# endif + jb L(more_8x_vec_backward) +1: + movq %rdx, %rcx + rep movsb +L(nop): + ret + + .p2align 4 +L(movsb_more_2x_vec): + cmpq $REP_MOVSB_THRESHOLD, %rdx + /* Force 32-bit displacement to avoid long nop between + instructions. */ + ja.d32 L(movsb) + .p2align 4 +L(more_2x_vec): + /* More than 2 * VEC. */ + cmpq %rsi, %rdi + je L(nop) + jb L(copy_forward) + leaq (%rsi,%rdx), %rcx + cmpq %rcx, %rdi + jb L(more_2x_vec_overlap) +L(copy_forward): + leaq (%rdi,%rdx), %rcx + cmpq %rcx, %rsi + jb L(more_2x_vec_overlap) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) + cmpq $(VEC_SIZE * 4), %rdx + /* Force 32-bit displacement to avoid long nop between + instructions. */ + jbe.d32 L(return) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(0) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) + VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2) + VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3) + VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(1), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx) + cmpq $(VEC_SIZE * 8), %rdx +# if VEC_SIZE == 16 + jbe L(return) +# else + /* Use 8-bit displacement to avoid long nop between + instructions. */ + jbe L(return_disp8) +# endif + leaq (VEC_SIZE * 4)(%rdi), %rcx + addq %rdi, %rdx + andq $-(VEC_SIZE * 4), %rdx + andq $-(VEC_SIZE * 4), %rcx + movq %rcx, %r11 + subq %rdi, %r11 + addq %r11, %rsi + cmpq %rdx, %rcx + /* Use 8-bit displacement to avoid long nop between + instructions. */ + je L(return_disp8) + movq %rsi, %r10 + subq %rcx, %r10 + leaq VEC_SIZE(%r10), %r9 + leaq (VEC_SIZE * 2)(%r10), %r8 + leaq (VEC_SIZE * 3)(%r10), %r11 + .p2align 4 +L(loop): + VMOVU (%rcx,%r10), %VEC(0) + VMOVU (%rcx,%r9), %VEC(1) + VMOVU (%rcx,%r8), %VEC(2) + VMOVU (%rcx,%r11), %VEC(3) + VMOVA %VEC(0), (%rcx) + VMOVA %VEC(1), VEC_SIZE(%rcx) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rcx) + addq $(VEC_SIZE * 4), %rcx + cmpq %rcx, %rdx + jne L(loop) +L(return_disp8): + VZEROUPPER + ret +L(less_vec): + /* Less than 1 VEC. */ +# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +# endif +# if VEC_SIZE > 32 + cmpb $32, %dl + jae L(between_32_63) +# endif +# if VEC_SIZE > 16 + cmpb $16, %dl + jae L(between_16_31) +# endif + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl + jae L(between_4_7) + cmpb $1, %dl + ja L(between_2_3) + jb 1f + movzbl (%rsi), %ecx + movb %cl, (%rdi) +1: + ret +# if VEC_SIZE > 32 +L(between_32_63): + /* From 32 to 63. No branch when size == 32. */ + vmovdqu (%rsi), %ymm0 + vmovdqu -32(%rsi,%rdx), %ymm1 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -32(%rdi,%rdx) + VZEROUPPER + ret +# endif +# if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ +L(between_16_31): + vmovdqu (%rsi), %xmm0 + vmovdqu -16(%rsi,%rdx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -16(%rdi,%rdx) + ret +# endif +L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ + movq -8(%rsi,%rdx), %rcx + movq (%rsi), %rsi + movq %rcx, -8(%rdi,%rdx) + movq %rsi, (%rdi) + ret +L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl -4(%rsi,%rdx), %ecx + movl (%rsi), %esi + movl %ecx, -4(%rdi,%rdx) + movl %esi, (%rdi) + ret +L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + movzwl -2(%rsi,%rdx), %ecx + movzwl (%rsi), %esi + movw %cx, -2(%rdi,%rdx) + movw %si, (%rdi) + ret + +# if VEC_SIZE > 16 + /* Align to 16 bytes to avoid long nop between instructions. */ + .p2align 4 +# endif +L(more_2x_vec_overlap): + /* More than 2 * VEC and there is overlap bewteen destination + and source. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) +L(between_4x_vec_and_8x_vec): + /* Copy from 4 * VEC to 8 * VEC, inclusively. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) + VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) + VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) + VZEROUPPER + ret +L(last_4x_vec): + /* Copy from 2 * VEC to 4 * VEC. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) + VZEROUPPER + ret +L(between_0_and_4x_vec): + /* Copy from 0 to 4 * VEC. */ + cmpl $(VEC_SIZE * 2), %edx + jae L(last_4x_vec) + /* Copy from 0 to 2 * VEC. */ + cmpl $VEC_SIZE, %edx + jae L(last_2x_vec) + /* Copy from 0 to VEC. */ + VZEROUPPER + jmp L(less_vec) +L(more_8x_vec): + cmpq %rsi, %rdi + ja L(more_8x_vec_backward) + + .p2align 4 +L(loop_8x_vec_forward): + /* Copy 8 * VEC a time forward. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + VMOVU (VEC_SIZE * 4)(%rsi), %VEC(4) + VMOVU (VEC_SIZE * 5)(%rsi), %VEC(5) + VMOVU (VEC_SIZE * 6)(%rsi), %VEC(6) + VMOVU (VEC_SIZE * 7)(%rsi), %VEC(7) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(4), (VEC_SIZE * 4)(%rdi) + VMOVU %VEC(5), (VEC_SIZE * 5)(%rdi) + VMOVU %VEC(6), (VEC_SIZE * 6)(%rdi) + VMOVU %VEC(7), (VEC_SIZE * 7)(%rdi) + addq $(VEC_SIZE * 8), %rdi + addq $(VEC_SIZE * 8), %rsi + subq $(VEC_SIZE * 8), %rdx + cmpq $(VEC_SIZE * 8), %rdx + je L(between_4x_vec_and_8x_vec) + ja L(loop_8x_vec_forward) + /* Less than 8 * VEC to copy. */ + cmpq $(VEC_SIZE * 4), %rdx + jb L(between_0_and_4x_vec) + jmp L(between_4x_vec_and_8x_vec) + + .p2align 4 +L(more_8x_vec_backward): + leaq -VEC_SIZE(%rsi, %rdx), %rcx + leaq -VEC_SIZE(%rdi, %rdx), %r9 + + .p2align 4 +L(loop_8x_vec_backward): + /* Copy 8 * VEC a time backward. */ + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) + VMOVU -(VEC_SIZE * 4)(%rcx), %VEC(4) + VMOVU -(VEC_SIZE * 5)(%rcx), %VEC(5) + VMOVU -(VEC_SIZE * 6)(%rcx), %VEC(6) + VMOVU -(VEC_SIZE * 7)(%rcx), %VEC(7) + VMOVU %VEC(0), (%r9) + VMOVU %VEC(1), -VEC_SIZE(%r9) + VMOVU %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVU %VEC(3), -(VEC_SIZE * 3)(%r9) + VMOVU %VEC(4), -(VEC_SIZE * 4)(%r9) + VMOVU %VEC(5), -(VEC_SIZE * 5)(%r9) + VMOVU %VEC(6), -(VEC_SIZE * 6)(%r9) + VMOVU %VEC(7), -(VEC_SIZE * 7)(%r9) + subq $(VEC_SIZE * 8), %rcx + subq $(VEC_SIZE * 8), %r9 + subq $(VEC_SIZE * 8), %rdx + cmpq $(VEC_SIZE * 8), %rdx + je L(between_4x_vec_and_8x_vec) + ja L(loop_8x_vec_backward) + /* Less than 8 * VEC to copy. */ + cmpq $(VEC_SIZE * 4), %rdx + jb L(between_0_and_4x_vec) + jmp L(between_4x_vec_and_8x_vec) +END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + +# ifdef SHARED +strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), + MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) +strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), + MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) +strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2), + MEMMOVE_SYMBOL (__memcpy, unaligned_2)) +strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2), + MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2)) +# endif + +#endif |