diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 553 |
1 files changed, 0 insertions, 553 deletions
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S deleted file mode 100644 index dee3ec529c..0000000000 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ /dev/null @@ -1,553 +0,0 @@ -/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb - Copyright (C) 2016-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -/* memmove/memcpy/mempcpy is implemented as: - 1. Use overlapping load and store to avoid branch. - 2. Load all sources into registers and store them together to avoid - possible address overlap between source and destination. - 3. If size is 8 * VEC_SIZE or less, load all sources into registers - and store them together. - 4. If address of destination > address of source, backward copy - 4 * VEC_SIZE at a time with unaligned load and aligned store. - Load the first 4 * VEC and last VEC before the loop and store - them after the loop to support overlapping addresses. - 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned - load and aligned store. Load the last 4 * VEC and first VEC - before the loop and store them after the loop to support - overlapping addresses. - 6. If size >= __x86_shared_non_temporal_threshold and there is no - overlap between destination and source, use non-temporal store - instead of aligned store. */ - -#include <sysdep.h> - -#ifndef MEMCPY_SYMBOL -# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) -#endif - -#ifndef MEMPCPY_SYMBOL -# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) -#endif - -#ifndef MEMMOVE_CHK_SYMBOL -# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) -#endif - -#ifndef VZEROUPPER -# if VEC_SIZE > 16 -# define VZEROUPPER vzeroupper -# else -# define VZEROUPPER -# endif -#endif - -/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set - up REP MOVSB operation, REP MOVSB isn't faster on short data. The - memcpy micro benchmark in glibc shows that 2KB is the approximate - value above which REP MOVSB becomes faster than SSE2 optimization - on processors with Enhanced REP MOVSB. Since larger register size - can move more data with a single load and store, the threshold is - higher with larger register size. */ -#ifndef REP_MOVSB_THRESHOLD -# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16)) -#endif - -#ifndef PREFETCH -# define PREFETCH(addr) prefetcht0 addr -#endif - -/* Assume 64-byte prefetch size. */ -#ifndef PREFETCH_SIZE -# define PREFETCH_SIZE 64 -#endif - -#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) - -#if PREFETCH_SIZE == 64 -# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE -# define PREFETCH_ONE_SET(dir, base, offset) \ - PREFETCH ((offset)base) -# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE -# define PREFETCH_ONE_SET(dir, base, offset) \ - PREFETCH ((offset)base); \ - PREFETCH ((offset + dir * PREFETCH_SIZE)base) -# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE -# define PREFETCH_ONE_SET(dir, base, offset) \ - PREFETCH ((offset)base); \ - PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ - PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ - PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ - PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) -# else -# error Unsupported PREFETCHED_LOAD_SIZE! -# endif -#else -# error Unsupported PREFETCH_SIZE! -#endif - -#ifndef SECTION -# error SECTION is not defined! -#endif - - .section SECTION(.text),"ax",@progbits -#if defined SHARED && IS_IN (libc) -ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) - cmpq %rdx, %rcx - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) -#endif - -#if VEC_SIZE == 16 || defined SHARED -ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) - movq %rdi, %rax - addq %rdx, %rax - jmp L(start) -END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) -#endif - -#if defined SHARED && IS_IN (libc) -ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) - cmpq %rdx, %rcx - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) -#endif - -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) - movq %rdi, %rax -L(start): - cmpq $VEC_SIZE, %rdx - jb L(less_vec) - cmpq $(VEC_SIZE * 2), %rdx - ja L(more_2x_vec) -#if !defined USE_MULTIARCH || !IS_IN (libc) -L(last_2x_vec): -#endif - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU (%rsi), %VEC(0) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) - VZEROUPPER -#if !defined USE_MULTIARCH || !IS_IN (libc) -L(nop): -#endif - ret -#if defined USE_MULTIARCH && IS_IN (libc) -END (MEMMOVE_SYMBOL (__memmove, unaligned)) - -# if VEC_SIZE == 16 -# if defined SHARED -/* Only used to measure performance of REP MOVSB. */ -ENTRY (__mempcpy_erms) - movq %rdi, %rax - addq %rdx, %rax - jmp L(start_movsb) -END (__mempcpy_erms) -# endif - -ENTRY (__memmove_erms) - movq %rdi, %rax -L(start_movsb): - movq %rdx, %rcx - cmpq %rsi, %rdi - jb 1f - /* Source == destination is less common. */ - je 2f - leaq (%rsi,%rcx), %rdx - cmpq %rdx, %rdi - jb L(movsb_backward) -1: - rep movsb -2: - ret -L(movsb_backward): - leaq -1(%rdi,%rcx), %rdi - leaq -1(%rsi,%rcx), %rsi - std - rep movsb - cld - ret -END (__memmove_erms) -# if defined SHARED -strong_alias (__memmove_erms, __memcpy_erms) -# endif -# endif - -# ifdef SHARED -ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) - cmpq %rdx, %rcx - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) - -ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) - movq %rdi, %rax - addq %rdx, %rax - jmp L(start_erms) -END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) - -ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) - cmpq %rdx, %rcx - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) -# endif - -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) - movq %rdi, %rax -L(start_erms): - cmpq $VEC_SIZE, %rdx - jb L(less_vec) - cmpq $(VEC_SIZE * 2), %rdx - ja L(movsb_more_2x_vec) -L(last_2x_vec): - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU (%rsi), %VEC(0) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) -L(return): - VZEROUPPER - ret - -L(movsb): - cmpq __x86_shared_non_temporal_threshold(%rip), %rdx - jae L(more_8x_vec) - cmpq %rsi, %rdi - jb 1f - /* Source == destination is less common. */ - je L(nop) - leaq (%rsi,%rdx), %r9 - cmpq %r9, %rdi - /* Avoid slow backward REP MOVSB. */ -# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8) -# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE! -# endif - jb L(more_8x_vec_backward) -1: - movq %rdx, %rcx - rep movsb -L(nop): - ret -#endif - -L(less_vec): - /* Less than 1 VEC. */ -#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 -# error Unsupported VEC_SIZE! -#endif -#if VEC_SIZE > 32 - cmpb $32, %dl - jae L(between_32_63) -#endif -#if VEC_SIZE > 16 - cmpb $16, %dl - jae L(between_16_31) -#endif - cmpb $8, %dl - jae L(between_8_15) - cmpb $4, %dl - jae L(between_4_7) - cmpb $1, %dl - ja L(between_2_3) - jb 1f - movzbl (%rsi), %ecx - movb %cl, (%rdi) -1: - ret -#if VEC_SIZE > 32 -L(between_32_63): - /* From 32 to 63. No branch when size == 32. */ - vmovdqu (%rsi), %ymm0 - vmovdqu -32(%rsi,%rdx), %ymm1 - vmovdqu %ymm0, (%rdi) - vmovdqu %ymm1, -32(%rdi,%rdx) - VZEROUPPER - ret -#endif -#if VEC_SIZE > 16 - /* From 16 to 31. No branch when size == 16. */ -L(between_16_31): - vmovdqu (%rsi), %xmm0 - vmovdqu -16(%rsi,%rdx), %xmm1 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, -16(%rdi,%rdx) - ret -#endif -L(between_8_15): - /* From 8 to 15. No branch when size == 8. */ - movq -8(%rsi,%rdx), %rcx - movq (%rsi), %rsi - movq %rcx, -8(%rdi,%rdx) - movq %rsi, (%rdi) - ret -L(between_4_7): - /* From 4 to 7. No branch when size == 4. */ - movl -4(%rsi,%rdx), %ecx - movl (%rsi), %esi - movl %ecx, -4(%rdi,%rdx) - movl %esi, (%rdi) - ret -L(between_2_3): - /* From 2 to 3. No branch when size == 2. */ - movzwl -2(%rsi,%rdx), %ecx - movzwl (%rsi), %esi - movw %cx, -2(%rdi,%rdx) - movw %si, (%rdi) - ret - -#if defined USE_MULTIARCH && IS_IN (libc) -L(movsb_more_2x_vec): - cmpq $REP_MOVSB_THRESHOLD, %rdx - ja L(movsb) -#endif -L(more_2x_vec): - /* More than 2 * VEC and there may be overlap between destination - and source. */ - cmpq $(VEC_SIZE * 8), %rdx - ja L(more_8x_vec) - cmpq $(VEC_SIZE * 4), %rdx - jb L(last_4x_vec) - /* Copy from 4 * VEC to 8 * VEC, inclusively. */ - VMOVU (%rsi), %VEC(0) - VMOVU VEC_SIZE(%rsi), %VEC(1) - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) - VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) - VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), VEC_SIZE(%rdi) - VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) - VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) - VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) - VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) - VZEROUPPER - ret -L(last_4x_vec): - /* Copy from 2 * VEC to 4 * VEC. */ - VMOVU (%rsi), %VEC(0) - VMOVU VEC_SIZE(%rsi), %VEC(1) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(1), VEC_SIZE(%rdi) - VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) - VZEROUPPER - ret - -L(more_8x_vec): - cmpq %rsi, %rdi - ja L(more_8x_vec_backward) - /* Source == destination is less common. */ - je L(nop) - /* Load the first VEC and last 4 * VEC to support overlapping - addresses. */ - VMOVU (%rsi), %VEC(4) - VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) - VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) - VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) - /* Save start and stop of the destination buffer. */ - movq %rdi, %r11 - leaq -VEC_SIZE(%rdi, %rdx), %rcx - /* Align destination for aligned stores in the loop. Compute - how much destination is misaligned. */ - movq %rdi, %r8 - andq $(VEC_SIZE - 1), %r8 - /* Get the negative of offset for alignment. */ - subq $VEC_SIZE, %r8 - /* Adjust source. */ - subq %r8, %rsi - /* Adjust destination which should be aligned now. */ - subq %r8, %rdi - /* Adjust length. */ - addq %r8, %rdx -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) - /* Check non-temporal store threshold. */ - cmpq __x86_shared_non_temporal_threshold(%rip), %rdx - ja L(large_forward) -#endif -L(loop_4x_vec_forward): - /* Copy 4 * VEC a time forward. */ - VMOVU (%rsi), %VEC(0) - VMOVU VEC_SIZE(%rsi), %VEC(1) - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - addq $(VEC_SIZE * 4), %rsi - subq $(VEC_SIZE * 4), %rdx - VMOVA %VEC(0), (%rdi) - VMOVA %VEC(1), VEC_SIZE(%rdi) - VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) - VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) - addq $(VEC_SIZE * 4), %rdi - cmpq $(VEC_SIZE * 4), %rdx - ja L(loop_4x_vec_forward) - /* Store the last 4 * VEC. */ - VMOVU %VEC(5), (%rcx) - VMOVU %VEC(6), -VEC_SIZE(%rcx) - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) - /* Store the first VEC. */ - VMOVU %VEC(4), (%r11) - VZEROUPPER - ret - -L(more_8x_vec_backward): - /* Load the first 4 * VEC and last VEC to support overlapping - addresses. */ - VMOVU (%rsi), %VEC(4) - VMOVU VEC_SIZE(%rsi), %VEC(5) - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) - /* Save stop of the destination buffer. */ - leaq -VEC_SIZE(%rdi, %rdx), %r11 - /* Align destination end for aligned stores in the loop. Compute - how much destination end is misaligned. */ - leaq -VEC_SIZE(%rsi, %rdx), %rcx - movq %r11, %r9 - movq %r11, %r8 - andq $(VEC_SIZE - 1), %r8 - /* Adjust source. */ - subq %r8, %rcx - /* Adjust the end of destination which should be aligned now. */ - subq %r8, %r9 - /* Adjust length. */ - subq %r8, %rdx -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) - /* Check non-temporal store threshold. */ - cmpq __x86_shared_non_temporal_threshold(%rip), %rdx - ja L(large_backward) -#endif -L(loop_4x_vec_backward): - /* Copy 4 * VEC a time backward. */ - VMOVU (%rcx), %VEC(0) - VMOVU -VEC_SIZE(%rcx), %VEC(1) - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - subq $(VEC_SIZE * 4), %rcx - subq $(VEC_SIZE * 4), %rdx - VMOVA %VEC(0), (%r9) - VMOVA %VEC(1), -VEC_SIZE(%r9) - VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) - VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) - subq $(VEC_SIZE * 4), %r9 - cmpq $(VEC_SIZE * 4), %rdx - ja L(loop_4x_vec_backward) - /* Store the first 4 * VEC. */ - VMOVU %VEC(4), (%rdi) - VMOVU %VEC(5), VEC_SIZE(%rdi) - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) - /* Store the last VEC. */ - VMOVU %VEC(8), (%r11) - VZEROUPPER - ret - -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) -L(large_forward): - /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache - when source is loaded. */ - leaq (%rdi, %rdx), %r10 - cmpq %r10, %rsi - jb L(loop_4x_vec_forward) -L(loop_large_forward): - /* Copy 4 * VEC a time forward with non-temporal stores. */ - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) - VMOVU (%rsi), %VEC(0) - VMOVU VEC_SIZE(%rsi), %VEC(1) - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - addq $PREFETCHED_LOAD_SIZE, %rsi - subq $PREFETCHED_LOAD_SIZE, %rdx - VMOVNT %VEC(0), (%rdi) - VMOVNT %VEC(1), VEC_SIZE(%rdi) - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) - addq $PREFETCHED_LOAD_SIZE, %rdi - cmpq $PREFETCHED_LOAD_SIZE, %rdx - ja L(loop_large_forward) - sfence - /* Store the last 4 * VEC. */ - VMOVU %VEC(5), (%rcx) - VMOVU %VEC(6), -VEC_SIZE(%rcx) - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) - /* Store the first VEC. */ - VMOVU %VEC(4), (%r11) - VZEROUPPER - ret - -L(large_backward): - /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache - when source is loaded. */ - leaq (%rcx, %rdx), %r10 - cmpq %r10, %r9 - jb L(loop_4x_vec_backward) -L(loop_large_backward): - /* Copy 4 * VEC a time backward with non-temporal stores. */ - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) - VMOVU (%rcx), %VEC(0) - VMOVU -VEC_SIZE(%rcx), %VEC(1) - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - subq $PREFETCHED_LOAD_SIZE, %rcx - subq $PREFETCHED_LOAD_SIZE, %rdx - VMOVNT %VEC(0), (%r9) - VMOVNT %VEC(1), -VEC_SIZE(%r9) - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) - subq $PREFETCHED_LOAD_SIZE, %r9 - cmpq $PREFETCHED_LOAD_SIZE, %rdx - ja L(loop_large_backward) - sfence - /* Store the first 4 * VEC. */ - VMOVU %VEC(4), (%rdi) - VMOVU %VEC(5), VEC_SIZE(%rdi) - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) - /* Store the last VEC. */ - VMOVU %VEC(8), (%r11) - VZEROUPPER - ret -#endif -END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) - -#ifdef SHARED -# if IS_IN (libc) -# ifdef USE_MULTIARCH -strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), - MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) -strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), - MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) -# endif -strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), - MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) -# endif -#endif -#if VEC_SIZE == 16 || defined SHARED -strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), - MEMCPY_SYMBOL (__memcpy, unaligned)) -#endif |