diff options
Diffstat (limited to 'sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S')
-rw-r--r-- | sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S | 681 |
1 files changed, 0 insertions, 681 deletions
diff --git a/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S deleted file mode 100644 index 2fe2072cb1..0000000000 --- a/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S +++ /dev/null @@ -1,681 +0,0 @@ -/* memcpy optimized with SSE2 unaligned memory access instructions. - Copyright (C) 2014-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) \ - && (defined SHARED \ - || defined USE_AS_MEMMOVE \ - || !defined USE_MULTIARCH) - -# include <sysdep.h> -# include "asm-syntax.h" - -# ifndef MEMCPY -# define MEMCPY __memcpy_sse2_unaligned -# define MEMCPY_CHK __memcpy_chk_sse2_unaligned -# endif - -# ifdef USE_AS_BCOPY -# define SRC PARMS -# define DEST SRC+4 -# define LEN DEST+4 -# else -# define DEST PARMS -# define SRC DEST+4 -# define LEN SRC+4 -# endif - -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) - -# define PARMS 8 /* Preserve EBX. */ -# define ENTRANCE PUSH (%ebx); -# define RETURN_END POP (%ebx); ret -# define RETURN RETURN_END; CFI_PUSH (%ebx) - - .section .text.sse2,"ax",@progbits -# if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -# endif - -ENTRY (MEMCPY) - ENTRANCE - movl LEN(%esp), %ecx - movl SRC(%esp), %eax - movl DEST(%esp), %edx - cmp %edx, %eax - -# ifdef USE_AS_MEMMOVE - jg L(check_forward) - -L(mm_len_0_or_more_backward): -/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] - separately. */ - cmp $16, %ecx - jbe L(mm_len_0_16_bytes_backward) - - cmpl $32, %ecx - jg L(mm_len_32_or_more_backward) - -/* Copy [0..32] and return. */ - movdqu (%eax), %xmm0 - movdqu -16(%eax, %ecx), %xmm1 - movdqu %xmm0, (%edx) - movdqu %xmm1, -16(%edx, %ecx) - jmp L(return) - -L(mm_len_32_or_more_backward): - cmpl $64, %ecx - jg L(mm_len_64_or_more_backward) - -/* Copy [0..64] and return. */ - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu -16(%eax, %ecx), %xmm2 - movdqu -32(%eax, %ecx), %xmm3 - movdqu %xmm0, (%edx) - movdqu %xmm1, 16(%edx) - movdqu %xmm2, -16(%edx, %ecx) - movdqu %xmm3, -32(%edx, %ecx) - jmp L(return) - -L(mm_len_64_or_more_backward): - cmpl $128, %ecx - jg L(mm_len_128_or_more_backward) - -/* Copy [0..128] and return. */ - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu 32(%eax), %xmm2 - movdqu 48(%eax), %xmm3 - movdqu -64(%eax, %ecx), %xmm4 - movdqu -48(%eax, %ecx), %xmm5 - movdqu -32(%eax, %ecx), %xmm6 - movdqu -16(%eax, %ecx), %xmm7 - movdqu %xmm0, (%edx) - movdqu %xmm1, 16(%edx) - movdqu %xmm2, 32(%edx) - movdqu %xmm3, 48(%edx) - movdqu %xmm4, -64(%edx, %ecx) - movdqu %xmm5, -48(%edx, %ecx) - movdqu %xmm6, -32(%edx, %ecx) - movdqu %xmm7, -16(%edx, %ecx) - jmp L(return) - -L(mm_len_128_or_more_backward): - add %ecx, %eax - cmp %edx, %eax - movl SRC(%esp), %eax - jle L(forward) - PUSH (%esi) - PUSH (%edi) - PUSH (%ebx) - -/* Aligning the address of destination. */ - movdqu (%eax), %xmm4 - movdqu 16(%eax), %xmm5 - movdqu 32(%eax), %xmm6 - movdqu 48(%eax), %xmm7 - leal (%edx, %ecx), %esi - movdqu -16(%eax, %ecx), %xmm0 - subl $16, %esp - movdqu %xmm0, (%esp) - mov %ecx, %edi - movl %esi, %ecx - andl $-16, %ecx - leal (%ecx), %ebx - subl %edx, %ebx - leal (%eax, %ebx), %eax - shrl $6, %ebx - -# ifdef SHARED_CACHE_SIZE_HALF - cmp $SHARED_CACHE_SIZE_HALF, %edi -# else -# ifdef SHARED - PUSH (%ebx) - SETUP_PIC_REG (bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi - POP (%ebx) -# else - cmp __x86_shared_cache_size_half, %edi -# endif -# endif - jae L(mm_large_page_loop_backward) - - .p2align 4 -L(mm_main_loop_backward): - - prefetcht0 -128(%eax) - - movdqu -64(%eax), %xmm0 - movdqu -48(%eax), %xmm1 - movdqu -32(%eax), %xmm2 - movdqu -16(%eax), %xmm3 - movaps %xmm0, -64(%ecx) - subl $64, %eax - movaps %xmm1, -48(%ecx) - movaps %xmm2, -32(%ecx) - movaps %xmm3, -16(%ecx) - subl $64, %ecx - sub $1, %ebx - jnz L(mm_main_loop_backward) - movdqu (%esp), %xmm0 - addl $16, %esp - movdqu %xmm0, -16(%esi) - movdqu %xmm4, (%edx) - movdqu %xmm5, 16(%edx) - movdqu %xmm6, 32(%edx) - movdqu %xmm7, 48(%edx) - POP (%ebx) - jmp L(mm_return_pop_all) - -/* Copy [0..16] and return. */ -L(mm_len_0_16_bytes_backward): - testb $24, %cl - jnz L(mm_len_9_16_bytes_backward) - testb $4, %cl - .p2align 4,,5 - jnz L(mm_len_5_8_bytes_backward) - testl %ecx, %ecx - .p2align 4,,2 - je L(return) - testb $2, %cl - .p2align 4,,1 - jne L(mm_len_3_4_bytes_backward) - movzbl -1(%eax,%ecx), %ebx - movzbl (%eax), %eax - movb %bl, -1(%edx,%ecx) - movb %al, (%edx) - jmp L(return) - -L(mm_len_3_4_bytes_backward): - movzwl -2(%eax,%ecx), %ebx - movzwl (%eax), %eax - movw %bx, -2(%edx,%ecx) - movw %ax, (%edx) - jmp L(return) - -L(mm_len_9_16_bytes_backward): - PUSH (%esi) - movl -4(%eax,%ecx), %ebx - movl -8(%eax,%ecx), %esi - movl %ebx, -4(%edx,%ecx) - movl %esi, -8(%edx,%ecx) - subl $8, %ecx - POP (%esi) - jmp L(mm_len_0_16_bytes_backward) - -L(mm_len_5_8_bytes_backward): - movl (%eax), %ebx - movl -4(%eax,%ecx), %eax - movl %ebx, (%edx) - movl %eax, -4(%edx,%ecx) - jmp L(return) - -/* Big length copy backward part. */ - .p2align 4 -L(mm_large_page_loop_backward): - movdqu -64(%eax), %xmm0 - movdqu -48(%eax), %xmm1 - movdqu -32(%eax), %xmm2 - movdqu -16(%eax), %xmm3 - movntdq %xmm0, -64(%ecx) - subl $64, %eax - movntdq %xmm1, -48(%ecx) - movntdq %xmm2, -32(%ecx) - movntdq %xmm3, -16(%ecx) - subl $64, %ecx - sub $1, %ebx - jnz L(mm_large_page_loop_backward) - sfence - movdqu (%esp), %xmm0 - addl $16, %esp - movdqu %xmm0, -16(%esi) - movdqu %xmm4, (%edx) - movdqu %xmm5, 16(%edx) - movdqu %xmm6, 32(%edx) - movdqu %xmm7, 48(%edx) - POP (%ebx) - jmp L(mm_return_pop_all) - -L(check_forward): - add %edx, %ecx - cmp %eax, %ecx - movl LEN(%esp), %ecx - jle L(forward) - -/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] - separately. */ - cmp $16, %ecx - jbe L(mm_len_0_16_bytes_forward) - - cmpl $32, %ecx - ja L(mm_len_32_or_more_forward) - -/* Copy [0..32] and return. */ - movdqu (%eax), %xmm0 - movdqu -16(%eax, %ecx), %xmm1 - movdqu %xmm0, (%edx) - movdqu %xmm1, -16(%edx, %ecx) - jmp L(return) - -L(mm_len_32_or_more_forward): - cmpl $64, %ecx - ja L(mm_len_64_or_more_forward) - -/* Copy [0..64] and return. */ - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu -16(%eax, %ecx), %xmm2 - movdqu -32(%eax, %ecx), %xmm3 - movdqu %xmm0, (%edx) - movdqu %xmm1, 16(%edx) - movdqu %xmm2, -16(%edx, %ecx) - movdqu %xmm3, -32(%edx, %ecx) - jmp L(return) - -L(mm_len_64_or_more_forward): - cmpl $128, %ecx - ja L(mm_len_128_or_more_forward) - -/* Copy [0..128] and return. */ - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu 32(%eax), %xmm2 - movdqu 48(%eax), %xmm3 - movdqu -64(%eax, %ecx), %xmm4 - movdqu -48(%eax, %ecx), %xmm5 - movdqu -32(%eax, %ecx), %xmm6 - movdqu -16(%eax, %ecx), %xmm7 - movdqu %xmm0, (%edx) - movdqu %xmm1, 16(%edx) - movdqu %xmm2, 32(%edx) - movdqu %xmm3, 48(%edx) - movdqu %xmm4, -64(%edx, %ecx) - movdqu %xmm5, -48(%edx, %ecx) - movdqu %xmm6, -32(%edx, %ecx) - movdqu %xmm7, -16(%edx, %ecx) - jmp L(return) - -L(mm_len_128_or_more_forward): - PUSH (%esi) - PUSH (%edi) - PUSH (%ebx) - -/* Aligning the address of destination. */ - movdqu -16(%eax, %ecx), %xmm4 - movdqu -32(%eax, %ecx), %xmm5 - movdqu -48(%eax, %ecx), %xmm6 - movdqu -64(%eax, %ecx), %xmm7 - leal (%edx, %ecx), %esi - movdqu (%eax), %xmm0 - subl $16, %esp - movdqu %xmm0, (%esp) - mov %ecx, %edi - leal 16(%edx), %ecx - andl $-16, %ecx - movl %ecx, %ebx - subl %edx, %ebx - addl %ebx, %eax - movl %esi, %ebx - subl %ecx, %ebx - shrl $6, %ebx - -# ifdef SHARED_CACHE_SIZE_HALF - cmp $SHARED_CACHE_SIZE_HALF, %edi -# else -# ifdef SHARED - PUSH (%ebx) - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi - POP (%ebx) -# else - cmp __x86_shared_cache_size_half, %edi -# endif -# endif - jae L(mm_large_page_loop_forward) - - .p2align 4 -L(mm_main_loop_forward): - - prefetcht0 128(%eax) - - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu 32(%eax), %xmm2 - movdqu 48(%eax), %xmm3 - movdqa %xmm0, (%ecx) - addl $64, %eax - movaps %xmm1, 16(%ecx) - movaps %xmm2, 32(%ecx) - movaps %xmm3, 48(%ecx) - addl $64, %ecx - sub $1, %ebx - jnz L(mm_main_loop_forward) - movdqu (%esp), %xmm0 - addl $16, %esp - movdqu %xmm0, (%edx) - movdqu %xmm4, -16(%esi) - movdqu %xmm5, -32(%esi) - movdqu %xmm6, -48(%esi) - movdqu %xmm7, -64(%esi) - POP (%ebx) - jmp L(mm_return_pop_all) - -L(mm_len_0_16_bytes_forward): - testb $24, %cl - jne L(mm_len_9_16_bytes_forward) - testb $4, %cl - .p2align 4,,5 - jne L(mm_len_5_8_bytes_forward) - testl %ecx, %ecx - .p2align 4,,2 - je L(return) - testb $2, %cl - .p2align 4,,1 - jne L(mm_len_2_4_bytes_forward) - movzbl -1(%eax,%ecx), %ebx - movzbl (%eax), %eax - movb %bl, -1(%edx,%ecx) - movb %al, (%edx) - jmp L(return) - -L(mm_len_2_4_bytes_forward): - movzwl -2(%eax,%ecx), %ebx - movzwl (%eax), %eax - movw %bx, -2(%edx,%ecx) - movw %ax, (%edx) - jmp L(return) - -L(mm_len_5_8_bytes_forward): - movl (%eax), %ebx - movl -4(%eax,%ecx), %eax - movl %ebx, (%edx) - movl %eax, -4(%edx,%ecx) - jmp L(return) - -L(mm_len_9_16_bytes_forward): - movq (%eax), %xmm0 - movq -8(%eax, %ecx), %xmm1 - movq %xmm0, (%edx) - movq %xmm1, -8(%edx, %ecx) - jmp L(return) - -L(mm_return_pop_all): - movl %edx, %eax - POP (%edi) - POP (%esi) - RETURN - -/* Big length copy forward part. */ - .p2align 4 -L(mm_large_page_loop_forward): - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu 32(%eax), %xmm2 - movdqu 48(%eax), %xmm3 - movntdq %xmm0, (%ecx) - addl $64, %eax - movntdq %xmm1, 16(%ecx) - movntdq %xmm2, 32(%ecx) - movntdq %xmm3, 48(%ecx) - addl $64, %ecx - sub $1, %ebx - jnz L(mm_large_page_loop_forward) - sfence - movdqu (%esp), %xmm0 - addl $16, %esp - movdqu %xmm0, (%edx) - movdqu %xmm4, -16(%esi) - movdqu %xmm5, -32(%esi) - movdqu %xmm6, -48(%esi) - movdqu %xmm7, -64(%esi) - POP (%ebx) - jmp L(mm_return_pop_all) -# endif - -L(forward): - cmp $16, %ecx - jbe L(len_0_16_bytes) - -# ifdef SHARED_CACHE_SIZE_HALF - cmp $SHARED_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_shared_cache_size_half, %ecx -# endif -# endif - jae L(large_page) - - movdqu (%eax), %xmm0 - movdqu -16(%eax, %ecx), %xmm1 - cmpl $32, %ecx - movdqu %xmm0, (%edx) - movdqu %xmm1, -16(%edx, %ecx) - jbe L(return) - - movdqu 16(%eax), %xmm0 - movdqu -32(%eax, %ecx), %xmm1 - cmpl $64, %ecx - movdqu %xmm0, 16(%edx) - movdqu %xmm1, -32(%edx, %ecx) - jbe L(return) - - movdqu 32(%eax), %xmm0 - movdqu 48(%eax), %xmm1 - movdqu -48(%eax, %ecx), %xmm2 - movdqu -64(%eax, %ecx), %xmm3 - cmpl $128, %ecx - movdqu %xmm0, 32(%edx) - movdqu %xmm1, 48(%edx) - movdqu %xmm2, -48(%edx, %ecx) - movdqu %xmm3, -64(%edx, %ecx) - jbe L(return) - -/* Now the main loop: we align the address of the destination. */ - leal 64(%edx), %ebx - andl $-64, %ebx - - addl %edx, %ecx - andl $-64, %ecx - - subl %edx, %eax - -/* We should stop two iterations before the termination - (in order not to misprefetch). */ - subl $64, %ecx - cmpl %ebx, %ecx - je L(main_loop_just_one_iteration) - - subl $64, %ecx - cmpl %ebx, %ecx - je L(main_loop_last_two_iterations) - - .p2align 4 -L(main_loop_cache): - - prefetcht0 128(%ebx, %eax) - - movdqu (%ebx, %eax), %xmm0 - movdqu 16(%ebx, %eax), %xmm1 - movdqu 32(%ebx, %eax), %xmm2 - movdqu 48(%ebx, %eax), %xmm3 - movdqa %xmm0, (%ebx) - movaps %xmm1, 16(%ebx) - movaps %xmm2, 32(%ebx) - movaps %xmm3, 48(%ebx) - lea 64(%ebx), %ebx - cmpl %ebx, %ecx - jne L(main_loop_cache) - -L(main_loop_last_two_iterations): - movdqu (%ebx, %eax), %xmm0 - movdqu 16(%ebx, %eax), %xmm1 - movdqu 32(%ebx, %eax), %xmm2 - movdqu 48(%ebx, %eax), %xmm3 - movdqu 64(%ebx, %eax), %xmm4 - movdqu 80(%ebx, %eax), %xmm5 - movdqu 96(%ebx, %eax), %xmm6 - movdqu 112(%ebx, %eax), %xmm7 - movdqa %xmm0, (%ebx) - movaps %xmm1, 16(%ebx) - movaps %xmm2, 32(%ebx) - movaps %xmm3, 48(%ebx) - movaps %xmm4, 64(%ebx) - movaps %xmm5, 80(%ebx) - movaps %xmm6, 96(%ebx) - movaps %xmm7, 112(%ebx) - jmp L(return) - -L(main_loop_just_one_iteration): - movdqu (%ebx, %eax), %xmm0 - movdqu 16(%ebx, %eax), %xmm1 - movdqu 32(%ebx, %eax), %xmm2 - movdqu 48(%ebx, %eax), %xmm3 - movdqa %xmm0, (%ebx) - movaps %xmm1, 16(%ebx) - movaps %xmm2, 32(%ebx) - movaps %xmm3, 48(%ebx) - jmp L(return) - -L(large_page): - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu 32(%eax), %xmm2 - movdqu 48(%eax), %xmm3 - movdqu -64(%eax, %ecx), %xmm4 - movdqu -48(%eax, %ecx), %xmm5 - movdqu -32(%eax, %ecx), %xmm6 - movdqu -16(%eax, %ecx), %xmm7 - movdqu %xmm0, (%edx) - movdqu %xmm1, 16(%edx) - movdqu %xmm2, 32(%edx) - movdqu %xmm3, 48(%edx) - movdqu %xmm4, -64(%edx, %ecx) - movdqu %xmm5, -48(%edx, %ecx) - movdqu %xmm6, -32(%edx, %ecx) - movdqu %xmm7, -16(%edx, %ecx) - - movdqu 64(%eax), %xmm0 - movdqu 80(%eax), %xmm1 - movdqu 96(%eax), %xmm2 - movdqu 112(%eax), %xmm3 - movdqu -128(%eax, %ecx), %xmm4 - movdqu -112(%eax, %ecx), %xmm5 - movdqu -96(%eax, %ecx), %xmm6 - movdqu -80(%eax, %ecx), %xmm7 - movdqu %xmm0, 64(%edx) - movdqu %xmm1, 80(%edx) - movdqu %xmm2, 96(%edx) - movdqu %xmm3, 112(%edx) - movdqu %xmm4, -128(%edx, %ecx) - movdqu %xmm5, -112(%edx, %ecx) - movdqu %xmm6, -96(%edx, %ecx) - movdqu %xmm7, -80(%edx, %ecx) - -/* Now the main loop with non temporal stores. We align - the address of the destination. */ - leal 128(%edx), %ebx - andl $-128, %ebx - - addl %edx, %ecx - andl $-128, %ecx - - subl %edx, %eax - - .p2align 4 -L(main_loop_large_page): - movdqu (%ebx, %eax), %xmm0 - movdqu 16(%ebx, %eax), %xmm1 - movdqu 32(%ebx, %eax), %xmm2 - movdqu 48(%ebx, %eax), %xmm3 - movdqu 64(%ebx, %eax), %xmm4 - movdqu 80(%ebx, %eax), %xmm5 - movdqu 96(%ebx, %eax), %xmm6 - movdqu 112(%ebx, %eax), %xmm7 - movntdq %xmm0, (%ebx) - movntdq %xmm1, 16(%ebx) - movntdq %xmm2, 32(%ebx) - movntdq %xmm3, 48(%ebx) - movntdq %xmm4, 64(%ebx) - movntdq %xmm5, 80(%ebx) - movntdq %xmm6, 96(%ebx) - movntdq %xmm7, 112(%ebx) - lea 128(%ebx), %ebx - cmpl %ebx, %ecx - jne L(main_loop_large_page) - sfence - jmp L(return) - -L(len_0_16_bytes): - testb $24, %cl - jne L(len_9_16_bytes) - testb $4, %cl - .p2align 4,,5 - jne L(len_5_8_bytes) - testl %ecx, %ecx - .p2align 4,,2 - je L(return) - movzbl (%eax), %ebx - testb $2, %cl - movb %bl, (%edx) - je L(return) - movzwl -2(%eax,%ecx), %ebx - movw %bx, -2(%edx,%ecx) - jmp L(return) - -L(len_9_16_bytes): - movq (%eax), %xmm0 - movq -8(%eax, %ecx), %xmm1 - movq %xmm0, (%edx) - movq %xmm1, -8(%edx, %ecx) - jmp L(return) - -L(len_5_8_bytes): - movl (%eax), %ebx - movl %ebx, (%edx) - movl -4(%eax,%ecx), %ebx - movl %ebx, -4(%edx,%ecx) - -L(return): - movl %edx, %eax -# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif - RETURN - -END (MEMCPY) -#endif |