diff options
Diffstat (limited to 'sysdeps/i386/multiarch/memcpy-ssse3.S')
-rw-r--r-- | sysdeps/i386/multiarch/memcpy-ssse3.S | 3162 |
1 files changed, 3162 insertions, 0 deletions
diff --git a/sysdeps/i386/multiarch/memcpy-ssse3.S b/sysdeps/i386/multiarch/memcpy-ssse3.S new file mode 100644 index 0000000000..27ab6a2c3e --- /dev/null +++ b/sysdeps/i386/multiarch/memcpy-ssse3.S @@ -0,0 +1,3162 @@ +/* memcpy with SSSE3 + Copyright (C) 2010-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +# include <sysdep.h> +# include "asm-syntax.h" + +# ifndef MEMCPY +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# endif + +# ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +# else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx, INDEX, SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx +# else + +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(, INDEX, SCALE) +# endif + + .section .text.ssse3,"ax",@progbits +# if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +# endif +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + +# ifdef USE_AS_MEMMOVE + cmp %eax, %edx + jb L(copy_forward) + je L(fwd_write_0bytes) + cmp $32, %ecx + jae L(memmove_bwd) + jmp L(bk_write_less32bytes_2) + + .p2align 4 +L(memmove_bwd): + add %ecx, %eax + cmp %eax, %edx + movl SRC(%esp), %eax + jb L(copy_backward) + +L(copy_forward): +# endif + cmp $48, %ecx + jae L(48bytesormore) + +L(fwd_write_less32bytes): +# ifndef USE_AS_MEMMOVE + cmp %dl, %al + jb L(bk_write) +# endif + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) +# ifndef USE_AS_MEMMOVE + .p2align 4 +L(bk_write): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) +# endif + + .p2align 4 +L(48bytesormore): +# ifndef USE_AS_MEMMOVE + movlpd (%eax), %xmm0 + movlpd 8(%eax), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) +# else + movdqu (%eax), %xmm0 +# endif + PUSH (%edi) + movl %edx, %edi + and $-16, %edx + add $16, %edx + sub %edx, %edi + add %edi, %ecx + sub %edi, %eax + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +# endif + + mov %eax, %edi + jae L(large_page) + and $0xf, %edi + jz L(shl_0) + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + + .p2align 4 +L(shl_0): +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif + xor %edi, %edi + cmp $127, %ecx + ja L(shl_0_gobble) + lea -32(%ecx), %ecx + + .p2align 4 +L(shl_0_loop): + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + +L(shl_0_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + add %edi, %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_0_gobble): +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + POP (%edi) + lea -128(%ecx), %ecx + jae L(shl_0_gobble_mem_loop) + + .p2align 4 +L(shl_0_gobble_cache_loop): + movdqa (%eax), %xmm0 + movdqa 0x10(%eax), %xmm1 + movdqa 0x20(%eax), %xmm2 + movdqa 0x30(%eax), %xmm3 + movdqa 0x40(%eax), %xmm4 + movdqa 0x50(%eax), %xmm5 + movdqa 0x60(%eax), %xmm6 + movdqa 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa %xmm2, 0x20(%edx) + movdqa %xmm3, 0x30(%edx) + movdqa %xmm4, 0x40(%edx) + movdqa %xmm5, 0x50(%edx) + movdqa %xmm6, 0x60(%edx) + movdqa %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_cache_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_cache_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx + +L(shl_0_cache_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_cache_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx + +L(shl_0_cache_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_cache_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx + +L(shl_0_cache_less_16bytes): + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(shl_0_gobble_mem_loop): + prefetcht0 0x1c0(%eax) + prefetcht0 0x280(%eax) + prefetcht0 0x1c0(%edx) + + movdqa (%eax), %xmm0 + movdqa 0x10(%eax), %xmm1 + movdqa 0x20(%eax), %xmm2 + movdqa 0x30(%eax), %xmm3 + movdqa 0x40(%eax), %xmm4 + movdqa 0x50(%eax), %xmm5 + movdqa 0x60(%eax), %xmm6 + movdqa 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $0x80, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa %xmm2, 0x20(%edx) + movdqa %xmm3, 0x30(%edx) + movdqa %xmm4, 0x40(%edx) + movdqa %xmm5, 0x50(%edx) + movdqa %xmm6, 0x60(%edx) + movdqa %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_mem_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_mem_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx + +L(shl_0_mem_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_mem_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx + +L(shl_0_mem_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_mem_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx + +L(shl_0_mem_less_16bytes): + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) + + .p2align 4 +L(shl_1): +# ifndef USE_AS_MEMMOVE + movaps -1(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -1(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_1_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl1LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + movaps 47(%eax), %xmm4 + movaps 63(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + palignr $1, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $1, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $1, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl1LoopStart) + +L(Shl1LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_1_no_prefetch): + lea -32(%ecx), %ecx + lea -1(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_1_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_1_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_1_no_prefetch_loop) + +L(sh_1_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 1(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_2): +# ifndef USE_AS_MEMMOVE + movaps -2(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -2(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_2_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl2LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + movaps 46(%eax), %xmm4 + movaps 62(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + palignr $2, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $2, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $2, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl2LoopStart) + +L(Shl2LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_2_no_prefetch): + lea -32(%ecx), %ecx + lea -2(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_2_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_2_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_2_no_prefetch_loop) + +L(sh_2_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 2(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_3): +# ifndef USE_AS_MEMMOVE + movaps -3(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -3(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_3_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl3LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + movaps 45(%eax), %xmm4 + movaps 61(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + palignr $3, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $3, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $3, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl3LoopStart) + +L(Shl3LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_3_no_prefetch): + lea -32(%ecx), %ecx + lea -3(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_3_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_3_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_3_no_prefetch_loop) + +L(sh_3_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 3(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_4): +# ifndef USE_AS_MEMMOVE + movaps -4(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -4(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_4_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl4LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + movaps 44(%eax), %xmm4 + movaps 60(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + palignr $4, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $4, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $4, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl4LoopStart) + +L(Shl4LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_4_no_prefetch): + lea -32(%ecx), %ecx + lea -4(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_4_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_4_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_4_no_prefetch_loop) + +L(sh_4_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 4(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_5): +# ifndef USE_AS_MEMMOVE + movaps -5(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -5(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_5_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl5LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + movaps 43(%eax), %xmm4 + movaps 59(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + palignr $5, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $5, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $5, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl5LoopStart) + +L(Shl5LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_5_no_prefetch): + lea -32(%ecx), %ecx + lea -5(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_5_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_5_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_5_no_prefetch_loop) + +L(sh_5_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 5(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_6): +# ifndef USE_AS_MEMMOVE + movaps -6(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -6(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_6_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl6LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + movaps 42(%eax), %xmm4 + movaps 58(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + palignr $6, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $6, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $6, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl6LoopStart) + +L(Shl6LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_6_no_prefetch): + lea -32(%ecx), %ecx + lea -6(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_6_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_6_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_6_no_prefetch_loop) + +L(sh_6_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 6(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_7): +# ifndef USE_AS_MEMMOVE + movaps -7(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -7(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_7_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl7LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + movaps 41(%eax), %xmm4 + movaps 57(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + palignr $7, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $7, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $7, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl7LoopStart) + +L(Shl7LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_7_no_prefetch): + lea -32(%ecx), %ecx + lea -7(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_7_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_7_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_7_no_prefetch_loop) + +L(sh_7_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 7(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_8): +# ifndef USE_AS_MEMMOVE + movaps -8(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -8(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_8_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl8LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + movaps 40(%eax), %xmm4 + movaps 56(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + palignr $8, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $8, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $8, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl8LoopStart) + +L(LoopLeave8): + add $32, %ecx + jle L(shl_end_0) + + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_8_no_prefetch): + lea -32(%ecx), %ecx + lea -8(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_8_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_8_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_8_no_prefetch_loop) + +L(sh_8_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 8(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_9): +# ifndef USE_AS_MEMMOVE + movaps -9(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -9(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_9_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl9LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + movaps 39(%eax), %xmm4 + movaps 55(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + palignr $9, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $9, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $9, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl9LoopStart) + +L(Shl9LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_9_no_prefetch): + lea -32(%ecx), %ecx + lea -9(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_9_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_9_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_9_no_prefetch_loop) + +L(sh_9_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 9(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_10): +# ifndef USE_AS_MEMMOVE + movaps -10(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -10(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_10_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl10LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + movaps 38(%eax), %xmm4 + movaps 54(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + palignr $10, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $10, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $10, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl10LoopStart) + +L(Shl10LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_10_no_prefetch): + lea -32(%ecx), %ecx + lea -10(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_10_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_10_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_10_no_prefetch_loop) + +L(sh_10_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 10(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_11): +# ifndef USE_AS_MEMMOVE + movaps -11(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -11(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_11_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl11LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + movaps 37(%eax), %xmm4 + movaps 53(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + palignr $11, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $11, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $11, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl11LoopStart) + +L(Shl11LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_11_no_prefetch): + lea -32(%ecx), %ecx + lea -11(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_11_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_11_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_11_no_prefetch_loop) + +L(sh_11_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 11(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_12): +# ifndef USE_AS_MEMMOVE + movaps -12(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -12(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_12_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl12LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + movaps 36(%eax), %xmm4 + movaps 52(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + palignr $12, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $12, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $12, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl12LoopStart) + +L(Shl12LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_12_no_prefetch): + lea -32(%ecx), %ecx + lea -12(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_12_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_12_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_12_no_prefetch_loop) + +L(sh_12_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 12(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_13): +# ifndef USE_AS_MEMMOVE + movaps -13(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -13(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_13_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl13LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + movaps 35(%eax), %xmm4 + movaps 51(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + palignr $13, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $13, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $13, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl13LoopStart) + +L(Shl13LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_13_no_prefetch): + lea -32(%ecx), %ecx + lea -13(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_13_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_13_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_13_no_prefetch_loop) + +L(sh_13_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 13(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_14): +# ifndef USE_AS_MEMMOVE + movaps -14(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -14(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_14_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl14LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + movaps 34(%eax), %xmm4 + movaps 50(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + palignr $14, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $14, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $14, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl14LoopStart) + +L(Shl14LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_14_no_prefetch): + lea -32(%ecx), %ecx + lea -14(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_14_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_14_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_14_no_prefetch_loop) + +L(sh_14_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 14(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_15): +# ifndef USE_AS_MEMMOVE + movaps -15(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -15(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_15_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl15LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + movaps 33(%eax), %xmm4 + movaps 49(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + palignr $15, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $15, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $15, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl15LoopStart) + +L(Shl15LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_15_no_prefetch): + lea -32(%ecx), %ecx + lea -15(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_15_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_15_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_15_no_prefetch_loop) + +L(sh_15_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 15(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_end_0): + lea 32(%ecx), %ecx + lea (%edx, %ecx), %edx + lea (%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(fwd_write_44bytes): + movq -44(%eax), %xmm0 + movq %xmm0, -44(%edx) +L(fwd_write_36bytes): + movq -36(%eax), %xmm0 + movq %xmm0, -36(%edx) +L(fwd_write_28bytes): + movq -28(%eax), %xmm0 + movq %xmm0, -28(%edx) +L(fwd_write_20bytes): + movq -20(%eax), %xmm0 + movq %xmm0, -20(%edx) +L(fwd_write_12bytes): + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) +L(fwd_write_4bytes): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_40bytes): + movq -40(%eax), %xmm0 + movq %xmm0, -40(%edx) +L(fwd_write_32bytes): + movq -32(%eax), %xmm0 + movq %xmm0, -32(%edx) +L(fwd_write_24bytes): + movq -24(%eax), %xmm0 + movq %xmm0, -24(%edx) +L(fwd_write_16bytes): + movq -16(%eax), %xmm0 + movq %xmm0, -16(%edx) +L(fwd_write_8bytes): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) +L(fwd_write_0bytes): +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_5bytes): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_45bytes): + movq -45(%eax), %xmm0 + movq %xmm0, -45(%edx) +L(fwd_write_37bytes): + movq -37(%eax), %xmm0 + movq %xmm0, -37(%edx) +L(fwd_write_29bytes): + movq -29(%eax), %xmm0 + movq %xmm0, -29(%edx) +L(fwd_write_21bytes): + movq -21(%eax), %xmm0 + movq %xmm0, -21(%edx) +L(fwd_write_13bytes): + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_41bytes): + movq -41(%eax), %xmm0 + movq %xmm0, -41(%edx) +L(fwd_write_33bytes): + movq -33(%eax), %xmm0 + movq %xmm0, -33(%edx) +L(fwd_write_25bytes): + movq -25(%eax), %xmm0 + movq %xmm0, -25(%edx) +L(fwd_write_17bytes): + movq -17(%eax), %xmm0 + movq %xmm0, -17(%edx) +L(fwd_write_9bytes): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) +L(fwd_write_1bytes): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_46bytes): + movq -46(%eax), %xmm0 + movq %xmm0, -46(%edx) +L(fwd_write_38bytes): + movq -38(%eax), %xmm0 + movq %xmm0, -38(%edx) +L(fwd_write_30bytes): + movq -30(%eax), %xmm0 + movq %xmm0, -30(%edx) +L(fwd_write_22bytes): + movq -22(%eax), %xmm0 + movq %xmm0, -22(%edx) +L(fwd_write_14bytes): + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) +L(fwd_write_6bytes): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_42bytes): + movq -42(%eax), %xmm0 + movq %xmm0, -42(%edx) +L(fwd_write_34bytes): + movq -34(%eax), %xmm0 + movq %xmm0, -34(%edx) +L(fwd_write_26bytes): + movq -26(%eax), %xmm0 + movq %xmm0, -26(%edx) +L(fwd_write_18bytes): + movq -18(%eax), %xmm0 + movq %xmm0, -18(%edx) +L(fwd_write_10bytes): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) +L(fwd_write_2bytes): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_47bytes): + movq -47(%eax), %xmm0 + movq %xmm0, -47(%edx) +L(fwd_write_39bytes): + movq -39(%eax), %xmm0 + movq %xmm0, -39(%edx) +L(fwd_write_31bytes): + movq -31(%eax), %xmm0 + movq %xmm0, -31(%edx) +L(fwd_write_23bytes): + movq -23(%eax), %xmm0 + movq %xmm0, -23(%edx) +L(fwd_write_15bytes): + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) +L(fwd_write_7bytes): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_43bytes): + movq -43(%eax), %xmm0 + movq %xmm0, -43(%edx) +L(fwd_write_35bytes): + movq -35(%eax), %xmm0 + movq %xmm0, -35(%edx) +L(fwd_write_27bytes): + movq -27(%eax), %xmm0 + movq %xmm0, -27(%edx) +L(fwd_write_19bytes): + movq -19(%eax), %xmm0 + movq %xmm0, -19(%edx) +L(fwd_write_11bytes): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) +L(fwd_write_3bytes): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_40bytes_align): + movdqa -40(%eax), %xmm0 + movdqa %xmm0, -40(%edx) +L(fwd_write_24bytes_align): + movdqa -24(%eax), %xmm0 + movdqa %xmm0, -24(%edx) +L(fwd_write_8bytes_align): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) +L(fwd_write_0bytes_align): +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_32bytes_align): + movdqa -32(%eax), %xmm0 + movdqa %xmm0, -32(%edx) +L(fwd_write_16bytes_align): + movdqa -16(%eax), %xmm0 + movdqa %xmm0, -16(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_5bytes_align): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_45bytes_align): + movdqa -45(%eax), %xmm0 + movdqa %xmm0, -45(%edx) +L(fwd_write_29bytes_align): + movdqa -29(%eax), %xmm0 + movdqa %xmm0, -29(%edx) +L(fwd_write_13bytes_align): + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_37bytes_align): + movdqa -37(%eax), %xmm0 + movdqa %xmm0, -37(%edx) +L(fwd_write_21bytes_align): + movdqa -21(%eax), %xmm0 + movdqa %xmm0, -21(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_41bytes_align): + movdqa -41(%eax), %xmm0 + movdqa %xmm0, -41(%edx) +L(fwd_write_25bytes_align): + movdqa -25(%eax), %xmm0 + movdqa %xmm0, -25(%edx) +L(fwd_write_9bytes_align): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) +L(fwd_write_1bytes_align): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_33bytes_align): + movdqa -33(%eax), %xmm0 + movdqa %xmm0, -33(%edx) +L(fwd_write_17bytes_align): + movdqa -17(%eax), %xmm0 + movdqa %xmm0, -17(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_46bytes_align): + movdqa -46(%eax), %xmm0 + movdqa %xmm0, -46(%edx) +L(fwd_write_30bytes_align): + movdqa -30(%eax), %xmm0 + movdqa %xmm0, -30(%edx) +L(fwd_write_14bytes_align): + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) +L(fwd_write_6bytes_align): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_38bytes_align): + movdqa -38(%eax), %xmm0 + movdqa %xmm0, -38(%edx) +L(fwd_write_22bytes_align): + movdqa -22(%eax), %xmm0 + movdqa %xmm0, -22(%edx) + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_42bytes_align): + movdqa -42(%eax), %xmm0 + movdqa %xmm0, -42(%edx) +L(fwd_write_26bytes_align): + movdqa -26(%eax), %xmm0 + movdqa %xmm0, -26(%edx) +L(fwd_write_10bytes_align): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) +L(fwd_write_2bytes_align): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_34bytes_align): + movdqa -34(%eax), %xmm0 + movdqa %xmm0, -34(%edx) +L(fwd_write_18bytes_align): + movdqa -18(%eax), %xmm0 + movdqa %xmm0, -18(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_47bytes_align): + movdqa -47(%eax), %xmm0 + movdqa %xmm0, -47(%edx) +L(fwd_write_31bytes_align): + movdqa -31(%eax), %xmm0 + movdqa %xmm0, -31(%edx) +L(fwd_write_15bytes_align): + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) +L(fwd_write_7bytes_align): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_39bytes_align): + movdqa -39(%eax), %xmm0 + movdqa %xmm0, -39(%edx) +L(fwd_write_23bytes_align): + movdqa -23(%eax), %xmm0 + movdqa %xmm0, -23(%edx) + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_43bytes_align): + movdqa -43(%eax), %xmm0 + movdqa %xmm0, -43(%edx) +L(fwd_write_27bytes_align): + movdqa -27(%eax), %xmm0 + movdqa %xmm0, -27(%edx) +L(fwd_write_11bytes_align): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) +L(fwd_write_3bytes_align): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_35bytes_align): + movdqa -35(%eax), %xmm0 + movdqa %xmm0, -35(%edx) +L(fwd_write_19bytes_align): + movdqa -19(%eax), %xmm0 + movdqa %xmm0, -19(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_44bytes_align): + movdqa -44(%eax), %xmm0 + movdqa %xmm0, -44(%edx) +L(fwd_write_28bytes_align): + movdqa -28(%eax), %xmm0 + movdqa %xmm0, -28(%edx) +L(fwd_write_12bytes_align): + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) +L(fwd_write_4bytes_align): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_36bytes_align): + movdqa -36(%eax), %xmm0 + movdqa %xmm0, -36(%edx) +L(fwd_write_20bytes_align): + movdqa -20(%eax), %xmm0 + movdqa %xmm0, -20(%edx) + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN_END + + CFI_PUSH (%edi) + + .p2align 4 +L(large_page): + movdqu (%eax), %xmm1 +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif + lea 16(%eax), %eax + movntdq %xmm1, (%edx) + lea 16(%edx), %edx + lea -0x90(%ecx), %ecx + POP (%edi) + + .p2align 4 +L(large_page_loop): + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + movdqu 0x40(%eax), %xmm4 + movdqu 0x50(%eax), %xmm5 + movdqu 0x60(%eax), %xmm6 + movdqu 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + movntdq %xmm4, 0x40(%edx) + movntdq %xmm5, 0x50(%edx) + movntdq %xmm6, 0x60(%edx) + movntdq %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + jae L(large_page_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(large_page_less_64bytes) + + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + lea 0x40(%eax), %eax + + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + lea 0x40(%edx), %edx + sub $0x40, %ecx +L(large_page_less_64bytes): + cmp $32, %ecx + jb L(large_page_less_32bytes) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + lea 0x20(%eax), %eax + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + lea 0x20(%edx), %edx + sub $0x20, %ecx +L(large_page_less_32bytes): + add %ecx, %edx + add %ecx, %eax + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(bk_write_44bytes): + movq 36(%eax), %xmm0 + movq %xmm0, 36(%edx) +L(bk_write_36bytes): + movq 28(%eax), %xmm0 + movq %xmm0, 28(%edx) +L(bk_write_28bytes): + movq 20(%eax), %xmm0 + movq %xmm0, 20(%edx) +L(bk_write_20bytes): + movq 12(%eax), %xmm0 + movq %xmm0, 12(%edx) +L(bk_write_12bytes): + movq 4(%eax), %xmm0 + movq %xmm0, 4(%edx) +L(bk_write_4bytes): + movl (%eax), %ecx + movl %ecx, (%edx) +L(bk_write_0bytes): +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_40bytes): + movq 32(%eax), %xmm0 + movq %xmm0, 32(%edx) +L(bk_write_32bytes): + movq 24(%eax), %xmm0 + movq %xmm0, 24(%edx) +L(bk_write_24bytes): + movq 16(%eax), %xmm0 + movq %xmm0, 16(%edx) +L(bk_write_16bytes): + movq 8(%eax), %xmm0 + movq %xmm0, 8(%edx) +L(bk_write_8bytes): + movq (%eax), %xmm0 + movq %xmm0, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_45bytes): + movq 37(%eax), %xmm0 + movq %xmm0, 37(%edx) +L(bk_write_37bytes): + movq 29(%eax), %xmm0 + movq %xmm0, 29(%edx) +L(bk_write_29bytes): + movq 21(%eax), %xmm0 + movq %xmm0, 21(%edx) +L(bk_write_21bytes): + movq 13(%eax), %xmm0 + movq %xmm0, 13(%edx) +L(bk_write_13bytes): + movq 5(%eax), %xmm0 + movq %xmm0, 5(%edx) +L(bk_write_5bytes): + movl 1(%eax), %ecx + movl %ecx, 1(%edx) +L(bk_write_1bytes): + movzbl (%eax), %ecx + movb %cl, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_41bytes): + movq 33(%eax), %xmm0 + movq %xmm0, 33(%edx) +L(bk_write_33bytes): + movq 25(%eax), %xmm0 + movq %xmm0, 25(%edx) +L(bk_write_25bytes): + movq 17(%eax), %xmm0 + movq %xmm0, 17(%edx) +L(bk_write_17bytes): + movq 9(%eax), %xmm0 + movq %xmm0, 9(%edx) +L(bk_write_9bytes): + movq 1(%eax), %xmm0 + movq %xmm0, 1(%edx) + movzbl (%eax), %ecx + movb %cl, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_46bytes): + movq 38(%eax), %xmm0 + movq %xmm0, 38(%edx) +L(bk_write_38bytes): + movq 30(%eax), %xmm0 + movq %xmm0, 30(%edx) +L(bk_write_30bytes): + movq 22(%eax), %xmm0 + movq %xmm0, 22(%edx) +L(bk_write_22bytes): + movq 14(%eax), %xmm0 + movq %xmm0, 14(%edx) +L(bk_write_14bytes): + movq 6(%eax), %xmm0 + movq %xmm0, 6(%edx) +L(bk_write_6bytes): + movl 2(%eax), %ecx + movl %ecx, 2(%edx) + movzwl (%eax), %ecx + movw %cx, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_42bytes): + movq 34(%eax), %xmm0 + movq %xmm0, 34(%edx) +L(bk_write_34bytes): + movq 26(%eax), %xmm0 + movq %xmm0, 26(%edx) +L(bk_write_26bytes): + movq 18(%eax), %xmm0 + movq %xmm0, 18(%edx) +L(bk_write_18bytes): + movq 10(%eax), %xmm0 + movq %xmm0, 10(%edx) +L(bk_write_10bytes): + movq 2(%eax), %xmm0 + movq %xmm0, 2(%edx) +L(bk_write_2bytes): + movzwl (%eax), %ecx + movw %cx, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_47bytes): + movq 39(%eax), %xmm0 + movq %xmm0, 39(%edx) +L(bk_write_39bytes): + movq 31(%eax), %xmm0 + movq %xmm0, 31(%edx) +L(bk_write_31bytes): + movq 23(%eax), %xmm0 + movq %xmm0, 23(%edx) +L(bk_write_23bytes): + movq 15(%eax), %xmm0 + movq %xmm0, 15(%edx) +L(bk_write_15bytes): + movq 7(%eax), %xmm0 + movq %xmm0, 7(%edx) +L(bk_write_7bytes): + movl 3(%eax), %ecx + movl %ecx, 3(%edx) + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_43bytes): + movq 35(%eax), %xmm0 + movq %xmm0, 35(%edx) +L(bk_write_35bytes): + movq 27(%eax), %xmm0 + movq %xmm0, 27(%edx) +L(bk_write_27bytes): + movq 19(%eax), %xmm0 + movq %xmm0, 19(%edx) +L(bk_write_19bytes): + movq 11(%eax), %xmm0 + movq %xmm0, 11(%edx) +L(bk_write_11bytes): + movq 3(%eax), %xmm0 + movq %xmm0, 3(%edx) +L(bk_write_3bytes): + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN_END + + + .pushsection .rodata.ssse3,"a",@progbits + .p2align 2 +L(table_48bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) + + .p2align 2 +L(table_48bytes_fwd_align): + .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) + + .p2align 2 +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + .p2align 2 +L(table_48_bytes_bwd): + .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) + + .popsection + +# ifdef USE_AS_MEMMOVE + .p2align 4 +L(copy_backward): + PUSH (%edi) + movl %eax, %edi + lea (%ecx,%edx,1),%edx + lea (%ecx,%edi,1),%edi + testl $0x3, %edx + jnz L(bk_align) + +L(bk_aligned_4): + cmp $64, %ecx + jae L(bk_write_more64bytes) + +L(bk_write_64bytesless): + cmp $32, %ecx + jb L(bk_write_less32bytes) + +L(bk_write_more32bytes): + /* Copy 32 bytes at a time. */ + sub $32, %ecx + movq -8(%edi), %xmm0 + movq %xmm0, -8(%edx) + movq -16(%edi), %xmm0 + movq %xmm0, -16(%edx) + movq -24(%edi), %xmm0 + movq %xmm0, -24(%edx) + movq -32(%edi), %xmm0 + movq %xmm0, -32(%edx) + sub $32, %edx + sub $32, %edi + +L(bk_write_less32bytes): + movl %edi, %eax + sub %ecx, %edx + sub %ecx, %eax + POP (%edi) +L(bk_write_less32bytes_2): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(bk_align): + cmp $8, %ecx + jbe L(bk_write_less32bytes) + testl $1, %edx + /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, + then (EDX & 2) must be != 0. */ + jz L(bk_got2) + sub $1, %edi + sub $1, %ecx + sub $1, %edx + movzbl (%edi), %eax + movb %al, (%edx) + + testl $2, %edx + jz L(bk_aligned_4) + +L(bk_got2): + sub $2, %edi + sub $2, %ecx + sub $2, %edx + movzwl (%edi), %eax + movw %ax, (%edx) + jmp L(bk_aligned_4) + + .p2align 4 +L(bk_write_more64bytes): + /* Check alignment of last byte. */ + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + +/* EDX is aligned 4 bytes, but not 16 bytes. */ +L(bk_ssse3_align): + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + +L(bk_ssse3_cpy_pre): + cmp $64, %ecx + jb L(bk_write_more32bytes) + + .p2align 4 +L(bk_ssse3_cpy): + sub $64, %edi + sub $64, %ecx + sub $64, %edx + movdqu 0x30(%edi), %xmm3 + movdqa %xmm3, 0x30(%edx) + movdqu 0x20(%edi), %xmm2 + movdqa %xmm2, 0x20(%edx) + movdqu 0x10(%edi), %xmm1 + movdqa %xmm1, 0x10(%edx) + movdqu (%edi), %xmm0 + movdqa %xmm0, (%edx) + cmp $64, %ecx + jae L(bk_ssse3_cpy) + jmp L(bk_write_64bytesless) + +# endif + +END (MEMCPY) + +#endif |