diff options
Diffstat (limited to 'REORG.TODO/sysdeps/x86_64/mul_1.S')
-rw-r--r-- | REORG.TODO/sysdeps/x86_64/mul_1.S | 128 |
1 files changed, 128 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/x86_64/mul_1.S b/REORG.TODO/sysdeps/x86_64/mul_1.S new file mode 100644 index 0000000000..5c1c4335bf --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/mul_1.S @@ -0,0 +1,128 @@ +/* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store + the result in a second limb vector. + Copyright (C) 2003-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define rp %rdi +#define up %rsi +#define n_param %rdx +#define vl %rcx + +#define n %r11 + + .text +ENTRY (__mpn_mul_1) + push %rbx + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbx, 0) + xor %r10, %r10 + mov (up), %rax /* read first u limb early */ + mov n_param, %rbx /* move away n from rdx, mul uses it */ + mul vl + mov %rbx, %r11 + + add %r10, %rax + adc $0, %rdx + + and $3, %ebx + jz L(b0) + cmp $2, %ebx + jz L(b2) + jg L(b3) + +L(b1): dec n + jne L(gt1) + mov %rax, (rp) + jmp L(ret) +L(gt1): lea 8(up,n,8), up + lea -8(rp,n,8), rp + neg n + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (up,n,8), %rax + mov %rdx, %r8 + jmp L(L1) + +L(b0): lea (up,n,8), up + lea -16(rp,n,8), rp + neg n + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp L(L0) + +L(b3): lea -8(up,n,8), up + lea -24(rp,n,8), rp + neg n + mov %rax, %rbx + mov %rdx, %r10 + jmp L(L3) + +L(b2): lea -16(up,n,8), up + lea -32(rp,n,8), rp + neg n + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(up,n,8), %rax + mov %rdx, %r9 + jmp L(L2) + + .p2align 4 +L(top): mov %r10, (rp,n,8) + add %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 + mov $0, %r10d +L(L1): mul vl + mov %r9, 8(rp,n,8) + add %rax, %r8 + adc %rdx, %rbx +L(L0): mov 8(up,n,8), %rax + mul vl + mov %r8, 16(rp,n,8) + add %rax, %rbx + adc %rdx, %r10 +L(L3): mov 16(up,n,8), %rax + mul vl + mov %rbx, 24(rp,n,8) + mov $0, %r8d # zero + mov %r8, %rbx # zero + add %rax, %r10 + mov 24(up,n,8), %rax + mov %r8, %r9 # zero + adc %rdx, %r9 +L(L2): mul vl + add $4, n + js L(top) + + mov %r10, (rp,n,8) + add %rax, %r9 + adc %r8, %rdx + mov %r9, 8(rp,n,8) + add %r8, %rdx +L(ret): mov %rdx, %rax + + pop %rbx + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbx) + ret +END (__mpn_mul_1) |