From 8f5ca04bc7fd53741d80117df992995ace8f6d2d Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 16 Oct 1995 01:37:51 +0000 Subject: Sat Oct 14 02:52:36 1995 Ulrich Drepper * malloc/malloc.c (_malloc_internal): Performance fix. Move if statement out of loop. * stdio/_itoa.c, stdio/_itoa.h: Complete rewrite. Much faster implementation using GMP functions. Contributed by Torbjorn Granlund and Ulrich Drepper. * stdio/test_rdwr.c: Include . * sysdeps/i386/i586/Implies: New file. New highly optimized string functions for i[345]86. * sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files. * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files. * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files. * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files. * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files. * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files. * sysdeps/i386/i586/strlen.S: New file. * sysdeps/i386/memchr.c: Removed. There is now an assembler version. * sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did not correspond to used values. * sysdeps/unix/sysv/linux/nfs/nfs.h: New file. Simply a wrapper around a kernel header file. * sysdeps/unix/sysv/linux/Dist: Add it. * sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers): Likewise. * sysdeps/unix/sysv/linux/local_lim.h: Rewrite. Instead of defining ourself we use a kernel header file. * sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system call handler for i586. * sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up. Sat Oct 14 02:52:36 1995 Ulrich Drepper * malloc/malloc.c (_malloc_internal): Performance fix. Move if statement out of loop. * stdio/_itoa.c, stdio/_itoa.h: Complete rewrite. Much faster implementation using GMP functions. Contributed by Torbjorn Granlund and Ulrich Drepper. * stdio/test_rdwr.c: Include . * sysdeps/i386/i586/Implies: New file. New highly optimized string functions for i[345]86. * sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files. * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files. * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files. * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files. * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files. * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files. * sysdeps/i386/i586/strlen.S: New file. * sysdeps/i386/memchr.c: Removed. There is now an assembler version. * sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did not correspond to used values. * sysdeps/unix/sysv/linux/nfs/nfs.h: New file. Simply a wrapper around a kernel header file. * sysdeps/unix/sysv/linux/Dist: Add it. * sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers): Likewise. * sysdeps/unix/sysv/linux/local_lim.h: Rewrite. Instead of defining ourself we use a kernel header file. * sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system call handler for i586. * sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up. --- sysdeps/i386/i586/Implies | 2 + sysdeps/i386/i586/add_n.S | 136 ++++++++++++++++++ sysdeps/i386/i586/addmul_1.S | 84 +++++++++++ sysdeps/i386/i586/lshift.S | 213 +++++++++++++++++++++++++++ sysdeps/i386/i586/memcopy.h | 6 +- sysdeps/i386/i586/mul_1.S | 78 ++++++++++ sysdeps/i386/i586/rshift.S | 213 +++++++++++++++++++++++++++ sysdeps/i386/i586/strchr.S | 334 +++++++++++++++++++++++++++++++++++++++++++ sysdeps/i386/i586/strlen.S | 185 ++++++++++++++++++++++++ sysdeps/i386/i586/sub_n.S | 136 ++++++++++++++++++ sysdeps/i386/i586/submul_1.S | 82 +++++++++++ 11 files changed, 1466 insertions(+), 3 deletions(-) create mode 100644 sysdeps/i386/i586/Implies create mode 100644 sysdeps/i386/i586/add_n.S create mode 100644 sysdeps/i386/i586/addmul_1.S create mode 100644 sysdeps/i386/i586/lshift.S create mode 100644 sysdeps/i386/i586/mul_1.S create mode 100644 sysdeps/i386/i586/rshift.S create mode 100644 sysdeps/i386/i586/strchr.S create mode 100644 sysdeps/i386/i586/strlen.S create mode 100644 sysdeps/i386/i586/sub_n.S create mode 100644 sysdeps/i386/i586/submul_1.S (limited to 'sysdeps/i386/i586') diff --git a/sysdeps/i386/i586/Implies b/sysdeps/i386/i586/Implies new file mode 100644 index 0000000000..477cd741ce --- /dev/null +++ b/sysdeps/i386/i586/Implies @@ -0,0 +1,2 @@ +# Code optimized for i486 is better than simple i386 code. +i386/i486 diff --git a/sysdeps/i386/i586/add_n.S b/sysdeps/i386/i586/add_n.S new file mode 100644 index 0000000000..9be45ed93b --- /dev/null +++ b/sysdeps/i386/i586/add_n.S @@ -0,0 +1,136 @@ +/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store + sum in a third limb vector. + +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + s2_ptr (sp + 12) + size (sp + 16) +*/ + +#define r1 %eax +#define r2 %edx +#define src1 %esi +#define src2 %ebp +#define dst %edi +#define x %ebx + +#include "sysdep.h" +#include "asm-syntax.h" + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(__mpn_add_n) +C_SYMBOL_NAME(__mpn_add_n:) + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),dst /* res_ptr */ + movl 24(%esp),src1 /* s1_ptr */ + movl 28(%esp),src2 /* s2_ptr */ + movl 32(%esp),%ecx /* size */ + + movl (src2),x + + decl %ecx + movl %ecx,r2 + shrl $3,%ecx + andl $7,r2 + testl %ecx,%ecx /* zero carry flag */ + jz Lend + pushl r2 + + ALIGN (3) +Loop: movl 28(dst),%eax /* fetch destination cache line */ + leal 32(dst),dst + +L1: movl (src1),r1 + movl 4(src1),r2 + adcl x,r1 + movl 4(src2),x + adcl x,r2 + movl 8(src2),x + movl r1,-32(dst) + movl r2,-28(dst) + +L2: movl 8(src1),r1 + movl 12(src1),r2 + adcl x,r1 + movl 12(src2),x + adcl x,r2 + movl 16(src2),x + movl r1,-24(dst) + movl r2,-20(dst) + +L3: movl 16(src1),r1 + movl 20(src1),r2 + adcl x,r1 + movl 20(src2),x + adcl x,r2 + movl 24(src2),x + movl r1,-16(dst) + movl r2,-12(dst) + +L4: movl 24(src1),r1 + movl 28(src1),r2 + adcl x,r1 + movl 28(src2),x + adcl x,r2 + movl 32(src2),x + movl r1,-8(dst) + movl r2,-4(dst) + + leal 32(src1),src1 + leal 32(src2),src2 + decl %ecx + jnz Loop + + popl r2 +Lend: + decl r2 /* test r2 w/o clobbering carry */ + js Lend2 + incl r2 +Loop2: + leal 4(dst),dst + movl (src1),r1 + adcl x,r1 + movl 4(src2),x + movl r1,-4(dst) + leal 4(src1),src1 + leal 4(src2),src2 + decl r2 + jnz Loop2 +Lend2: + movl (src1),r1 + adcl x,r1 + movl r1,(dst) + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/sysdeps/i386/i586/addmul_1.S b/sysdeps/i386/i586/addmul_1.S new file mode 100644 index 0000000000..b222840591 --- /dev/null +++ b/sysdeps/i386/i586/addmul_1.S @@ -0,0 +1,84 @@ +/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add + the result to a second limb vector. + +Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + size (sp + 12) + s2_limb (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define res_ptr edi +#define s1_ptr esi +#define s2_limb ebp + + TEXT + ALIGN (3) + GLOBL C_SYMBOL_NAME(__mpn_addmul_1) + .type C_SYMBOL_NAME(__mpn_addmul_1),@function +C_SYMBOL_NAME(__mpn_addmul_1:) + + INSN1(push,l ,R(edi)) + INSN1(push,l ,R(esi)) + INSN1(push,l ,R(ebx)) + INSN1(push,l ,R(ebp)) + + INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) + INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) + INSN2(mov,l ,R(ecx),MEM_DISP(esp,28)) + INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) + + INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,ecx,4)) + INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,ecx,4)) + INSN1(neg,l ,R(ecx)) + INSN2(xor,l ,R(edx),R(edx)) + ALIGN (3) +Loop: + INSN2(mov,l ,R(ebx),R(edx)) + INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,ecx,4)) + + INSN1(mul,l ,R(s2_limb)) + + INSN2(add,l ,R(eax),R(ebx)) + INSN2(mov,l ,R(ebx),MEM_INDEX(res_ptr,ecx,4)) + + INSN2(adc,l ,R(edx),$0) + INSN2(add,l ,R(ebx),R(eax)) + + INSN2(adc,l ,R(edx),$0) + INSN2(mov,l ,MEM_INDEX(res_ptr,ecx,4),R(ebx)) + + INSN1(inc,l ,R(ecx)) + INSN1(jnz, ,Loop) + + + INSN2(mov,l ,R(eax),R(edx)) + INSN1(pop,l ,R(ebp)) + INSN1(pop,l ,R(ebx)) + INSN1(pop,l ,R(esi)) + INSN1(pop,l ,R(edi)) + ret +Lfe1: + .size C_SYMBOL_NAME(__mpn_addmul_1),Lfe1-C_SYMBOL_NAME(__mpn_addmul_1) diff --git a/sysdeps/i386/i586/lshift.S b/sysdeps/i386/i586/lshift.S new file mode 100644 index 0000000000..b9f8131297 --- /dev/null +++ b/sysdeps/i386/i586/lshift.S @@ -0,0 +1,213 @@ +/* Pentium optimized __mpn_lshift -- + +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s_ptr (sp + 8) + size (sp + 12) + cnt (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(__mpn_lshift) +C_SYMBOL_NAME(__mpn_lshift:) + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),%edi /* res_ptr */ + movl 24(%esp),%esi /* s_ptr */ + movl 28(%esp),%ebp /* size */ + movl 32(%esp),%ecx /* cnt */ + + cmp $1,%ecx + jne Lnormal + movl %edi,%eax + subl %esi,%eax + cmpl %ebp,%eax + jnc Lspecial + +Lnormal: + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + xorl %eax,%eax + shldl %cl,%edx,%eax /* compute carry limb */ + pushl %eax /* push carry limb onto stack */ + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz Lend + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +Loop: movl -28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + shldl %cl,%eax,%ebx + shldl %cl,%edx,%eax + movl %ebx,(%edi) + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + shldl %cl,%ebx,%edx + shldl %cl,%eax,%ebx + movl %edx,-8(%edi) + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + shldl %cl,%edx,%eax + shldl %cl,%ebx,%edx + movl %eax,-16(%edi) + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + shldl %cl,%eax,%ebx + shldl %cl,%edx,%eax + movl %ebx,-24(%edi) + movl %eax,-28(%edi) + + subl $32,%esi + subl $32,%edi + decl %ebp + jnz Loop + +Lend: popl %ebp + andl $7,%ebp + jz Lend2 +Loop2: movl (%esi),%eax + shldl %cl,%eax,%edx + movl %edx,(%edi) + movl %eax,%edx + subl $4,%esi + subl $4,%edi + decl %ebp + jnz Loop2 + +Lend2: shll %cl,%edx /* compute least significant limb */ + movl %edx,(%edi) /* store it */ + + popl %eax /* pop carry limb */ + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +/* We loop from least significant end of the arrays, which is only + permissable if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. +*/ + +Lspecial: + movl (%esi),%edx + addl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + addl %edx,%edx + incl %ebp + decl %ebp + jz LLend + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +LLoop: movl 28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + adcl %eax,%eax + movl %ebx,(%edi) + adcl %edx,%edx + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + adcl %ebx,%ebx + movl %edx,8(%edi) + adcl %eax,%eax + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + adcl %edx,%edx + movl %eax,16(%edi) + adcl %ebx,%ebx + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + adcl %eax,%eax + movl %ebx,24(%edi) + adcl %edx,%edx + movl %eax,28(%edi) + + leal 32(%esi),%esi /* use leal not to clobber carry */ + leal 32(%edi),%edi + decl %ebp + jnz LLoop + +LLend: popl %ebp + sbbl %eax,%eax /* save carry in %eax */ + andl $7,%ebp + jz LLend2 + addl %eax,%eax /* restore carry from eax */ +LLoop2: movl %edx,%ebx + movl (%esi),%edx + adcl %edx,%edx + movl %ebx,(%edi) + + leal 4(%esi),%esi /* use leal not to clobber carry */ + leal 4(%edi),%edi + decl %ebp + jnz LLoop2 + + jmp LL1 +LLend2: addl %eax,%eax /* restore carry from eax */ +LL1: movl %edx,(%edi) /* store last limb */ + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/sysdeps/i386/i586/memcopy.h b/sysdeps/i386/i586/memcopy.h index a9bb9e7a40..0a8768788e 100644 --- a/sysdeps/i386/i586/memcopy.h +++ b/sysdeps/i386/i586/memcopy.h @@ -1,5 +1,5 @@ /* memcopy.h -- definitions for memory copy functions. Pentium version. - Copyright (C) 1994 Free Software Foundation, Inc. + Copyright (C) 1994, 1995 Free Software Foundation, Inc. Contributed by Torbjorn Granlund (tege@sics.se). This file is part of the GNU C Library. @@ -88,7 +88,7 @@ Cambridge, MA 02139, USA. */ "subl $32,%2\n" \ "jns 1b\n" \ "2: addl $32,%2" : \ - "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) : \ - "0" (dst_bp), "1" (src_bp), "2" (nbytes) : \ + "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) : \ + "0" (dst_ep), "1" (src_ep), "2" (nbytes) : \ "ax", "dx"); \ } while (0) diff --git a/sysdeps/i386/i586/mul_1.S b/sysdeps/i386/i586/mul_1.S new file mode 100644 index 0000000000..2b7258e130 --- /dev/null +++ b/sysdeps/i386/i586/mul_1.S @@ -0,0 +1,78 @@ +/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store + the result in a second limb vector. + +Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + size (sp + 12) + s2_limb (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebp + + TEXT + ALIGN (3) + GLOBL C_SYMBOL_NAME(__mpn_mul_1) +C_SYMBOL_NAME(__mpn_mul_1:) + + INSN1(push,l ,R(edi)) + INSN1(push,l ,R(esi)) + INSN1(push,l ,R(ebx)) + INSN1(push,l ,R(ebp)) + + INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) + INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) + INSN2(mov,l ,R(size),MEM_DISP(esp,28)) + INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) + + INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4)) + INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4)) + INSN1(neg,l ,R(size)) + INSN2(xor,l ,R(edx),R(edx)) + ALIGN (3) +Loop: + INSN2(mov,l ,R(ebx),R(edx)) + INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4)) + + INSN1(mul,l ,R(s2_limb)) + + INSN2(add,l ,R(eax),R(ebx)) + + INSN2(adc,l ,R(edx),$0) + INSN2(mov,l ,MEM_INDEX(res_ptr,size,4),R(eax)) + + INSN1(inc,l ,R(size)) + INSN1(jnz, ,Loop) + + + INSN2(mov,l ,R(eax),R(edx)) + INSN1(pop,l ,R(ebp)) + INSN1(pop,l ,R(ebx)) + INSN1(pop,l ,R(esi)) + INSN1(pop,l ,R(edi)) + ret diff --git a/sysdeps/i386/i586/rshift.S b/sysdeps/i386/i586/rshift.S new file mode 100644 index 0000000000..51cde8f07f --- /dev/null +++ b/sysdeps/i386/i586/rshift.S @@ -0,0 +1,213 @@ +/* Pentium optimized __mpn_rshift -- + +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s_ptr (sp + 8) + size (sp + 12) + cnt (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(__mpn_rshift) +C_SYMBOL_NAME(__mpn_rshift:) + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),%edi /* res_ptr */ + movl 24(%esp),%esi /* s_ptr */ + movl 28(%esp),%ebp /* size */ + movl 32(%esp),%ecx /* cnt */ + + cmp $1,%ecx + jne Lnormal + movl %edi,%eax + subl %esi,%eax + cmpl %ebp,%eax + jnc Lspecial + +Lnormal: + movl (%esi),%edx + addl $4,%esi + xorl %eax,%eax + shrdl %cl,%edx,%eax /* compute carry limb */ + pushl %eax /* push carry limb onto stack */ + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz Lend + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +Loop: movl 28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + shrdl %cl,%eax,%ebx + shrdl %cl,%edx,%eax + movl %ebx,(%edi) + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + shrdl %cl,%ebx,%edx + shrdl %cl,%eax,%ebx + movl %edx,8(%edi) + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + shrdl %cl,%edx,%eax + shrdl %cl,%ebx,%edx + movl %eax,16(%edi) + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + shrdl %cl,%eax,%ebx + shrdl %cl,%edx,%eax + movl %ebx,24(%edi) + movl %eax,28(%edi) + + addl $32,%esi + addl $32,%edi + decl %ebp + jnz Loop + +Lend: popl %ebp + andl $7,%ebp + jz Lend2 +Loop2: movl (%esi),%eax + shrdl %cl,%eax,%edx /* compute result limb */ + movl %edx,(%edi) + movl %eax,%edx + addl $4,%esi + addl $4,%edi + decl %ebp + jnz Loop2 + +Lend2: shrl %cl,%edx /* compute most significant limb */ + movl %edx,(%edi) /* store it */ + + popl %eax /* pop carry limb */ + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +/* We loop from least significant end of the arrays, which is only + permissable if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. +*/ + +Lspecial: + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + shrl $1,%edx + incl %ebp + decl %ebp + jz LLend + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +LLoop: movl -28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + rcrl $1,%eax + movl %ebx,(%edi) + rcrl $1,%edx + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + rcrl $1,%ebx + movl %edx,-8(%edi) + rcrl $1,%eax + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + rcrl $1,%edx + movl %eax,-16(%edi) + rcrl $1,%ebx + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + rcrl $1,%eax + movl %ebx,-24(%edi) + rcrl $1,%edx + movl %eax,-28(%edi) + + leal -32(%esi),%esi /* use leal not to clobber carry */ + leal -32(%edi),%edi + decl %ebp + jnz LLoop + +LLend: popl %ebp + sbbl %eax,%eax /* save carry in %eax */ + andl $7,%ebp + jz LLend2 + addl %eax,%eax /* restore carry from eax */ +LLoop2: movl %edx,%ebx + movl (%esi),%edx + rcrl $1,%edx + movl %ebx,(%edi) + + leal -4(%esi),%esi /* use leal not to clobber carry */ + leal -4(%edi),%edi + decl %ebp + jnz LLoop2 + + jmp LL1 +LLend2: addl %eax,%eax /* restore carry from eax */ +LL1: movl %edx,(%edi) /* store last limb */ + + movl $0,%eax + rcrl $1,%eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/sysdeps/i386/i586/strchr.S b/sysdeps/i386/i586/strchr.S new file mode 100644 index 0000000000..982c80ec9a --- /dev/null +++ b/sysdeps/i386/i586/strchr.S @@ -0,0 +1,334 @@ +/* strchr -- find character CH in a NUL terminated string. +Highly optimized version for ix85, x>=5. +Copyright (C) 1995 Free Software Foundation, Inc. +This file is part of the GNU C Library. +Contributed by Ulrich Drepper, . + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include + +/* This version is especially optimized for the i586 (and following?) + processors. This is mainly done by using the two pipelines. The + version optimized for i486 is weak in this aspect because to get + as much parallelism we have to executs some *more* instructions. + + The code below is structured to reflect the pairing of the instructions + as *I think* it is. I have no processor data book to verify this. + If you find something you think is incorrect let me know. */ + + +/* The magic value which is used throughout in the whole code. */ +#define magic 0xfefefeff + +/* + INPUT PARAMETERS: + str (sp + 4) + ch (sp + 8) +*/ + + .text +ENTRY (strchr) + pushl %edi /* Save callee-safe registers. */ + pushl %esi + + pushl %ebx + pushl %ebp + + movl 20(%esp), %eax /* get string pointer */ + movl 24(%esp), %edx /* get character we are looking for */ + + movl %eax, %edi /* duplicate string pointer for later */ + xorl %ecx, %ecx /* clear %ecx */ + + /* At the moment %edx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* now it is 0|0|c|c */ + movb %dl, %cl /* we construct the lower half in %ecx */ + + shll $16, %edx /* now %edx is c|c|0|0 */ + movb %cl, %ch /* now %ecx is 0|0|c|c */ + + orl %ecx, %edx /* and finally c|c|c|c */ + andl $3, %edi /* mask alignment bits */ + + jz L11 /* alignment is 0 => start loop */ + + movb (%eax), %cl /* load single byte */ + cmpb %cl, %dl /* is byte == C? */ + + je L2 /* aligned => return pointer */ + + cmp $0, %cl /* is byte NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + cmp $3, %edi /* was alignment == 3? */ + + je L11 /* yes => start loop */ + + movb (%eax), %cl /* load single byte */ + cmpb %cl, %dl /* is byte == C? */ + + je L2 /* aligned => return pointer */ + + cmp $0, %cl /* is byte NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + cmp $2, %edi /* was alignment == 2? */ + + je L11 /* yes => start loop */ + + movb (%eax), %cl /* load single byte */ + cmpb %cl, %dl /* is byte == C? */ + + je L2 /* aligned => return pointer */ + + cmp $0, %cl /* is byte NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + + /* The following code is the preparation for the loop. The + four instruction up to `L1' will not be executed in the loop + because the same code is found at the end of the loop, but + there it is executed in parallel with other instructions. */ +L11: movl (%eax), %ecx + movl $magic, %ebp + + movl $magic, %edi + addl %ecx, %ebp + + /* The main loop: it looks complex and indeed it is. I would + love to say `it was hard to write, so it should he hard to + read' but I will give some more hints. To fully understand + this code you should first take a look at the i486 version. + The basic algorithm is the same, but here the code organized + in a way which permits to use both pipelines all the time. + + I tried to make it a bit more understandable by indenting + the code according to stage in the algorithm. It goes as + follows: + check for 0 in 1st word + check for C in 1st word + check for 0 in 2nd word + check for C in 2nd word + check for 0 in 3rd word + check for C in 3rd word + check for 0 in 4th word + check for C in 4th word + + Please note that doing the test for NUL before the test for + C allows us to overlap the test for 0 in the next word with + the test for C. */ + +L1: xorl %ecx, %ebp /* (word^magic) */ + addl %ecx, %edi /* add magic word */ + + leal 4(%eax), %eax /* increment pointer */ + jnc L4 /* previous addl caused overflow? */ + + movl %ecx, %ebx /* duplicate original word */ + orl $magic, %ebp /* (word^magic)|magic */ + + addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */ + jne L4 /* yes => we found word with NUL */ + + movl $magic, %esi /* load magic value */ + xorl %edx, %ebx /* clear words which are C */ + + movl (%eax), %ecx + addl %ebx, %esi /* (word+magic) */ + + movl $magic, %edi + jnc L5 /* previous addl caused overflow? */ + + movl %edi, %ebp + xorl %ebx, %esi /* (word+magic)^word */ + + addl %ecx, %ebp + orl $magic, %esi /* ((word+magic)^word)|magic */ + + addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/ + jne L5 /* yes => we found word with C */ + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L4 + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L4 + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L5 + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + jne L5 + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L4 + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L4 + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L5 + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + jne L5 + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L4 + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L4 + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L5 + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + + je L1 + + /* We know there is no NUL byte but a C byte in the word. + %ebx contains NUL in this particular byte. */ +L5: subl $4, %eax /* adjust pointer */ + testb %bl, %bl /* first byte == C? */ + + jz L2 /* yes => return pointer */ + + incl %eax /* increment pointer */ + testb %bh, %bh /* second byte == C? */ + + jz L2 /* yes => return pointer */ + + shrl $16, %ebx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmp $0, %bl /* third byte == C */ + je L2 /* yes => return pointer */ + + incl %eax /* increment pointer */ + +L2: popl %ebp /* restore saved registers */ + popl %ebx + + popl %esi + popl %edi + + ret + + /* We know there is a NUL byte in the word. But we have to test + whether there is an C byte before it in the word. */ +L4: subl $4, %eax /* adjust pointer */ + cmpb %dl, %cl /* first byte == C? */ + + je L2 /* yes => return pointer */ + + cmpb $0, %cl /* first byte == NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + + cmpb %dl, %ch /* second byte == C? */ + je L2 /* yes => return pointer */ + + cmpb $0, %ch /* second byte == NUL? */ + je L3 /* yes => return NULL */ + + shrl $16, %ecx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmpb %dl, %cl /* third byte == C? */ + je L2 /* yes => return pointer */ + + cmpb $0, %cl /* third byte == NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + + /* The test four the fourth byte is necessary! */ + cmpb %dl, %ch /* fourth byte == C? */ + je L2 /* yes => return pointer */ + +L3: xorl %eax, %eax /* set return value = NULL */ + + popl %ebp /* restore saved registers */ + popl %ebx + + popl %esi + popl %edi + + ret + +#undef index +weak_alias (strchr, index) diff --git a/sysdeps/i386/i586/strlen.S b/sysdeps/i386/i586/strlen.S new file mode 100644 index 0000000000..b807ed4b4f --- /dev/null +++ b/sysdeps/i386/i586/strlen.S @@ -0,0 +1,185 @@ +/* strlen -- Compute length og NUL terminated string. +Highly optimized version for ix86, x>=5. +Copyright (C) 1995 Free Software Foundation, Inc. +This file is part of the GNU C Library. +Contributed by Ulrich Drepper, . + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include + +/* This version is especially optimized for the i586 (and following?) + processors. This is mainly done by using the two pipelines. The + version optimized for i486 is weak in this aspect because to get + as much parallelism we have to executs some *more* instructions. + + The code below is structured to reflect the pairing of the instructions + as *I think* it is. I have no processor data book to verify this. + If you find something you think is incorrect let me know. */ + + +/* The magic value which is used throughout in the whole code. */ +#define magic 0xfefefeff + +/* + INPUT PARAMETERS: + str (sp + 4) +*/ + + .text +ENTRY(strlen) + movl 4(%esp), %eax /* get string pointer */ + + movl %eax, %ecx /* duplicate it */ + andl $3, %ecx /* mask alignment bits */ + + jz L11 /* aligned => start loop */ + + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + + incl %eax /* increment pointer */ + cmpl $3, %ecx /* was alignment = 3? */ + + je L11 /* yes => now it is aligned and start loop */ + + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + + incl %eax /* increment pointer */ + cmpl $2, %ecx /* was alignment = 2? */ + + je L11 /* yes => now it is aligned and start loop */ + + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + + incl %eax /* increment pointer */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. */ +L11: xorl %edx, %edx /* We need %edx == 0 for later */ + +L1: + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L3 /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + subl $magic, %ecx /* undo previous addl to restore word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L3 /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L3 /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + subl $magic, %ecx /* undo previous addl to restore word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L3 /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L3 /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + subl $magic, %ecx /* undo previous addl to restore word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L3 /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* wcomplete negation of ord */ + jnc L3 /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + subl $magic, %ecx /* undo previous addl to restore word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + je L1 /* no => start loop again */ + + +L3: subl $4, %eax /* correct too early pointer increment */ + testb %cl, %cl /* lowest byte NUL? */ + + jz L2 /* yes => return */ + + inc %eax /* increment pointer */ + testb %ch, %ch /* second byte NUL? */ + + jz L2 /* yes => return */ + + shrl $16, %ecx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmpb $0, %cl /* is third byte NUL? */ + jz L2 /* yes => return */ + + incl %eax /* increment pointer */ + +L2: subl 4(%esp), %eax /* now compute the length as difference + between start and terminating NUL + character */ + + ret diff --git a/sysdeps/i386/i586/sub_n.S b/sysdeps/i386/i586/sub_n.S new file mode 100644 index 0000000000..1382e665f6 --- /dev/null +++ b/sysdeps/i386/i586/sub_n.S @@ -0,0 +1,136 @@ +/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0 + and store difference in a third limb vector. + +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + s2_ptr (sp + 12) + size (sp + 16) +*/ + +#define r1 %eax +#define r2 %edx +#define src1 %esi +#define src2 %ebp +#define dst %edi +#define x %ebx + +#include "sysdep.h" +#include "asm-syntax.h" + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(__mpn_sub_n) +C_SYMBOL_NAME(__mpn_sub_n:) + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),dst /* res_ptr */ + movl 24(%esp),src1 /* s1_ptr */ + movl 28(%esp),src2 /* s2_ptr */ + movl 32(%esp),%ecx /* size */ + + movl (src2),x + + decl %ecx + movl %ecx,r2 + shrl $3,%ecx + andl $7,r2 + testl %ecx,%ecx /* zero carry flag */ + jz Lend + pushl r2 + + ALIGN (3) +Loop: movl 28(dst),%eax /* fetch destination cache line */ + leal 32(dst),dst + +L1: movl (src1),r1 + movl 4(src1),r2 + sbbl x,r1 + movl 4(src2),x + sbbl x,r2 + movl 8(src2),x + movl r1,-32(dst) + movl r2,-28(dst) + +L2: movl 8(src1),r1 + movl 12(src1),r2 + sbbl x,r1 + movl 12(src2),x + sbbl x,r2 + movl 16(src2),x + movl r1,-24(dst) + movl r2,-20(dst) + +L3: movl 16(src1),r1 + movl 20(src1),r2 + sbbl x,r1 + movl 20(src2),x + sbbl x,r2 + movl 24(src2),x + movl r1,-16(dst) + movl r2,-12(dst) + +L4: movl 24(src1),r1 + movl 28(src1),r2 + sbbl x,r1 + movl 28(src2),x + sbbl x,r2 + movl 32(src2),x + movl r1,-8(dst) + movl r2,-4(dst) + + leal 32(src1),src1 + leal 32(src2),src2 + decl %ecx + jnz Loop + + popl r2 +Lend: + decl r2 /* test r2 w/o clobbering carry */ + js Lend2 + incl r2 +Loop2: + leal 4(dst),dst + movl (src1),r1 + sbbl x,r1 + movl 4(src2),x + movl r1,-4(dst) + leal 4(src1),src1 + leal 4(src2),src2 + decl r2 + jnz Loop2 +Lend2: + movl (src1),r1 + sbbl x,r1 + movl r1,(dst) + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/sysdeps/i386/i586/submul_1.S b/sysdeps/i386/i586/submul_1.S new file mode 100644 index 0000000000..14bfe54e24 --- /dev/null +++ b/sysdeps/i386/i586/submul_1.S @@ -0,0 +1,82 @@ +/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract + the result from a second limb vector. + +Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + size (sp + 12) + s2_limb (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebp + + TEXT + ALIGN (3) + GLOBL C_SYMBOL_NAME(__mpn_submul_1) +C_SYMBOL_NAME(__mpn_submul_1:) + + INSN1(push,l ,R(edi)) + INSN1(push,l ,R(esi)) + INSN1(push,l ,R(ebx)) + INSN1(push,l ,R(ebp)) + + INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) + INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) + INSN2(mov,l ,R(size),MEM_DISP(esp,28)) + INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) + + INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4)) + INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4)) + INSN1(neg,l ,R(size)) + INSN2(xor,l ,R(edx),R(edx)) + ALIGN (3) +Loop: + INSN2(mov,l ,R(ebx),R(edx)) + INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4)) + + INSN1(mul,l ,R(s2_limb)) + + INSN2(add,l ,R(eax),R(ebx)) + INSN2(mov,l ,R(ebx),MEM_INDEX(res_ptr,size,4)) + + INSN2(adc,l ,R(edx),$0) + INSN2(sub,l ,R(ebx),R(eax)) + + INSN2(adc,l ,R(edx),$0) + INSN2(mov,l ,MEM_INDEX(res_ptr,size,4),R(ebx)) + + INSN1(inc,l ,R(size)) + INSN1(jnz, ,Loop) + + + INSN2(mov,l ,R(eax),R(edx)) + INSN1(pop,l ,R(ebp)) + INSN1(pop,l ,R(ebx)) + INSN1(pop,l ,R(esi)) + INSN1(pop,l ,R(edi)) + ret -- cgit 1.4.1