diff options
Diffstat (limited to 'REORG.TODO/sysdeps/i386/i586')
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/add_n.S | 143 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/addmul_1.S | 94 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/bzero.S | 4 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/init-arch.h | 19 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/lshift.S | 255 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/memcopy.h | 95 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/memcpy.S | 124 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/mempcpy.S | 8 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/memset.S | 121 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/memusage.h | 1 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/mul_1.S | 90 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/rshift.S | 255 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/stpcpy.S | 8 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/strchr.S | 348 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/strcpy.S | 169 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/strlen.S | 182 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/sub_n.S | 143 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/i386/i586/submul_1.S | 94 |
18 files changed, 2153 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/i386/i586/add_n.S b/REORG.TODO/sysdeps/i386/i586/add_n.S new file mode 100644 index 0000000000..f73df092f0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/add_n.S @@ -0,0 +1,143 @@ +/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store + sum in a third limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define S2 S1+4 +#define SIZE S2+4 + + .text +ENTRY (__mpn_add_n) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %ebx + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 12) + movl S1(%esp),%esi + cfi_rel_offset (esi, 8) + movl S2(%esp),%ebx + cfi_rel_offset (ebx, 0) + movl SIZE(%esp),%ecx + movl (%ebx),%ebp + cfi_rel_offset (ebp, 4) + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx /* zero carry flag */ + jz L(end) + pushl %edx + cfi_adjust_cfa_offset (4) + + ALIGN (3) +L(oop): movl 28(%edi),%eax /* fetch destination cache line */ + leal 32(%edi),%edi + +L(1): movl (%esi),%eax + movl 4(%esi),%edx + adcl %ebp,%eax + movl 4(%ebx),%ebp + adcl %ebp,%edx + movl 8(%ebx),%ebp + movl %eax,-32(%edi) + movl %edx,-28(%edi) + +L(2): movl 8(%esi),%eax + movl 12(%esi),%edx + adcl %ebp,%eax + movl 12(%ebx),%ebp + adcl %ebp,%edx + movl 16(%ebx),%ebp + movl %eax,-24(%edi) + movl %edx,-20(%edi) + +L(3): movl 16(%esi),%eax + movl 20(%esi),%edx + adcl %ebp,%eax + movl 20(%ebx),%ebp + adcl %ebp,%edx + movl 24(%ebx),%ebp + movl %eax,-16(%edi) + movl %edx,-12(%edi) + +L(4): movl 24(%esi),%eax + movl 28(%esi),%edx + adcl %ebp,%eax + movl 28(%ebx),%ebp + adcl %ebp,%edx + movl 32(%ebx),%ebp + movl %eax,-8(%edi) + movl %edx,-4(%edi) + + leal 32(%esi),%esi + leal 32(%ebx),%ebx + decl %ecx + jnz L(oop) + + popl %edx + cfi_adjust_cfa_offset (-4) +L(end): + decl %edx /* test %edx w/o clobbering carry */ + js L(end2) + incl %edx +L(oop2): + leal 4(%edi),%edi + movl (%esi),%eax + adcl %ebp,%eax + movl 4(%ebx),%ebp + movl %eax,-4(%edi) + leal 4(%esi),%esi + leal 4(%ebx),%ebx + decl %edx + jnz L(oop2) +L(end2): + movl (%esi),%eax + adcl %ebp,%eax + movl %eax,(%edi) + + sbbl %eax,%eax + negl %eax + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_add_n) diff --git a/REORG.TODO/sysdeps/i386/i586/addmul_1.S b/REORG.TODO/sysdeps/i386/i586/addmul_1.S new file mode 100644 index 0000000000..a713192982 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/addmul_1.S @@ -0,0 +1,94 @@ +/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add + the result to a second limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define SIZE S1+4 +#define S2LIMB SIZE+4 + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebx + + .text +ENTRY (__mpn_addmul_1) + + pushl %res_ptr + cfi_adjust_cfa_offset (4) + pushl %s1_ptr + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %s2_limb + cfi_adjust_cfa_offset (4) + + movl RES(%esp), %res_ptr + cfi_rel_offset (res_ptr, 12) + movl S1(%esp), %s1_ptr + cfi_rel_offset (s1_ptr, 8) + movl SIZE(%esp), %size + movl S2LIMB(%esp), %s2_limb + cfi_rel_offset (s2_limb, 0) + leal (%res_ptr,%size,4), %res_ptr + leal (%s1_ptr,%size,4), %s1_ptr + negl %size + xorl %ebp, %ebp + cfi_rel_offset (ebp, 4) + ALIGN (3) + +L(oop): adcl $0, %ebp + movl (%s1_ptr,%size,4), %eax + + mull %s2_limb + + addl %ebp, %eax + movl (%res_ptr,%size,4), %ebp + + adcl $0, %edx + addl %eax, %ebp + + movl %ebp, (%res_ptr,%size,4) + incl %size + + movl %edx, %ebp + jnz L(oop) + + adcl $0, %ebp + movl %ebp, %eax + popl %s2_limb + cfi_adjust_cfa_offset (-4) + cfi_restore (s2_limb) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %s1_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (s1_ptr) + popl %res_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (res_ptr) + + ret +#undef size +END (__mpn_addmul_1) diff --git a/REORG.TODO/sysdeps/i386/i586/bzero.S b/REORG.TODO/sysdeps/i386/i586/bzero.S new file mode 100644 index 0000000000..2a106719a4 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/bzero.S @@ -0,0 +1,4 @@ +#define USE_AS_BZERO +#define memset __bzero +#include <sysdeps/i386/i586/memset.S> +weak_alias (__bzero, bzero) diff --git a/REORG.TODO/sysdeps/i386/i586/init-arch.h b/REORG.TODO/sysdeps/i386/i586/init-arch.h new file mode 100644 index 0000000000..4711212e6f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/init-arch.h @@ -0,0 +1,19 @@ +/* Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define MINIMUM_ISA 586 +#include <sysdeps/x86/init-arch.h> diff --git a/REORG.TODO/sysdeps/i386/i586/lshift.S b/REORG.TODO/sysdeps/i386/i586/lshift.S new file mode 100644 index 0000000000..7941c28d9d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/lshift.S @@ -0,0 +1,255 @@ +/* Pentium optimized __mpn_lshift -- + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S RES+4 +#define SIZE S+4 +#define CNT SIZE+4 + + .text +ENTRY (__mpn_lshift) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebp, 0) + pushl %ebx + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 12) + movl S(%esp),%esi + cfi_rel_offset (esi, 8) + movl SIZE(%esp),%ebx + cfi_rel_offset (ebx, 0) + movl CNT(%esp),%ecx + +/* We can use faster code for shift-by-1 under certain conditions. */ + cmp $1,%ecx + jne L(normal) + leal 4(%esi),%eax + cmpl %edi,%eax + jnc L(special) /* jump if s_ptr + 1 >= res_ptr */ + leal (%esi,%ebx,4),%eax + cmpl %eax,%edi + jnc L(special) /* jump if res_ptr >= s_ptr + size */ + +L(normal): + leal -4(%edi,%ebx,4),%edi + leal -4(%esi,%ebx,4),%esi + + movl (%esi),%edx + subl $4,%esi + xorl %eax,%eax + shldl %cl,%edx,%eax /* compute carry limb */ + pushl %eax /* push carry limb onto stack */ + cfi_adjust_cfa_offset (4) + + decl %ebx + pushl %ebx + cfi_adjust_cfa_offset (4) + shrl $3,%ebx + jz L(end) + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +L(oop): movl -28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebp + + movl (%esi),%eax + movl -4(%esi),%edx + shldl %cl,%eax,%ebp + shldl %cl,%edx,%eax + movl %ebp,(%edi) + movl %eax,-4(%edi) + + movl -8(%esi),%ebp + movl -12(%esi),%eax + shldl %cl,%ebp,%edx + shldl %cl,%eax,%ebp + movl %edx,-8(%edi) + movl %ebp,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebp + shldl %cl,%edx,%eax + shldl %cl,%ebp,%edx + movl %eax,-16(%edi) + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + shldl %cl,%eax,%ebp + shldl %cl,%edx,%eax + movl %ebp,-24(%edi) + movl %eax,-28(%edi) + + subl $32,%esi + subl $32,%edi + decl %ebx + jnz L(oop) + +L(end): popl %ebx + cfi_adjust_cfa_offset (-4) + andl $7,%ebx + jz L(end2) +L(oop2): + movl (%esi),%eax + shldl %cl,%eax,%edx + movl %edx,(%edi) + movl %eax,%edx + subl $4,%esi + subl $4,%edi + decl %ebx + jnz L(oop2) + +L(end2): + shll %cl,%edx /* compute least significant limb */ + movl %edx,(%edi) /* store it */ + + popl %eax /* pop carry limb */ + cfi_adjust_cfa_offset (-4) + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret + +/* We loop from least significant end of the arrays, which is only + permissible if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. +*/ + + cfi_adjust_cfa_offset (16) + cfi_rel_offset (edi, 12) + cfi_rel_offset (esi, 8) + cfi_rel_offset (ebp, 4) + cfi_rel_offset (ebx, 0) +L(special): + movl (%esi),%edx + addl $4,%esi + + decl %ebx + pushl %ebx + cfi_adjust_cfa_offset (4) + shrl $3,%ebx + + addl %edx,%edx + incl %ebx + decl %ebx + jz L(Lend) + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +L(Loop): + movl 28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebp + + movl (%esi),%eax + movl 4(%esi),%edx + adcl %eax,%eax + movl %ebp,(%edi) + adcl %edx,%edx + movl %eax,4(%edi) + + movl 8(%esi),%ebp + movl 12(%esi),%eax + adcl %ebp,%ebp + movl %edx,8(%edi) + adcl %eax,%eax + movl %ebp,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebp + adcl %edx,%edx + movl %eax,16(%edi) + adcl %ebp,%ebp + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + adcl %eax,%eax + movl %ebp,24(%edi) + adcl %edx,%edx + movl %eax,28(%edi) + + leal 32(%esi),%esi /* use leal not to clobber carry */ + leal 32(%edi),%edi + decl %ebx + jnz L(Loop) + +L(Lend): + popl %ebx + cfi_adjust_cfa_offset (-4) + sbbl %eax,%eax /* save carry in %eax */ + andl $7,%ebx + jz L(Lend2) + addl %eax,%eax /* restore carry from eax */ +L(Loop2): + movl %edx,%ebp + movl (%esi),%edx + adcl %edx,%edx + movl %ebp,(%edi) + + leal 4(%esi),%esi /* use leal not to clobber carry */ + leal 4(%edi),%edi + decl %ebx + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax /* restore carry from eax */ +L(L1): movl %edx,(%edi) /* store last limb */ + + sbbl %eax,%eax + negl %eax + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_lshift) diff --git a/REORG.TODO/sysdeps/i386/i586/memcopy.h b/REORG.TODO/sysdeps/i386/i586/memcopy.h new file mode 100644 index 0000000000..39f020a746 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/memcopy.h @@ -0,0 +1,95 @@ +/* memcopy.h -- definitions for memory copy functions. Pentium version. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + Contributed by Torbjorn Granlund (tege@sics.se). + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Get the i386 definitions. We will override some of them below. */ +#include <sysdeps/i386/memcopy.h> + +/* Written like this, the Pentium pipeline can execute the loop at a + sustained rate of 2 instructions/clock, or asymptotically 480 + Mbytes/second at 60Mhz. */ + +#undef WORD_COPY_FWD +#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \ + do \ + { \ + asm volatile ("subl $32,%2\n" \ + "js 2f\n" \ + "movl 0(%0),%%edx\n" /* alloc dest line */ \ + "1:\n" \ + "movl 28(%0),%%eax\n" /* alloc dest line */ \ + "subl $32,%2\n" /* decr loop count */ \ + "movl 0(%1),%%eax\n" /* U pipe */ \ + "movl 4(%1),%%edx\n" /* V pipe */ \ + "movl %%eax,0(%0)\n" /* U pipe */ \ + "movl %%edx,4(%0)\n" /* V pipe */ \ + "movl 8(%1),%%eax\n" \ + "movl 12(%1),%%edx\n" \ + "movl %%eax,8(%0)\n" \ + "movl %%edx,12(%0)\n" \ + "movl 16(%1),%%eax\n" \ + "movl 20(%1),%%edx\n" \ + "movl %%eax,16(%0)\n" \ + "movl %%edx,20(%0)\n" \ + "movl 24(%1),%%eax\n" \ + "movl 28(%1),%%edx\n" \ + "movl %%eax,24(%0)\n" \ + "movl %%edx,28(%0)\n" \ + "leal 32(%1),%1\n" /* update src ptr */ \ + "leal 32(%0),%0\n" /* update dst ptr */ \ + "jns 1b\n" \ + "2: addl $32,%2" : \ + "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) : \ + "0" (dst_bp), "1" (src_bp), "2" (nbytes) : \ + "ax", "dx"); \ + } while (0) + +#undef WORD_COPY_BWD +#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes) \ + do \ + { \ + asm volatile ("subl $32,%2\n" \ + "js 2f\n" \ + "movl -4(%0),%%edx\n" \ + "1:\n" \ + "movl -32(%0),%%eax\n" \ + "subl $32,%2\n" \ + "movl -4(%1),%%eax\n" \ + "movl -8(%1),%%edx\n" \ + "movl %%eax,-4(%0)\n" \ + "movl %%edx,-8(%0)\n" \ + "movl -12(%1),%%eax\n" \ + "movl -16(%1),%%edx\n" \ + "movl %%eax,-12(%0)\n" \ + "movl %%edx,-16(%0)\n" \ + "movl -20(%1),%%eax\n" \ + "movl -24(%1),%%edx\n" \ + "movl %%eax,-20(%0)\n" \ + "movl %%edx,-24(%0)\n" \ + "movl -28(%1),%%eax\n" \ + "movl -32(%1),%%edx\n" \ + "movl %%eax,-28(%0)\n" \ + "movl %%edx,-32(%0)\n" \ + "leal -32(%1),%1\n" \ + "leal -32(%0),%0\n" \ + "jns 1b\n" \ + "2: addl $32,%2" : \ + "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) : \ + "0" (dst_ep), "1" (src_ep), "2" (nbytes) : \ + "ax", "dx"); \ + } while (0) diff --git a/REORG.TODO/sysdeps/i386/i586/memcpy.S b/REORG.TODO/sysdeps/i386/i586/memcpy.S new file mode 100644 index 0000000000..6474a3f653 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/memcpy.S @@ -0,0 +1,124 @@ +/* Highly optimized version for i586. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+8 /* space for 2 saved regs */ +#define RTN PARMS +#define DEST RTN +#define SRC DEST+4 +#define LEN SRC+4 + + .text +#if defined PIC && IS_IN (libc) +ENTRY (__memcpy_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memcpy_chk) +#endif +ENTRY (memcpy) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + + movl DEST(%esp), %edi + cfi_rel_offset (edi, 4) + movl SRC(%esp), %esi + cfi_rel_offset (esi, 0) + movl LEN(%esp), %ecx + movl %edi, %eax + + /* We need this in any case. */ + cld + + /* Cutoff for the big loop is a size of 32 bytes since otherwise + the loop will never be entered. */ + cmpl $32, %ecx + jbe L(1) + + negl %eax + andl $3, %eax + subl %eax, %ecx + xchgl %eax, %ecx + + rep; movsb + + movl %eax, %ecx + subl $32, %ecx + js L(2) + + /* Read ahead to make sure we write in the cache since the stupid + i586 designers haven't implemented read-on-write-miss. */ + movl (%edi), %eax +L(3): movl 28(%edi), %edx + + /* Now correct the loop counter. Please note that in the following + code the flags are not changed anymore. */ + subl $32, %ecx + + movl (%esi), %eax + movl 4(%esi), %edx + movl %eax, (%edi) + movl %edx, 4(%edi) + movl 8(%esi), %eax + movl 12(%esi), %edx + movl %eax, 8(%edi) + movl %edx, 12(%edi) + movl 16(%esi), %eax + movl 20(%esi), %edx + movl %eax, 16(%edi) + movl %edx, 20(%edi) + movl 24(%esi), %eax + movl 28(%esi), %edx + movl %eax, 24(%edi) + movl %edx, 28(%edi) + + leal 32(%esi), %esi + leal 32(%edi), %edi + + jns L(3) + + /* Correct extra loop counter modification. */ +L(2): addl $32, %ecx +#ifndef USE_AS_MEMPCPY + movl DEST(%esp), %eax +#endif + +L(1): rep; movsb + +#ifdef USE_AS_MEMPCPY + movl %edi, %eax +#endif + + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (memcpy) +#ifndef USE_AS_MEMPCPY +libc_hidden_builtin_def (memcpy) +#endif diff --git a/REORG.TODO/sysdeps/i386/i586/mempcpy.S b/REORG.TODO/sysdeps/i386/i586/mempcpy.S new file mode 100644 index 0000000000..720a4c0923 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/mempcpy.S @@ -0,0 +1,8 @@ +#define USE_AS_MEMPCPY +#define memcpy __mempcpy +#define __memcpy_chk __mempcpy_chk +#include <sysdeps/i386/i586/memcpy.S> + +libc_hidden_def (__mempcpy) +weak_alias (__mempcpy, mempcpy) +libc_hidden_builtin_def (mempcpy) diff --git a/REORG.TODO/sysdeps/i386/i586/memset.S b/REORG.TODO/sysdeps/i386/i586/memset.S new file mode 100644 index 0000000000..4f8f1bcf94 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/memset.S @@ -0,0 +1,121 @@ +/* memset/bzero -- set memory area to CH/0 + Highly optimized version for ix86, x>=5. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Torbjorn Granlund, <tege@matematik.su.se> + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+4 /* space for 1 saved reg */ +#define RTN PARMS +#define DEST RTN +#ifdef USE_AS_BZERO +# define LEN DEST+4 +#else +# define CHR DEST+4 +# define LEN CHR+4 +#endif + + .text +#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO +ENTRY (__memset_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk) +#endif +ENTRY (memset) + + pushl %edi + cfi_adjust_cfa_offset (4) + + movl DEST(%esp), %edi + cfi_rel_offset (edi, 0) + movl LEN(%esp), %edx +#ifdef USE_AS_BZERO + xorl %eax, %eax /* we fill with 0 */ +#else + movb CHR(%esp), %al + movb %al, %ah + movl %eax, %ecx + shll $16, %eax + movw %cx, %ax +#endif + cld + +/* If less than 36 bytes to write, skip tricky code (it wouldn't work). */ + cmpl $36, %edx + movl %edx, %ecx /* needed when branch is taken! */ + jl L(2) + +/* First write 0-3 bytes to make the pointer 32-bit aligned. */ + movl %edi, %ecx /* Copy ptr to ecx... */ + negl %ecx /* ...and negate that and... */ + andl $3, %ecx /* ...mask to get byte count. */ + subl %ecx, %edx /* adjust global byte count */ + rep + stosb + + subl $32, %edx /* offset count for unrolled loop */ + movl (%edi), %ecx /* Fetch destination cache line */ + + .align 2, 0x90 /* supply 0x90 for broken assemblers */ +L(1): movl 28(%edi), %ecx /* allocate cache line for destination */ + subl $32, %edx /* decr loop count */ + movl %eax, 0(%edi) /* store words pairwise */ + movl %eax, 4(%edi) + movl %eax, 8(%edi) + movl %eax, 12(%edi) + movl %eax, 16(%edi) + movl %eax, 20(%edi) + movl %eax, 24(%edi) + movl %eax, 28(%edi) + leal 32(%edi), %edi /* update destination pointer */ + jge L(1) + + leal 32(%edx), %ecx /* reset offset count */ + +/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */ +L(2): shrl $2, %ecx /* convert byte count to longword count */ + rep + stosl + +/* Finally write the last 0-3 bytes. */ + movl %edx, %ecx + andl $3, %ecx + rep + stosb + +#ifndef USE_AS_BZERO + /* Load result (only if used as memset). */ + movl DEST(%esp), %eax /* start address of destination is result */ +#endif + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (memset) +libc_hidden_builtin_def (memset) + +#if defined SHARED && IS_IN (libc) && !defined __memset_chk \ + && !defined USE_AS_BZERO +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) + .section .gnu.warning.__memset_zero_constant_len_parameter + .string "memset used with constant zero length parameter; this could be due to transposed parameters" +#endif diff --git a/REORG.TODO/sysdeps/i386/i586/memusage.h b/REORG.TODO/sysdeps/i386/i586/memusage.h new file mode 100644 index 0000000000..c8170874d0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/memusage.h @@ -0,0 +1 @@ +#include "../i686/memusage.h" diff --git a/REORG.TODO/sysdeps/i386/i586/mul_1.S b/REORG.TODO/sysdeps/i386/i586/mul_1.S new file mode 100644 index 0000000000..bd3a07de90 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/mul_1.S @@ -0,0 +1,90 @@ +/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store + the result in a second limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define SIZE S1+4 +#define S2LIMB SIZE+4 + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebx + + .text +ENTRY (__mpn_mul_1) + + pushl %res_ptr + cfi_adjust_cfa_offset (4) + pushl %s1_ptr + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %s2_limb + cfi_adjust_cfa_offset (4) + + movl RES(%esp), %res_ptr + cfi_rel_offset (res_ptr, 12) + movl S1(%esp), %s1_ptr + cfi_rel_offset (s1_ptr, 8) + movl SIZE(%esp), %size + movl S2LIMB(%esp), %s2_limb + cfi_rel_offset (s2_limb, 0) + leal (%res_ptr,%size,4), %res_ptr + leal (%s1_ptr,%size,4), %s1_ptr + negl %size + xorl %ebp, %ebp + cfi_rel_offset (ebp, 4) + ALIGN (3) + +L(oop): adcl $0, %ebp + movl (%s1_ptr,%size,4), %eax + + mull %s2_limb + + addl %eax, %ebp + + movl %ebp, (%res_ptr,%size,4) + incl %size + + movl %edx, %ebp + jnz L(oop) + + adcl $0, %ebp + movl %ebp, %eax + popl %s2_limb + cfi_adjust_cfa_offset (-4) + cfi_restore (s2_limb) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %s1_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (s1_ptr) + popl %res_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (res_ptr) + + ret +#undef size +END (__mpn_mul_1) diff --git a/REORG.TODO/sysdeps/i386/i586/rshift.S b/REORG.TODO/sysdeps/i386/i586/rshift.S new file mode 100644 index 0000000000..24c76ee0bb --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/rshift.S @@ -0,0 +1,255 @@ +/* Pentium optimized __mpn_rshift -- + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S RES+4 +#define SIZE S+4 +#define CNT SIZE+4 + + .text +ENTRY (__mpn_rshift) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebp, 0) + pushl %ebx + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 12) + movl S(%esp),%esi + cfi_rel_offset (esi, 8) + movl SIZE(%esp),%ebx + cfi_rel_offset (ebx, 0) + movl CNT(%esp),%ecx + +/* We can use faster code for shift-by-1 under certain conditions. */ + cmp $1,%ecx + jne L(normal) + leal 4(%edi),%eax + cmpl %esi,%eax + jnc L(special) /* jump if res_ptr + 1 >= s_ptr */ + leal (%edi,%ebx,4),%eax + cmpl %eax,%esi + jnc L(special) /* jump if s_ptr >= res_ptr + size */ + +L(normal): + movl (%esi),%edx + addl $4,%esi + xorl %eax,%eax + shrdl %cl,%edx,%eax /* compute carry limb */ + pushl %eax /* push carry limb onto stack */ + cfi_adjust_cfa_offset (4) + + decl %ebx + pushl %ebx + cfi_adjust_cfa_offset (4) + shrl $3,%ebx + jz L(end) + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +L(oop): movl 28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebp + + movl (%esi),%eax + movl 4(%esi),%edx + shrdl %cl,%eax,%ebp + shrdl %cl,%edx,%eax + movl %ebp,(%edi) + movl %eax,4(%edi) + + movl 8(%esi),%ebp + movl 12(%esi),%eax + shrdl %cl,%ebp,%edx + shrdl %cl,%eax,%ebp + movl %edx,8(%edi) + movl %ebp,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebp + shrdl %cl,%edx,%eax + shrdl %cl,%ebp,%edx + movl %eax,16(%edi) + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + shrdl %cl,%eax,%ebp + shrdl %cl,%edx,%eax + movl %ebp,24(%edi) + movl %eax,28(%edi) + + addl $32,%esi + addl $32,%edi + decl %ebx + jnz L(oop) + +L(end): popl %ebx + cfi_adjust_cfa_offset (-4) + andl $7,%ebx + jz L(end2) +L(oop2): + movl (%esi),%eax + shrdl %cl,%eax,%edx /* compute result limb */ + movl %edx,(%edi) + movl %eax,%edx + addl $4,%esi + addl $4,%edi + decl %ebx + jnz L(oop2) + +L(end2): + shrl %cl,%edx /* compute most significant limb */ + movl %edx,(%edi) /* store it */ + + popl %eax /* pop carry limb */ + cfi_adjust_cfa_offset (-4) + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret + +/* We loop from least significant end of the arrays, which is only + permissible if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. +*/ + + cfi_adjust_cfa_offset (16) + cfi_rel_offset (edi, 12) + cfi_rel_offset (esi, 8) + cfi_rel_offset (ebp, 4) + cfi_rel_offset (ebx, 0) +L(special): + leal -4(%edi,%ebx,4),%edi + leal -4(%esi,%ebx,4),%esi + + movl (%esi),%edx + subl $4,%esi + + decl %ebx + pushl %ebx + cfi_adjust_cfa_offset (4) + shrl $3,%ebx + + shrl $1,%edx + incl %ebx + decl %ebx + jz L(Lend) + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +L(Loop): + movl -28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebp + + movl (%esi),%eax + movl -4(%esi),%edx + rcrl $1,%eax + movl %ebp,(%edi) + rcrl $1,%edx + movl %eax,-4(%edi) + + movl -8(%esi),%ebp + movl -12(%esi),%eax + rcrl $1,%ebp + movl %edx,-8(%edi) + rcrl $1,%eax + movl %ebp,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebp + rcrl $1,%edx + movl %eax,-16(%edi) + rcrl $1,%ebp + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + rcrl $1,%eax + movl %ebp,-24(%edi) + rcrl $1,%edx + movl %eax,-28(%edi) + + leal -32(%esi),%esi /* use leal not to clobber carry */ + leal -32(%edi),%edi + decl %ebx + jnz L(Loop) + +L(Lend): + popl %ebx + cfi_adjust_cfa_offset (-4) + sbbl %eax,%eax /* save carry in %eax */ + andl $7,%ebx + jz L(Lend2) + addl %eax,%eax /* restore carry from eax */ +L(Loop2): + movl %edx,%ebp + movl (%esi),%edx + rcrl $1,%edx + movl %ebp,(%edi) + + leal -4(%esi),%esi /* use leal not to clobber carry */ + leal -4(%edi),%edi + decl %ebx + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax /* restore carry from eax */ +L(L1): movl %edx,(%edi) /* store last limb */ + + movl $0,%eax + rcrl $1,%eax + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_rshift) diff --git a/REORG.TODO/sysdeps/i386/i586/stpcpy.S b/REORG.TODO/sysdeps/i386/i586/stpcpy.S new file mode 100644 index 0000000000..8691efd01c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/stpcpy.S @@ -0,0 +1,8 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy + +#include <sysdeps/i386/i586/strcpy.S> + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/REORG.TODO/sysdeps/i386/i586/strchr.S b/REORG.TODO/sysdeps/i386/i586/strchr.S new file mode 100644 index 0000000000..02f66b8f72 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/strchr.S @@ -0,0 +1,348 @@ +/* Find character CH in a NUL terminated string. + Highly optimized version for ix85, x>=5. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +/* This version is especially optimized for the i586 (and following?) + processors. This is mainly done by using the two pipelines. The + version optimized for i486 is weak in this aspect because to get + as much parallelism we have to execute some *more* instructions. + + The code below is structured to reflect the pairing of the instructions + as *I think* it is. I have no processor data book to verify this. + If you find something you think is incorrect let me know. */ + + +/* The magic value which is used throughout in the whole code. */ +#define magic 0xfefefeff + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RTN PARMS +#define STR RTN +#define CHR STR+4 + + .text +ENTRY (strchr) + + pushl %edi /* Save callee-safe registers. */ + cfi_adjust_cfa_offset (-4) + pushl %esi + cfi_adjust_cfa_offset (-4) + + pushl %ebx + cfi_adjust_cfa_offset (-4) + pushl %ebp + cfi_adjust_cfa_offset (-4) + + movl STR(%esp), %eax + movl CHR(%esp), %edx + + movl %eax, %edi /* duplicate string pointer for later */ + cfi_rel_offset (edi, 12) + xorl %ecx, %ecx /* clear %ecx */ + + /* At the moment %edx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* now it is 0|0|c|c */ + movb %dl, %cl /* we construct the lower half in %ecx */ + + shll $16, %edx /* now %edx is c|c|0|0 */ + movb %cl, %ch /* now %ecx is 0|0|c|c */ + + orl %ecx, %edx /* and finally c|c|c|c */ + andl $3, %edi /* mask alignment bits */ + + jz L(11) /* alignment is 0 => start loop */ + + movb %dl, %cl /* 0 is needed below */ + jp L(0) /* exactly two bits set */ + + xorb (%eax), %cl /* is byte the one we are looking for? */ + jz L(out) /* yes => return pointer */ + + xorb %dl, %cl /* load single byte and test for NUL */ + je L(3) /* yes => return NULL */ + + movb 1(%eax), %cl /* load single byte */ + incl %eax + + cmpb %cl, %dl /* is byte == C? */ + je L(out) /* aligned => return pointer */ + + cmpb $0, %cl /* is byte NUL? */ + je L(3) /* yes => return NULL */ + + incl %eax + decl %edi + + jne L(11) + +L(0): movb (%eax), %cl /* load single byte */ + + cmpb %cl, %dl /* is byte == C? */ + je L(out) /* aligned => return pointer */ + + cmpb $0, %cl /* is byte NUL? */ + je L(3) /* yes => return NULL */ + + incl %eax /* increment pointer */ + + cfi_rel_offset (esi, 8) + cfi_rel_offset (ebx, 4) + cfi_rel_offset (ebp, 0) + + /* The following code is the preparation for the loop. The + four instruction up to `L1' will not be executed in the loop + because the same code is found at the end of the loop, but + there it is executed in parallel with other instructions. */ +L(11): movl (%eax), %ecx + movl $magic, %ebp + + movl $magic, %edi + addl %ecx, %ebp + + /* The main loop: it looks complex and indeed it is. I would + love to say `it was hard to write, so it should he hard to + read' but I will give some more hints. To fully understand + this code you should first take a look at the i486 version. + The basic algorithm is the same, but here the code organized + in a way which permits to use both pipelines all the time. + + I tried to make it a bit more understandable by indenting + the code according to stage in the algorithm. It goes as + follows: + check for 0 in 1st word + check for C in 1st word + check for 0 in 2nd word + check for C in 2nd word + check for 0 in 3rd word + check for C in 3rd word + check for 0 in 4th word + check for C in 4th word + + Please note that doing the test for NUL before the test for + C allows us to overlap the test for 0 in the next word with + the test for C. */ + +L(1): xorl %ecx, %ebp /* (word^magic) */ + addl %ecx, %edi /* add magic word */ + + leal 4(%eax), %eax /* increment pointer */ + jnc L(4) /* previous addl caused overflow? */ + + movl %ecx, %ebx /* duplicate original word */ + orl $magic, %ebp /* (word^magic)|magic */ + + addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */ + jne L(4) /* yes => we found word with NUL */ + + movl $magic, %esi /* load magic value */ + xorl %edx, %ebx /* clear words which are C */ + + movl (%eax), %ecx + addl %ebx, %esi /* (word+magic) */ + + movl $magic, %edi + jnc L(5) /* previous addl caused overflow? */ + + movl %edi, %ebp + xorl %ebx, %esi /* (word+magic)^word */ + + addl %ecx, %ebp + orl $magic, %esi /* ((word+magic)^word)|magic */ + + addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/ + jne L(5) /* yes => we found word with C */ + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L(4) + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L(4) + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L(5) + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + jne L(5) + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L(4) + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L(4) + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L(5) + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + jne L(5) + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L(4) + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L(4) + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L(5) + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + + je L(1) + + /* We know there is no NUL byte but a C byte in the word. + %ebx contains NUL in this particular byte. */ +L(5): subl $4, %eax /* adjust pointer */ + testb %bl, %bl /* first byte == C? */ + + jz L(out) /* yes => return pointer */ + + incl %eax /* increment pointer */ + testb %bh, %bh /* second byte == C? */ + + jz L(out) /* yes => return pointer */ + + shrl $16, %ebx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmp $0, %bl /* third byte == C */ + je L(out) /* yes => return pointer */ + + incl %eax /* increment pointer */ + +L(out): popl %ebp /* restore saved registers */ + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret + + cfi_adjust_cfa_offset (16) + cfi_rel_offset (edi, 12) + cfi_rel_offset (esi, 8) + cfi_rel_offset (ebx, 4) + cfi_rel_offset (ebp, 0) + /* We know there is a NUL byte in the word. But we have to test + whether there is an C byte before it in the word. */ +L(4): subl $4, %eax /* adjust pointer */ + cmpb %dl, %cl /* first byte == C? */ + + je L(out) /* yes => return pointer */ + + cmpb $0, %cl /* first byte == NUL? */ + je L(3) /* yes => return NULL */ + + incl %eax /* increment pointer */ + + cmpb %dl, %ch /* second byte == C? */ + je L(out) /* yes => return pointer */ + + cmpb $0, %ch /* second byte == NUL? */ + je L(3) /* yes => return NULL */ + + shrl $16, %ecx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmpb %dl, %cl /* third byte == C? */ + je L(out) /* yes => return pointer */ + + cmpb $0, %cl /* third byte == NUL? */ + je L(3) /* yes => return NULL */ + + incl %eax /* increment pointer */ + + /* The test four the fourth byte is necessary! */ + cmpb %dl, %ch /* fourth byte == C? */ + je L(out) /* yes => return pointer */ + +L(3): xorl %eax, %eax + jmp L(out) +END (strchr) + +#undef index +weak_alias (strchr, index) +libc_hidden_builtin_def (strchr) diff --git a/REORG.TODO/sysdeps/i386/i586/strcpy.S b/REORG.TODO/sysdeps/i386/i586/strcpy.S new file mode 100644 index 0000000000..a444604f4f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/strcpy.S @@ -0,0 +1,169 @@ +/* strcpy/stpcpy implementation for i586. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+12 /* space for 3 saved regs */ +#define RTN PARMS +#define DEST RTN +#define SRC DEST+4 + +#ifndef USE_AS_STPCPY +# define STRCPY strcpy +#endif + +#define magic 0xfefefeff + + .text +ENTRY (STRCPY) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebx + cfi_adjust_cfa_offset (4) + + movl DEST(%esp), %edi + cfi_rel_offset (edi, 8) + movl SRC(%esp), %esi + cfi_rel_offset (esi, 4) + + xorl %eax, %eax + leal -1(%esi), %ecx + + movl $magic, %ebx + cfi_rel_offset (ebx, 0) + andl $3, %ecx + +#ifdef PIC + call 2f + cfi_adjust_cfa_offset (4) +2: popl %edx + cfi_adjust_cfa_offset (-4) + /* 0xb is the distance between 2: and 1: but we avoid writing + 1f-2b because the assembler generates worse code. */ + leal 0xb(%edx,%ecx,8), %ecx +#else + leal 1f(,%ecx,8), %ecx +#endif + + jmp *%ecx + + .align 8 +1: + orb (%esi), %al + jz L(end) + stosb + xorl %eax, %eax + incl %esi + + orb (%esi), %al + jz L(end) + stosb + xorl %eax, %eax + incl %esi + + orb (%esi), %al + jz L(end) + stosb + xorl %eax, %eax + incl %esi + +L(1): movl (%esi), %ecx + leal 4(%esi),%esi + + subl %ecx, %eax + addl %ebx, %ecx + + decl %eax + jnc L(3) + + movl %ecx, %edx + xorl %ecx, %eax + + subl %ebx, %edx + andl $~magic, %eax + + jne L(4) + + movl %edx, (%edi) + leal 4(%edi),%edi + + jmp L(1) + +L(3): movl %ecx, %edx + + subl %ebx, %edx + +L(4): movb %dl, (%edi) + testb %dl, %dl + + movl %edx, %eax + jz L(end2) + + shrl $16, %eax + movb %dh, 1(%edi) +#ifdef USE_AS_STPCPY + addl $1, %edi +#endif + + cmpb $0, %dh + jz L(end2) + +#ifdef USE_AS_STPCPY + movb %al, 1(%edi) + addl $1, %edi + + cmpb $0, %al + jz L(end2) + + addl $1, %edi +#else + movb %al, 2(%edi) + testb %al, %al + + leal 3(%edi), %edi + jz L(end2) +#endif + +L(end): movb %ah, (%edi) + +L(end2): +#ifdef USE_AS_STPCPY + movl %edi, %eax +#else + movl DEST(%esp), %eax +#endif + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (STRCPY) +#ifndef USE_AS_STPCPY +libc_hidden_builtin_def (strcpy) +#endif diff --git a/REORG.TODO/sysdeps/i386/i586/strlen.S b/REORG.TODO/sysdeps/i386/i586/strlen.S new file mode 100644 index 0000000000..cfea2e020f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/strlen.S @@ -0,0 +1,182 @@ +/* strlen -- Compute length of NUL terminated string. + Highly optimized version for ix86, x>=5. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +/* This version is especially optimized for the i586 (and following?) + processors. This is mainly done by using the two pipelines. The + version optimized for i486 is weak in this aspect because to get + as much parallelism we have to execute some *more* instructions. + + The code below is structured to reflect the pairing of the instructions + as *I think* it is. I have no processor data book to verify this. + If you find something you think is incorrect let me know. */ + + +/* The magic value which is used throughout in the whole code. */ +#define magic 0xfefefeff + +#define PARMS 4 /* no space for saved regs */ +#define STR PARMS + + .text +ENTRY (strlen) + + movl STR(%esp), %eax + movl $3, %edx /* load mask (= 3) */ + + andl %eax, %edx /* separate last two bits of address */ + + jz L(1) /* aligned => start loop */ + jp L(0) /* exactly two bits set */ + + cmpb %dh, (%eax) /* is byte NUL? */ + je L(2) /* yes => return */ + + incl %eax /* increment pointer */ + cmpb %dh, (%eax) /* is byte NUL? */ + + je L(2) /* yes => return */ + + incl %eax /* increment pointer */ + xorl $2, %edx + + jz L(1) + +L(0): cmpb %dh, (%eax) /* is byte NUL? */ + je L(2) /* yes => return */ + + incl %eax /* increment pointer */ + xorl %edx, %edx /* We need %edx == 0 for later */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + Note: %edx == 0 in any case here. */ + +L(1): + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L(3) /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L(3) /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L(3) /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L(3) /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L(3) /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L(3) /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L(3) /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + je L(1) /* no => start loop again */ + + +L(3): subl $4, %eax /* correct too early pointer increment */ + subl $magic, %ecx + + cmpb $0, %cl /* lowest byte NUL? */ + jz L(2) /* yes => return */ + + inc %eax /* increment pointer */ + testb %ch, %ch /* second byte NUL? */ + + jz L(2) /* yes => return */ + + shrl $16, %ecx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmpb $0, %cl /* is third byte NUL? */ + jz L(2) /* yes => return */ + + incl %eax /* increment pointer */ + +L(2): subl STR(%esp), %eax /* now compute the length as difference + between start and terminating NUL + character */ + ret +END (strlen) +libc_hidden_builtin_def (strlen) diff --git a/REORG.TODO/sysdeps/i386/i586/sub_n.S b/REORG.TODO/sysdeps/i386/i586/sub_n.S new file mode 100644 index 0000000000..21b5a2742c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/sub_n.S @@ -0,0 +1,143 @@ +/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0 + and store difference in a third limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define S2 S1+4 +#define SIZE S2+4 + + .text +ENTRY (__mpn_sub_n) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %ebx + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 12) + movl S1(%esp),%esi + cfi_rel_offset (esi, 8) + movl S2(%esp),%ebx + cfi_rel_offset (ebx, 0) + movl SIZE(%esp),%ecx + movl (%ebx),%ebp + cfi_rel_offset (ebp, 4) + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx /* zero carry flag */ + jz L(end) + pushl %edx + cfi_adjust_cfa_offset (4) + + ALIGN (3) +L(oop): movl 28(%edi),%eax /* fetch destination cache line */ + leal 32(%edi),%edi + +L(1): movl (%esi),%eax + movl 4(%esi),%edx + sbbl %ebp,%eax + movl 4(%ebx),%ebp + sbbl %ebp,%edx + movl 8(%ebx),%ebp + movl %eax,-32(%edi) + movl %edx,-28(%edi) + +L(2): movl 8(%esi),%eax + movl 12(%esi),%edx + sbbl %ebp,%eax + movl 12(%ebx),%ebp + sbbl %ebp,%edx + movl 16(%ebx),%ebp + movl %eax,-24(%edi) + movl %edx,-20(%edi) + +L(3): movl 16(%esi),%eax + movl 20(%esi),%edx + sbbl %ebp,%eax + movl 20(%ebx),%ebp + sbbl %ebp,%edx + movl 24(%ebx),%ebp + movl %eax,-16(%edi) + movl %edx,-12(%edi) + +L(4): movl 24(%esi),%eax + movl 28(%esi),%edx + sbbl %ebp,%eax + movl 28(%ebx),%ebp + sbbl %ebp,%edx + movl 32(%ebx),%ebp + movl %eax,-8(%edi) + movl %edx,-4(%edi) + + leal 32(%esi),%esi + leal 32(%ebx),%ebx + decl %ecx + jnz L(oop) + + popl %edx + cfi_adjust_cfa_offset (-4) +L(end): + decl %edx /* test %edx w/o clobbering carry */ + js L(end2) + incl %edx +L(oop2): + leal 4(%edi),%edi + movl (%esi),%eax + sbbl %ebp,%eax + movl 4(%ebx),%ebp + movl %eax,-4(%edi) + leal 4(%esi),%esi + leal 4(%ebx),%ebx + decl %edx + jnz L(oop2) +L(end2): + movl (%esi),%eax + sbbl %ebp,%eax + movl %eax,(%edi) + + sbbl %eax,%eax + negl %eax + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_sub_n) diff --git a/REORG.TODO/sysdeps/i386/i586/submul_1.S b/REORG.TODO/sysdeps/i386/i586/submul_1.S new file mode 100644 index 0000000000..5e5e121ca2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/submul_1.S @@ -0,0 +1,94 @@ +/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract + the result from a second limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define SIZE S1+4 +#define S2LIMB SIZE+4 + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebx + + .text +ENTRY (__mpn_submul_1) + + pushl %res_ptr + cfi_adjust_cfa_offset (4) + pushl %s1_ptr + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %s2_limb + cfi_adjust_cfa_offset (4) + + movl RES(%esp), %res_ptr + cfi_rel_offset (res_ptr, 12) + movl S1(%esp), %s1_ptr + cfi_rel_offset (s1_ptr, 8) + movl SIZE(%esp), %size + movl S2LIMB(%esp), %s2_limb + cfi_rel_offset (s2_limb, 0) + leal (%res_ptr,%size,4), %res_ptr + leal (%s1_ptr,%size,4), %s1_ptr + negl %size + xorl %ebp, %ebp + cfi_rel_offset (ebp, 4) + ALIGN (3) + +L(oop): adcl $0, %ebp + movl (%s1_ptr,%size,4), %eax + + mull %s2_limb + + addl %ebp, %eax + movl (%res_ptr,%size,4), %ebp + + adcl $0, %edx + subl %eax, %ebp + + movl %ebp, (%res_ptr,%size,4) + incl %size + + movl %edx, %ebp + jnz L(oop) + + adcl $0, %ebp + movl %ebp, %eax + popl %s2_limb + cfi_adjust_cfa_offset (-4) + cfi_restore (s2_limb) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %s1_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (s1_ptr) + popl %res_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (res_ptr) + + ret +#undef size +END (__mpn_submul_1) |