From 8f5ca04bc7fd53741d80117df992995ace8f6d2d Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@gnu.org>
Date: Mon, 16 Oct 1995 01:37:51 +0000
Subject: Sat Oct 14 02:52:36 1995  Ulrich Drepper 
 <drepper@ipd.info.uni-karlsruhe.de>

	* malloc/malloc.c (_malloc_internal): Performance fix.  Move
	if statement out of loop.

	* stdio/_itoa.c, stdio/_itoa.h: Complete rewrite.  Much faster
	implementation using GMP functions.  Contributed by
	Torbjorn Granlund and Ulrich Drepper.

	* stdio/test_rdwr.c: Include <errno.h>.

	* sysdeps/i386/i586/Implies: New file.

	New highly optimized string functions for i[345]86.
	* sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files.
        * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files.
        * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files.
        * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files.
        * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files.
        * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files.
        * sysdeps/i386/i586/strlen.S: New file.
	* sysdeps/i386/memchr.c: Removed.  There is now an assembler version.

	* sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did
	not correspond to used values.

	* sysdeps/unix/sysv/linux/nfs/nfs.h: New file.  Simply a wrapper
        around a kernel header file.
	* sysdeps/unix/sysv/linux/Dist: Add it.
	* sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers):
	Likewise.

	* sysdeps/unix/sysv/linux/local_lim.h: Rewrite.  Instead of
        defining ourself we use a kernel header file.

	* sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system
        call handler for i586.

	* sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up.
Sat Oct 14 02:52:36 1995  Ulrich Drepper  <drepper@ipd.info.uni-karlsruhe.de>

	* malloc/malloc.c (_malloc_internal): Performance fix.  Move
	if statement out of loop.

	* stdio/_itoa.c, stdio/_itoa.h: Complete rewrite.  Much faster
	implementation using GMP functions.  Contributed by
	Torbjorn Granlund and Ulrich Drepper.

	* stdio/test_rdwr.c: Include <errno.h>.

	* sysdeps/i386/i586/Implies: New file.

	New highly optimized string functions for i[345]86.
	* sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files.
        * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files.
        * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files.
        * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files.
        * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files.
        * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files.
        * sysdeps/i386/i586/strlen.S: New file.
	* sysdeps/i386/memchr.c: Removed.  There is now an assembler version.

	* sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did
	not correspond to used values.

	* sysdeps/unix/sysv/linux/nfs/nfs.h: New file.  Simply a wrapper
        around a kernel header file.
	* sysdeps/unix/sysv/linux/Dist: Add it.
	* sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers):
	Likewise.

	* sysdeps/unix/sysv/linux/local_lim.h: Rewrite.  Instead of
        defining ourself we use a kernel header file.

	* sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system
        call handler for i586.

	* sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up.
---
 sysdeps/i386/i586/Implies    |   2 +
 sysdeps/i386/i586/add_n.S    | 136 ++++++++++++++++++
 sysdeps/i386/i586/addmul_1.S |  84 +++++++++++
 sysdeps/i386/i586/lshift.S   | 213 +++++++++++++++++++++++++++
 sysdeps/i386/i586/memcopy.h  |   6 +-
 sysdeps/i386/i586/mul_1.S    |  78 ++++++++++
 sysdeps/i386/i586/rshift.S   | 213 +++++++++++++++++++++++++++
 sysdeps/i386/i586/strchr.S   | 334 +++++++++++++++++++++++++++++++++++++++++++
 sysdeps/i386/i586/strlen.S   | 185 ++++++++++++++++++++++++
 sysdeps/i386/i586/sub_n.S    | 136 ++++++++++++++++++
 sysdeps/i386/i586/submul_1.S |  82 +++++++++++
 11 files changed, 1466 insertions(+), 3 deletions(-)
 create mode 100644 sysdeps/i386/i586/Implies
 create mode 100644 sysdeps/i386/i586/add_n.S
 create mode 100644 sysdeps/i386/i586/addmul_1.S
 create mode 100644 sysdeps/i386/i586/lshift.S
 create mode 100644 sysdeps/i386/i586/mul_1.S
 create mode 100644 sysdeps/i386/i586/rshift.S
 create mode 100644 sysdeps/i386/i586/strchr.S
 create mode 100644 sysdeps/i386/i586/strlen.S
 create mode 100644 sysdeps/i386/i586/sub_n.S
 create mode 100644 sysdeps/i386/i586/submul_1.S

(limited to 'sysdeps/i386/i586')

diff --git a/sysdeps/i386/i586/Implies b/sysdeps/i386/i586/Implies
new file mode 100644
index 0000000000..477cd741ce
--- /dev/null
+++ b/sysdeps/i386/i586/Implies
@@ -0,0 +1,2 @@
+# Code optimized for i486 is better than simple i386 code.
+i386/i486
diff --git a/sysdeps/i386/i586/add_n.S b/sysdeps/i386/i586/add_n.S
new file mode 100644
index 0000000000..9be45ed93b
--- /dev/null
+++ b/sysdeps/i386/i586/add_n.S
@@ -0,0 +1,136 @@
+/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   s2_ptr	(sp + 12)
+   size		(sp + 16)
+*/
+
+#define r1	%eax
+#define r2	%edx
+#define src1	%esi
+#define src2	%ebp
+#define dst	%edi
+#define x	%ebx
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+	ALIGN (3)
+	.globl C_SYMBOL_NAME(__mpn_add_n)
+C_SYMBOL_NAME(__mpn_add_n:)
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),dst		/* res_ptr */
+	movl	24(%esp),src1		/* s1_ptr */
+	movl	28(%esp),src2		/* s2_ptr */
+	movl	32(%esp),%ecx		/* size */
+
+	movl	(src2),x
+
+	decl	%ecx
+	movl	%ecx,r2
+	shrl	$3,%ecx
+	andl	$7,r2
+	testl	%ecx,%ecx		/* zero carry flag */
+	jz	Lend
+	pushl	r2
+
+	ALIGN (3)
+Loop:	movl	28(dst),%eax		/* fetch destination cache line */
+	leal	32(dst),dst
+
+L1:	movl	(src1),r1
+	movl	4(src1),r2
+	adcl	x,r1
+	movl	4(src2),x
+	adcl	x,r2
+	movl	8(src2),x
+	movl	r1,-32(dst)
+	movl	r2,-28(dst)
+
+L2:	movl	8(src1),r1
+	movl	12(src1),r2
+	adcl	x,r1
+	movl	12(src2),x
+	adcl	x,r2
+	movl	16(src2),x
+	movl	r1,-24(dst)
+	movl	r2,-20(dst)
+
+L3:	movl	16(src1),r1
+	movl	20(src1),r2
+	adcl	x,r1
+	movl	20(src2),x
+	adcl	x,r2
+	movl	24(src2),x
+	movl	r1,-16(dst)
+	movl	r2,-12(dst)
+
+L4:	movl	24(src1),r1
+	movl	28(src1),r2
+	adcl	x,r1
+	movl	28(src2),x
+	adcl	x,r2
+	movl	32(src2),x
+	movl	r1,-8(dst)
+	movl	r2,-4(dst)
+
+	leal	32(src1),src1
+	leal	32(src2),src2
+	decl	%ecx
+	jnz	Loop
+
+	popl	r2
+Lend:
+	decl	r2			/* test r2 w/o clobbering carry */
+	js	Lend2
+	incl	r2
+Loop2:
+	leal	4(dst),dst
+	movl	(src1),r1
+	adcl	x,r1
+	movl	4(src2),x
+	movl	r1,-4(dst)
+	leal	4(src1),src1
+	leal	4(src2),src2
+	decl	r2
+	jnz	Loop2
+Lend2:
+	movl	(src1),r1
+	adcl	x,r1
+	movl	r1,(dst)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
diff --git a/sysdeps/i386/i586/addmul_1.S b/sysdeps/i386/i586/addmul_1.S
new file mode 100644
index 0000000000..b222840591
--- /dev/null
+++ b/sysdeps/i386/i586/addmul_1.S
@@ -0,0 +1,84 @@
+/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+   the result to a second limb vector.
+
+Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   size		(sp + 12)
+   s2_limb	(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr edi
+#define s1_ptr esi
+#define s2_limb ebp
+
+	TEXT
+	ALIGN (3)
+	GLOBL	C_SYMBOL_NAME(__mpn_addmul_1)
+	.type	C_SYMBOL_NAME(__mpn_addmul_1),@function
+C_SYMBOL_NAME(__mpn_addmul_1:)
+
+	INSN1(push,l	,R(edi))
+	INSN1(push,l	,R(esi))
+	INSN1(push,l	,R(ebx))
+	INSN1(push,l	,R(ebp))
+
+	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
+	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
+	INSN2(mov,l	,R(ecx),MEM_DISP(esp,28))
+	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
+
+	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,ecx,4))
+	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,ecx,4))
+	INSN1(neg,l	,R(ecx))
+	INSN2(xor,l	,R(edx),R(edx))
+	ALIGN (3)
+Loop:
+	INSN2(mov,l	,R(ebx),R(edx))
+	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,ecx,4))
+
+	INSN1(mul,l	,R(s2_limb))
+
+	INSN2(add,l	,R(eax),R(ebx))
+	INSN2(mov,l	,R(ebx),MEM_INDEX(res_ptr,ecx,4))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(add,l	,R(ebx),R(eax))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(mov,l	,MEM_INDEX(res_ptr,ecx,4),R(ebx))
+
+	INSN1(inc,l	,R(ecx))
+	INSN1(jnz,	,Loop)
+
+
+	INSN2(mov,l	,R(eax),R(edx))
+	INSN1(pop,l	,R(ebp))
+	INSN1(pop,l	,R(ebx))
+	INSN1(pop,l	,R(esi))
+	INSN1(pop,l	,R(edi))
+	ret
+Lfe1:
+	.size	C_SYMBOL_NAME(__mpn_addmul_1),Lfe1-C_SYMBOL_NAME(__mpn_addmul_1)
diff --git a/sysdeps/i386/i586/lshift.S b/sysdeps/i386/i586/lshift.S
new file mode 100644
index 0000000000..b9f8131297
--- /dev/null
+++ b/sysdeps/i386/i586/lshift.S
@@ -0,0 +1,213 @@
+/* Pentium optimized __mpn_lshift -- 
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  size		(sp + 12)
+  cnt		(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+	ALIGN (3)
+	.globl C_SYMBOL_NAME(__mpn_lshift)
+C_SYMBOL_NAME(__mpn_lshift:)
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s_ptr */
+	movl	28(%esp),%ebp		/* size */
+	movl	32(%esp),%ecx		/* cnt */
+
+	cmp	$1,%ecx
+	jne	Lnormal
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ebp,%eax
+	jnc	Lspecial
+
+Lnormal:
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+	xorl	%eax,%eax
+	shldl	%cl,%edx,%eax		/* compute carry limb */
+	pushl	%eax			/* push carry limb onto stack */
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	Lend
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+Loop:	movl	-28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	shldl	%cl,%eax,%ebx
+	shldl	%cl,%edx,%eax
+	movl	%ebx,(%edi)
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	shldl	%cl,%ebx,%edx
+	shldl	%cl,%eax,%ebx
+	movl	%edx,-8(%edi)
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	shldl	%cl,%edx,%eax
+	shldl	%cl,%ebx,%edx
+	movl	%eax,-16(%edi)
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	shldl	%cl,%eax,%ebx
+	shldl	%cl,%edx,%eax
+	movl	%ebx,-24(%edi)
+	movl	%eax,-28(%edi)
+
+	subl	$32,%esi
+	subl	$32,%edi
+	decl	%ebp
+	jnz	Loop
+
+Lend:	popl	%ebp
+	andl	$7,%ebp
+	jz	Lend2
+Loop2:	movl	(%esi),%eax
+	shldl	%cl,%eax,%edx
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	subl	$4,%esi
+	subl	$4,%edi
+	decl	%ebp
+	jnz	Loop2
+
+Lend2:	shll	%cl,%edx		/* compute least significant limb */
+	movl	%edx,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+/* We loop from least significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.
+*/
+
+Lspecial:
+	movl	(%esi),%edx
+	addl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	addl	%edx,%edx
+	incl	%ebp
+	decl	%ebp
+	jz	LLend
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+LLoop:	movl	28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,(%edi)
+	adcl	%edx,%edx
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	adcl	%ebx,%ebx
+	movl	%edx,8(%edi)
+	adcl	%eax,%eax
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	adcl	%edx,%edx
+	movl	%eax,16(%edi)
+	adcl	%ebx,%ebx
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,24(%edi)
+	adcl	%edx,%edx
+	movl	%eax,28(%edi)
+
+	leal	32(%esi),%esi		/* use leal not to clobber carry */
+	leal	32(%edi),%edi
+	decl	%ebp
+	jnz	LLoop
+
+LLend:	popl	%ebp
+	sbbl	%eax,%eax		/* save carry in %eax */
+	andl	$7,%ebp
+	jz	LLend2
+	addl	%eax,%eax		/* restore carry from eax */
+LLoop2:	movl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	%edx,%edx
+	movl	%ebx,(%edi)
+
+	leal	4(%esi),%esi		/* use leal not to clobber carry */
+	leal	4(%edi),%edi
+	decl	%ebp
+	jnz	LLoop2
+
+	jmp	LL1
+LLend2:	addl	%eax,%eax		/* restore carry from eax */
+LL1:	movl	%edx,(%edi)		/* store last limb */
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
diff --git a/sysdeps/i386/i586/memcopy.h b/sysdeps/i386/i586/memcopy.h
index a9bb9e7a40..0a8768788e 100644
--- a/sysdeps/i386/i586/memcopy.h
+++ b/sysdeps/i386/i586/memcopy.h
@@ -1,5 +1,5 @@
 /* memcopy.h -- definitions for memory copy functions.  Pentium version.
-   Copyright (C) 1994 Free Software Foundation, Inc.
+   Copyright (C) 1994, 1995 Free Software Foundation, Inc.
    Contributed by Torbjorn Granlund (tege@sics.se).
 
 This file is part of the GNU C Library.
@@ -88,7 +88,7 @@ Cambridge, MA 02139, USA.  */
 		    "subl	$32,%2\n"				\
 		    "jns	1b\n"					\
 		    "2: addl	$32,%2" :				\
-		    "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) :	\
-		    "0" (dst_bp), "1" (src_bp), "2" (nbytes) :		\
+		    "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) :	\
+		    "0" (dst_ep), "1" (src_ep), "2" (nbytes) :		\
 		    "ax", "dx");					\
     } while (0)
diff --git a/sysdeps/i386/i586/mul_1.S b/sysdeps/i386/i586/mul_1.S
new file mode 100644
index 0000000000..2b7258e130
--- /dev/null
+++ b/sysdeps/i386/i586/mul_1.S
@@ -0,0 +1,78 @@
+/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store
+   the result in a second limb vector.
+
+Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   size		(sp + 12)
+   s2_limb	(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebp
+
+	TEXT
+	ALIGN (3)
+	GLOBL	C_SYMBOL_NAME(__mpn_mul_1)
+C_SYMBOL_NAME(__mpn_mul_1:)
+
+	INSN1(push,l	,R(edi))
+	INSN1(push,l	,R(esi))
+	INSN1(push,l	,R(ebx))
+	INSN1(push,l	,R(ebp))
+
+	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
+	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
+	INSN2(mov,l	,R(size),MEM_DISP(esp,28))
+	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
+
+	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,size,4))
+	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
+	INSN1(neg,l	,R(size))
+	INSN2(xor,l	,R(edx),R(edx))
+	ALIGN (3)
+Loop:
+	INSN2(mov,l	,R(ebx),R(edx))
+	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,size,4))
+
+	INSN1(mul,l	,R(s2_limb))
+
+	INSN2(add,l	,R(eax),R(ebx))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(eax))
+
+	INSN1(inc,l	,R(size))
+	INSN1(jnz,	,Loop)
+
+
+	INSN2(mov,l	,R(eax),R(edx))
+	INSN1(pop,l	,R(ebp))
+	INSN1(pop,l	,R(ebx))
+	INSN1(pop,l	,R(esi))
+	INSN1(pop,l	,R(edi))
+	ret
diff --git a/sysdeps/i386/i586/rshift.S b/sysdeps/i386/i586/rshift.S
new file mode 100644
index 0000000000..51cde8f07f
--- /dev/null
+++ b/sysdeps/i386/i586/rshift.S
@@ -0,0 +1,213 @@
+/* Pentium optimized __mpn_rshift -- 
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  size		(sp + 12)
+  cnt		(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+	ALIGN (3)
+	.globl C_SYMBOL_NAME(__mpn_rshift)
+C_SYMBOL_NAME(__mpn_rshift:)
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s_ptr */
+	movl	28(%esp),%ebp		/* size */
+	movl	32(%esp),%ecx		/* cnt */
+
+	cmp	$1,%ecx
+	jne	Lnormal
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ebp,%eax
+	jnc	Lspecial
+
+Lnormal:
+	movl	(%esi),%edx
+	addl	$4,%esi
+	xorl	%eax,%eax
+	shrdl	%cl,%edx,%eax		/* compute carry limb */
+	pushl	%eax			/* push carry limb onto stack */
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	Lend
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+Loop:	movl	28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	shrdl	%cl,%eax,%ebx
+	shrdl	%cl,%edx,%eax
+	movl	%ebx,(%edi)
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	shrdl	%cl,%ebx,%edx
+	shrdl	%cl,%eax,%ebx
+	movl	%edx,8(%edi)
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	shrdl	%cl,%edx,%eax
+	shrdl	%cl,%ebx,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	shrdl	%cl,%eax,%ebx
+	shrdl	%cl,%edx,%eax
+	movl	%ebx,24(%edi)
+	movl	%eax,28(%edi)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	decl	%ebp
+	jnz	Loop
+
+Lend:	popl	%ebp
+	andl	$7,%ebp
+	jz	Lend2
+Loop2:	movl	(%esi),%eax
+	shrdl	%cl,%eax,%edx		/* compute result limb */
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	addl	$4,%esi
+	addl	$4,%edi
+	decl	%ebp
+	jnz	Loop2
+
+Lend2:	shrl	%cl,%edx		/* compute most significant limb */
+	movl	%edx,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+/* We loop from least significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.
+*/
+
+Lspecial:
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	shrl	$1,%edx
+	incl	%ebp
+	decl	%ebp
+	jz	LLend
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+LLoop:	movl	-28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	rcrl	$1,%eax
+	movl	%ebx,(%edi)
+	rcrl	$1,%edx
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	rcrl	$1,%ebx
+	movl	%edx,-8(%edi)
+	rcrl	$1,%eax
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	rcrl	$1,%edx
+	movl	%eax,-16(%edi)
+	rcrl	$1,%ebx
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	rcrl	$1,%eax
+	movl	%ebx,-24(%edi)
+	rcrl	$1,%edx
+	movl	%eax,-28(%edi)
+
+	leal	-32(%esi),%esi		/* use leal not to clobber carry */
+	leal	-32(%edi),%edi
+	decl	%ebp
+	jnz	LLoop
+
+LLend:	popl	%ebp
+	sbbl	%eax,%eax		/* save carry in %eax */
+	andl	$7,%ebp
+	jz	LLend2
+	addl	%eax,%eax		/* restore carry from eax */
+LLoop2:	movl	%edx,%ebx
+	movl	(%esi),%edx
+	rcrl	$1,%edx
+	movl	%ebx,(%edi)
+
+	leal	-4(%esi),%esi		/* use leal not to clobber carry */
+	leal	-4(%edi),%edi
+	decl	%ebp
+	jnz	LLoop2
+
+	jmp	LL1
+LLend2:	addl	%eax,%eax		/* restore carry from eax */
+LL1:	movl	%edx,(%edi)		/* store last limb */
+
+	movl	$0,%eax
+	rcrl	$1,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
diff --git a/sysdeps/i386/i586/strchr.S b/sysdeps/i386/i586/strchr.S
new file mode 100644
index 0000000000..982c80ec9a
--- /dev/null
+++ b/sysdeps/i386/i586/strchr.S
@@ -0,0 +1,334 @@
+/* strchr -- find character CH in a NUL terminated string.
+Highly optimized version for ix85, x>=5.
+Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to executs some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   ch		(sp + 8)
+*/
+
+	.text
+ENTRY (strchr)
+	pushl %edi		/* Save callee-safe registers.  */
+	pushl %esi
+
+	pushl %ebx
+	pushl %ebp
+
+	movl 20(%esp), %eax	/* get string pointer */
+	movl 24(%esp), %edx	/* get character we are looking for */
+
+	movl %eax, %edi		/* duplicate string pointer for later */
+	xorl %ecx, %ecx		/* clear %ecx */
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* now it is 0|0|c|c */
+	movb %dl, %cl		/* we construct the lower half in %ecx */
+
+	shll $16, %edx		/* now %edx is c|c|0|0 */
+	movb %cl, %ch		/* now %ecx is 0|0|c|c */
+
+	orl %ecx, %edx		/* and finally c|c|c|c */
+	andl $3, %edi		/* mask alignment bits */
+
+	jz L11			/* alignment is 0 => start loop */
+
+	movb (%eax), %cl	/* load single byte */
+	cmpb %cl, %dl		/* is byte == C? */
+
+	je L2			/* aligned => return pointer */
+
+	cmp $0, %cl		/* is byte NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+	cmp $3, %edi		/* was alignment == 3? */
+
+	je L11			/* yes => start loop */
+
+	movb (%eax), %cl	/* load single byte */
+	cmpb %cl, %dl		/* is byte == C? */
+
+	je L2			/* aligned => return pointer */
+
+	cmp $0, %cl		/* is byte NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+	cmp $2, %edi		/* was alignment == 2? */
+
+	je L11			/* yes => start loop */
+
+	movb (%eax), %cl	/* load single byte */
+	cmpb %cl, %dl		/* is byte == C? */
+
+	je L2			/* aligned => return pointer */
+
+	cmp $0, %cl		/* is byte NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	/* The following code is the preparation for the loop.  The
+	   four instruction up to `L1' will not be executed in the loop
+	   because the same code is found at the end of the loop, but
+	   there it is executed in parallel with other instructions.  */
+L11:	movl (%eax), %ecx
+	movl $magic, %ebp
+
+	movl $magic, %edi
+	addl %ecx, %ebp
+
+	/* The main loop: it looks complex and indeed it is.  I would
+	   love to say `it was hard to write, so it should he hard to
+	   read' but I will give some more hints.  To fully understand
+	   this code you should first take a look at the i486 version.
+	   The basic algorithm is the same, but here the code organized
+	   in a way which permits to use both pipelines all the time.
+
+	   I tried to make it a bit more understandable by indenting
+	   the code according to stage in the algorithm.  It goes as
+	   follows:
+		check for 0 in 1st word
+			check for C in 1st word
+					check for 0 in 2nd word
+						check for C in 2nd word
+		check for 0 in 3rd word
+			check for C in 3rd word
+					check for 0 in 4th word
+						check for C in 4th word
+
+	   Please note that doing the test for NUL before the test for
+	   C allows us to overlap the test for 0 in the next word with
+	   the test for C.  */
+
+L1:	xorl %ecx, %ebp			/* (word^magic) */
+	addl %ecx, %edi			/* add magic word */
+
+	leal 4(%eax), %eax		/* increment pointer */
+	jnc L4				/* previous addl caused overflow? */
+
+		movl %ecx, %ebx		/* duplicate original word */
+	orl $magic, %ebp		/* (word^magic)|magic */
+
+	addl $1, %ebp			/* (word^magic)|magic == 0xffffffff? */
+	jne L4				/* yes => we found word with NUL */
+
+		movl $magic, %esi	/* load magic value */
+		xorl %edx, %ebx		/* clear words which are C */
+
+					movl (%eax), %ecx
+		addl %ebx, %esi		/* (word+magic) */
+
+					movl $magic, %edi
+		jnc L5			/* previous addl caused overflow? */
+
+					movl %edi, %ebp
+		xorl %ebx, %esi		/* (word+magic)^word */
+
+					addl %ecx, %ebp
+		orl $magic, %esi	/* ((word+magic)^word)|magic */
+
+		addl $1, %esi		/* ((word+magic)^word)|magic==0xf..f?*/
+		jne L5			/* yes => we found word with C */
+
+					xorl %ecx, %ebp
+					addl %ecx, %edi
+
+					leal 4(%eax), %eax
+					jnc L4
+
+						movl %ecx, %ebx
+					orl $magic, %ebp
+
+					addl $1, %ebp
+					jne L4
+
+						movl $magic, %esi
+						xorl %edx, %ebx
+
+	movl (%eax), %ecx
+						addl %ebx, %esi
+
+	movl $magic, %edi
+						jnc L5
+
+	movl %edi, %ebp
+						xorl %ebx, %esi
+
+	addl %ecx, %ebp
+						orl $magic, %esi
+
+						addl $1, %esi
+						jne L5
+
+	xorl %ecx, %ebp
+	addl %ecx, %edi
+
+	leal 4(%eax), %eax
+	jnc L4
+
+		movl %ecx, %ebx
+	orl $magic, %ebp
+
+	addl $1, %ebp
+	jne L4
+
+		movl $magic, %esi
+		xorl %edx, %ebx
+
+					movl (%eax), %ecx
+		addl %ebx, %esi
+
+					movl $magic, %edi
+		jnc L5
+
+					movl %edi, %ebp
+		xorl %ebx, %esi
+
+					addl %ecx, %ebp
+		orl $magic, %esi
+
+		addl $1, %esi
+		jne L5
+
+					xorl %ecx, %ebp
+					addl %ecx, %edi
+
+					leal 4(%eax), %eax
+					jnc L4
+
+						movl %ecx, %ebx
+					orl $magic, %ebp
+
+					addl $1, %ebp
+					jne L4
+
+						movl $magic, %esi
+						xorl %edx, %ebx
+
+	movl (%eax), %ecx
+						addl %ebx, %esi
+
+	movl $magic, %edi
+						jnc L5
+
+	movl %edi, %ebp
+						xorl %ebx, %esi
+
+	addl %ecx, %ebp
+						orl $magic, %esi
+
+						addl $1, %esi
+
+						je L1
+
+	/* We know there is no NUL byte but a C byte in the word.
+	   %ebx contains NUL in this particular byte.  */
+L5:	subl $4, %eax		/* adjust pointer */
+	testb %bl, %bl		/* first byte == C? */
+
+	jz L2			/* yes => return pointer */
+
+	incl %eax		/* increment pointer */
+	testb %bh, %bh		/* second byte == C? */
+
+	jz L2			/* yes => return pointer */
+
+	shrl $16, %ebx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmp $0, %bl		/* third byte == C */
+	je L2			/* yes => return pointer */
+
+	incl %eax		/* increment pointer */
+
+L2:	popl %ebp		/* restore saved registers */
+	popl %ebx
+
+	popl %esi
+	popl %edi
+
+	ret
+
+	/* We know there is a NUL byte in the word.  But we have to test
+	   whether there is an C byte before it in the word.  */
+L4:	subl $4, %eax		/* adjust pointer */
+	cmpb %dl, %cl		/* first byte == C? */
+
+	je L2			/* yes => return pointer */
+
+	cmpb $0, %cl		/* first byte == NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	cmpb %dl, %ch		/* second byte == C? */
+	je L2			/* yes => return pointer */
+
+	cmpb $0, %ch		/* second byte == NUL? */
+	je L3			/* yes => return NULL */
+
+	shrl $16, %ecx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmpb %dl, %cl		/* third byte == C? */
+	je L2			/* yes => return pointer */
+
+	cmpb $0, %cl		/* third byte == NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	/* The test four the fourth byte is necessary!  */
+	cmpb %dl, %ch		/* fourth byte == C? */
+	je L2			/* yes => return pointer */
+
+L3:	xorl %eax, %eax		/* set return value = NULL */
+
+	popl %ebp		/* restore saved registers */
+	popl %ebx
+
+	popl %esi
+	popl %edi
+
+	ret
+
+#undef index
+weak_alias (strchr, index)
diff --git a/sysdeps/i386/i586/strlen.S b/sysdeps/i386/i586/strlen.S
new file mode 100644
index 0000000000..b807ed4b4f
--- /dev/null
+++ b/sysdeps/i386/i586/strlen.S
@@ -0,0 +1,185 @@
+/* strlen -- Compute length og NUL terminated string.
+Highly optimized version for ix86, x>=5.
+Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to executs some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+*/
+
+	.text
+ENTRY(strlen)
+	movl 4(%esp), %eax	/* get string pointer */
+
+	movl %eax, %ecx		/* duplicate it */
+	andl $3, %ecx		/* mask alignment bits */
+
+	jz L11			/* aligned => start loop */
+
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	cmpl $3, %ecx		/* was alignment = 3? */
+
+	je L11			/* yes => now it is aligned and start loop */
+
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	cmpl $2, %ecx		/* was alignment = 2? */
+
+	je L11			/* yes => now it is aligned and start loop */
+
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+
+	incl %eax		/* increment pointer */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.  */
+L11:	xorl %edx, %edx		/* We need %edx == 0 for later */
+
+L1:
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L3			/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+	subl $magic, %ecx	/* undo previous addl to restore word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L3			/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L3			/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+	subl $magic, %ecx	/* undo previous addl to restore word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L3			/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L3			/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+	subl $magic, %ecx	/* undo previous addl to restore word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L3			/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* wcomplete negation of ord */
+	jnc L3			/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+	subl $magic, %ecx	/* undo previous addl to restore word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	je L1			/* no => start loop again */
+
+
+L3:	subl $4, %eax		/* correct too early pointer increment */
+	testb %cl, %cl		/* lowest byte NUL? */
+
+	jz L2			/* yes => return */
+
+	inc %eax		/* increment pointer */
+	testb %ch, %ch		/* second byte NUL? */
+
+	jz L2			/* yes => return */
+
+	shrl $16, %ecx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmpb $0, %cl		/* is third byte NUL? */
+	jz L2			/* yes => return */
+
+	incl %eax		/* increment pointer */
+
+L2:	subl 4(%esp), %eax	/* now compute the length as difference
+				   between start and terminating NUL
+				   character */
+
+	ret
diff --git a/sysdeps/i386/i586/sub_n.S b/sysdeps/i386/i586/sub_n.S
new file mode 100644
index 0000000000..1382e665f6
--- /dev/null
+++ b/sysdeps/i386/i586/sub_n.S
@@ -0,0 +1,136 @@
+/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0
+   and store difference in a third limb vector.
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   s2_ptr	(sp + 12)
+   size		(sp + 16)
+*/
+
+#define r1	%eax
+#define r2	%edx
+#define src1	%esi
+#define src2	%ebp
+#define dst	%edi
+#define x	%ebx
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+	ALIGN (3)
+	.globl C_SYMBOL_NAME(__mpn_sub_n)
+C_SYMBOL_NAME(__mpn_sub_n:)
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),dst		/* res_ptr */
+	movl	24(%esp),src1		/* s1_ptr */
+	movl	28(%esp),src2		/* s2_ptr */
+	movl	32(%esp),%ecx		/* size */
+
+	movl	(src2),x
+
+	decl	%ecx
+	movl	%ecx,r2
+	shrl	$3,%ecx
+	andl	$7,r2
+	testl	%ecx,%ecx		/* zero carry flag */
+	jz	Lend
+	pushl	r2
+
+	ALIGN (3)
+Loop:	movl	28(dst),%eax		/* fetch destination cache line */
+	leal	32(dst),dst
+
+L1:	movl	(src1),r1
+	movl	4(src1),r2
+	sbbl	x,r1
+	movl	4(src2),x
+	sbbl	x,r2
+	movl	8(src2),x
+	movl	r1,-32(dst)
+	movl	r2,-28(dst)
+
+L2:	movl	8(src1),r1
+	movl	12(src1),r2
+	sbbl	x,r1
+	movl	12(src2),x
+	sbbl	x,r2
+	movl	16(src2),x
+	movl	r1,-24(dst)
+	movl	r2,-20(dst)
+
+L3:	movl	16(src1),r1
+	movl	20(src1),r2
+	sbbl	x,r1
+	movl	20(src2),x
+	sbbl	x,r2
+	movl	24(src2),x
+	movl	r1,-16(dst)
+	movl	r2,-12(dst)
+
+L4:	movl	24(src1),r1
+	movl	28(src1),r2
+	sbbl	x,r1
+	movl	28(src2),x
+	sbbl	x,r2
+	movl	32(src2),x
+	movl	r1,-8(dst)
+	movl	r2,-4(dst)
+
+	leal	32(src1),src1
+	leal	32(src2),src2
+	decl	%ecx
+	jnz	Loop
+
+	popl	r2
+Lend:
+	decl	r2			/* test r2 w/o clobbering carry */
+	js	Lend2
+	incl	r2
+Loop2:
+	leal	4(dst),dst
+	movl	(src1),r1
+	sbbl	x,r1
+	movl	4(src2),x
+	movl	r1,-4(dst)
+	leal	4(src1),src1
+	leal	4(src2),src2
+	decl	r2
+	jnz	Loop2
+Lend2:
+	movl	(src1),r1
+	sbbl	x,r1
+	movl	r1,(dst)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
diff --git a/sysdeps/i386/i586/submul_1.S b/sysdeps/i386/i586/submul_1.S
new file mode 100644
index 0000000000..14bfe54e24
--- /dev/null
+++ b/sysdeps/i386/i586/submul_1.S
@@ -0,0 +1,82 @@
+/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+   the result from a second limb vector.
+
+Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   size		(sp + 12)
+   s2_limb	(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebp
+
+	TEXT
+	ALIGN (3)
+	GLOBL	C_SYMBOL_NAME(__mpn_submul_1)
+C_SYMBOL_NAME(__mpn_submul_1:)
+
+	INSN1(push,l	,R(edi))
+	INSN1(push,l	,R(esi))
+	INSN1(push,l	,R(ebx))
+	INSN1(push,l	,R(ebp))
+
+	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
+	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
+	INSN2(mov,l	,R(size),MEM_DISP(esp,28))
+	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
+
+	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,size,4))
+	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
+	INSN1(neg,l	,R(size))
+	INSN2(xor,l	,R(edx),R(edx))
+	ALIGN (3)
+Loop:
+	INSN2(mov,l	,R(ebx),R(edx))
+	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,size,4))
+
+	INSN1(mul,l	,R(s2_limb))
+
+	INSN2(add,l	,R(eax),R(ebx))
+	INSN2(mov,l	,R(ebx),MEM_INDEX(res_ptr,size,4))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(sub,l	,R(ebx),R(eax))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(ebx))
+
+	INSN1(inc,l	,R(size))
+	INSN1(jnz,	,Loop)
+
+
+	INSN2(mov,l	,R(eax),R(edx))
+	INSN1(pop,l	,R(ebp))
+	INSN1(pop,l	,R(ebx))
+	INSN1(pop,l	,R(esi))
+	INSN1(pop,l	,R(edi))
+	ret
-- 
cgit 1.4.1