From 8f5ca04bc7fd53741d80117df992995ace8f6d2d Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@gnu.org>
Date: Mon, 16 Oct 1995 01:37:51 +0000
Subject: Sat Oct 14 02:52:36 1995  Ulrich Drepper 
 <drepper@ipd.info.uni-karlsruhe.de>

	* malloc/malloc.c (_malloc_internal): Performance fix.  Move
	if statement out of loop.

	* stdio/_itoa.c, stdio/_itoa.h: Complete rewrite.  Much faster
	implementation using GMP functions.  Contributed by
	Torbjorn Granlund and Ulrich Drepper.

	* stdio/test_rdwr.c: Include <errno.h>.

	* sysdeps/i386/i586/Implies: New file.

	New highly optimized string functions for i[345]86.
	* sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files.
        * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files.
        * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files.
        * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files.
        * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files.
        * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files.
        * sysdeps/i386/i586/strlen.S: New file.
	* sysdeps/i386/memchr.c: Removed.  There is now an assembler version.

	* sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did
	not correspond to used values.

	* sysdeps/unix/sysv/linux/nfs/nfs.h: New file.  Simply a wrapper
        around a kernel header file.
	* sysdeps/unix/sysv/linux/Dist: Add it.
	* sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers):
	Likewise.

	* sysdeps/unix/sysv/linux/local_lim.h: Rewrite.  Instead of
        defining ourself we use a kernel header file.

	* sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system
        call handler for i586.

	* sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up.
Sat Oct 14 02:52:36 1995  Ulrich Drepper  <drepper@ipd.info.uni-karlsruhe.de>

	* malloc/malloc.c (_malloc_internal): Performance fix.  Move
	if statement out of loop.

	* stdio/_itoa.c, stdio/_itoa.h: Complete rewrite.  Much faster
	implementation using GMP functions.  Contributed by
	Torbjorn Granlund and Ulrich Drepper.

	* stdio/test_rdwr.c: Include <errno.h>.

	* sysdeps/i386/i586/Implies: New file.

	New highly optimized string functions for i[345]86.
	* sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files.
        * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files.
        * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files.
        * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files.
        * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files.
        * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files.
        * sysdeps/i386/i586/strlen.S: New file.
	* sysdeps/i386/memchr.c: Removed.  There is now an assembler version.

	* sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did
	not correspond to used values.

	* sysdeps/unix/sysv/linux/nfs/nfs.h: New file.  Simply a wrapper
        around a kernel header file.
	* sysdeps/unix/sysv/linux/Dist: Add it.
	* sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers):
	Likewise.

	* sysdeps/unix/sysv/linux/local_lim.h: Rewrite.  Instead of
        defining ourself we use a kernel header file.

	* sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system
        call handler for i586.

	* sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up.
---
 sysdeps/rs6000/add_n.s    |  54 ++++++++++++++++++++
 sysdeps/rs6000/addmul_1.s | 122 ++++++++++++++++++++++++++++++++++++++++++++
 sysdeps/rs6000/lshift.s   |  58 +++++++++++++++++++++
 sysdeps/rs6000/mul_1.s    | 109 +++++++++++++++++++++++++++++++++++++++
 sysdeps/rs6000/rshift.s   |  56 ++++++++++++++++++++
 sysdeps/rs6000/sub_n.s    |  55 ++++++++++++++++++++
 sysdeps/rs6000/submul_1.s | 127 ++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 581 insertions(+)
 create mode 100644 sysdeps/rs6000/add_n.s
 create mode 100644 sysdeps/rs6000/addmul_1.s
 create mode 100644 sysdeps/rs6000/lshift.s
 create mode 100644 sysdeps/rs6000/mul_1.s
 create mode 100644 sysdeps/rs6000/rshift.s
 create mode 100644 sysdeps/rs6000/sub_n.s
 create mode 100644 sysdeps/rs6000/submul_1.s

(limited to 'sysdeps/rs6000')

diff --git a/sysdeps/rs6000/add_n.s b/sysdeps/rs6000/add_n.s
new file mode 100644
index 0000000000..34ad9e1d2d
--- /dev/null
+++ b/sysdeps/rs6000/add_n.s
@@ -0,0 +1,54 @@
+# IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+	.toc
+	.extern __mpn_add_n[DS]
+	.extern .__mpn_add_n
+.csect [PR]
+	.align 2
+	.globl __mpn_add_n
+	.globl .__mpn_add_n
+	.csect __mpn_add_n[DS]
+__mpn_add_n:
+	.long .__mpn_add_n, TOC[tc0], 0
+	.csect [PR]
+.__mpn_add_n:
+	mtctr	6		# copy size into CTR
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3)		# offset res_ptr, it's updated before used
+	a	7,0,8		# add least significant limbs, set cy
+	bdz	Lend		# If done, skip loop
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)		# store previous limb in load latecny slot
+	ae	7,0,8		# add new limbs with cy, set cy
+	bdn	Loop		# decrement CTR and loop back
+Lend:	st	7,4(3)		# store ultimate result limb
+	lil	3,0		# load cy into ...
+	aze	3,3		# ... return value register
+	br
diff --git a/sysdeps/rs6000/addmul_1.s b/sysdeps/rs6000/addmul_1.s
new file mode 100644
index 0000000000..862b6139fe
--- /dev/null
+++ b/sysdeps/rs6000/addmul_1.s
@@ -0,0 +1,122 @@
+# IBM POWER __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.csect .__mpn_addmul_1[PR]
+	.align 2
+	.globl __mpn_addmul_1
+	.globl .__mpn_addmul_1
+	.csect __mpn_addmul_1[DS]
+__mpn_addmul_1:
+	.long .__mpn_addmul_1[PR], TOC[tc0], 0
+	.csect .__mpn_addmul_1[PR]
+.__mpn_addmul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	cax	9,9,7
+	l	7,4(3)
+	a	8,8,7		# add res_limb
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	l	7,4(3)
+	aze	9,9
+	a	8,8,7
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	8,7,9
+	l	7,4(3)
+	ae	10,10,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	8,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/sysdeps/rs6000/lshift.s b/sysdeps/rs6000/lshift.s
new file mode 100644
index 0000000000..69c7502061
--- /dev/null
+++ b/sysdeps/rs6000/lshift.s
@@ -0,0 +1,58 @@
+# IBM POWER __mpn_lshift -- 
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr		r4
+# size		r5
+# cnt		r6
+
+	.toc
+	.extern __mpn_lshift[DS]
+	.extern .__mpn_lshift
+.csect [PR]
+	.align 2
+	.globl __mpn_lshift
+	.globl .__mpn_lshift
+	.csect __mpn_lshift[DS]
+__mpn_lshift:
+	.long .__mpn_lshift, TOC[tc0], 0
+	.csect [PR]
+.__mpn_lshift:
+	sli	0,5,2
+	cax	9,3,0
+	cax	4,4,0
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	lu	0,-4(4)		# read most significant limb
+	sre	3,0,8		# compute carry out limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,-4(4)		# read 2:nd most significant limb
+	sreq	7,0,8		# compute most significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,-4(4)		# load next lower limb
+	stu	7,-4(9)		# store previous result during read latency
+	sreq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,-4(9)		# store 2:nd least significant limb
+Lend2:	sle	7,0,6		# compute least significant limb
+	st      7,-4(9)		# store it"				\
+	br
diff --git a/sysdeps/rs6000/mul_1.s b/sysdeps/rs6000/mul_1.s
new file mode 100644
index 0000000000..f4fa894339
--- /dev/null
+++ b/sysdeps/rs6000/mul_1.s
@@ -0,0 +1,109 @@
+# IBM POWER __mpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.csect .__mpn_mul_1[PR]
+	.align 2
+	.globl __mpn_mul_1
+	.globl .__mpn_mul_1
+	.csect __mpn_mul_1[DS]
+__mpn_mul_1:
+	.long .__mpn_mul_1[PR], TOC[tc0], 0
+	.csect .__mpn_mul_1[PR]
+.__mpn_mul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	ai	0,0,0		# reset carry
+	cax	9,9,7
+	blt	Lneg
+Lpos:	bdz	Lend
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	cax	10,10,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,9
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	cax	9,9,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,10
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/sysdeps/rs6000/rshift.s b/sysdeps/rs6000/rshift.s
new file mode 100644
index 0000000000..6056acc753
--- /dev/null
+++ b/sysdeps/rs6000/rshift.s
@@ -0,0 +1,56 @@
+# IBM POWER __mpn_rshift -- 
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr		r4
+# size		r5
+# cnt		r6
+
+	.toc
+	.extern __mpn_rshift[DS]
+	.extern .__mpn_rshift
+.csect [PR]
+	.align 2
+	.globl __mpn_rshift
+	.globl .__mpn_rshift
+	.csect __mpn_rshift[DS]
+__mpn_rshift:
+	.long .__mpn_rshift, TOC[tc0], 0
+	.csect [PR]
+.__mpn_rshift:
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	l	0,0(4)		# read least significant limb
+	ai	9,3,-4		# adjust res_ptr since it's offset in the stu:s
+	sle	3,0,8		# compute carry limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,4(4)		# read 2:nd least significant limb
+	sleq	7,0,8		# compute least significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,4(4)		# load next higher limb
+	stu	7,4(9)		# store previous result during read latency
+	sleq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,4(9)		# store 2:nd most significant limb
+Lend2:	sre	7,0,6		# compute most significant limb
+	st      7,4(9)		# store it"				\
+	br
diff --git a/sysdeps/rs6000/sub_n.s b/sysdeps/rs6000/sub_n.s
new file mode 100644
index 0000000000..402fdcefc4
--- /dev/null
+++ b/sysdeps/rs6000/sub_n.s
@@ -0,0 +1,55 @@
+# IBM POWER __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+	.toc
+	.extern __mpn_sub_n[DS]
+	.extern .__mpn_sub_n
+.csect [PR]
+	.align 2
+	.globl __mpn_sub_n
+	.globl .__mpn_sub_n
+	.csect __mpn_sub_n[DS]
+__mpn_sub_n:
+	.long .__mpn_sub_n, TOC[tc0], 0
+	.csect [PR]
+.__mpn_sub_n:
+	mtctr	6		# copy size into CTR
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3)		# offset res_ptr, it's updated before used
+	sf	7,0,8		# add least significant limbs, set cy
+	bdz	Lend		# If done, skip loop
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)		# store previous limb in load latecny slot
+	sfe	7,0,8		# add new limbs with cy, set cy
+	bdn	Loop		# decrement CTR and loop back
+Lend:	st	7,4(3)		# store ultimate result limb
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br
diff --git a/sysdeps/rs6000/submul_1.s b/sysdeps/rs6000/submul_1.s
new file mode 100644
index 0000000000..252633261d
--- /dev/null
+++ b/sysdeps/rs6000/submul_1.s
@@ -0,0 +1,127 @@
+# IBM POWER __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.csect .__mpn_submul_1[PR]
+	.align 2
+	.globl __mpn_submul_1
+	.globl .__mpn_submul_1
+	.csect __mpn_submul_1[DS]
+__mpn_submul_1:
+	.long .__mpn_submul_1[PR], TOC[tc0], 0
+	.csect .__mpn_submul_1[PR]
+.__mpn_submul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	11
+	cax	9,9,7
+	l	7,4(3)
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	11,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	11,0,10
+	l	7,4(3)
+	aze	9,9
+	sf	8,11,7
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	11,7,9
+	l	7,4(3)
+	ae	10,10,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	11,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
-- 
cgit 1.4.1