67 files changed, 2329 insertions, 757 deletions
diff --git a/sysdeps/alpha/addmul_1.s b/sysdeps/alpha/addmul_1.s
index 46d277df6e..8b168cb46d 100644
--- a/sysdeps/alpha/addmul_1.s
+++ b/sysdeps/alpha/addmul_1.s
@@ -26,16 +26,7 @@
  # size		r18
  # s2_limb	r19
 
- # This code runs at 42 cycles/limb on the 21064.
-
- # To improve performance for long multiplications, we would use
- # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
- # these instructions without slowing down the general code: 1. We can
- # only have two prefetches in operation at any time in the Alpha
- # architecture.  2. There will seldom be any special alignment
- # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
- # loop into an inner and outer loop, having the inner loop handle
- # exactly one prefetch block?
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
 
 	.set	noreorder
 	.set	noat
@@ -52,7 +43,7 @@ __mpn_addmul_1:
 	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	umulh	$2,$19,$0	# $0 = prod_high
-	beq	$18,Lend1	# jump if size was == 1
+	beq	$18,.Lend1	# jump if size was == 1
 	ldq	$2,0($17)	# $2 = s1_limb
 	addq	$17,8,$17	# s1_ptr++
 	subq	$18,1,$18	# size--
@@ -60,10 +51,10 @@ __mpn_addmul_1:
 	cmpult	$3,$5,$4
 	stq	$3,0($16)
 	addq	$16,8,$16	# res_ptr++
-	beq	$18,Lend2	# jump if size was == 2
+	beq	$18,.Lend2	# jump if size was == 2
 
 	.align	3
-Loop:	mulq	$2,$19,$3	# $3 = prod_low
+.Loop:	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
 	subq	$18,1,$18	# size--
@@ -77,9 +68,9 @@ Loop:	mulq	$2,$19,$3	# $3 = prod_low
 	stq	$3,0($16)
 	addq	$16,8,$16	# res_ptr++
 	addq	$5,$0,$0	# combine carries
-	bne	$18,Loop
+	bne	$18,.Loop
 
-Lend2:	mulq	$2,$19,$3	# $3 = prod_low
+.Lend2:	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
 	umulh	$2,$19,$4	# $4 = cy_limb
@@ -91,7 +82,7 @@ Lend2:	mulq	$2,$19,$3	# $3 = prod_low
 	addq	$5,$0,$0	# combine carries
 	addq	$4,$0,$0	# cy_limb = prod_high + cy
 	ret	$31,($26),1
-Lend1:	addq	$5,$3,$3
+.Lend1:	addq	$5,$3,$3
 	cmpult	$3,$5,$5
 	stq	$3,0($16)
 	addq	$0,$5,$0
diff --git a/sysdeps/alpha/alphaev5/add_n.s b/sysdeps/alpha/alphaev5/add_n.s
index 2aaf041774..66cf82b3c3 100644
--- a/sysdeps/alpha/alphaev5/add_n.s
+++ b/sysdeps/alpha/alphaev5/add_n.s
@@ -35,84 +35,113 @@
 __mpn_add_n:
 	.frame	$30,0,$26,0
 
-	ldq	$3,0($17)
-	ldq	$4,0($18)
-
-	subq	$19,1,$19
-	and	$19,4-1,$2	# number of limbs in first loop
-	bis	$31,$31,$0
-	beq	$2,.L0		# if multiple of 4 limbs, skip first loop
-
-	subq	$19,$2,$19
-
-.Loop0:	subq	$2,1,$2
+	or	$31,$31,$25		# clear cy
+	subq	$19,4,$19		# decr loop cnt
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+	ldq	$0,0($18)
+	ldq	$1,8($18)
+	ldq	$4,0($17)
 	ldq	$5,8($17)
-	addq	$4,$0,$4
-	ldq	$6,8($18)
-	cmpult	$4,$0,$1
-	addq	$3,$4,$4
-	cmpult	$4,$3,$0
-	stq	$4,0($16)
-	or	$0,$1,$0
-
-	addq	$17,8,$17
-	addq	$18,8,$18
-	bis	$5,$5,$3
-	bis	$6,$6,$4
-	addq	$16,8,$16
-	bne	$2,.Loop0
-
-.L0:	beq	$19,.Lend
-
+	addq	$17,32,$17		# update s1_ptr
+	ldq	$2,16($18)
+	addq	$0,$4,$20		# 1st main add
+	ldq	$3,24($18)
+	subq	$19,4,$19		# decr loop cnt
+	ldq	$6,-16($17)
+	cmpult	$20,$0,$25		# compute cy from last add
+	ldq	$7,-8($17)
+	addq	$1,$25,$28		# cy add
+	addq	$18,32,$18		# update s2_ptr
+	addq	$5,$28,$21		# 2nd main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
 	.align	4
-.Loop:	subq	$19,4,$19
-	unop
-
-	ldq	$6,8($18)
-	addq	$4,$0,$0
+.Loop:	cmpult	$21,$28,$25		# compute cy from last add
+	ldq	$0,0($18)
+	or	$8,$25,$25		# combine cy from the two adds
+	ldq	$1,8($18)
+	addq	$2,$25,$28		# cy add
+	ldq	$4,0($17)
+	addq	$28,$6,$22		# 3rd main add
 	ldq	$5,8($17)
-	cmpult	$0,$4,$1
-	ldq	$4,16($18)
-	addq	$3,$0,$20
-	cmpult	$20,$3,$0
-	ldq	$3,16($17)
-	or	$0,$1,$0
-	addq	$6,$0,$0
-	cmpult	$0,$6,$1
-	ldq	$6,24($18)
-	addq	$5,$0,$21
-	cmpult	$21,$5,$0
-	ldq	$5,24($17)
-	or	$0,$1,$0
-	addq	$4,$0,$0
-	cmpult	$0,$4,$1
-	ldq	$4,32($18)
-	addq	$3,$0,$22
-	cmpult	$22,$3,$0
-	ldq	$3,32($17)
-	or	$0,$1,$0
-	addq	$6,$0,$0
-	cmpult	$0,$6,$1
-	addq	$5,$0,$23
-	cmpult	$23,$5,$0
-	or	$0,$1,$0
-
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$22,$28,$25		# compute cy from last add
 	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
 	stq	$21,8($16)
-	stq	$22,16($16)
-	stq	$23,24($16)
-
-	addq	$17,32,$17
-	addq	$18,32,$18
-	addq	$16,32,$16
-	bne	$19,.Loop
+	addq	$3,$25,$28		# cy add
+	addq	$28,$7,$23		# 4th main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$23,$28,$25		# compute cy from last add
+	addq	$17,32,$17		# update s1_ptr
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,32,$16		# update res_ptr
+	addq	$0,$25,$28		# cy add
+	ldq	$2,16($18)
+	addq	$4,$28,$20		# 1st main add
+	ldq	$3,24($18)
+	cmpult	$28,$25,$8		# compute cy from last add
+	ldq	$6,-16($17)
+	cmpult	$20,$28,$25		# compute cy from last add
+	ldq	$7,-8($17)
+	or	$8,$25,$25		# combine cy from the two adds
+	subq	$19,4,$19		# decr loop cnt
+	stq	$22,-16($16)
+	addq	$1,$25,$28		# cy add
+	stq	$23,-8($16)
+	addq	$5,$28,$21		# 2nd main add
+	addq	$18,32,$18		# update s2_ptr
+	cmpult	$28,$25,$8		# compute cy from last add
+	bge	$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:	cmpult	$21,$28,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$2,$25,$28		# cy add
+	addq	$28,$6,$22		# 3rd main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$22,$28,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+	stq	$21,8($16)
+	addq	$3,$25,$28		# cy add
+	addq	$28,$7,$23		# 4th main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$23,$28,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,32,$16		# update res_ptr
+	stq	$22,-16($16)
+	stq	$23,-8($16)
+.Lend2:	addq	$19,4,$19		# restore loop cnt
+	beq	$19,.Lret
+ # Start software pipeline for 2nd loop
+	ldq	$0,0($18)
+	ldq	$4,0($17)
+	subq	$19,1,$19
+	beq	$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+	.align	4
+.Loop0:	addq	$0,$25,$28		# cy add
+	ldq	$0,8($18)
+	addq	$4,$28,$20		# main add
+	ldq	$4,8($17)
+	addq	$18,8,$18
+	cmpult	$28,$25,$8		# compute cy from last add
+	addq	$17,8,$17
+	stq	$20,0($16)
+	cmpult	$20,$28,$25		# compute cy from last add
+	subq	$19,1,$19		# decr loop cnt
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,8,$16
+	bne	$19,.Loop0
+.Lend0:	addq	$0,$25,$28		# cy add
+	addq	$4,$28,$20		# main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$20,$28,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
 
-.Lend:	addq	$4,$0,$4
-	cmpult	$4,$0,$1
-	addq	$3,$4,$4
-	cmpult	$4,$3,$0
-	stq	$4,0($16)
-	or	$0,$1,$0
+.Lret:	or	$25,$31,$0		# return cy
 	ret	$31,($26),1
-
 	.end	__mpn_add_n
diff --git a/sysdeps/alpha/alphaev5/lshift.s b/sysdeps/alpha/alphaev5/lshift.s
index fdb089550f..392b4249b8 100644
--- a/sysdeps/alpha/alphaev5/lshift.s
+++ b/sysdeps/alpha/alphaev5/lshift.s
@@ -25,7 +25,7 @@
  # size		r18
  # cnt		r19
 
- # This code runs at 4.25 cycles/limb on the EV5.
+ # This code runs at 3.25 cycles/limb on the EV5.
 
 	.set	noreorder
 	.set	noat
@@ -44,11 +44,11 @@ __mpn_lshift:
 	and	$18,4-1,$28	# number of limbs in first loop
 	srl	$4,$20,$0	# compute function result
 
-	beq	$28,L0
+	beq	$28,.L0
 	subq	$18,$28,$18
 
 	.align	3
-Loop0:	ldq	$3,-16($17)
+.Loop0:	ldq	$3,-16($17)
 	subq	$16,8,$16
 	sll	$4,$19,$5
 	subq	$17,8,$17
@@ -57,17 +57,17 @@ Loop0:	ldq	$3,-16($17)
 	or	$3,$3,$4
 	or	$5,$6,$8
 	stq	$8,0($16)
-	bne	$28,Loop0
+	bne	$28,.Loop0
 
-L0:	sll	$4,$19,$24
-	beq	$18,Lend
+.L0:	sll	$4,$19,$24
+	beq	$18,.Lend
  # warm up phase 1
 	ldq	$1,-16($17)
 	subq	$18,4,$18
 	ldq	$2,-24($17)
 	ldq	$3,-32($17)
 	ldq	$4,-40($17)
-	beq	$18,Lcool1
+	beq	$18,.Lend1
  # warm up phase 2
 	srl	$1,$20,$7
 	sll	$1,$19,$21
@@ -84,10 +84,10 @@ L0:	sll	$4,$19,$24
 	sll	$4,$19,$24
 	ldq	$4,-72($17)
 	subq	$18,4,$18
-	beq	$18,Lcool1
+	beq	$18,.Lend2
 	.align  4
  # main loop
-Loop:	stq	$7,-8($16)
+.Loop:	stq	$7,-8($16)
 	or	$5,$22,$5
 	stq	$8,-16($16)
 	or	$6,$23,$6
@@ -113,16 +113,14 @@ Loop:	stq	$7,-8($16)
 	subq	$16,32,$16
 
 	srl	$4,$20,$6
-	ldq	$3,-96($17
+	ldq	$3,-96($17)
 	sll	$4,$19,$24
 	ldq	$4,-104($17)
 
 	subq	$17,32,$17
-	bne	$18,Loop
-	unop
-	unop
+	bne	$18,.Loop
  # cool down phase 2/1
-Lcool1:	stq	$7,-8($16)
+.Lend2:	stq	$7,-8($16)
 	or	$5,$22,$5
 	stq	$8,-16($16)
 	or	$6,$23,$6
@@ -150,7 +148,7 @@ Lcool1:	stq	$7,-8($16)
 	ret	$31,($26),1
 
  # cool down phase 1/1
-Lcool1:	srl	$1,$20,$7
+.Lend1:	srl	$1,$20,$7
 	sll	$1,$19,$21
 	srl	$2,$20,$8
 	sll	$2,$19,$22
@@ -170,6 +168,6 @@ Lcool1:	srl	$1,$20,$7
 	stq	$24,-40($16)
 	ret	$31,($26),1
 
-Lend	stq	$24,-8($16)
+.Lend:	stq	$24,-8($16)
 	ret	$31,($26),1
 	.end	__mpn_lshift
diff --git a/sysdeps/alpha/alphaev5/rshift.s b/sysdeps/alpha/alphaev5/rshift.s
index 1da9960b46..d20dde35b7 100644
--- a/sysdeps/alpha/alphaev5/rshift.s
+++ b/sysdeps/alpha/alphaev5/rshift.s
@@ -25,7 +25,7 @@
  # size		r18
  # cnt		r19
 
- # This code runs at 4.25 cycles/limb on the EV5.
+ # This code runs at 3.25 cycles/limb on the EV5.
 
 	.set	noreorder
 	.set	noat
@@ -42,11 +42,11 @@ __mpn_rshift:
 	and	$18,4-1,$28	# number of limbs in first loop
 	sll	$4,$20,$0	# compute function result
 
-	beq	$28,L0
+	beq	$28,.L0
 	subq	$18,$28,$18
 
 	.align	3
-Loop0:	ldq	$3,8($17)
+.Loop0:	ldq	$3,8($17)
 	addq	$16,8,$16
 	srl	$4,$19,$5
 	addq	$17,8,$17
@@ -55,17 +55,17 @@ Loop0:	ldq	$3,8($17)
 	or	$3,$3,$4
 	or	$5,$6,$8
 	stq	$8,-8($16)
-	bne	$28,Loop0
+	bne	$28,.Loop0
 
-L0:	srl	$4,$19,$24
-	beq	$18,Lend
+.L0:	srl	$4,$19,$24
+	beq	$18,.Lend
  # warm up phase 1
 	ldq	$1,8($17)
 	subq	$18,4,$18
 	ldq	$2,16($17)
 	ldq	$3,24($17)
 	ldq	$4,32($17)
-	beq	$18,Lcool1
+	beq	$18,.Lend1
  # warm up phase 2
 	sll	$1,$20,$7
 	srl	$1,$19,$21
@@ -82,10 +82,10 @@ L0:	srl	$4,$19,$24
 	srl	$4,$19,$24
 	ldq	$4,64($17)
 	subq	$18,4,$18
-	beq	$18,Lcool2
+	beq	$18,.Lend2
 	.align  4
  # main loop
-Loop:	stq	$7,0($16)
+.Loop:	stq	$7,0($16)
 	or	$5,$22,$5
 	stq	$8,8($16)
 	or	$6,$23,$6
@@ -116,11 +116,9 @@ Loop:	stq	$7,0($16)
 	ldq	$4,96($17)
 
 	addq	$17,32,$17
-	bne	$18,Loop
-	unop
-	unop
+	bne	$18,.Loop
  # cool down phase 2/1
-Lcool2:	stq	$7,0($16)
+.Lend2:	stq	$7,0($16)
 	or	$5,$22,$5
 	stq	$8,8($16)
 	or	$6,$23,$6
@@ -148,7 +146,7 @@ Lcool2:	stq	$7,0($16)
 	ret	$31,($26),1
 
  # cool down phase 1/1
-Lcool1:	sll	$1,$20,$7
+.Lend1:	sll	$1,$20,$7
 	srl	$1,$19,$21
 	sll	$2,$20,$8
 	srl	$2,$19,$22
@@ -168,6 +166,6 @@ Lcool1:	sll	$1,$20,$7
 	stq	$24,32($16)
 	ret	$31,($26),1
 
-Lend:	stq	$24,0($16)
+.Lend:	stq	$24,0($16)
 	ret	$31,($26),1
 	.end	__mpn_rshift
diff --git a/sysdeps/alpha/alphaev5/sub_n.s b/sysdeps/alpha/alphaev5/sub_n.s
new file mode 100644
index 0000000000..c9f3a4ec3a
--- /dev/null
+++ b/sysdeps/alpha/alphaev5/sub_n.s
@@ -0,0 +1,148 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.frame	$30,0,$26,0
+
+	or	$31,$31,$25		# clear cy
+	subq	$19,4,$19		# decr loop cnt
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+	ldq	$0,0($18)
+	ldq	$1,8($18)
+	ldq	$4,0($17)
+	ldq	$5,8($17)
+	addq	$17,32,$17		# update s1_ptr
+	ldq	$2,16($18)
+	subq	$4,$0,$20		# 1st main sub
+	ldq	$3,24($18)
+	subq	$19,4,$19		# decr loop cnt
+	ldq	$6,-16($17)
+	cmpult	$4,$20,$25		# compute cy from last sub
+	ldq	$7,-8($17)
+	addq	$1,$25,$28		# cy add
+	addq	$18,32,$18		# update s2_ptr
+	subq	$5,$28,$21		# 2nd main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+	.align	4
+.Loop:	cmpult	$5,$21,$25		# compute cy from last add
+	ldq	$0,0($18)
+	or	$8,$25,$25		# combine cy from the two adds
+	ldq	$1,8($18)
+	addq	$2,$25,$28		# cy add
+	ldq	$4,0($17)
+	subq	$6,$28,$22		# 3rd main sub
+	ldq	$5,8($17)
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$6,$22,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+	stq	$21,8($16)
+	addq	$3,$25,$28		# cy add
+	subq	$7,$28,$23		# 4th main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$7,$23,$25		# compute cy from last add
+	addq	$17,32,$17		# update s1_ptr
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,32,$16		# update res_ptr
+	addq	$0,$25,$28		# cy add
+	ldq	$2,16($18)
+	subq	$4,$28,$20		# 1st main sub
+	ldq	$3,24($18)
+	cmpult	$28,$25,$8		# compute cy from last add
+	ldq	$6,-16($17)
+	cmpult	$4,$20,$25		# compute cy from last add
+	ldq	$7,-8($17)
+	or	$8,$25,$25		# combine cy from the two adds
+	subq	$19,4,$19		# decr loop cnt
+	stq	$22,-16($16)
+	addq	$1,$25,$28		# cy add
+	stq	$23,-8($16)
+	subq	$5,$28,$21		# 2nd main sub
+	addq	$18,32,$18		# update s2_ptr
+	cmpult	$28,$25,$8		# compute cy from last add
+	bge	$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:	cmpult	$5,$21,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$2,$25,$28		# cy add
+	subq	$6,$28,$22		# 3rd main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$6,$22,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+	stq	$21,8($16)
+	addq	$3,$25,$28		# cy add
+	subq	$7,$28,$23		# 4th main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$7,$23,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,32,$16		# update res_ptr
+	stq	$22,-16($16)
+	stq	$23,-8($16)
+.Lend2:	addq	$19,4,$19		# restore loop cnt
+	beq	$19,.Lret
+ # Start software pipeline for 2nd loop
+	ldq	$0,0($18)
+	ldq	$4,0($17)
+	subq	$19,1,$19
+	beq	$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+	.align	4
+.Loop0:	addq	$0,$25,$28		# cy add
+	ldq	$0,8($18)
+	subq	$4,$28,$20		# main sub
+	ldq	$1,8($17)
+	addq	$18,8,$18
+	cmpult	$28,$25,$8		# compute cy from last add
+	addq	$17,8,$17
+	stq	$20,0($16)
+	cmpult	$4,$20,$25		# compute cy from last add
+	subq	$19,1,$19		# decr loop cnt
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,8,$16
+	or	$1,$31,$4
+	bne	$19,.Loop0
+.Lend0:	addq	$0,$25,$28		# cy add
+	subq	$4,$28,$20		# main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$4,$20,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+
+.Lret:	or	$25,$31,$0		# return cy
+	ret	$31,($26),1
+	.end	__mpn_sub_n
diff --git a/sysdeps/alpha/lshift.s b/sysdeps/alpha/lshift.s
index c28434926b..aa8417bb6a 100644
--- a/sysdeps/alpha/lshift.s
+++ b/sysdeps/alpha/lshift.s
@@ -53,11 +53,11 @@ __mpn_lshift:
 	and	$18,4-1,$20	# number of limbs in first loop
 	srl	$4,$7,$0	# compute function result
 
-	beq	$20,L0
+	beq	$20,.L0
 	subq	$18,$20,$18
 
 	.align	3
-Loop0:
+.Loop0:
 	ldq	$3,-8($17)
 	subq	$16,8,$16
 	subq	$17,8,$17
@@ -67,12 +67,12 @@ Loop0:
 	bis	$3,$3,$4
 	bis	$5,$6,$8
 	stq	$8,0($16)
-	bne	$20,Loop0
+	bne	$20,.Loop0
 
-L0:	beq	$18,Lend
+.L0:	beq	$18,.Lend
 
 	.align	3
-Loop:	ldq	$3,-8($17)
+.Loop:	ldq	$3,-8($17)
 	subq	$16,32,$16
 	subq	$18,4,$18
 	sll	$4,$19,$5
@@ -100,9 +100,9 @@ Loop:	ldq	$3,-8($17)
 	bis	$1,$2,$8
 	stq	$8,0($16)
 
-	bgt	$18,Loop
+	bgt	$18,.Loop
 
-Lend:	sll	$4,$19,$8
+.Lend:	sll	$4,$19,$8
 	stq	$8,-8($16)
 	ret	$31,($26),1
 	.end	__mpn_lshift
diff --git a/sysdeps/alpha/mul_1.s b/sysdeps/alpha/mul_1.s
index 3ef194d7e6..58a63dfa5d 100644
--- a/sysdeps/alpha/mul_1.s
+++ b/sysdeps/alpha/mul_1.s
@@ -1,7 +1,7 @@
  # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
  # the result in a second limb vector.
 
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
  # This file is part of the GNU MP Library.
 
diff --git a/sysdeps/alpha/rshift.s b/sysdeps/alpha/rshift.s
index 74eab0434a..037b776017 100644
--- a/sysdeps/alpha/rshift.s
+++ b/sysdeps/alpha/rshift.s
@@ -34,7 +34,7 @@
  # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
  # 2. Only aligned instruction pairs can be paired.
  # 3. The store buffer or silo might not be able to deal with the bandwidth.
-      
+
 	.set	noreorder
 	.set	noat
 .text
@@ -51,11 +51,11 @@ __mpn_rshift:
 	and	$18,4-1,$20	# number of limbs in first loop
 	sll	$4,$7,$0	# compute function result
 
-	beq	$20,L0
+	beq	$20,.L0
 	subq	$18,$20,$18
 
 	.align	3
-Loop0:
+.Loop0:
 	ldq	$3,0($17)
 	addq	$16,8,$16
 	addq	$17,8,$17
@@ -65,12 +65,12 @@ Loop0:
 	bis	$3,$3,$4
 	bis	$5,$6,$8
 	stq	$8,-8($16)
-	bne	$20,Loop0
+	bne	$20,.Loop0
 
-L0:	beq	$18,Lend
+.L0:	beq	$18,.Lend
 
 	.align	3
-Loop:	ldq	$3,0($17)
+.Loop:	ldq	$3,0($17)
 	addq	$16,32,$16
 	subq	$18,4,$18
 	srl	$4,$19,$5
@@ -98,9 +98,9 @@ Loop:	ldq	$3,0($17)
 	bis	$1,$2,$8
 	stq	$8,-8($16)
 
-	bgt	$18,Loop
+	bgt	$18,.Loop
 
-Lend:	srl	$4,$19,$8
+.Lend:	srl	$4,$19,$8
 	stq	$8,0($16)
 	ret	$31,($26),1
 	.end	__mpn_rshift
diff --git a/sysdeps/alpha/submul_1.s b/sysdeps/alpha/submul_1.s
index acaa11c545..292b2c18b6 100644
--- a/sysdeps/alpha/submul_1.s
+++ b/sysdeps/alpha/submul_1.s
@@ -26,16 +26,7 @@
  # size		r18
  # s2_limb	r19
 
- # This code runs at 42 cycles/limb on the 21064.
-
- # To improve performance for long multiplications, we would use
- # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
- # these instructions without slowing down the general code: 1. We can
- # only have two prefetches in operation at any time in the Alpha
- # architecture.  2. There will seldom be any special alignment
- # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
- # loop into an inner and outer loop, having the inner loop handle
- # exactly one prefetch block?
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
 
 	.set	noreorder
 	.set	noat
@@ -52,7 +43,7 @@ __mpn_submul_1:
 	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	umulh	$2,$19,$0	# $0 = prod_high
-	beq	$18,Lend1	# jump if size was == 1
+	beq	$18,.Lend1	# jump if size was == 1
 	ldq	$2,0($17)	# $2 = s1_limb
 	addq	$17,8,$17	# s1_ptr++
 	subq	$18,1,$18	# size--
@@ -60,10 +51,10 @@ __mpn_submul_1:
 	cmpult	$5,$3,$4
 	stq	$3,0($16)
 	addq	$16,8,$16	# res_ptr++
-	beq	$18,Lend2	# jump if size was == 2
+	beq	$18,.Lend2	# jump if size was == 2
 
 	.align	3
-Loop:	mulq	$2,$19,$3	# $3 = prod_low
+.Loop:	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
 	subq	$18,1,$18	# size--
@@ -77,9 +68,9 @@ Loop:	mulq	$2,$19,$3	# $3 = prod_low
 	stq	$3,0($16)
 	addq	$16,8,$16	# res_ptr++
 	addq	$5,$0,$0	# combine carries
-	bne	$18,Loop
+	bne	$18,.Loop
 
-Lend2:	mulq	$2,$19,$3	# $3 = prod_low
+.Lend2:	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
 	umulh	$2,$19,$4	# $4 = cy_limb
@@ -91,7 +82,7 @@ Lend2:	mulq	$2,$19,$3	# $3 = prod_low
 	addq	$5,$0,$0	# combine carries
 	addq	$4,$0,$0	# cy_limb = prod_high + cy
 	ret	$31,($26),1
-Lend1:	subq	$5,$3,$3
+.Lend1:	subq	$5,$3,$3
 	cmpult	$5,$3,$5
 	stq	$3,0($16)
 	addq	$0,$5,$0
diff --git a/sysdeps/alpha/udiv_qrnnd.S b/sysdeps/alpha/udiv_qrnnd.S
index bafafd672e..ce590ede6c 100644
--- a/sysdeps/alpha/udiv_qrnnd.S
+++ b/sysdeps/alpha/udiv_qrnnd.S
@@ -1,6 +1,6 @@
  # Alpha 21064 __udiv_qrnnd
 
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
  # This file is part of the GNU MP Library.
 
@@ -21,13 +21,11 @@
 
         .set noreorder
         .set noat
-
 .text
-        .align 3
-        .globl __udiv_qrnnd
-        .ent __udiv_qrnnd 0
+        .align	3
+        .globl	__udiv_qrnnd
+        .ent	__udiv_qrnnd
 __udiv_qrnnd:
-__udiv_qrnnd..ng:
         .frame $30,0,$26,0
         .prologue 0
 #define cnt	$2
@@ -39,9 +37,9 @@ __udiv_qrnnd..ng:
 #define qb	$20
 
 	ldiq	cnt,16
-	blt	d,Largedivisor
+	blt	d,.Largedivisor
 
-Loop1:	cmplt	n0,0,tmp
+.Loop1:	cmplt	n0,0,tmp
 	addq	n1,n1,n1
 	bis	n1,tmp,n1
 	addq	n0,n0,n0
@@ -74,12 +72,12 @@ Loop1:	cmplt	n0,0,tmp
 	cmovne	qb,tmp,n1
 	bis	n0,qb,n0
 	subq	cnt,1,cnt
-	bgt	cnt,Loop1
+	bgt	cnt,.Loop1
 	stq	n1,0(rem_ptr)
 	bis	$31,n0,$0
 	ret	$31,($26),1
 
-Largedivisor:
+.Largedivisor:
 	and	n0,1,$4
 
 	srl	n0,1,n0
@@ -91,7 +89,7 @@ Largedivisor:
 	srl	d,1,$5
 	addq	$5,$6,$5
 
-Loop2:	cmplt	n0,0,tmp
+.Loop2:	cmplt	n0,0,tmp
 	addq	n1,n1,n1
 	bis	n1,tmp,n1
 	addq	n0,n0,n0
@@ -124,27 +122,27 @@ Loop2:	cmplt	n0,0,tmp
 	cmovne	qb,tmp,n1
 	bis	n0,qb,n0
 	subq	cnt,1,cnt
-	bgt	cnt,Loop2
+	bgt	cnt,.Loop2
 
 	addq	n1,n1,n1
 	addq	$4,n1,n1
-	bne	$6,Odd
+	bne	$6,.LOdd
 	stq	n1,0(rem_ptr)
 	bis	$31,n0,$0
 	ret	$31,($26),1
 
-Odd:
+.LOdd:
 	/* q' in n0. r' in n1 */
 	addq	n1,n0,n1
 	cmpult	n1,n0,tmp	# tmp := carry from addq
-	beq	tmp,LLp6
+	beq	tmp,.LLp6
 	addq	n0,1,n0
 	subq	n1,d,n1
-LLp6:	cmpult	n1,d,tmp
-	bne	tmp,LLp7
+.LLp6:	cmpult	n1,d,tmp
+	bne	tmp,.LLp7
 	addq	n0,1,n0
 	subq	n1,d,n1
-LLp7:
+.LLp7:
 	stq	n1,0(rem_ptr)
 	bis	$31,n0,$0
 	ret	$31,($26),1
diff --git a/sysdeps/generic/add_n.c b/sysdeps/generic/add_n.c
index 6989ab0628..647548d4c1 100644
--- a/sysdeps/generic/add_n.c
+++ b/sysdeps/generic/add_n.c
@@ -1,6 +1,6 @@
-/* __mpn_add_n -- Add two limb vectors of equal, non-zero length.
+/* mpn_add_n -- Add two limb vectors of equal, non-zero length.
 
-Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -23,9 +23,9 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 
 mp_limb
 #if __STDC__
-__mpn_add_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+mpn_add_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
 #else
-__mpn_add_n (res_ptr, s1_ptr, s2_ptr, size)
+mpn_add_n (res_ptr, s1_ptr, s2_ptr, size)
      register mp_ptr res_ptr;
      register mp_srcptr s1_ptr;
      register mp_srcptr s2_ptr;
diff --git a/sysdeps/generic/addmul_1.c b/sysdeps/generic/addmul_1.c
index fdf3541561..6156cab82c 100644
--- a/sysdeps/generic/addmul_1.c
+++ b/sysdeps/generic/addmul_1.c
@@ -1,9 +1,9 @@
-/* __mpn_addmul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+/* mpn_addmul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
    by S2_LIMB, add the S1_SIZE least significant limbs of the product to the
    limb vector pointed to by RES_PTR.  Return the most significant limb of
    the product, adjusted for carry-out from the addition.
 
-Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -26,7 +26,7 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 #include "longlong.h"
 
 mp_limb
-__mpn_addmul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+mpn_addmul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
      register mp_ptr res_ptr;
      register mp_srcptr s1_ptr;
      mp_size_t s1_size;
diff --git a/sysdeps/generic/cmp.c b/sysdeps/generic/cmp.c
index 144c88588f..e499b1ec44 100644
--- a/sysdeps/generic/cmp.c
+++ b/sysdeps/generic/cmp.c
@@ -1,6 +1,6 @@
-/* __mpn_cmp -- Compare two low-level natural-number integers.
+/* mpn_cmp -- Compare two low-level natural-number integers.
 
-Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -28,9 +28,9 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 
 int
 #if __STDC__
-__mpn_cmp (mp_srcptr op1_ptr, mp_srcptr op2_ptr, mp_size_t size)
+mpn_cmp (mp_srcptr op1_ptr, mp_srcptr op2_ptr, mp_size_t size)
 #else
-__mpn_cmp (op1_ptr, op2_ptr, size)
+mpn_cmp (op1_ptr, op2_ptr, size)
      mp_srcptr op1_ptr;
      mp_srcptr op2_ptr;
      mp_size_t size;
diff --git a/sysdeps/generic/divmod_1.c b/sysdeps/generic/divmod_1.c
index 2989d36708..c04032750c 100644
--- a/sysdeps/generic/divmod_1.c
+++ b/sysdeps/generic/divmod_1.c
@@ -1,4 +1,4 @@
-/* __mpn_divmod_1(quot_ptr, dividend_ptr, dividend_size, divisor_limb) --
+/* mpn_divmod_1(quot_ptr, dividend_ptr, dividend_size, divisor_limb) --
    Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
    Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
    Return the single-limb remainder.
@@ -6,7 +6,7 @@
 
    QUOT_PTR and DIVIDEND_PTR might point to the same limb.
 
-Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -41,11 +41,11 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 
 mp_limb
 #if __STDC__
-__mpn_divmod_1 (mp_ptr quot_ptr,
+mpn_divmod_1 (mp_ptr quot_ptr,
 	      mp_srcptr dividend_ptr, mp_size_t dividend_size,
 	      mp_limb divisor_limb)
 #else
-__mpn_divmod_1 (quot_ptr, dividend_ptr, dividend_size, divisor_limb)
+mpn_divmod_1 (quot_ptr, dividend_ptr, dividend_size, divisor_limb)
      mp_ptr quot_ptr;
      mp_srcptr dividend_ptr;
      mp_size_t dividend_size;
diff --git a/sysdeps/generic/lshift.c b/sysdeps/generic/lshift.c
index 1ba09038dd..35794e4ea8 100644
--- a/sysdeps/generic/lshift.c
+++ b/sysdeps/generic/lshift.c
@@ -1,6 +1,6 @@
-/* __mpn_lshift -- Shift left low level.
+/* mpn_lshift -- Shift left low level.
 
-Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -32,11 +32,11 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 
 mp_limb
 #if __STDC__
-__mpn_lshift (register mp_ptr wp,
+mpn_lshift (register mp_ptr wp,
 	    register mp_srcptr up, mp_size_t usize,
 	    register unsigned int cnt)
 #else
-__mpn_lshift (wp, up, usize, cnt)
+mpn_lshift (wp, up, usize, cnt)
      register mp_ptr wp;
      register mp_srcptr up;
      mp_size_t usize;
diff --git a/sysdeps/generic/mod_1.c b/sysdeps/generic/mod_1.c
index 8a49fb4be0..0842f6b1ee 100644
--- a/sysdeps/generic/mod_1.c
+++ b/sysdeps/generic/mod_1.c
@@ -1,4 +1,4 @@
-/* __mpn_mod_1(dividend_ptr, dividend_size, divisor_limb) --
+/* mpn_mod_1(dividend_ptr, dividend_size, divisor_limb) --
    Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
    Return the single-limb remainder.
    There are no constraints on the value of the divisor.
@@ -38,10 +38,10 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 
 mp_limb
 #if __STDC__
-__mpn_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size,
-	     mp_limb divisor_limb)
+mpn_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size,
+	   mp_limb divisor_limb)
 #else
-__mpn_mod_1 (dividend_ptr, dividend_size, divisor_limb)
+mpn_mod_1 (dividend_ptr, dividend_size, divisor_limb)
      mp_srcptr dividend_ptr;
      mp_size_t dividend_size;
      mp_limb divisor_limb;
diff --git a/sysdeps/generic/mul.c b/sysdeps/generic/mul.c
index cd2acb5127..3f3f41e99f 100644
--- a/sysdeps/generic/mul.c
+++ b/sysdeps/generic/mul.c
@@ -1,6 +1,6 @@
-/* __mpn_mul -- Multiply two natural numbers.
+/* mpn_mul -- Multiply two natural numbers.
 
-Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -43,11 +43,11 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 
 mp_limb
 #if __STDC__
-__mpn_mul (mp_ptr prodp,
-	  mp_srcptr up, mp_size_t usize,
-	  mp_srcptr vp, mp_size_t vsize)
+mpn_mul (mp_ptr prodp,
+	 mp_srcptr up, mp_size_t usize,
+	 mp_srcptr vp, mp_size_t vsize)
 #else
-__mpn_mul (prodp, up, usize, vp, vsize)
+mpn_mul (prodp, up, usize, vp, vsize)
      mp_ptr prodp;
      mp_srcptr up;
      mp_size_t usize;
@@ -58,6 +58,7 @@ __mpn_mul (prodp, up, usize, vp, vsize)
   mp_ptr prod_endp = prodp + usize + vsize - 1;
   mp_limb cy;
   mp_ptr tspace;
+  TMP_DECL (marker);
 
   if (vsize < KARATSUBA_THRESHOLD)
     {
@@ -86,7 +87,7 @@ __mpn_mul (prodp, up, usize, vp, vsize)
 	  cy_limb = 0;
 	}
       else
-	cy_limb = __mpn_mul_1 (prodp, up, usize, v_limb);
+	cy_limb = mpn_mul_1 (prodp, up, usize, v_limb);
 
       prodp[usize] = cy_limb;
       prodp++;
@@ -100,10 +101,10 @@ __mpn_mul (prodp, up, usize, vp, vsize)
 	    {
 	      cy_limb = 0;
 	      if (v_limb == 1)
-		cy_limb = __mpn_add_n (prodp, prodp, up, usize);
+		cy_limb = mpn_add_n (prodp, prodp, up, usize);
 	    }
 	  else
-	    cy_limb = __mpn_addmul_1 (prodp, up, usize, v_limb);
+	    cy_limb = mpn_addmul_1 (prodp, up, usize, v_limb);
 
 	  prodp[usize] = cy_limb;
 	  prodp++;
@@ -111,7 +112,9 @@ __mpn_mul (prodp, up, usize, vp, vsize)
       return cy_limb;
     }
 
-  tspace = (mp_ptr) alloca (2 * vsize * BYTES_PER_MP_LIMB);
+  TMP_MARK (marker);
+
+  tspace = (mp_ptr) TMP_ALLOC (2 * vsize * BYTES_PER_MP_LIMB);
   MPN_MUL_N_RECURSE (prodp, up, vp, vsize, tspace);
 
   prodp += vsize;
@@ -119,12 +122,12 @@ __mpn_mul (prodp, up, usize, vp, vsize)
   usize -= vsize;
   if (usize >= vsize)
     {
-      mp_ptr tp = (mp_ptr) alloca (2 * vsize * BYTES_PER_MP_LIMB);
+      mp_ptr tp = (mp_ptr) TMP_ALLOC (2 * vsize * BYTES_PER_MP_LIMB);
       do
 	{
 	  MPN_MUL_N_RECURSE (tp, up, vp, vsize, tspace);
-	  cy = __mpn_add_n (prodp, prodp, tp, vsize);
-	  __mpn_add_1 (prodp + vsize, tp + vsize, vsize, cy);
+	  cy = mpn_add_n (prodp, prodp, tp, vsize);
+	  mpn_add_1 (prodp + vsize, tp + vsize, vsize, cy);
 	  prodp += vsize;
 	  up += vsize;
 	  usize -= vsize;
@@ -138,10 +141,11 @@ __mpn_mul (prodp, up, usize, vp, vsize)
 
   if (usize != 0)
     {
-      __mpn_mul (tspace, vp, vsize, up, usize);
-      cy = __mpn_add_n (prodp, prodp, tspace, vsize);
-      __mpn_add_1 (prodp + vsize, tspace + vsize, usize, cy);
+      mpn_mul (tspace, vp, vsize, up, usize);
+      cy = mpn_add_n (prodp, prodp, tspace, vsize);
+      mpn_add_1 (prodp + vsize, tspace + vsize, usize, cy);
     }
 
+  TMP_FREE (marker);
   return *prod_endp;
 }
diff --git a/sysdeps/generic/mul_1.c b/sysdeps/generic/mul_1.c
index 37dbc33031..01fdbbbc9b 100644
--- a/sysdeps/generic/mul_1.c
+++ b/sysdeps/generic/mul_1.c
@@ -1,7 +1,7 @@
-/* __mpn_mul_1 -- Multiply a limb vector with a single limb and
+/* mpn_mul_1 -- Multiply a limb vector with a single limb and
    store the product in a second limb vector.
 
-Copyright (C) 1991, 1992, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -24,7 +24,7 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 #include "longlong.h"
 
 mp_limb
-__mpn_mul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+mpn_mul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
      register mp_ptr res_ptr;
      register mp_srcptr s1_ptr;
      mp_size_t s1_size;
diff --git a/sysdeps/generic/mul_n.c b/sysdeps/generic/mul_n.c
index e37c5d8290..049f63dce8 100644
--- a/sysdeps/generic/mul_n.c
+++ b/sysdeps/generic/mul_n.c
@@ -1,6 +1,6 @@
-/* __mpn_mul_n -- Multiply two natural numbers of length n.
+/* mpn_mul_n -- Multiply two natural numbers of length n.
 
-Copyright (C) 1991, 1992, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -41,13 +41,6 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 #define KARATSUBA_THRESHOLD 2
 #endif
 
-void
-#if __STDC__
-____mpn_mul_n (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_ptr);
-#else
-____mpn_mul_n ();
-#endif
-
 /* Handle simple cases with traditional multiplication.
 
    This is the most critical code of multiplication.  All multiplies rely
@@ -57,9 +50,9 @@ ____mpn_mul_n ();
 
 void
 #if __STDC__
-____mpn_mul_n_basecase (mp_ptr prodp, mp_srcptr up, mp_srcptr vp, mp_size_t size)
+impn_mul_n_basecase (mp_ptr prodp, mp_srcptr up, mp_srcptr vp, mp_size_t size)
 #else
-____mpn_mul_n_basecase (prodp, up, vp, size)
+impn_mul_n_basecase (prodp, up, vp, size)
      mp_ptr prodp;
      mp_srcptr up;
      mp_srcptr vp;
@@ -82,7 +75,7 @@ ____mpn_mul_n_basecase (prodp, up, vp, size)
       cy_limb = 0;
     }
   else
-    cy_limb = __mpn_mul_1 (prodp, up, size, v_limb);
+    cy_limb = mpn_mul_1 (prodp, up, size, v_limb);
 
   prodp[size] = cy_limb;
   prodp++;
@@ -96,10 +89,10 @@ ____mpn_mul_n_basecase (prodp, up, vp, size)
 	{
 	  cy_limb = 0;
 	  if (v_limb == 1)
-	    cy_limb = __mpn_add_n (prodp, prodp, up, size);
+	    cy_limb = mpn_add_n (prodp, prodp, up, size);
 	}
       else
-	cy_limb = __mpn_addmul_1 (prodp, up, size, v_limb);
+	cy_limb = mpn_addmul_1 (prodp, up, size, v_limb);
 
       prodp[size] = cy_limb;
       prodp++;
@@ -108,10 +101,10 @@ ____mpn_mul_n_basecase (prodp, up, vp, size)
 
 void
 #if __STDC__
-____mpn_mul_n (mp_ptr prodp,
+impn_mul_n (mp_ptr prodp,
 	     mp_srcptr up, mp_srcptr vp, mp_size_t size, mp_ptr tspace)
 #else
-____mpn_mul_n (prodp, up, vp, size, tspace)
+impn_mul_n (prodp, up, vp, size, tspace)
      mp_ptr prodp;
      mp_srcptr up;
      mp_srcptr vp;
@@ -135,9 +128,9 @@ ____mpn_mul_n (prodp, up, vp, size, tspace)
       mp_limb cy_limb;
 
       MPN_MUL_N_RECURSE (prodp, up, vp, esize, tspace);
-      cy_limb = __mpn_addmul_1 (prodp + esize, up, esize, vp[esize]);
+      cy_limb = mpn_addmul_1 (prodp + esize, up, esize, vp[esize]);
       prodp[esize + esize] = cy_limb;
-      cy_limb = __mpn_addmul_1 (prodp + esize, vp, size, up[esize]);
+      cy_limb = mpn_addmul_1 (prodp + esize, vp, size, up[esize]);
 
       prodp[esize + size] = cy_limb;
     }
@@ -170,24 +163,24 @@ ____mpn_mul_n (prodp, up, vp, size, tspace)
 
       /*** Product M.	 ________________
 			|_(U1-U0)(V0-V1)_|  */
-      if (__mpn_cmp (up + hsize, up, hsize) >= 0)
+      if (mpn_cmp (up + hsize, up, hsize) >= 0)
 	{
-	  __mpn_sub_n (prodp, up + hsize, up, hsize);
+	  mpn_sub_n (prodp, up + hsize, up, hsize);
 	  negflg = 0;
 	}
       else
 	{
-	  __mpn_sub_n (prodp, up, up + hsize, hsize);
+	  mpn_sub_n (prodp, up, up + hsize, hsize);
 	  negflg = 1;
 	}
-      if (__mpn_cmp (vp + hsize, vp, hsize) >= 0)
+      if (mpn_cmp (vp + hsize, vp, hsize) >= 0)
 	{
-	  __mpn_sub_n (prodp + hsize, vp + hsize, vp, hsize);
+	  mpn_sub_n (prodp + hsize, vp + hsize, vp, hsize);
 	  negflg ^= 1;
 	}
       else
 	{
-	  __mpn_sub_n (prodp + hsize, vp, vp + hsize, hsize);
+	  mpn_sub_n (prodp + hsize, vp, vp + hsize, hsize);
 	  /* No change of NEGFLG.  */
 	}
       /* Read temporary operands from low part of PROD.
@@ -197,13 +190,13 @@ ____mpn_mul_n (prodp, up, vp, size, tspace)
 
       /*** Add/copy product H.  */
       MPN_COPY (prodp + hsize, prodp + size, hsize);
-      cy = __mpn_add_n (prodp + size, prodp + size, prodp + size + hsize, hsize);
+      cy = mpn_add_n (prodp + size, prodp + size, prodp + size + hsize, hsize);
 
       /*** Add product M (if NEGFLG M is a negative number).  */
       if (negflg)
-	cy -= __mpn_sub_n (prodp + hsize, prodp + hsize, tspace, size);
+	cy -= mpn_sub_n (prodp + hsize, prodp + hsize, tspace, size);
       else
-	cy += __mpn_add_n (prodp + hsize, prodp + hsize, tspace, size);
+	cy += mpn_add_n (prodp + hsize, prodp + hsize, tspace, size);
 
       /*** Product L.	 ________________  ________________
 			|________________||____U0 x V0_____|  */
@@ -214,22 +207,22 @@ ____mpn_mul_n (prodp, up, vp, size, tspace)
 
       /*** Add/copy Product L (twice).  */
 
-      cy += __mpn_add_n (prodp + hsize, prodp + hsize, tspace, size);
+      cy += mpn_add_n (prodp + hsize, prodp + hsize, tspace, size);
       if (cy)
-	__mpn_add_1 (prodp + hsize + size, prodp + hsize + size, hsize, cy);
+	mpn_add_1 (prodp + hsize + size, prodp + hsize + size, hsize, cy);
 
       MPN_COPY (prodp, tspace, hsize);
-      cy = __mpn_add_n (prodp + hsize, prodp + hsize, tspace + hsize, hsize);
+      cy = mpn_add_n (prodp + hsize, prodp + hsize, tspace + hsize, hsize);
       if (cy)
-	__mpn_add_1 (prodp + size, prodp + size, size, 1);
+	mpn_add_1 (prodp + size, prodp + size, size, 1);
     }
 }
 
 void
 #if __STDC__
-____mpn_sqr_n_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t size)
+impn_sqr_n_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t size)
 #else
-____mpn_sqr_n_basecase (prodp, up, size)
+impn_sqr_n_basecase (prodp, up, size)
      mp_ptr prodp;
      mp_srcptr up;
      mp_size_t size;
@@ -251,7 +244,7 @@ ____mpn_sqr_n_basecase (prodp, up, size)
       cy_limb = 0;
     }
   else
-    cy_limb = __mpn_mul_1 (prodp, up, size, v_limb);
+    cy_limb = mpn_mul_1 (prodp, up, size, v_limb);
 
   prodp[size] = cy_limb;
   prodp++;
@@ -265,10 +258,10 @@ ____mpn_sqr_n_basecase (prodp, up, size)
 	{
 	  cy_limb = 0;
 	  if (v_limb == 1)
-	    cy_limb = __mpn_add_n (prodp, prodp, up, size);
+	    cy_limb = mpn_add_n (prodp, prodp, up, size);
 	}
       else
-	cy_limb = __mpn_addmul_1 (prodp, up, size, v_limb);
+	cy_limb = mpn_addmul_1 (prodp, up, size, v_limb);
 
       prodp[size] = cy_limb;
       prodp++;
@@ -277,10 +270,10 @@ ____mpn_sqr_n_basecase (prodp, up, size)
 
 void
 #if __STDC__
-____mpn_sqr_n (mp_ptr prodp,
+impn_sqr_n (mp_ptr prodp,
 	     mp_srcptr up, mp_size_t size, mp_ptr tspace)
 #else
-____mpn_sqr_n (prodp, up, size, tspace)
+impn_sqr_n (prodp, up, size, tspace)
      mp_ptr prodp;
      mp_srcptr up;
      mp_size_t size;
@@ -303,9 +296,9 @@ ____mpn_sqr_n (prodp, up, size, tspace)
       mp_limb cy_limb;
 
       MPN_SQR_N_RECURSE (prodp, up, esize, tspace);
-      cy_limb = __mpn_addmul_1 (prodp + esize, up, esize, up[esize]);
+      cy_limb = mpn_addmul_1 (prodp + esize, up, esize, up[esize]);
       prodp[esize + esize] = cy_limb;
-      cy_limb = __mpn_addmul_1 (prodp + esize, up, size, up[esize]);
+      cy_limb = mpn_addmul_1 (prodp + esize, up, size, up[esize]);
 
       prodp[esize + size] = cy_limb;
     }
@@ -322,13 +315,13 @@ ____mpn_sqr_n (prodp, up, size, tspace)
 
       /*** Product M.	 ________________
 			|_(U1-U0)(U0-U1)_|  */
-      if (__mpn_cmp (up + hsize, up, hsize) >= 0)
+      if (mpn_cmp (up + hsize, up, hsize) >= 0)
 	{
-	  __mpn_sub_n (prodp, up + hsize, up, hsize);
+	  mpn_sub_n (prodp, up + hsize, up, hsize);
 	}
       else
 	{
-	  __mpn_sub_n (prodp, up, up + hsize, hsize);
+	  mpn_sub_n (prodp, up, up + hsize, hsize);
 	}
 
       /* Read temporary operands from low part of PROD.
@@ -338,10 +331,10 @@ ____mpn_sqr_n (prodp, up, size, tspace)
 
       /*** Add/copy product H.  */
       MPN_COPY (prodp + hsize, prodp + size, hsize);
-      cy = __mpn_add_n (prodp + size, prodp + size, prodp + size + hsize, hsize);
+      cy = mpn_add_n (prodp + size, prodp + size, prodp + size + hsize, hsize);
 
       /*** Add product M (if NEGFLG M is a negative number).  */
-      cy -= __mpn_sub_n (prodp + hsize, prodp + hsize, tspace, size);
+      cy -= mpn_sub_n (prodp + hsize, prodp + hsize, tspace, size);
 
       /*** Product L.	 ________________  ________________
 			|________________||____U0 x U0_____|  */
@@ -352,53 +345,56 @@ ____mpn_sqr_n (prodp, up, size, tspace)
 
       /*** Add/copy Product L (twice).  */
 
-      cy += __mpn_add_n (prodp + hsize, prodp + hsize, tspace, size);
+      cy += mpn_add_n (prodp + hsize, prodp + hsize, tspace, size);
       if (cy)
-	__mpn_add_1 (prodp + hsize + size, prodp + hsize + size, hsize, cy);
+	mpn_add_1 (prodp + hsize + size, prodp + hsize + size, hsize, cy);
 
       MPN_COPY (prodp, tspace, hsize);
-      cy = __mpn_add_n (prodp + hsize, prodp + hsize, tspace + hsize, hsize);
+      cy = mpn_add_n (prodp + hsize, prodp + hsize, tspace + hsize, hsize);
       if (cy)
-	__mpn_add_1 (prodp + size, prodp + size, size, 1);
+	mpn_add_1 (prodp + size, prodp + size, size, 1);
     }
 }
 
 /* This should be made into an inline function in gmp.h.  */
 inline void
 #if __STDC__
-__mpn_mul_n (mp_ptr prodp, mp_srcptr up, mp_srcptr vp, mp_size_t size)
+mpn_mul_n (mp_ptr prodp, mp_srcptr up, mp_srcptr vp, mp_size_t size)
 #else
-__mpn_mul_n (prodp, up, vp, size)
+mpn_mul_n (prodp, up, vp, size)
      mp_ptr prodp;
      mp_srcptr up;
      mp_srcptr vp;
      mp_size_t size;
 #endif
 {
+  TMP_DECL (marker);
+  TMP_MARK (marker);
   if (up == vp)
     {
       if (size < KARATSUBA_THRESHOLD)
 	{
-	  ____mpn_sqr_n_basecase (prodp, up, size);
+	  impn_sqr_n_basecase (prodp, up, size);
 	}
       else
 	{
 	  mp_ptr tspace;
-	  tspace = (mp_ptr) alloca (2 * size * BYTES_PER_MP_LIMB);
-	  ____mpn_sqr_n (prodp, up, size, tspace);
+	  tspace = (mp_ptr) TMP_ALLOC (2 * size * BYTES_PER_MP_LIMB);
+	  impn_sqr_n (prodp, up, size, tspace);
 	}
     }
   else
     {
       if (size < KARATSUBA_THRESHOLD)
 	{
-	  ____mpn_mul_n_basecase (prodp, up, vp, size);
+	  impn_mul_n_basecase (prodp, up, vp, size);
 	}
       else
 	{
 	  mp_ptr tspace;
-	  tspace = (mp_ptr) alloca (2 * size * BYTES_PER_MP_LIMB);
-	  ____mpn_mul_n (prodp, up, vp, size, tspace);
+	  tspace = (mp_ptr) TMP_ALLOC (2 * size * BYTES_PER_MP_LIMB);
+	  impn_mul_n (prodp, up, vp, size, tspace);
 	}
     }
+  TMP_FREE (marker);
 }
diff --git a/sysdeps/generic/rshift.c b/sysdeps/generic/rshift.c
index 966cc7bcad..7ce02e07f7 100644
--- a/sysdeps/generic/rshift.c
+++ b/sysdeps/generic/rshift.c
@@ -1,6 +1,6 @@
-/* __mpn_rshift -- Shift right a low-level natural-number integer.
+/* mpn_rshift -- Shift right a low-level natural-number integer.
 
-Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -32,11 +32,11 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 
 mp_limb
 #if __STDC__
-__mpn_rshift (register mp_ptr wp,
+mpn_rshift (register mp_ptr wp,
 	    register mp_srcptr up, mp_size_t usize,
 	    register unsigned int cnt)
 #else
-__mpn_rshift (wp, up, usize, cnt)
+mpn_rshift (wp, up, usize, cnt)
      register mp_ptr wp;
      register mp_srcptr up;
      mp_size_t usize;
diff --git a/sysdeps/generic/sub_n.c b/sysdeps/generic/sub_n.c
index 6b33e6696f..f3c83d1fd6 100644
--- a/sysdeps/generic/sub_n.c
+++ b/sysdeps/generic/sub_n.c
@@ -1,6 +1,6 @@
-/* __mpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
 
-Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -23,9 +23,9 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 
 mp_limb
 #if __STDC__
-__mpn_sub_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+mpn_sub_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
 #else
-__mpn_sub_n (res_ptr, s1_ptr, s2_ptr, size)
+mpn_sub_n (res_ptr, s1_ptr, s2_ptr, size)
      register mp_ptr res_ptr;
      register mp_srcptr s1_ptr;
      register mp_srcptr s2_ptr;
diff --git a/sysdeps/generic/submul_1.c b/sysdeps/generic/submul_1.c
index 855dd3feaf..57122a5784 100644
--- a/sysdeps/generic/submul_1.c
+++ b/sysdeps/generic/submul_1.c
@@ -1,9 +1,9 @@
-/* __mpn_submul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+/* mpn_submul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
    by S2_LIMB, subtract the S1_SIZE least significant limbs of the product
    from the limb vector pointed to by RES_PTR.  Return the most significant
    limb of the product, adjusted for carry-out from the subtraction.
 
-Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -26,7 +26,7 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 #include "longlong.h"
 
 mp_limb
-__mpn_submul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+mpn_submul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
      register mp_ptr res_ptr;
      register mp_srcptr s1_ptr;
      mp_size_t s1_size;
diff --git a/sysdeps/i386/gmp-mparam.h b/sysdeps/i386/gmp-mparam.h
index 687f12aa35..ddc308ae20 100644
--- a/sysdeps/i386/gmp-mparam.h
+++ b/sysdeps/i386/gmp-mparam.h
@@ -1,6 +1,6 @@
 /* gmp-mparam.h -- Compiler/machine parameter header file.
 
-Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
diff --git a/sysdeps/i386/i586/add_n.S b/sysdeps/i386/i586/add_n.S
index f52f9c60bc..f214c8cb36 100644
--- a/sysdeps/i386/i586/add_n.S
+++ b/sysdeps/i386/i586/add_n.S
@@ -1,7 +1,7 @@
 /* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store
    sum in a third limb vector.
 
-Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1995, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -30,13 +30,6 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 #include "sysdep.h"
 #include "asm-syntax.h"
 
-#define t1	%eax
-#define t2	%edx
-#define src1	%esi
-#define src2	%ebp
-#define dst	%edi
-#define x	%ebx
-
 .text
 	ALIGN (3)
 	.globl C_SYMBOL_NAME(__mpn_add_n)
@@ -46,85 +39,85 @@ C_SYMBOL_NAME(__mpn_add_n:)
 	pushl	%ebx
 	pushl	%ebp
 
-	movl	20(%esp),dst		/* res_ptr */
-	movl	24(%esp),src1		/* s1_ptr */
-	movl	28(%esp),src2		/* s2_ptr */
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s1_ptr */
+	movl	28(%esp),%ebp		/* s2_ptr */
 	movl	32(%esp),%ecx		/* size */
 
-	movl	(src2),x
+	movl	(%ebp),%ebx
 
 	decl	%ecx
-	movl	%ecx,t2
+	movl	%ecx,%edx
 	shrl	$3,%ecx
-	andl	$7,t2
+	andl	$7,%edx
 	testl	%ecx,%ecx		/* zero carry flag */
 	jz	Lend
-	pushl	t2
+	pushl	%edx
 
 	ALIGN (3)
-Loop:	movl	28(dst),%eax		/* fetch destination cache line */
-	leal	32(dst),dst
-
-L1:	movl	(src1),t1
-	movl	4(src1),t2
-	adcl	x,t1
-	movl	4(src2),x
-	adcl	x,t2
-	movl	8(src2),x
-	movl	t1,-32(dst)
-	movl	t2,-28(dst)
-
-L2:	movl	8(src1),t1
-	movl	12(src1),t2
-	adcl	x,t1
-	movl	12(src2),x
-	adcl	x,t2
-	movl	16(src2),x
-	movl	t1,-24(dst)
-	movl	t2,-20(dst)
-
-L3:	movl	16(src1),t1
-	movl	20(src1),t2
-	adcl	x,t1
-	movl	20(src2),x
-	adcl	x,t2
-	movl	24(src2),x
-	movl	t1,-16(dst)
-	movl	t2,-12(dst)
-
-L4:	movl	24(src1),t1
-	movl	28(src1),t2
-	adcl	x,t1
-	movl	28(src2),x
-	adcl	x,t2
-	movl	32(src2),x
-	movl	t1,-8(dst)
-	movl	t2,-4(dst)
-
-	leal	32(src1),src1
-	leal	32(src2),src2
+Loop:	movl	28(%edi),%eax		/* fetch destination cache line */
+	leal	32(%edi),%edi
+
+L1:	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%ebx,%eax
+	movl	4(%ebp),%ebx
+	adcl	%ebx,%edx
+	movl	8(%ebp),%ebx
+	movl	%eax,-32(%edi)
+	movl	%edx,-28(%edi)
+
+L2:	movl	8(%esi),%eax
+	movl	12(%esi),%edx
+	adcl	%ebx,%eax
+	movl	12(%ebp),%ebx
+	adcl	%ebx,%edx
+	movl	16(%ebp),%ebx
+	movl	%eax,-24(%edi)
+	movl	%edx,-20(%edi)
+
+L3:	movl	16(%esi),%eax
+	movl	20(%esi),%edx
+	adcl	%ebx,%eax
+	movl	20(%ebp),%ebx
+	adcl	%ebx,%edx
+	movl	24(%ebp),%ebx
+	movl	%eax,-16(%edi)
+	movl	%edx,-12(%edi)
+
+L4:	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%ebx,%eax
+	movl	28(%ebp),%ebx
+	adcl	%ebx,%edx
+	movl	32(%ebp),%ebx
+	movl	%eax,-8(%edi)
+	movl	%edx,-4(%edi)
+
+	leal	32(%esi),%esi
+	leal	32(%ebp),%ebp
 	decl	%ecx
 	jnz	Loop
 
-	popl	t2
+	popl	%edx
 Lend:
-	decl	t2			/* test t2 w/o clobbering carry */
+	decl	%edx			/* test %edx w/o clobbering carry */
 	js	Lend2
-	incl	t2
+	incl	%edx
 Loop2:
-	leal	4(dst),dst
-	movl	(src1),t1
-	adcl	x,t1
-	movl	4(src2),x
-	movl	t1,-4(dst)
-	leal	4(src1),src1
-	leal	4(src2),src2
-	decl	t2
+	leal	4(%edi),%edi
+	movl	(%esi),%eax
+	adcl	%ebx,%eax
+	movl	4(%ebp),%ebx
+	movl	%eax,-4(%edi)
+	leal	4(%esi),%esi
+	leal	4(%ebp),%ebp
+	decl	%edx
 	jnz	Loop2
 Lend2:
-	movl	(src1),t1
-	adcl	x,t1
-	movl	t1,(dst)
+	movl	(%esi),%eax
+	adcl	%ebx,%eax
+	movl	%eax,(%edi)
 
 	sbbl	%eax,%eax
 	negl	%eax
diff --git a/sysdeps/i386/i586/addmul_1.S b/sysdeps/i386/i586/addmul_1.S
index b222840591..5bf2603cab 100644
--- a/sysdeps/i386/i586/addmul_1.S
+++ b/sysdeps/i386/i586/addmul_1.S
@@ -1,7 +1,7 @@
 /* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add
    the result to a second limb vector.
 
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -32,12 +32,12 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 
 #define res_ptr edi
 #define s1_ptr esi
+#define size ecx
 #define s2_limb ebp
 
 	TEXT
 	ALIGN (3)
 	GLOBL	C_SYMBOL_NAME(__mpn_addmul_1)
-	.type	C_SYMBOL_NAME(__mpn_addmul_1),@function
 C_SYMBOL_NAME(__mpn_addmul_1:)
 
 	INSN1(push,l	,R(edi))
@@ -47,38 +47,36 @@ C_SYMBOL_NAME(__mpn_addmul_1:)
 
 	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
 	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
-	INSN2(mov,l	,R(ecx),MEM_DISP(esp,28))
+	INSN2(mov,l	,R(size),MEM_DISP(esp,28))
 	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
 
-	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,ecx,4))
-	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,ecx,4))
-	INSN1(neg,l	,R(ecx))
-	INSN2(xor,l	,R(edx),R(edx))
+	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,size,4))
+	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
+	INSN1(neg,l	,R(size))
+	INSN2(xor,l	,R(ebx),R(ebx))
 	ALIGN (3)
-Loop:
-	INSN2(mov,l	,R(ebx),R(edx))
-	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,ecx,4))
+
+Loop:	INSN2(adc,l	,R(ebx),$0)
+	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,size,4))
 
 	INSN1(mul,l	,R(s2_limb))
 
 	INSN2(add,l	,R(eax),R(ebx))
-	INSN2(mov,l	,R(ebx),MEM_INDEX(res_ptr,ecx,4))
+	INSN2(mov,l	,R(ebx),MEM_INDEX(res_ptr,size,4))
 
 	INSN2(adc,l	,R(edx),$0)
 	INSN2(add,l	,R(ebx),R(eax))
 
-	INSN2(adc,l	,R(edx),$0)
-	INSN2(mov,l	,MEM_INDEX(res_ptr,ecx,4),R(ebx))
+	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(ebx))
+	INSN1(inc,l	,R(size))
 
-	INSN1(inc,l	,R(ecx))
+	INSN2(mov,l	,R(ebx),R(edx))
 	INSN1(jnz,	,Loop)
 
-
-	INSN2(mov,l	,R(eax),R(edx))
+	INSN2(adc,l	,R(ebx),$0)
+	INSN2(mov,l	,R(eax),R(ebx))
 	INSN1(pop,l	,R(ebp))
 	INSN1(pop,l	,R(ebx))
 	INSN1(pop,l	,R(esi))
 	INSN1(pop,l	,R(edi))
 	ret
-Lfe1:
-	.size	C_SYMBOL_NAME(__mpn_addmul_1),Lfe1-C_SYMBOL_NAME(__mpn_addmul_1)
diff --git a/sysdeps/i386/i586/mul_1.S b/sysdeps/i386/i586/mul_1.S
index 2b7258e130..048c0601f2 100644
--- a/sysdeps/i386/i586/mul_1.S
+++ b/sysdeps/i386/i586/mul_1.S
@@ -1,7 +1,7 @@
 /* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store
    the result in a second limb vector.
 
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -53,24 +53,24 @@ C_SYMBOL_NAME(__mpn_mul_1:)
 	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,size,4))
 	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
 	INSN1(neg,l	,R(size))
-	INSN2(xor,l	,R(edx),R(edx))
+	INSN2(xor,l	,R(ebx),R(ebx))
 	ALIGN (3)
-Loop:
-	INSN2(mov,l	,R(ebx),R(edx))
+
+Loop:	INSN2(adc,l	,R(ebx),$0)
 	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,size,4))
 
 	INSN1(mul,l	,R(s2_limb))
 
-	INSN2(add,l	,R(eax),R(ebx))
-
-	INSN2(adc,l	,R(edx),$0)
-	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(eax))
+	INSN2(add,l	,R(ebx),R(eax))
 
+	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(ebx))
 	INSN1(inc,l	,R(size))
-	INSN1(jnz,	,Loop)
 
+	INSN2(mov,l	,R(ebx),R(edx))
+	INSN1(jnz,	,Loop)
 
-	INSN2(mov,l	,R(eax),R(edx))
+	INSN2(adc,l	,R(ebx),$0)
+	INSN2(mov,l	,R(eax),R(ebx))
 	INSN1(pop,l	,R(ebp))
 	INSN1(pop,l	,R(ebx))
 	INSN1(pop,l	,R(esi))
diff --git a/sysdeps/i386/i586/sub_n.S b/sysdeps/i386/i586/sub_n.S
index 9c964a82f3..cd158a5469 100644
--- a/sysdeps/i386/i586/sub_n.S
+++ b/sysdeps/i386/i586/sub_n.S
@@ -1,7 +1,7 @@
 /* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0
    and store difference in a third limb vector.
 
-Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1995, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -30,13 +30,6 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
 #include "sysdep.h"
 #include "asm-syntax.h"
 
-#define t1	%eax
-#define t2	%edx
-#define src1	%esi
-#define src2	%ebp
-#define dst	%edi
-#define x	%ebx
-
 .text
 	ALIGN (3)
 	.globl C_SYMBOL_NAME(__mpn_sub_n)
@@ -46,85 +39,85 @@ C_SYMBOL_NAME(__mpn_sub_n:)
 	pushl	%ebx
 	pushl	%ebp
 
-	movl	20(%esp),dst		/* res_ptr */
-	movl	24(%esp),src1		/* s1_ptr */
-	movl	28(%esp),src2		/* s2_ptr */
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s1_ptr */
+	movl	28(%esp),%ebp		/* s2_ptr */
 	movl	32(%esp),%ecx		/* size */
 
-	movl	(src2),x
+	movl	(%ebp),%ebx
 
 	decl	%ecx
-	movl	%ecx,t2
+	movl	%ecx,%edx
 	shrl	$3,%ecx
-	andl	$7,t2
+	andl	$7,%edx
 	testl	%ecx,%ecx		/* zero carry flag */
 	jz	Lend
-	pushl	t2
+	pushl	%edx
 
 	ALIGN (3)
-Loop:	movl	28(dst),%eax		/* fetch destination cache line */
-	leal	32(dst),dst
-
-L1:	movl	(src1),t1
-	movl	4(src1),t2
-	sbbl	x,t1
-	movl	4(src2),x
-	sbbl	x,t2
-	movl	8(src2),x
-	movl	t1,-32(dst)
-	movl	t2,-28(dst)
-
-L2:	movl	8(src1),t1
-	movl	12(src1),t2
-	sbbl	x,t1
-	movl	12(src2),x
-	sbbl	x,t2
-	movl	16(src2),x
-	movl	t1,-24(dst)
-	movl	t2,-20(dst)
-
-L3:	movl	16(src1),t1
-	movl	20(src1),t2
-	sbbl	x,t1
-	movl	20(src2),x
-	sbbl	x,t2
-	movl	24(src2),x
-	movl	t1,-16(dst)
-	movl	t2,-12(dst)
-
-L4:	movl	24(src1),t1
-	movl	28(src1),t2
-	sbbl	x,t1
-	movl	28(src2),x
-	sbbl	x,t2
-	movl	32(src2),x
-	movl	t1,-8(dst)
-	movl	t2,-4(dst)
-
-	leal	32(src1),src1
-	leal	32(src2),src2
+Loop:	movl	28(%edi),%eax		/* fetch destination cache line */
+	leal	32(%edi),%edi
+
+L1:	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	sbbl	%ebx,%eax
+	movl	4(%ebp),%ebx
+	sbbl	%ebx,%edx
+	movl	8(%ebp),%ebx
+	movl	%eax,-32(%edi)
+	movl	%edx,-28(%edi)
+
+L2:	movl	8(%esi),%eax
+	movl	12(%esi),%edx
+	sbbl	%ebx,%eax
+	movl	12(%ebp),%ebx
+	sbbl	%ebx,%edx
+	movl	16(%ebp),%ebx
+	movl	%eax,-24(%edi)
+	movl	%edx,-20(%edi)
+
+L3:	movl	16(%esi),%eax
+	movl	20(%esi),%edx
+	sbbl	%ebx,%eax
+	movl	20(%ebp),%ebx
+	sbbl	%ebx,%edx
+	movl	24(%ebp),%ebx
+	movl	%eax,-16(%edi)
+	movl	%edx,-12(%edi)
+
+L4:	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	sbbl	%ebx,%eax
+	movl	28(%ebp),%ebx
+	sbbl	%ebx,%edx
+	movl	32(%ebp),%ebx
+	movl	%eax,-8(%edi)
+	movl	%edx,-4(%edi)
+
+	leal	32(%esi),%esi
+	leal	32(%ebp),%ebp
 	decl	%ecx
 	jnz	Loop
 
-	popl	t2
+	popl	%edx
 Lend:
-	decl	t2			/* test t2 w/o clobbering carry */
+	decl	%edx			/* test %edx w/o clobbering carry */
 	js	Lend2
-	incl	t2
+	incl	%edx
 Loop2:
-	leal	4(dst),dst
-	movl	(src1),t1
-	sbbl	x,t1
-	movl	4(src2),x
-	movl	t1,-4(dst)
-	leal	4(src1),src1
-	leal	4(src2),src2
-	decl	t2
+	leal	4(%edi),%edi
+	movl	(%esi),%eax
+	sbbl	%ebx,%eax
+	movl	4(%ebp),%ebx
+	movl	%eax,-4(%edi)
+	leal	4(%esi),%esi
+	leal	4(%ebp),%ebp
+	decl	%edx
 	jnz	Loop2
 Lend2:
-	movl	(src1),t1
-	sbbl	x,t1
-	movl	t1,(dst)
+	movl	(%esi),%eax
+	sbbl	%ebx,%eax
+	movl	%eax,(%edi)
 
 	sbbl	%eax,%eax
 	negl	%eax
diff --git a/sysdeps/i386/i586/submul_1.S b/sysdeps/i386/i586/submul_1.S
index 14bfe54e24..440f64f358 100644
--- a/sysdeps/i386/i586/submul_1.S
+++ b/sysdeps/i386/i586/submul_1.S
@@ -1,7 +1,7 @@
 /* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
    the result from a second limb vector.
 
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -53,10 +53,10 @@ C_SYMBOL_NAME(__mpn_submul_1:)
 	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,size,4))
 	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
 	INSN1(neg,l	,R(size))
-	INSN2(xor,l	,R(edx),R(edx))
+	INSN2(xor,l	,R(ebx),R(ebx))
 	ALIGN (3)
-Loop:
-	INSN2(mov,l	,R(ebx),R(edx))
+
+Loop:	INSN2(adc,l	,R(ebx),$0)
 	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,size,4))
 
 	INSN1(mul,l	,R(s2_limb))
@@ -67,14 +67,14 @@ Loop:
 	INSN2(adc,l	,R(edx),$0)
 	INSN2(sub,l	,R(ebx),R(eax))
 
-	INSN2(adc,l	,R(edx),$0)
 	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(ebx))
-
 	INSN1(inc,l	,R(size))
-	INSN1(jnz,	,Loop)
 
+	INSN2(mov,l	,R(ebx),R(edx))
+	INSN1(jnz,	,Loop)
 
-	INSN2(mov,l	,R(eax),R(edx))
+	INSN2(adc,l	,R(ebx),$0)
+	INSN2(mov,l	,R(eax),R(ebx))
 	INSN1(pop,l	,R(ebp))
 	INSN1(pop,l	,R(ebx))
 	INSN1(pop,l	,R(esi))
diff --git a/sysdeps/m68k/add_n.S b/sysdeps/m68k/add_n.S
index ea7a4458ea..754af9f469 100644
--- a/sysdeps/m68k/add_n.S
+++ b/sysdeps/m68k/add_n.S
@@ -1,7 +1,7 @@
 /* mc68020 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
    sum in a third limb vector.
 
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -27,50 +27,53 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
   size		(sp + 12)
 */
 
+#include "sysdep.h"
 #include "asm-syntax.h"
 
 	TEXT
 	ALIGN
-	GLOBL	___mpn_add_n
+	GLOBL	C_SYMBOL_NAME(__mpn_add_n)
 
-LAB(___mpn_add_n)
+C_SYMBOL_NAME(__mpn_add_n:)
+PROLOG(__mpn_add_n)
 /* Save used registers on the stack.  */
-	INSN2(move,l	,MEM_PREDEC(sp),d2)
-	INSN2(move,l	,MEM_PREDEC(sp),a2)
+	movel	R(d2),MEM_PREDEC(sp)
+	movel	R(a2),MEM_PREDEC(sp)
 
 /* Copy the arguments to registers.  Better use movem?  */
-	INSN2(move,l	,a2,MEM_DISP(sp,12))
-	INSN2(move,l	,a0,MEM_DISP(sp,16))
-	INSN2(move,l	,a1,MEM_DISP(sp,20))
-	INSN2(move,l	,d2,MEM_DISP(sp,24))
-
-	INSN2(eor,w	,d2,#1)
-	INSN2(lsr,l	,d2,#1)
-	bcc L1
-	INSN2(subq,l	,d2,#1)		/* clears cy as side effect */
-
-LAB(Loop)
-	INSN2(move,l	,d0,MEM_POSTINC(a0))
-	INSN2(move,l	,d1,MEM_POSTINC(a1))
-	INSN2(addx,l	,d0,d1)
-	INSN2(move,l	,MEM_POSTINC(a2),d0)
-LAB(L1)	INSN2(move,l	,d0,MEM_POSTINC(a0))
-	INSN2(move,l	,d1,MEM_POSTINC(a1))
-	INSN2(addx,l	,d0,d1)
-	INSN2(move,l	,MEM_POSTINC(a2),d0)
-
-	dbf d2,Loop			/* loop until 16 lsb of %4 == -1 */
-	INSN2(subx,l	,d0,d0)		/* d0 <= -cy; save cy as 0 or -1 in d0 */
-	INSN2(sub,l	,d2,#0x10000)
-	bcs L2
-	INSN2(add,l	,d0,d0)		/* restore cy */
-	bra Loop
-
-LAB(L2)
-	INSN1(neg,l	,d0)
+	movel	MEM_DISP(sp,12),R(a2)
+	movel	MEM_DISP(sp,16),R(a0)
+	movel	MEM_DISP(sp,20),R(a1)
+	movel	MEM_DISP(sp,24),R(d2)
+
+	eorw	#1,R(d2)
+	lsrl	#1,R(d2)
+	bcc	L(L1)
+	subql	#1,R(d2)	/* clears cy as side effect */
+
+L(Loop:)
+	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	addxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+L(L1:)	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	addxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+
+	dbf	R(d2),L(Loop)		/* loop until 16 lsb of %4 == -1 */
+	subxl	R(d0),R(d0)	/* d0 <= -cy; save cy as 0 or -1 in d0 */
+	subl	#0x10000,R(d2)
+	bcs	L(L2)
+	addl	R(d0),R(d0)	/* restore cy */
+	bra	L(Loop)
+
+L(L2:)
+	negl	R(d0)
 
 /* Restore used registers from stack frame.  */
-	INSN2(move,l	,a2,MEM_POSTINC(sp))
-	INSN2(move,l	,d2,MEM_POSTINC(sp))
+	movel	MEM_POSTINC(sp),R(a2)
+	movel	MEM_POSTINC(sp),R(d2)
 
 	rts
+EPILOG(__mpn_add_n)
diff --git a/sysdeps/m68k/lshift.S b/sysdeps/m68k/lshift.S
new file mode 100644
index 0000000000..c58594a01b
--- /dev/null
+++ b/sysdeps/m68k/lshift.S
@@ -0,0 +1,150 @@
+/* mc68020 __mpn_lshift -- Shift left a low-level natural-number integer.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  s_size	(sp + 16)
+  cnt		(sp + 12)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr a1
+#define s_ptr a0
+#define s_size d6
+#define cnt d4
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__mpn_lshift)
+
+C_SYMBOL_NAME(__mpn_lshift:)
+PROLOG(__mpn_lshift)
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d6)/R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  */
+	movel	MEM_DISP(sp,28),R(res_ptr)
+	movel	MEM_DISP(sp,32),R(s_ptr)
+	movel	MEM_DISP(sp,36),R(s_size)
+	movel	MEM_DISP(sp,40),R(cnt)
+
+	moveql	#1,R(d5)
+	cmpl	R(d5),R(cnt)
+	bne	L(Lnormal)
+	cmpl	R(s_ptr),R(res_ptr)
+	bls	L(Lspecial)		/* jump if s_ptr >= res_ptr */
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(s_ptr,s_size,l,4),R(a2)
+#else /* not mc68020 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	lea	MEM_INDX(s_ptr,d0,l),R(a2)
+#endif
+	cmpl	R(res_ptr),R(a2)
+	bls	L(Lspecial)		/* jump if res_ptr >= s_ptr + s_size */
+
+L(Lnormal:)
+	moveql	#32,R(d5)
+	subl	R(cnt),R(d5)
+
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr)
+	lea	MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr)
+#else /* not mc68000 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	addl	R(s_size),R(s_ptr)
+	addl	R(s_size),R(res_ptr)
+#endif
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	movel	R(d2),R(d0)
+	lsrl	R(d5),R(d0)		/* compute carry limb */
+
+	lsll	R(cnt),R(d2)
+	movel	R(d2),R(d1)
+	subql	#1,R(s_size)
+	beq	L(Lend)
+	lsrl	#1,R(s_size)
+	bcs	L(L1)
+	subql	#1,R(s_size)
+
+L(Loop:)
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	movel	R(d2),R(d3)
+	lsrl	R(d5),R(d3)
+	orl	R(d3),R(d1)
+	movel	R(d1),MEM_PREDEC(res_ptr)
+	lsll	R(cnt),R(d2)
+L(L1:)
+	movel	MEM_PREDEC(s_ptr),R(d1)
+	movel	R(d1),R(d3)
+	lsrl	R(d5),R(d3)
+	orl	R(d3),R(d2)
+	movel	R(d2),MEM_PREDEC(res_ptr)
+	lsll	R(cnt),R(d1)
+
+	dbf	R(s_size),L(Loop)
+	subl	#0x10000,R(s_size)
+	bcc	L(Loop)
+
+L(Lend:)
+	movel	R(d1),MEM_PREDEC(res_ptr) /* store least significant limb */
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+
+/* We loop from least significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.  */
+
+L(Lspecial:)
+	clrl	R(d0)			/* initialize carry */
+	eorw	#1,R(s_size)
+	lsrl	#1,R(s_size)
+	bcc	L(LL1)
+	subql	#1,R(s_size)
+
+L(LLoop:)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	addxl	R(d2),R(d2)
+	movel	R(d2),MEM_POSTINC(res_ptr)
+L(LL1:)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	addxl	R(d2),R(d2)
+	movel	R(d2),MEM_POSTINC(res_ptr)
+
+	dbf	R(s_size),L(LLoop)
+	addxl	R(d0),R(d0)		/* save cy in lsb */
+	subl	#0x10000,R(s_size)
+	bcs	L(LLend)
+	lsrl	#1,R(d0)		/* restore cy */
+	bra	L(LLoop)
+
+L(LLend:)
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+EPILOG(__mpn_lshift)
diff --git a/sysdeps/m68k/m68020/addmul_1.S b/sysdeps/m68k/m68020/addmul_1.S
index 3f244c40b4..169f1135be 100644
--- a/sysdeps/m68k/m68020/addmul_1.S
+++ b/sysdeps/m68k/m68020/addmul_1.S
@@ -1,7 +1,7 @@
 /* mc68020 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
    the result to a second limb vector.
 
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -23,58 +23,61 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
   INPUT PARAMETERS
   res_ptr	(sp + 4)
   s1_ptr	(sp + 8)
-  size		(sp + 12)
+  s1_size	(sp + 12)
   s2_limb	(sp + 16)
 */
 
+#include "sysdep.h"
 #include "asm-syntax.h"
 
 	TEXT
 	ALIGN
-	GLOBL	___mpn_addmul_1
+	GLOBL	C_SYMBOL_NAME(__mpn_addmul_1)
 
-LAB(___mpn_addmul_1)
+C_SYMBOL_NAME(__mpn_addmul_1:)
+PROLOG(__mpn_addmul_1)
 
 #define res_ptr a0
 #define s1_ptr a1
-#define size d2
+#define s1_size d2
 #define s2_limb d4
 
 /* Save used registers on the stack.  */
-	INSN2(movem,l	,MEM_PREDEC(sp),d2-d5)
+	moveml	R(d2)-R(d5),MEM_PREDEC(sp)
 
 /* Copy the arguments to registers.  Better use movem?  */
-	INSN2(move,l	,res_ptr,MEM_DISP(sp,20))
-	INSN2(move,l	,s1_ptr,MEM_DISP(sp,24))
-	INSN2(move,l	,size,MEM_DISP(sp,28))
-	INSN2(move,l	,s2_limb,MEM_DISP(sp,32))
-
-	INSN2(eor,w	,size,#1)
-	INSN1(clr,l	,d1)
-	INSN1(clr,l	,d5)
-	INSN2(lsr,l	,size,#1)
-	bcc	L1
-	INSN2(subq,l	,size,#1)
-	INSN2(sub,l	,d0,d0)		/* (d0,cy) <= (0,0) */
-
-LAB(Loop)
-	INSN2(move,l	,d3,MEM_POSTINC(s1_ptr))
-	INSN2(mulu,l	,d1:d3,s2_limb)
-	INSN2(addx,l	,d3,d0)
-	INSN2(addx,l	,d1,d5)
-	INSN2(add,l	,MEM_POSTINC(res_ptr),d3)
-LAB(L1)	INSN2(move,l	,d3,MEM_POSTINC(s1_ptr))
-	INSN2(mulu,l	,d0:d3,s2_limb)
-	INSN2(addx,l	,d3,d1)
-	INSN2(addx,l	,d0,d5)
-	INSN2(add,l	,MEM_POSTINC(res_ptr),d3)
-
-	dbf	size,Loop
-	INSN2(addx,l	,d0,d5)
-	INSN2(sub,l	,size,#0x10000)
-	bcc	Loop
+	movel	MEM_DISP(sp,20),R(res_ptr)
+	movel	MEM_DISP(sp,24),R(s1_ptr)
+	movel	MEM_DISP(sp,28),R(s1_size)
+	movel	MEM_DISP(sp,32),R(s2_limb)
+
+	eorw	#1,R(s1_size)
+	clrl	R(d1)
+	clrl	R(d5)
+	lsrl	#1,R(s1_size)
+	bcc	L(L1)
+	subql	#1,R(s1_size)
+	subl	R(d0),R(d0)		/* (d0,cy) <= (0,0) */
+
+L(Loop:)
+	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d1):R(d3)
+	addxl	R(d0),R(d3)
+	addxl	R(d5),R(d1)
+	addl	R(d3),MEM_POSTINC(res_ptr)
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d0):R(d3)
+	addxl	R(d1),R(d3)
+	addxl	R(d5),R(d0)
+	addl	R(d3),MEM_POSTINC(res_ptr)
+
+	dbf	R(s1_size),L(Loop)
+	addxl	R(d5),R(d0)
+	subl	#0x10000,R(s1_size)
+	bcc	L(Loop)
 
 /* Restore used registers from stack frame.  */
-	INSN2(movem,l	,d2-d5,MEM_POSTINC(sp))
+	moveml	MEM_POSTINC(sp),R(d2)-R(d5)
 
 	rts
+EPILOG(__mpn_addmul_1)
diff --git a/sysdeps/m68k/m68020/mul_1.S b/sysdeps/m68k/m68020/mul_1.S
index 548ca0091b..4db1ccac25 100644
--- a/sysdeps/m68k/m68020/mul_1.S
+++ b/sysdeps/m68k/m68020/mul_1.S
@@ -1,7 +1,7 @@
 /* mc68020 __mpn_mul_1 -- Multiply a limb vector with a limb and store
    the result in a second limb vector.
 
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -23,65 +23,68 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
   INPUT PARAMETERS
   res_ptr	(sp + 4)
   s1_ptr	(sp + 8)
-  size		(sp + 12)
+  s1_size	(sp + 12)
   s2_limb	(sp + 16)
 */
 
+#include "sysdep.h"
 #include "asm-syntax.h"
 
 	TEXT
 	ALIGN
-	GLOBL	___mpn_mul_1
+	GLOBL	C_SYMBOL_NAME(__mpn_mul_1)
 
-LAB(___mpn_mul_1)
+C_SYMBOL_NAME(__mpn_mul_1:)
+PROLOG(__mpn_mul_1)
 
 #define res_ptr a0
 #define s1_ptr a1
-#define size d2
+#define s1_size d2
 #define s2_limb d4
 
 /* Save used registers on the stack.  */
-	INSN2(movem,l	,MEM_PREDEC(sp),d2-d4)
+	moveml	R(d2)-R(d4),MEM_PREDEC(sp)
 #if 0
-	INSN2(move,l	,MEM_PREDEC(sp),d2)
-	INSN2(move,l	,MEM_PREDEC(sp),d3)
-	INSN2(move,l	,MEM_PREDEC(sp),d4)
+	movel	R(d2),MEM_PREDEC(sp)
+	movel	R(d3),MEM_PREDEC(sp)
+	movel	R(d4),MEM_PREDEC(sp)
 #endif
 
 /* Copy the arguments to registers.  Better use movem?  */
-	INSN2(move,l	,res_ptr,MEM_DISP(sp,16))
-	INSN2(move,l	,s1_ptr,MEM_DISP(sp,20))
-	INSN2(move,l	,size,MEM_DISP(sp,24))
-	INSN2(move,l	,s2_limb,MEM_DISP(sp,28))
-
-	INSN2(eor,w	,size,#1)
-	INSN1(clr,l	,d1)
-	INSN2(lsr,l	,size,#1)
-	bcc	L1
-	INSN2(subq,l	,size,#1)
-	INSN2(sub,l	,d0,d0)		/* (d0,cy) <= (0,0) */
-
-LAB(Loop)
-	INSN2(move,l	,d3,MEM_POSTINC(s1_ptr))
-	INSN2(mulu,l	,d1:d3,s2_limb)
-	INSN2(addx,l	,d3,d0)
-	INSN2(move,l	,MEM_POSTINC(res_ptr),d3)
-LAB(L1)	INSN2(move,l	,d3,MEM_POSTINC(s1_ptr))
-	INSN2(mulu,l	,d0:d3,s2_limb)
-	INSN2(addx,l	,d3,d1)
-	INSN2(move,l	,MEM_POSTINC(res_ptr),d3)
-
-	dbf	size,Loop
-	INSN1(clr,l	,d3)
-	INSN2(addx,l	,d0,d3)
-	INSN2(sub,l	,size,#0x10000)
-	bcc	Loop
+	movel	MEM_DISP(sp,16),R(res_ptr)
+	movel	MEM_DISP(sp,20),R(s1_ptr)
+	movel	MEM_DISP(sp,24),R(s1_size)
+	movel	MEM_DISP(sp,28),R(s2_limb)
+
+	eorw	#1,R(s1_size)
+	clrl	R(d1)
+	lsrl	#1,R(s1_size)
+	bcc	L(L1)
+	subql	#1,R(s1_size)
+	subl	R(d0),R(d0)	/* (d0,cy) <= (0,0) */
+
+L(Loop:)
+	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d1):R(d3)
+	addxl	R(d0),R(d3)
+	movel	R(d3),MEM_POSTINC(res_ptr)
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d0):R(d3)
+	addxl	R(d1),R(d3)
+	movel	R(d3),MEM_POSTINC(res_ptr)
+
+	dbf	R(s1_size),L(Loop)
+	clrl	R(d3)
+	addxl	R(d3),R(d0)
+	subl	#0x10000,R(s1_size)
+	bcc	L(Loop)
 
 /* Restore used registers from stack frame.  */
-	INSN2(movem,l	,d2-d4,MEM_POSTINC(sp))
+	moveml	MEM_POSTINC(sp),R(d2)-R(d4)
 #if 0
-	INSN2(move,l	,d4,MEM_POSTINC(sp))
-	INSN2(move,l	,d3,MEM_POSTINC(sp))
-	INSN2(move,l	,d2,MEM_POSTINC(sp))
+	movel	MEM_POSTINC(sp),R(d4)
+	movel	MEM_POSTINC(sp),R(d3)
+	movel	MEM_POSTINC(sp),R(d2)
 #endif
 	rts
+EPILOG(__mpn_mul_1)
diff --git a/sysdeps/m68k/m68020/submul_1.S b/sysdeps/m68k/m68020/submul_1.S
index ef7f39de7a..cf30029b2f 100644
--- a/sysdeps/m68k/m68020/submul_1.S
+++ b/sysdeps/m68k/m68020/submul_1.S
@@ -1,7 +1,7 @@
 /* mc68020 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
    the result from a second limb vector.
 
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -23,58 +23,61 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
   INPUT PARAMETERS
   res_ptr	(sp + 4)
   s1_ptr	(sp + 8)
-  size		(sp + 12)
+  s1_size	(sp + 12)
   s2_limb	(sp + 16)
 */
 
+#include "sysdep.h"
 #include "asm-syntax.h"
 
 	TEXT
 	ALIGN
-	GLOBL	___mpn_submul_1
+	GLOBL	C_SYMBOL_NAME(__mpn_submul_1)
 
-LAB(___mpn_submul_1)
+C_SYMBOL_NAME(__mpn_submul_1:)
+PROLOG(__mpn_submul_1)
 
 #define res_ptr a0
 #define s1_ptr a1
-#define size d2
+#define s1_size d2
 #define s2_limb d4
 
 /* Save used registers on the stack.  */
-	INSN2(movem,l	,MEM_PREDEC(sp),d2-d5)
+	moveml	R(d2)-R(d5),MEM_PREDEC(sp)
 
 /* Copy the arguments to registers.  Better use movem?  */
-	INSN2(move,l	,res_ptr,MEM_DISP(sp,20))
-	INSN2(move,l	,s1_ptr,MEM_DISP(sp,24))
-	INSN2(move,l	,size,MEM_DISP(sp,28))
-	INSN2(move,l	,s2_limb,MEM_DISP(sp,32))
-
-	INSN2(eor,w	,size,#1)
-	INSN1(clr,l	,d1)
-	INSN1(clr,l	,d5)
-	INSN2(lsr,l	,size,#1)
-	bcc	L1
-	INSN2(subq,l	,size,#1)
-	INSN2(sub,l	,d0,d0)		/* (d0,cy) <= (0,0) */
-
-LAB(Loop)
-	INSN2(move,l	,d3,MEM_POSTINC(s1_ptr))
-	INSN2(mulu,l	,d1:d3,s2_limb)
-	INSN2(addx,l	,d3,d0)
-	INSN2(addx,l	,d1,d5)
-	INSN2(sub,l	,MEM_POSTINC(res_ptr),d3)
-LAB(L1)	INSN2(move,l	,d3,MEM_POSTINC(s1_ptr))
-	INSN2(mulu,l	,d0:d3,s2_limb)
-	INSN2(addx,l	,d3,d1)
-	INSN2(addx,l	,d0,d5)
-	INSN2(sub,l	,MEM_POSTINC(res_ptr),d3)
-
-	dbf	size,Loop
-	INSN2(addx,l	,d0,d5)
-	INSN2(sub,l	,size,#0x10000)
-	bcc	Loop
+	movel	MEM_DISP(sp,20),R(res_ptr)
+	movel	MEM_DISP(sp,24),R(s1_ptr)
+	movel	MEM_DISP(sp,28),R(s1_size)
+	movel	MEM_DISP(sp,32),R(s2_limb)
+
+	eorw	#1,R(s1_size)
+	clrl	R(d1)
+	clrl	R(d5)
+	lsrl	#1,R(s1_size)
+	bcc	L(L1)
+	subql	#1,R(s1_size)
+	subl	R(d0),R(d0)	/* (d0,cy) <= (0,0) */
+
+L(Loop:)
+	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d1):R(d3)
+	addxl	R(d0),R(d3)
+	addxl	R(d5),R(d1)
+	subl	R(d3),MEM_POSTINC(res_ptr)
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d0):R(d3)
+	addxl	R(d1),R(d3)
+	addxl	R(d5),R(d0)
+	subl	R(d3),MEM_POSTINC(res_ptr)
+
+	dbf	R(s1_size),L(Loop)
+	addxl	R(d5),R(d0)
+	subl	#0x10000,R(s1_size)
+	bcc	L(Loop)
 
 /* Restore used registers from stack frame.  */
-	INSN2(movem,l	,d2-d5,MEM_POSTINC(sp))
+	moveml	MEM_POSTINC(sp),R(d2)-R(d5)
 
 	rts
+EPILOG(__mpn_submul_1)
diff --git a/sysdeps/m68k/rshift.S b/sysdeps/m68k/rshift.S
new file mode 100644
index 0000000000..494dfcbeab
--- /dev/null
+++ b/sysdeps/m68k/rshift.S
@@ -0,0 +1,149 @@
+/* mc68020 __mpn_rshift -- Shift right a low-level natural-number integer.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  s_size	(sp + 16)
+  cnt		(sp + 12)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr a1
+#define s_ptr a0
+#define s_size d6
+#define cnt d4
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__mpn_rshift)
+
+C_SYMBOL_NAME(__mpn_rshift:)
+PROLOG(__mpn_rshift)
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d6)/R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  */
+	movel	MEM_DISP(sp,28),R(res_ptr)
+	movel	MEM_DISP(sp,32),R(s_ptr)
+	movel	MEM_DISP(sp,36),R(s_size)
+	movel	MEM_DISP(sp,40),R(cnt)
+
+	moveql	#1,R(d5)
+	cmpl	R(d5),R(cnt)
+	bne	L(Lnormal)
+	cmpl	R(res_ptr),R(s_ptr)
+	bls	L(Lspecial)		/* jump if res_ptr >= s_ptr */
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(res_ptr,s_size,l,4),R(a2)
+#else /* not mc68020 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	lea	MEM_INDX(res_ptr,d0,l),R(a2)
+#endif
+	cmpl	R(s_ptr),R(a2)
+	bls	L(Lspecial)		/* jump if s_ptr >= res_ptr + s_size */
+
+L(Lnormal:)
+	moveql	#32,R(d5)
+	subl	R(cnt),R(d5)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	movel	R(d2),R(d0)
+	lsll	R(d5),R(d0)		/* compute carry limb */
+   
+	lsrl	R(cnt),R(d2)
+	movel	R(d2),R(d1)
+	subql	#1,R(s_size)
+	beq	L(Lend)
+	lsrl	#1,R(s_size)
+	bcs	L(L1)
+	subql	#1,R(s_size)
+
+L(Loop:)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	movel	R(d2),R(d3)
+	lsll	R(d5),R(d3)
+	orl	R(d3),R(d1)
+	movel	R(d1),MEM_POSTINC(res_ptr)
+	lsrl	R(cnt),R(d2)
+L(L1:)
+	movel	MEM_POSTINC(s_ptr),R(d1)
+	movel	R(d1),R(d3)
+	lsll	R(d5),R(d3)
+	orl	R(d3),R(d2)
+	movel	R(d2),MEM_POSTINC(res_ptr)
+	lsrl	R(cnt),R(d1)
+
+	dbf	R(s_size),L(Loop)
+	subl	#0x10000,R(s_size)
+	bcc	L(Loop)
+
+L(Lend:)
+	movel	R(d1),MEM(res_ptr) /* store most significant limb */
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+
+/* We loop from most significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.  */
+
+L(Lspecial:)
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr)
+	lea	MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr)
+#else /* not mc68000 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	addl	R(s_size),R(s_ptr)
+	addl	R(s_size),R(res_ptr)
+#endif
+
+	clrl	R(d0)			/* initialize carry */
+	eorw	#1,R(s_size)
+	lsrl	#1,R(s_size)
+	bcc	L(LL1)
+	subql	#1,R(s_size)
+
+L(LLoop:)
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	roxrl	#1,R(d2)
+	movel	R(d2),MEM_PREDEC(res_ptr)
+L(LL1:)
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	roxrl	#1,R(d2)
+	movel	R(d2),MEM_PREDEC(res_ptr)
+
+	dbf	R(s_size),L(LLoop)
+	roxrl	#1,R(d0)		/* save cy in msb */
+	subl	#0x10000,R(s_size)
+	bcs	L(LLend)
+	addl	R(d0),R(d0)		/* restore cy */
+	bra	L(LLoop)
+
+L(LLend:)
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+EPILOG(__mpn_rshift)
diff --git a/sysdeps/m68k/sub_n.S b/sysdeps/m68k/sub_n.S
index 19f0ec1568..39f5161176 100644
--- a/sysdeps/m68k/sub_n.S
+++ b/sysdeps/m68k/sub_n.S
@@ -1,7 +1,7 @@
 /* mc68020 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
    store difference in a third limb vector.
 
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -27,50 +27,53 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
   size		(sp + 12)
 */
 
+#include "sysdep.h"
 #include "asm-syntax.h"
 
 	TEXT
 	ALIGN
-	GLOBL	___mpn_sub_n
+	GLOBL	C_SYMBOL_NAME(__mpn_sub_n)
 
-LAB(___mpn_sub_n)
+C_SYMBOL_NAME(__mpn_sub_n:)
+PROLOG(__mpn_sub_n)
 /* Save used registers on the stack.  */
-	INSN2(move,l	,MEM_PREDEC(sp),d2)
-	INSN2(move,l	,MEM_PREDEC(sp),a2)
+	movel	R(d2),MEM_PREDEC(sp)
+	movel	R(a2),MEM_PREDEC(sp)
 
 /* Copy the arguments to registers.  Better use movem?  */
-	INSN2(move,l	,a2,MEM_DISP(sp,12))
-	INSN2(move,l	,a0,MEM_DISP(sp,16))
-	INSN2(move,l	,a1,MEM_DISP(sp,20))
-	INSN2(move,l	,d2,MEM_DISP(sp,24))
-
-	INSN2(eor,w	,d2,#1)
-	INSN2(lsr,l	,d2,#1)
-	bcc L1
-	INSN2(subq,l	,d2,#1)		/* clears cy as side effect */
-
-LAB(Loop)
-	INSN2(move,l	,d0,MEM_POSTINC(a0))
-	INSN2(move,l	,d1,MEM_POSTINC(a1))
-	INSN2(subx,l	,d0,d1)
-	INSN2(move,l	,MEM_POSTINC(a2),d0)
-LAB(L1)	INSN2(move,l	,d0,MEM_POSTINC(a0))
-	INSN2(move,l	,d1,MEM_POSTINC(a1))
-	INSN2(subx,l	,d0,d1)
-	INSN2(move,l	,MEM_POSTINC(a2),d0)
-
-	dbf d2,Loop			/* loop until 16 lsb of %4 == -1 */
-	INSN2(subx,l	,d0,d0)		/* d0 <= -cy; save cy as 0 or -1 in d0 */
-	INSN2(sub,l	,d2,#0x10000)
-	bcs L2
-	INSN2(add,l	,d0,d0)		/* restore cy */
-	bra Loop
-
-LAB(L2)
-	INSN1(neg,l	,d0)
+	movel	MEM_DISP(sp,12),R(a2)
+	movel	MEM_DISP(sp,16),R(a0)
+	movel	MEM_DISP(sp,20),R(a1)
+	movel	MEM_DISP(sp,24),R(d2)
+
+	eorw	#1,R(d2)
+	lsrl	#1,R(d2)
+	bcc	L(L1)
+	subql	#1,R(d2)	/* clears cy as side effect */
+
+L(Loop:)
+	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	subxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+L(L1:)	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	subxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+
+	dbf	R(d2),L(Loop)		/* loop until 16 lsb of %4 == -1 */
+	subxl	R(d0),R(d0)	/* d0 <= -cy; save cy as 0 or -1 in d0 */
+	subl	#0x10000,R(d2)
+	bcs	L(L2)
+	addl	R(d0),R(d0)	/* restore cy */
+	bra	L(Loop)
+
+L(L2:)
+	negl	R(d0)
 
 /* Restore used registers from stack frame.  */
-	INSN2(move,l	,a2,MEM_POSTINC(sp))
-	INSN2(move,l	,d2,MEM_POSTINC(sp))
+	movel	MEM_POSTINC(sp),R(a2)
+	movel	MEM_POSTINC(sp),R(d2)
 
 	rts
+EPILOG(__mpn_sub_n)
diff --git a/sysdeps/m88k/add_n.s b/sysdeps/m88k/add_n.s
index 7e4ccccb90..d56447904e 100644
--- a/sysdeps/m88k/add_n.s
+++ b/sysdeps/m88k/add_n.s
@@ -1,7 +1,7 @@
 ; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store
 ; sum in a third limb vector.
 
-; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
 ; This file is part of the GNU MP Library.
 
diff --git a/sysdeps/m88k/m88110/add_n.S b/sysdeps/m88k/m88110/add_n.S
new file mode 100644
index 0000000000..ab20630a5e
--- /dev/null
+++ b/sysdeps/m88k/m88110/add_n.S
@@ -0,0 +1,199 @@
+; mc88110 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1995, 1996 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+#define res_ptr	r2
+#define s1_ptr	r3
+#define s2_ptr	r4
+#define size	r5
+
+#include "sysdep.h"
+
+	text
+	align	16
+	global	C_SYMBOL_NAME(__mpn_add_n)
+C_SYMBOL_NAME(__mpn_add_n):
+	addu.co	 r0,r0,r0		; clear cy flag
+	xor	 r12,s2_ptr,res_ptr
+	bb1	 2,r12,L1
+; **  V1a  **
+L0:	bb0	 2,res_ptr,L_v1		; branch if res_ptr is aligned?
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	addu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s1_ptr,0
+	ld	 r12,s1_ptr,4
+	ld.d	 r8,s2_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1:	subu	 size,size,8
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,16
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,20
+	ld.d	 r8,s2_ptr,16
+	st.d	 r6,res_ptr,8
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,24
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,28
+	ld.d	 r8,s2_ptr,24
+	st.d	 r6,res_ptr,16
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,32
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,36
+	addu	 s1_ptr,s1_ptr,32
+	ld.d	 r8,s2_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1
+
+Lfin1:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1:	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1
+Lend1:	addu.cio r6,r10,r8
+	addu.cio r7,r12,r9
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1
+/* Add last limb */
+	ld	 r10,s1_ptr,8
+	ld	 r8,s2_ptr,8
+	addu.cio r6,r10,r8
+	st	 r6,res_ptr,8
+
+Lret1:	jmp.n	 r1
+	addu.ci	 r2,r0,r0		; return carry-out from most sign. limb
+
+L1:	xor	 r12,s1_ptr,res_ptr
+	bb1	 2,r12,L2
+; **  V1b  **
+	or	 r12,r0,s2_ptr
+	or	 s2_ptr,r0,s1_ptr
+	or	 s1_ptr,r0,r12
+	br	 L0
+
+; **  V2  **
+/* If we come here, the alignment of s1_ptr and res_ptr as well as the
+   alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+   things can be aligned (that we care about) we now know that the alignment
+   of s1_ptr and s2_ptr are the same.  */
+
+L2:	cmp	 r12,size,1
+	bb1	 eq,r12,Ljone
+	bb0	 2,s1_ptr,L_v2		; branch if s1_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	addu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+
+L_v2:	subu	 size,size,8
+	bcnd	 lt0,size,Lfin2
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop2:	subu	 size,size,8
+	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	ld.d	 r8,s1_ptr,8
+	ld.d	 r6,s2_ptr,8
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,8
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,12
+	ld.d	 r8,s1_ptr,16
+	ld.d	 r6,s2_ptr,16
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,16
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,20
+	ld.d	 r8,s1_ptr,24
+	ld.d	 r6,s2_ptr,24
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,24
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,28
+	addu	 s1_ptr,s1_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop2
+
+Lfin2:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend2
+Loope2:	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope2
+Lend2:	bb0	 0,size,Lret2
+/* Add last limb */
+Ljone:	ld	 r10,s1_ptr,0
+	ld	 r8,s2_ptr,0
+	addu.cio r6,r10,r8
+	st	 r6,res_ptr,0
+
+Lret2:	jmp.n	 r1
+	addu.ci	 r2,r0,r0		; return carry-out from most sign. limb
diff --git a/sysdeps/m88k/m88110/addmul_1.s b/sysdeps/m88k/m88110/addmul_1.s
new file mode 100644
index 0000000000..1a4dfa11ea
--- /dev/null
+++ b/sysdeps/m88k/m88110/addmul_1.s
@@ -0,0 +1,60 @@
+; mc88110 __mpn_addmul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1996 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+	text
+	align	16
+	global	___mpn_addmul_1
+___mpn_addmul_1:
+	lda	 r3,r3[r4]
+	lda	 r8,r2[r4]		; RES_PTR in r8 since r2 is retval
+	subu	 r4,r0,r4
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+
+	ld	 r6,r3[r4]
+	addu	 r4,r4,1
+	subu	 r8,r8,4
+	bcnd.n	 eq0,r4,Lend
+	 mulu.d	 r10,r6,r5
+
+Loop:	ld	 r7,r8[r4]
+	ld	 r6,r3[r4]
+	addu.cio r9,r11,r2
+	addu.ci	 r2,r10,r0
+	addu.co	 r9,r9,r7
+	st	 r9,r8[r4]
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd	 ne0,r4,Loop
+
+Lend:	ld	 r7,r8,0
+	addu.cio r9,r11,r2
+	addu.ci	 r2,r10,r0
+	addu.co	 r9,r9,r7
+	st	 r9,r8,0
+	jmp.n	 r1
+	 addu.ci r2,r2,r0
diff --git a/sysdeps/m88k/m88110/mul_1.s b/sysdeps/m88k/m88110/mul_1.s
index 08c3ca07ee..b1352cea87 100644
--- a/sysdeps/m88k/m88110/mul_1.s
+++ b/sysdeps/m88k/m88110/mul_1.s
@@ -1,7 +1,7 @@
 ; mc88110 __mpn_mul_1 -- Multiply a limb vector with a single limb and
 ; store the product in a second limb vector.
 
-; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
 ; This file is part of the GNU MP Library.
 
@@ -56,29 +56,3 @@ Lend:	addu.cio r9,r11,r2
 	st	 r9,r8,4
 	jmp.n	 r1
 	 addu.ci r2,r10,r0
-
-; This is the Right Way to do this on '110.  4 cycles / 64-bit limb.
-;	ld.d	r10,
-;	mulu.d
-;	addu.cio
-;	addu.cio
-;	st.d
-;	mulu.d	,r11,r5
-;	ld.d	r12,
-;	mulu.d	,r10,r5
-;	addu.cio
-;	addu.cio
-;	st.d
-;	mulu.d
-;	ld.d	r10,
-;	mulu.d
-;	addu.cio
-;	addu.cio
-;	st.d
-;	mulu.d
-;	ld.d	r10,
-;	mulu.d
-;	addu.cio
-;	addu.cio
-;	st.d
-;	mulu.d
diff --git a/sysdeps/m88k/m88110/sub_n.S b/sysdeps/m88k/m88110/sub_n.S
new file mode 100644
index 0000000000..74ee0ae605
--- /dev/null
+++ b/sysdeps/m88k/m88110/sub_n.S
@@ -0,0 +1,275 @@
+; mc88110 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1995, 1996 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+#define res_ptr	r2
+#define s1_ptr	r3
+#define s2_ptr	r4
+#define size	r5
+
+#include "sysdep.h"
+
+	text
+	align	16
+	global	C_SYMBOL_NAME(__mpn_sub_n)
+C_SYMBOL_NAME(__mpn_sub_n):
+	subu.co	 r0,r0,r0		; set cy flag
+	xor	 r12,s2_ptr,res_ptr
+	bb1	 2,r12,L1
+; **  V1a  **
+L0:	bb0	 2,res_ptr,L_v1		; branch if res_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s1_ptr,0
+	ld	 r12,s1_ptr,4
+	ld.d	 r8,s2_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1:	subu	 size,size,8
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,16
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,20
+	ld.d	 r8,s2_ptr,16
+	st.d	 r6,res_ptr,8
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,24
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,28
+	ld.d	 r8,s2_ptr,24
+	st.d	 r6,res_ptr,16
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,32
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,36
+	addu	 s1_ptr,s1_ptr,32
+	ld.d	 r8,s2_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1
+
+Lfin1:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1:	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1
+Lend1:	subu.cio r6,r10,r8
+	subu.cio r7,r12,r9
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1
+/* Add last limb */
+	ld	 r10,s1_ptr,8
+	ld	 r8,s2_ptr,8
+	subu.cio r6,r10,r8
+	st	 r6,res_ptr,8
+
+Lret1:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
+
+L1:	xor	 r12,s1_ptr,res_ptr
+	bb1	 2,r12,L2
+; **  V1b  **
+	bb0	 2,res_ptr,L_v1b	; branch if res_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s1_ptr */
+	ld	 r10,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	ld	 r8,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r8,r10
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1b:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s2_ptr,0
+	ld	 r12,s2_ptr,4
+	ld.d	 r8,s1_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1b
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1b:	subu	 size,size,8
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,8
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,12
+	ld.d	 r8,s1_ptr,8
+	st.d	 r6,res_ptr,0
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,16
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,20
+	ld.d	 r8,s1_ptr,16
+	st.d	 r6,res_ptr,8
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,24
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,28
+	ld.d	 r8,s1_ptr,24
+	st.d	 r6,res_ptr,16
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,32
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,36
+	addu	 s2_ptr,s2_ptr,32
+	ld.d	 r8,s1_ptr,32
+	addu	 s1_ptr,s1_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1b
+
+Lfin1b:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1b
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1b:subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,8
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,12
+	ld.d	 r8,s1_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1b
+Lend1b:	subu.cio r6,r8,r10
+	subu.cio r7,r9,r12
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1b
+/* Add last limb */
+	ld	 r10,s2_ptr,8
+	ld	 r8,s1_ptr,8
+	subu.cio r6,r8,r10
+	st	 r6,res_ptr,8
+
+Lret1b:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
+
+; **  V2  **
+/* If we come here, the alignment of s1_ptr and res_ptr as well as the
+   alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+   things can be aligned (that we care about) we now know that the alignment
+   of s1_ptr and s2_ptr are the same.  */
+
+L2:	cmp	 r12,size,1
+	bb1	 eq,r12,Ljone
+	bb0	 2,s1_ptr,L_v2		; branch if s1_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+
+L_v2:	subu	 size,size,8
+	bcnd	 lt0,size,Lfin2
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop2:	subu	 size,size,8
+	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	ld.d	 r8,s1_ptr,8
+	ld.d	 r6,s2_ptr,8
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,8
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,12
+	ld.d	 r8,s1_ptr,16
+	ld.d	 r6,s2_ptr,16
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,16
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,20
+	ld.d	 r8,s1_ptr,24
+	ld.d	 r6,s2_ptr,24
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,24
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,28
+	addu	 s1_ptr,s1_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop2
+
+Lfin2:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend2
+Loope2:	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope2
+Lend2:	bb0	 0,size,Lret2
+/* Add last limb */
+Ljone:	ld	 r10,s1_ptr,0
+	ld	 r8,s2_ptr,0
+	subu.cio r6,r10,r8
+	st	 r6,res_ptr,0
+
+Lret2:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
diff --git a/sysdeps/m88k/mul_1.s b/sysdeps/m88k/mul_1.s
index 35c238d570..6b8492c4c6 100644
--- a/sysdeps/m88k/mul_1.s
+++ b/sysdeps/m88k/mul_1.s
@@ -1,7 +1,7 @@
 ; mc88100 __mpn_mul_1 -- Multiply a limb vector with a single limb and
 ; store the product in a second limb vector.
 
-; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
 ; This file is part of the GNU MP Library.
 
@@ -55,14 +55,14 @@ ___mpn_mul_1:
 	; Make S1_PTR and RES_PTR point at the end of their blocks
 	; and negate SIZE.
 	lda	 r3,r3[r4]
-	lda	 r6,r2[r4]		; RES_PTR in r6 since r2 is retval
+	lda	 r6,r2[r4]	; RES_PTR in r6 since r2 is retval
 	subu	 r4,r0,r4
 
-	addu.co	 r2,r0,r0		; r2 = cy = 0
+	addu.co	 r2,r0,r0	; r2 = cy = 0
 	ld	 r9,r3[r4]
-	mask	 r7,r5,0xffff		; r7 = lo(S2_LIMB)
-	extu	 r8,r5,16		; r8 = hi(S2_LIMB)
-	bcnd.n	 eq0,r8,Lsmall		; jump if (hi(S2_LIMB) == 0)
+	mask	 r7,r5,0xffff	; r7 = lo(S2_LIMB)
+	extu	 r8,r5,16	; r8 = hi(S2_LIMB)
+	bcnd.n	 eq0,r8,Lsmall	; jump if (hi(S2_LIMB) == 0)
 	 subu	 r6,r6,4
 
 ; General code for any value of S2_LIMB.
@@ -75,28 +75,27 @@ ___mpn_mul_1:
 	br.n	L1
 	addu	 r4,r4,1
 
-Loop:
-	ld	 r9,r3[r4]
+Loop:	ld	 r9,r3[r4]
 	st	 r26,r6[r4]
-; bcnd	ne0,r0,0			; bubble
+; bcnd	ne0,r0,0		; bubble
 	addu	 r4,r4,1
-L1:	mul	 r26,r9,r5		; low word of product	mul_1	WB ld
-	mask	 r12,r9,0xffff		; r12 = lo(s1_limb)	mask_1
-	mul	 r11,r12,r7		; r11 =  prod_0		mul_2	WB mask_1
-	mul	 r10,r12,r8		; r10 = prod_1a		mul_3
-	extu	 r13,r9,16		; r13 = hi(s1_limb)	extu_1	WB mul_1
-	mul	 r12,r13,r7		; r12 = prod_1b		mul_4	WB extu_1
-	mul	 r25,r13,r8		; r25  = prod_2		mul_5	WB mul_2
-	extu	 r11,r11,16		; r11 = hi(prod_0)	extu_2	WB mul_3
-	addu	 r10,r10,r11		;			addu_1	WB extu_2
-; bcnd	ne0,r0,0			; bubble			WB addu_1
-	addu.co	 r10,r10,r12		;				WB mul_4
-	mask.u	 r10,r10,0xffff		; move the 16 most significant bits...
-	addu.ci	 r10,r10,r0		; ...to the low half of the word...
-	rot	 r10,r10,16		; ...and put carry in pos 16.
-	addu.co	 r26,r26,r2		; add old carry limb
+L1:	mul	 r26,r9,r5	; low word of product	mul_1	WB ld
+	mask	 r12,r9,0xffff	; r12 = lo(s1_limb)	mask_1
+	mul	 r11,r12,r7	; r11 =  prod_0		mul_2	WB mask_1
+	mul	 r10,r12,r8	; r10 = prod_1a		mul_3
+	extu	 r13,r9,16	; r13 = hi(s1_limb)	extu_1	WB mul_1
+	mul	 r12,r13,r7	; r12 = prod_1b		mul_4	WB extu_1
+	mul	 r25,r13,r8	; r25  = prod_2		mul_5	WB mul_2
+	extu	 r11,r11,16	; r11 = hi(prod_0)	extu_2	WB mul_3
+	addu	 r10,r10,r11	;			addu_1	WB extu_2
+; bcnd	ne0,r0,0		; bubble			WB addu_1
+	addu.co	 r10,r10,r12	;				WB mul_4
+	mask.u	 r10,r10,0xffff	; move the 16 most significant bits...
+	addu.ci	 r10,r10,r0	; ...to the low half of the word...
+	rot	 r10,r10,16	; ...and put carry in pos 16.
+	addu.co	 r26,r26,r2	; add old carry limb
 	bcnd.n	 ne0,r4,Loop
-	 addu.ci r2,r25,r10		; compute new carry limb
+	 addu.ci r2,r25,r10	; compute new carry limb
 
 	st	 r26,r6[r4]
 	ld.d	 r25,r31,8
@@ -109,20 +108,19 @@ Lsmall:
 	br.n	SL1
 	addu	 r4,r4,1
 
-SLoop:
-	ld	 r9,r3[r4]		;
-	st	 r8,r6[r4]		;
-	addu	 r4,r4,1		;
-SL1:	mul	 r8,r9,r5		; low word of product
-	mask	 r12,r9,0xffff		; r12 = lo(s1_limb)
-	extu	 r13,r9,16		; r13 = hi(s1_limb)
-	mul	 r11,r12,r7		; r11 =  prod_0
-	mul	 r12,r13,r7		; r12 = prod_1b
-	addu.cio r8,r8,r2		; add old carry limb
-	extu	 r10,r11,16		; r11 = hi(prod_0)
-	addu	 r10,r10,r12		;
+SLoop:	ld	 r9,r3[r4]	;
+	st	 r8,r6[r4]	;
+	addu	 r4,r4,1	;
+SL1:	mul	 r8,r9,r5	; low word of product
+	mask	 r12,r9,0xffff	; r12 = lo(s1_limb)
+	extu	 r13,r9,16	; r13 = hi(s1_limb)
+	mul	 r11,r12,r7	; r11 =  prod_0
+	mul	 r12,r13,r7	; r12 = prod_1b
+	addu.cio r8,r8,r2	; add old carry limb
+	extu	 r10,r11,16	; r11 = hi(prod_0)
+	addu	 r10,r10,r12	;
 	bcnd.n	 ne0,r4,SLoop
-	extu	 r2,r10,16		; r2 = new carry limb
+	extu	 r2,r10,16	; r2 = new carry limb
 
 	jmp.n	 r1
 	st	 r8,r6[r4]
diff --git a/sysdeps/m88k/sub_n.s b/sysdeps/m88k/sub_n.s
index 3963cd5479..cd0b791b79 100644
--- a/sysdeps/m88k/sub_n.s
+++ b/sysdeps/m88k/sub_n.s
@@ -1,7 +1,7 @@
 ; mc88100 __mpn_sub -- Subtract two limb vectors of the same length > 0 and
 ; store difference in a third limb vector.
 
-; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+; Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 
 ; This file is part of the GNU MP Library.
 
@@ -41,9 +41,10 @@ ___mpn_sub_n:
 	extu	r10,r5,3
 	ld	r7,r4,0			; read first limb from s2_ptr
 
-	subu.co	r5,r0,r5		; (clear carry as side effect)
+	subu	r5,r0,r5
 	mak	r5,r5,3<4>
-	bcnd	eq0,r5,Lzero
+	bcnd.n	eq0,r5,Lzero
+	subu.co	r0,r0,r0		; initialize carry
 
 	or	r12,r0,lo16(Lbase)
 	or.u	r12,r12,hi16(Lbase)
diff --git a/sysdeps/mips/addmul_1.s b/sysdeps/mips/addmul_1.s
index abc2fb8dcf..917af1bac3 100644
--- a/sysdeps/mips/addmul_1.s
+++ b/sysdeps/mips/addmul_1.s
@@ -1,7 +1,7 @@
  # MIPS __mpn_addmul_1 -- Multiply a limb vector with a single limb and
  # add the product to a second limb vector.
 
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 
  # This file is part of the GNU MP Library.
 
@@ -63,7 +63,7 @@ Loop:	lw	$10,0($4)
 	addu	$2,$2,$10
 	sw	$3,0($4)
 	addiu	$4,$4,4
-	bne	$6,$0,Loop	# should be "bnel"
+	bne	$6,$0,Loop
 	 addu	$2,$9,$2	# add high product limb and carry from addition
 
  # cool down phase 1
diff --git a/sysdeps/mips/mips3/addmul_1.s b/sysdeps/mips/mips3/addmul_1.s
index 7af0172614..7dbc9ad41d 100644
--- a/sysdeps/mips/mips3/addmul_1.s
+++ b/sysdeps/mips/mips3/addmul_1.s
@@ -63,7 +63,7 @@ Loop:	ld	$10,0($4)
 	daddu	$2,$2,$10
 	sd	$3,0($4)
 	daddiu	$4,$4,8
-	bne	$6,$0,Loop	# should be "bnel"
+	bne	$6,$0,Loop
 	 daddu	$2,$9,$2	# add high product limb and carry from addition
 
  # cool down phase 1
diff --git a/sysdeps/mips/mips3/mul_1.s b/sysdeps/mips/mips3/mul_1.s
index 87954e5bc3..8376a02546 100644
--- a/sysdeps/mips/mips3/mul_1.s
+++ b/sysdeps/mips/mips3/mul_1.s
@@ -59,7 +59,7 @@ Loop:	mflo	$10
 	sltu	$2,$10,$2	# carry from previous addition -> $2
 	sd	$10,0($4)
 	daddiu	$4,$4,8
-	bne	$6,$0,Loop	# should be "bnel"
+	bne	$6,$0,Loop
 	 daddu	$2,$9,$2	# add high product limb and carry from addition
 
  # cool down phase 1
diff --git a/sysdeps/mips/mips3/submul_1.s b/sysdeps/mips/mips3/submul_1.s
index f28c6a5167..f041f6c0b4 100644
--- a/sysdeps/mips/mips3/submul_1.s
+++ b/sysdeps/mips/mips3/submul_1.s
@@ -63,7 +63,7 @@ Loop:	ld	$10,0($4)
 	daddu	$2,$2,$10
 	sd	$3,0($4)
 	daddiu	$4,$4,8
-	bne	$6,$0,Loop	# should be "bnel"
+	bne	$6,$0,Loop
 	 daddu	$2,$9,$2	# add high product limb and carry from addition
 
  # cool down phase 1
diff --git a/sysdeps/mips/mul_1.s b/sysdeps/mips/mul_1.s
index 01327e22d8..6f5324cef4 100644
--- a/sysdeps/mips/mul_1.s
+++ b/sysdeps/mips/mul_1.s
@@ -1,7 +1,7 @@
  # MIPS __mpn_mul_1 -- Multiply a limb vector with a single limb and
  # store the product in a second limb vector.
 
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 
  # This file is part of the GNU MP Library.
 
@@ -59,7 +59,7 @@ Loop:	mflo	$10
 	sltu	$2,$10,$2	# carry from previous addition -> $2
 	sw	$10,0($4)
 	addiu	$4,$4,4
-	bne	$6,$0,Loop	# should be "bnel"
+	bne	$6,$0,Loop
 	 addu	$2,$9,$2	# add high product limb and carry from addition
 
  # cool down phase 1
diff --git a/sysdeps/mips/submul_1.s b/sysdeps/mips/submul_1.s
index 616dd1b47c..a78072a1e9 100644
--- a/sysdeps/mips/submul_1.s
+++ b/sysdeps/mips/submul_1.s
@@ -1,7 +1,7 @@
  # MIPS __mpn_submul_1 -- Multiply a limb vector with a single limb and
  # subtract the product from a second limb vector.
 
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
 
  # This file is part of the GNU MP Library.
 
@@ -63,7 +63,7 @@ Loop:	lw	$10,0($4)
 	addu	$2,$2,$10
 	sw	$3,0($4)
 	addiu	$4,$4,4
-	bne	$6,$0,Loop	# should be "bnel"
+	bne	$6,$0,Loop
 	 addu	$2,$9,$2	# add high product limb and carry from addition
 
  # cool down phase 1
diff --git a/sysdeps/rs6000/add_n.s b/sysdeps/rs6000/add_n.s
index 7090cf1b00..e2536d5887 100644
--- a/sysdeps/rs6000/add_n.s
+++ b/sysdeps/rs6000/add_n.s
@@ -1,6 +1,6 @@
 # IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length.
 
-# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+# Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
 # This file is part of the GNU MP Library.
 
diff --git a/sysdeps/rs6000/sub_n.s b/sysdeps/rs6000/sub_n.s
index 40fe7d68bd..c57675b106 100644
--- a/sysdeps/rs6000/sub_n.s
+++ b/sysdeps/rs6000/sub_n.s
@@ -1,7 +1,7 @@
 # IBM POWER __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
 # store difference in a third limb vector.
 
-# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+# Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
 # This file is part of the GNU MP Library.
 
diff --git a/sysdeps/sparc/add_n.S b/sysdeps/sparc/add_n.S
index 80c3b99640..49b31fc660 100644
--- a/sysdeps/sparc/add_n.S
+++ b/sysdeps/sparc/add_n.S
@@ -1,7 +1,7 @@
-! sparc __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+! SPARC __mpn_add_n -- Add two limb vectors of the same length > 0 and store
 ! sum in a third limb vector.
 
-! Copyright (C) 1995 Free Software Foundation, Inc.
+! Copyright (C) 1995, 1996 Free Software Foundation, Inc.
 
 ! This file is part of the GNU MP Library.
 
@@ -32,18 +32,14 @@
 	.align	4
 	.global	C_SYMBOL_NAME(__mpn_add_n)
 C_SYMBOL_NAME(__mpn_add_n):
-	cmp	size,8
-	mov	0,%o4			! clear cy-save register
-	blt,a	Ltriv
-	addcc	size,-2,size
 	xor	s2_ptr,res_ptr,%g1
 	andcc	%g1,4,%g0
 	bne	L1			! branch if alignment differs
 	nop
+! **  V1a  **
 L0:	andcc	res_ptr,4,%g0		! res_ptr unaligned? Side effect: cy=0
-	beq	L_v1			! if no, branch
+	be	L_v1			! if no, branch
 	nop
-! **  V1a  **
 /* Add least significant limb separately to align res_ptr and s2_ptr */
 	ld	[s1_ptr],%g4
 	add	s1_ptr,4,s1_ptr
@@ -53,12 +49,15 @@ L0:	andcc	res_ptr,4,%g0		! res_ptr unaligned? Side effect: cy=0
 	addcc	%g4,%g2,%o4
 	st	%o4,[res_ptr]
 	add	res_ptr,4,res_ptr
+L_v1:	addx	%g0,%g0,%o4		! save cy in register
+	cmp	size,2			! if size < 2 ...
+	bl	Lend2			! ... branch to tail code
+	subcc	%g0,%o4,%g0		! restore cy
 
-L_v1:	ld	[s1_ptr+0],%g4
+	ld	[s1_ptr+0],%g4
+	addcc	size,-10,size
 	ld	[s1_ptr+4],%g1
 	ldd	[s2_ptr+0],%g2
-	addx	%g0,%g0,%o4		! save cy in register
-	addcc	size,-10,size
 	blt	Lfin1
 	subcc	%g0,%o4,%g0		! restore cy
 /* Add blocks of 8 limbs until less than 8 limbs remain */
@@ -98,7 +97,7 @@ Lfin1:	addcc	size,8-2,size
 	blt	Lend1
 	subcc	%g0,%o4,%g0		! restore cy
 /* Add blocks of 2 limbs until less than 2 limbs remain */
-Loop1b:	addxcc	%g4,%g2,%o4
+Loope1:	addxcc	%g4,%g2,%o4
 	ld	[s1_ptr+8],%g4
 	addxcc	%g1,%g3,%o5
 	ld	[s1_ptr+12],%g1
@@ -109,7 +108,7 @@ Loop1b:	addxcc	%g4,%g2,%o4
 	add	s1_ptr,8,s1_ptr
 	add	s2_ptr,8,s2_ptr
 	add	res_ptr,8,res_ptr
-	bge	Loop1b
+	bge	Loope1
 	subcc	%g0,%o4,%g0		! restore cy
 Lend1:	addxcc	%g4,%g2,%o4
 	addxcc	%g1,%g3,%o5
@@ -144,10 +143,13 @@ L1:	xor	s1_ptr,res_ptr,%g1
    things can be aligned (that we care about) we now know that the alignment
    of s1_ptr and s2_ptr are the same.  */
 
-L2:	andcc	s1_ptr,4,%g0		! s1_ptr unaligned? Side effect: cy=0
-	beq	L_v2			! if no, branch
+L2:	cmp	size,1
+	be	Ljone
 	nop
-/* Add least significant limb separately to align res_ptr and s2_ptr */
+	andcc	s1_ptr,4,%g0		! s1_ptr unaligned? Side effect: cy=0
+	be	L_v2			! if no, branch
+	nop
+/* Add least significant limb separately to align s1_ptr and s2_ptr */
 	ld	[s1_ptr],%g4
 	add	s1_ptr,4,s1_ptr
 	ld	[s2_ptr],%g2
@@ -195,9 +197,9 @@ Loop2:	ldd	[s1_ptr+0],%g2
 	subcc	%g0,%o4,%g0		! restore cy
 
 Lfin2:	addcc	size,8-2,size
-Ltriv:	blt	Lend2
+	blt	Lend2
 	subcc	%g0,%o4,%g0		! restore cy
-Loop2b:	ldd	[s1_ptr+0],%g2
+Loope2:	ldd	[s1_ptr+0],%g2
 	ldd	[s2_ptr+0],%o4
 	addxcc	%g2,%o4,%g2
 	st	%g2,[res_ptr+0]
@@ -208,13 +210,13 @@ Loop2b:	ldd	[s1_ptr+0],%g2
 	add	s1_ptr,8,s1_ptr
 	add	s2_ptr,8,s2_ptr
 	add	res_ptr,8,res_ptr
-	bge	Loop2b
+	bge	Loope2
 	subcc	%g0,%o4,%g0		! restore cy
 Lend2:	andcc	size,1,%g0
 	be	Lret2
 	subcc	%g0,%o4,%g0		! restore cy
 /* Add last limb */
-	ld	[s1_ptr],%g4
+Ljone:	ld	[s1_ptr],%g4
 	ld	[s2_ptr],%g2
 	addxcc	%g4,%g2,%o4
 	st	%o4,[res_ptr]
diff --git a/sysdeps/sparc/lshift.S b/sysdeps/sparc/lshift.S
index 497272af0f..6844fa2475 100644
--- a/sysdeps/sparc/lshift.S
+++ b/sysdeps/sparc/lshift.S
@@ -1,6 +1,6 @@
 ! sparc __mpn_lshift --
 
-! Copyright (C) 1995 Free Software Foundation, Inc.
+! Copyright (C) 1995, 1996 Free Software Foundation, Inc.
 
 ! This file is part of the GNU MP Library.
 
@@ -39,7 +39,7 @@ C_SYMBOL_NAME(__mpn_lshift):
 	add	%o2,-1,%o2
 	andcc	%o2,4-1,%g4	! number of limbs in first loop
 	srl	%g2,%o5,%g1	! compute function result
-	beq	L0		! if multiple of 4 limbs, skip first loop
+	be	L0		! if multiple of 4 limbs, skip first loop
 	st	%g1,[%sp+80]
 
 	sub	%o2,%g4,%o2	! adjust count for main loop
@@ -56,7 +56,7 @@ Loop0:	ld	[%o1-8],%g3
 	 st	%o4,[%o0+0]
 
 L0:	tst	%o2
-	beq	Lend
+	be	Lend
 	 nop
 
 Loop:	ld	[%o1-8],%g3
diff --git a/sysdeps/sparc/rshift.S b/sysdeps/sparc/rshift.S
index 3428cfe9a4..5a47926302 100644
--- a/sysdeps/sparc/rshift.S
+++ b/sysdeps/sparc/rshift.S
@@ -1,6 +1,6 @@
 ! sparc __mpn_rshift --
 
-! Copyright (C) 1995 Free Software Foundation, Inc.
+! Copyright (C) 1995, 1996 Free Software Foundation, Inc.
 
 ! This file is part of the GNU MP Library.
 
@@ -36,7 +36,7 @@ C_SYMBOL_NAME(__mpn_rshift):
 	add	%o2,-1,%o2
 	andcc	%o2,4-1,%g4	! number of limbs in first loop
 	sll	%g2,%o5,%g1	! compute function result
-	beq	L0		! if multiple of 4 limbs, skip first loop
+	be	L0		! if multiple of 4 limbs, skip first loop
 	st	%g1,[%sp+80]
 
 	sub	%o2,%g4,%o2	! adjust count for main loop
@@ -53,7 +53,7 @@ Loop0:	ld	[%o1+4],%g3
 	 st	%o4,[%o0-4]
 
 L0:	tst	%o2
-	beq	Lend
+	be	Lend
 	 nop
 
 Loop:	ld	[%o1+4],%g3
diff --git a/sysdeps/sparc/sparc64/add_n.s b/sysdeps/sparc/sparc64/add_n.s
new file mode 100644
index 0000000000..104a89e78c
--- /dev/null
+++ b/sysdeps/sparc/sparc64/add_n.s
@@ -0,0 +1,57 @@
+! SPARC v9 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+! sum in a third limb vector.
+
+! Copyright (C) 1995, 1996 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! s1_ptr	%o1
+! s2_ptr	%o2
+! size		%o3
+
+.section	".text"
+	.align 4
+	.global __mpn_add_n
+	.type	 __mpn_add_n,#function
+	.proc	04
+__mpn_add_n:
+	sub %g0,%o3,%g3
+	sllx %o3,3,%g1
+	add %o1,%g1,%o1			! make s1_ptr point at end
+	add %o2,%g1,%o2			! make s2_ptr point at end
+	add %o0,%g1,%o0			! make res_ptr point at end
+	mov 0,%o4			! clear carry variable
+	sllx %g3,3,%o5			! compute initial address index
+
+.Loop:	ldx [%o2+%o5],%g1		! load s2 limb
+	add %g3,1,%g3			! increment loop count
+	ldx [%o1+%o5],%g2		! load s1 limb
+	addcc %g1,%o4,%g1		! add s2 limb and carry variable
+	movcc %xcc,0,%o4		! if carry-out, o4 was 1; clear it
+	addcc %g1,%g2,%g1		! add s1 limb to sum
+	stx %g1,[%o0+%o5]		! store result
+	add %o5,8,%o5			! increment address index
+	brnz,pt %g3,.Loop
+	movcs %xcc,1,%o4		! if s1 add gave carry, record it
+
+	retl
+	mov %o4,%o0
+.LLfe1:
+	.size	 __mpn_add_n,.LLfe1-__mpn_add_n
diff --git a/sysdeps/sparc/sparc64/addmul_1.s b/sysdeps/sparc/sparc64/addmul_1.s
new file mode 100644
index 0000000000..ef013eeb91
--- /dev/null
+++ b/sysdeps/sparc/sparc64/addmul_1.s
@@ -0,0 +1,88 @@
+! SPARC v9 __mpn_addmul_1 -- Multiply a limb vector with a single limb and
+! add the product to a second limb vector.
+
+! Copyright (C) 1996 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	o0
+! s1_ptr	o1
+! size		o2
+! s2_limb	o3
+
+.section	".text"
+	.align 4
+	.global __mpn_addmul_1
+	.type	 __mpn_addmul_1,#function
+	.proc	016
+__mpn_addmul_1:
+	!#PROLOGUE#	0
+	save	%sp,-160,%sp
+	!#PROLOGUE#	1
+	sub	%g0,%i2,%o7
+	sllx	%o7,3,%g5
+	sub	%i1,%g5,%o3
+	sub	%i0,%g5,%o4
+	mov	0,%o0			! zero cy_limb
+
+	srl	%i3,0,%o1		! extract low 32 bits of s2_limb
+	srlx	%i3,32,%i3		! extract high 32 bits of s2_limb
+	mov	1,%o2
+	sllx	%o2,32,%o2		! o2 = 0x100000000
+
+	!   hi   !
+             !  mid-1 !
+             !  mid-2 !
+		 !   lo   !
+.Loop:
+	sllx	%o7,3,%g1
+	ldx	[%o3+%g1],%g5
+	srl	%g5,0,%i0		! zero hi bits
+	srlx	%g5,32,%g5
+	mulx	%o1,%i0,%i4		! lo product
+	mulx	%i3,%i0,%i1		! mid-1 product
+	mulx	%o1,%g5,%l2		! mid-2 product
+	mulx	%i3,%g5,%i5		! hi product
+	srlx	%i4,32,%i0		! extract high 32 bits of lo product...
+	add	%i1,%i0,%i1		! ...and add it to the mid-1 product
+	addcc	%i1,%l2,%i1		! add mid products
+	mov	0,%l0			! we need the carry from that add...
+	movcs	%xcc,%o2,%l0		! ...compute it and...
+	add	%i5,%l0,%i5		! ...add to bit 32 of the hi product
+	sllx	%i1,32,%i0		! align low bits of mid product
+	srl	%i4,0,%g5		! zero high 32 bits of lo product
+	add	%i0,%g5,%i0		! combine into low 64 bits of result
+	srlx	%i1,32,%i1		! extract high bits of mid product...
+	add	%i5,%i1,%i1		! ...and add them to the high result
+	addcc	%i0,%o0,%i0		! add cy_limb to low 64 bits of result
+	mov	0,%g5
+	movcs	%xcc,1,%g5
+	add	%o7,1,%o7
+	ldx	[%o4+%g1],%l1
+	addcc	%l1,%i0,%i0
+	movcs	%xcc,1,%g5
+	stx	%i0,[%o4+%g1]
+	brnz	%o7,.Loop
+	add	%i1,%g5,%o0		! compute new cy_limb
+
+	mov	%o0,%i0
+	ret
+	restore
+.LLfe1:
+	.size  __mpn_addmul_1,.LLfe1-__mpn_addmul_1
diff --git a/sysdeps/sparc/sparc64/gmp-mparam.h b/sysdeps/sparc/sparc64/gmp-mparam.h
new file mode 100644
index 0000000000..05c893f790
--- /dev/null
+++ b/sysdeps/sparc/sparc64/gmp-mparam.h
@@ -0,0 +1,26 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
diff --git a/sysdeps/sparc/sparc64/lshift.s b/sysdeps/sparc/sparc64/lshift.s
new file mode 100644
index 0000000000..bd7fa0111f
--- /dev/null
+++ b/sysdeps/sparc/sparc64/lshift.s
@@ -0,0 +1,95 @@
+! SPARC v9 __mpn_lshift --
+
+! Copyright (C) 1996 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! src_ptr	%o1
+! size		%o2
+! cnt		%o3
+
+.section	".text"
+	.align 4
+	.global __mpn_lshift
+	.type	 __mpn_lshift,#function
+	.proc	04
+__mpn_lshift:
+	sllx	%o2,3,%g1
+	add	%o1,%g1,%o1	! make %o1 point at end of src
+	ldx	[%o1-8],%g2	! load first limb
+	sub	%g0,%o3,%o5	! negate shift count
+	add	%o0,%g1,%o0	! make %o0 point at end of res
+	add	%o2,-1,%o2
+	and	%o2,4-1,%g4	! number of limbs in first loop
+	srlx	%g2,%o5,%g1	! compute function result
+	brz,pn	%g4,.L0		! if multiple of 4 limbs, skip first loop
+	stx	%g1,[%sp+80]
+
+	sub	%o2,%g4,%o2	! adjust count for main loop
+
+.Loop0:	ldx	[%o1-16],%g3
+	add	%o0,-8,%o0
+	add	%o1,-8,%o1
+	add	%g4,-1,%g4
+	sllx	%g2,%o3,%o4
+	srlx	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	brnz,pt	%g4,.Loop0
+	 stx	%o4,[%o0+0]
+
+.L0:	brz,pn	%o2,.Lend
+	 nop
+
+.Loop:	ldx	[%o1-16],%g3
+	add	%o0,-32,%o0
+	add	%o2,-4,%o2
+	sllx	%g2,%o3,%o4
+	srlx	%g3,%o5,%g1
+
+	ldx	[%o1-24],%g2
+	sllx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0+24]
+	srlx	%g2,%o5,%g1
+
+	ldx	[%o1-32],%g3
+	sllx	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	stx	%g4,[%o0+16]
+	srlx	%g3,%o5,%g1
+
+	ldx	[%o1-40],%g2
+	sllx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0+8]
+	srlx	%g2,%o5,%g1
+
+	add	%o1,-32,%o1
+	or	%g4,%g1,%g4
+	brnz,pt	%o2,.Loop
+	 stx	%g4,[%o0+0]
+
+.Lend:	sllx	%g2,%o3,%g2
+	stx	%g2,[%o0-8]
+	retl
+	ldx	[%sp+80],%o0
+.LLfe1:
+	.size	 __mpn_lshift,.LLfe1-__mpn_lshift
diff --git a/sysdeps/sparc/sparc64/mul_1.s b/sysdeps/sparc/sparc64/mul_1.s
new file mode 100644
index 0000000000..41be3705a8
--- /dev/null
+++ b/sysdeps/sparc/sparc64/mul_1.s
@@ -0,0 +1,85 @@
+! SPARC v9 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+! store the product in a second limb vector.
+
+! Copyright (C) 1995, 1996 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	o0
+! s1_ptr	o1
+! size		o2
+! s2_limb	o3
+
+.section	".text"
+	.align 4
+	.global __mpn_mul_1
+	.type	 __mpn_mul_1,#function
+	.proc	016
+__mpn_mul_1:
+	!#PROLOGUE#	0
+	save	%sp,-160,%sp
+	!#PROLOGUE#	1
+	sub	%g0,%i2,%o7
+	sllx	%o7,3,%g5
+	sub	%i1,%g5,%o3
+	sub	%i0,%g5,%o4
+	mov	0,%o0			! zero cy_limb
+
+	srl	%i3,0,%o1		! extract low 32 bits of s2_limb
+	srlx	%i3,32,%i3		! extract high 32 bits of s2_limb
+	mov	1,%o2
+	sllx	%o2,32,%o2		! o2 = 0x100000000
+
+	!   hi   !
+             !  mid-1 !
+             !  mid-2 !
+		 !   lo   !
+.Loop:
+	sllx	%o7,3,%g1
+	ldx	[%o3+%g1],%g5
+	srl	%g5,0,%i0		! zero hi bits
+	srlx	%g5,32,%g5
+	mulx	%o1,%i0,%i4		! lo product
+	mulx	%i3,%i0,%i1		! mid-1 product
+	mulx	%o1,%g5,%l2		! mid-2 product
+	mulx	%i3,%g5,%i5		! hi product
+	srlx	%i4,32,%i0		! extract high 32 bits of lo product...
+	add	%i1,%i0,%i1		! ...and add it to the mid-1 product
+	addcc	%i1,%l2,%i1		! add mid products
+	mov	0,%l0			! we need the carry from that add...
+	movcs	%xcc,%o2,%l0		! ...compute it and...
+	add	%i5,%l0,%i5		! ...add to bit 32 of the hi product
+	sllx	%i1,32,%i0		! align low bits of mid product
+	srl	%i4,0,%g5		! zero high 32 bits of lo product
+	add	%i0,%g5,%i0		! combine into low 64 bits of result
+	srlx	%i1,32,%i1		! extract high bits of mid product...
+	add	%i5,%i1,%i1		! ...and add them to the high result
+	addcc	%i0,%o0,%i0		! add cy_limb to low 64 bits of result
+	mov	0,%g5
+	movcs	%xcc,1,%g5
+	add	%o7,1,%o7
+	stx	%i0,[%o4+%g1]
+	brnz	%o7,.Loop
+	add	%i1,%g5,%o0		! compute new cy_limb
+
+	mov	%o0,%i0
+	ret
+	restore
+.LLfe1:
+	.size  __mpn_mul_1,.LLfe1-__mpn_mul_1
diff --git a/sysdeps/sparc/sparc64/rshift.s b/sysdeps/sparc/sparc64/rshift.s
new file mode 100644
index 0000000000..971deec2a4
--- /dev/null
+++ b/sysdeps/sparc/sparc64/rshift.s
@@ -0,0 +1,92 @@
+! SPARC v9 __mpn_rshift --
+
+! Copyright (C) 1996 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! src_ptr	%o1
+! size		%o2
+! cnt		%o3
+
+.section	".text"
+	.align	4
+	.global	__mpn_rshift
+	.type	__mpn_rshift,#function
+	.proc	04
+__mpn_rshift:
+	ldx	[%o1],%g2	! load first limb
+	sub	%g0,%o3,%o5	! negate shift count
+	add	%o2,-1,%o2
+	and	%o2,4-1,%g4	! number of limbs in first loop
+	sllx	%g2,%o5,%g1	! compute function result
+	brz,pn	%g4,.L0		! if multiple of 4 limbs, skip first loop
+	stx	%g1,[%sp+80]
+
+	sub	%o2,%g4,%o2	! adjust count for main loop
+
+.Loop0:	ldx	[%o1+8],%g3
+	add	%o0,8,%o0
+	add	%o1,8,%o1
+	add	%g4,-1,%g4
+	srlx	%g2,%o3,%o4
+	sllx	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	brnz,pt	%g4,.Loop0
+	 stx	%o4,[%o0-8]
+
+.L0:	brz,pn	%o2,.Lend
+	 nop
+
+.Loop:	ldx	[%o1+8],%g3
+	add	%o0,32,%o0
+	add	%o2,-4,%o2
+	srlx	%g2,%o3,%o4
+	sllx	%g3,%o5,%g1
+
+	ldx	[%o1+16],%g2
+	srlx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0-32]
+	sllx	%g2,%o5,%g1
+
+	ldx	[%o1+24],%g3
+	srlx	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	stx	%g4,[%o0-24]
+	sllx	%g3,%o5,%g1
+
+	ldx	[%o1+32],%g2
+	srlx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0-16]
+	sllx	%g2,%o5,%g1
+
+	add	%o1,32,%o1
+	or	%g4,%g1,%g4
+	brnz	%o2,.Loop
+	 stx	%g4,[%o0-8]
+
+.Lend:	srlx	%g2,%o3,%g2
+	stx	%g2,[%o0-0]
+	retl
+	ldx	[%sp+80],%o0
+.LLfe1:
+	.size	__mpn_rshift,.LLfe1-__mpn_rshift
diff --git a/sysdeps/sparc/sparc64/sub_n.s b/sysdeps/sparc/sparc64/sub_n.s
new file mode 100644
index 0000000000..7099bf43e9
--- /dev/null
+++ b/sysdeps/sparc/sparc64/sub_n.s
@@ -0,0 +1,57 @@
+! SPARC v9 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1995, 1996 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! s1_ptr	%o1
+! s2_ptr	%o2
+! size		%o3
+
+.section	".text"
+	.align 4
+	.global __mpn_sub_n
+	.type	 __mpn_sub_n,#function
+	.proc	04
+__mpn_sub_n:
+	sub %g0,%o3,%g3
+	sllx %o3,3,%g1
+	add %o1,%g1,%o1			! make s1_ptr point at end
+	add %o2,%g1,%o2			! make s2_ptr point at end
+	add %o0,%g1,%o0			! make res_ptr point at end
+	mov 0,%o4			! clear carry variable
+	sllx %g3,3,%o5			! compute initial address index
+
+.Loop:	ldx [%o2+%o5],%g1		! load s2 limb
+	add %g3,1,%g3			! increment loop count
+	ldx [%o1+%o5],%g2		! load s1 limb
+	addcc %g1,%o4,%g1		! add s2 limb and carry variable
+	movcc %xcc,0,%o4		! if carry-out, o4 was 1; clear it
+	subcc %g1,%g2,%g1		! subtract s1 limb from sum
+	stx %g1,[%o0+%o5]		! store result
+	add %o5,8,%o5			! increment address index
+	brnz,pt %g3,.Loop
+	movcs %xcc,1,%o4		! if s1 subtract gave carry, record it
+
+	retl
+	mov %o4,%o0
+.LLfe1:
+	.size	 __mpn_sub_n,.LLfe1-__mpn_sub_n
diff --git a/sysdeps/sparc/sparc64/submul_1.s b/sysdeps/sparc/sparc64/submul_1.s
new file mode 100644
index 0000000000..f0df38c3bc
--- /dev/null
+++ b/sysdeps/sparc/sparc64/submul_1.s
@@ -0,0 +1,88 @@
+! SPARC v9 __mpn_submul_1 -- Multiply a limb vector with a single limb and
+! subtract the product from a second limb vector.
+
+! Copyright (C) 1996 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	o0
+! s1_ptr	o1
+! size		o2
+! s2_limb	o3
+
+.section	".text"
+	.align 4
+	.global __mpn_submul_1
+	.type	 __mpn_submul_1,#function
+	.proc	016
+__mpn_submul_1:
+	!#PROLOGUE#	0
+	save	%sp,-160,%sp
+	!#PROLOGUE#	1
+	sub	%g0,%i2,%o7
+	sllx	%o7,3,%g5
+	sub	%i1,%g5,%o3
+	sub	%i0,%g5,%o4
+	mov	0,%o0			! zero cy_limb
+
+	srl	%i3,0,%o1		! extract low 32 bits of s2_limb
+	srlx	%i3,32,%i3		! extract high 32 bits of s2_limb
+	mov	1,%o2
+	sllx	%o2,32,%o2		! o2 = 0x100000000
+
+	!   hi   !
+             !  mid-1 !
+             !  mid-2 !
+		 !   lo   !
+.Loop:
+	sllx	%o7,3,%g1
+	ldx	[%o3+%g1],%g5
+	srl	%g5,0,%i0		! zero hi bits
+	srlx	%g5,32,%g5
+	mulx	%o1,%i0,%i4		! lo product
+	mulx	%i3,%i0,%i1		! mid-1 product
+	mulx	%o1,%g5,%l2		! mid-2 product
+	mulx	%i3,%g5,%i5		! hi product
+	srlx	%i4,32,%i0		! extract high 32 bits of lo product...
+	add	%i1,%i0,%i1		! ...and add it to the mid-1 product
+	addcc	%i1,%l2,%i1		! add mid products
+	mov	0,%l0			! we need the carry from that add...
+	movcs	%xcc,%o2,%l0		! ...compute it and...
+	add	%i5,%l0,%i5		! ...add to bit 32 of the hi product
+	sllx	%i1,32,%i0		! align low bits of mid product
+	srl	%i4,0,%g5		! zero high 32 bits of lo product
+	add	%i0,%g5,%i0		! combine into low 64 bits of result
+	srlx	%i1,32,%i1		! extract high bits of mid product...
+	add	%i5,%i1,%i1		! ...and add them to the high result
+	addcc	%i0,%o0,%i0		! add cy_limb to low 64 bits of result
+	mov	0,%g5
+	movcs	%xcc,1,%g5
+	add	%o7,1,%o7
+	ldx	[%o4+%g1],%l1
+	subcc	%l1,%i0,%i0
+	movcs	%xcc,1,%g5
+	stx	%i0,[%o4+%g1]
+	brnz	%o7,.Loop
+	add	%i1,%g5,%o0		! compute new cy_limb
+
+	mov	%o0,%i0
+	ret
+	restore
+.LLfe1:
+	.size  __mpn_submul_1,.LLfe1-__mpn_submul_1
diff --git a/sysdeps/sparc/sparc8/addmul_1.S b/sysdeps/sparc/sparc8/addmul_1.S
index d1de0c3649..1cf5128ee5 100644
--- a/sysdeps/sparc/sparc8/addmul_1.S
+++ b/sysdeps/sparc/sparc8/addmul_1.S
@@ -1,7 +1,7 @@
 ! SPARC v8 __mpn_addmul_1 -- Multiply a limb vector with a limb and
 ! add the result to a second limb vector.
 
-! Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+! Copyright (C) 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
 
 ! This file is part of the GNU MP Library.
 
diff --git a/sysdeps/sparc/sparc8/mul_1.S b/sysdeps/sparc/sparc8/mul_1.S
index 42717be33b..d56394e135 100644
--- a/sysdeps/sparc/sparc8/mul_1.S
+++ b/sysdeps/sparc/sparc8/mul_1.S
@@ -1,7 +1,7 @@
 ! SPARC v8 __mpn_mul_1 -- Multiply a limb vector with a single limb and
 ! store the product in a second limb vector.
 
-! Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+! Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
 ! This file is part of the GNU MP Library.
 
diff --git a/sysdeps/sparc/sub_n.S b/sysdeps/sparc/sub_n.S
index 2e217ed679..9ff5b7b863 100644
--- a/sysdeps/sparc/sub_n.S
+++ b/sysdeps/sparc/sub_n.S
@@ -1,7 +1,7 @@
-! sparc __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! SPARC __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
 ! store difference in a third limb vector.
 
-! Copyright (C) 1995 Free Software Foundation, Inc.
+! Copyright (C) 1995, 1996 Free Software Foundation, Inc.
 
 ! This file is part of the GNU MP Library.
 
@@ -38,7 +38,7 @@ C_SYMBOL_NAME(__mpn_sub_n):
 	nop
 ! **  V1a  **
 	andcc	res_ptr,4,%g0		! res_ptr unaligned? Side effect: cy=0
-	beq	L_v1			! if no, branch
+	be	L_v1			! if no, branch
 	nop
 /* Add least significant limb separately to align res_ptr and s2_ptr */
 	ld	[s1_ptr],%g4
@@ -133,9 +133,9 @@ L1:	xor	s1_ptr,res_ptr,%g1
 	nop
 ! **  V1b  **
 	andcc	res_ptr,4,%g0		! res_ptr unaligned? Side effect: cy=0
-	beq	L_v1b			! if no, branch
+	be	L_v1b			! if no, branch
 	nop
-/* Add least significant limb separately to align res_ptr and s2_ptr */
+/* Add least significant limb separately to align res_ptr and s1_ptr */
 	ld	[s2_ptr],%g4
 	add	s2_ptr,4,s2_ptr
 	ld	[s1_ptr],%g2
@@ -232,7 +232,7 @@ L2:	cmp	size,1
 	be	Ljone
 	nop
 	andcc	s1_ptr,4,%g0		! s1_ptr unaligned? Side effect: cy=0
-	beq	L_v2			! if no, branch
+	be	L_v2			! if no, branch
 	nop
 /* Add least significant limb separately to align s1_ptr and s2_ptr */
 	ld	[s1_ptr],%g4
diff --git a/sysdeps/unix/sysv/linux/m68k/profil-counter.h b/sysdeps/unix/sysv/linux/m68k/profil-counter.h
new file mode 100644
index 0000000000..4e7b132dc9
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/m68k/profil-counter.h
@@ -0,0 +1,24 @@
+/* Machine-dependent SIGPROF signal handler.  Linux/m68k version.
+Copyright (C) 1996 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+Cambridge, MA 02139, USA.  */
+  
+static void
+profil_counter (int signr, int code, struct sigcontext *scp)
+{
+  profil_count ((void *) scp->sc_pc);
+}
diff --git a/sysdeps/vax/gmp-mparam.h b/sysdeps/vax/gmp-mparam.h
index 687f12aa35..ddc308ae20 100644
--- a/sysdeps/vax/gmp-mparam.h
+++ b/sysdeps/vax/gmp-mparam.h
@@ -1,6 +1,6 @@
 /* gmp-mparam.h -- Compiler/machine parameter header file.
 
-Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
diff --git a/sysdeps/z8000/mul_1.s b/sysdeps/z8000/mul_1.s
index 2075225d11..0150e85e85 100644
--- a/sysdeps/z8000/mul_1.s
+++ b/sysdeps/z8000/mul_1.s
@@ -1,7 +1,7 @@
 ! Z8000 __mpn_mul_1 -- Multiply a limb vector with a limb and store
 ! the result in a second limb vector.
 
-! Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+! Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
 
 ! This file is part of the GNU MP Library.