10 files changed, 323 insertions, 170 deletions
diff --git a/sysdeps/alpha/addmul_1.s b/sysdeps/alpha/addmul_1.s
index 46d277df6e..8b168cb46d 100644
--- a/sysdeps/alpha/addmul_1.s
+++ b/sysdeps/alpha/addmul_1.s
@@ -26,16 +26,7 @@
  # size		r18
  # s2_limb	r19
 
- # This code runs at 42 cycles/limb on the 21064.
-
- # To improve performance for long multiplications, we would use
- # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
- # these instructions without slowing down the general code: 1. We can
- # only have two prefetches in operation at any time in the Alpha
- # architecture.  2. There will seldom be any special alignment
- # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
- # loop into an inner and outer loop, having the inner loop handle
- # exactly one prefetch block?
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
 
 	.set	noreorder
 	.set	noat
@@ -52,7 +43,7 @@ __mpn_addmul_1:
 	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	umulh	$2,$19,$0	# $0 = prod_high
-	beq	$18,Lend1	# jump if size was == 1
+	beq	$18,.Lend1	# jump if size was == 1
 	ldq	$2,0($17)	# $2 = s1_limb
 	addq	$17,8,$17	# s1_ptr++
 	subq	$18,1,$18	# size--
@@ -60,10 +51,10 @@ __mpn_addmul_1:
 	cmpult	$3,$5,$4
 	stq	$3,0($16)
 	addq	$16,8,$16	# res_ptr++
-	beq	$18,Lend2	# jump if size was == 2
+	beq	$18,.Lend2	# jump if size was == 2
 
 	.align	3
-Loop:	mulq	$2,$19,$3	# $3 = prod_low
+.Loop:	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
 	subq	$18,1,$18	# size--
@@ -77,9 +68,9 @@ Loop:	mulq	$2,$19,$3	# $3 = prod_low
 	stq	$3,0($16)
 	addq	$16,8,$16	# res_ptr++
 	addq	$5,$0,$0	# combine carries
-	bne	$18,Loop
+	bne	$18,.Loop
 
-Lend2:	mulq	$2,$19,$3	# $3 = prod_low
+.Lend2:	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
 	umulh	$2,$19,$4	# $4 = cy_limb
@@ -91,7 +82,7 @@ Lend2:	mulq	$2,$19,$3	# $3 = prod_low
 	addq	$5,$0,$0	# combine carries
 	addq	$4,$0,$0	# cy_limb = prod_high + cy
 	ret	$31,($26),1
-Lend1:	addq	$5,$3,$3
+.Lend1:	addq	$5,$3,$3
 	cmpult	$3,$5,$5
 	stq	$3,0($16)
 	addq	$0,$5,$0
diff --git a/sysdeps/alpha/alphaev5/add_n.s b/sysdeps/alpha/alphaev5/add_n.s
index 2aaf041774..66cf82b3c3 100644
--- a/sysdeps/alpha/alphaev5/add_n.s
+++ b/sysdeps/alpha/alphaev5/add_n.s
@@ -35,84 +35,113 @@
 __mpn_add_n:
 	.frame	$30,0,$26,0
 
-	ldq	$3,0($17)
-	ldq	$4,0($18)
-
-	subq	$19,1,$19
-	and	$19,4-1,$2	# number of limbs in first loop
-	bis	$31,$31,$0
-	beq	$2,.L0		# if multiple of 4 limbs, skip first loop
-
-	subq	$19,$2,$19
-
-.Loop0:	subq	$2,1,$2
+	or	$31,$31,$25		# clear cy
+	subq	$19,4,$19		# decr loop cnt
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+	ldq	$0,0($18)
+	ldq	$1,8($18)
+	ldq	$4,0($17)
 	ldq	$5,8($17)
-	addq	$4,$0,$4
-	ldq	$6,8($18)
-	cmpult	$4,$0,$1
-	addq	$3,$4,$4
-	cmpult	$4,$3,$0
-	stq	$4,0($16)
-	or	$0,$1,$0
-
-	addq	$17,8,$17
-	addq	$18,8,$18
-	bis	$5,$5,$3
-	bis	$6,$6,$4
-	addq	$16,8,$16
-	bne	$2,.Loop0
-
-.L0:	beq	$19,.Lend
-
+	addq	$17,32,$17		# update s1_ptr
+	ldq	$2,16($18)
+	addq	$0,$4,$20		# 1st main add
+	ldq	$3,24($18)
+	subq	$19,4,$19		# decr loop cnt
+	ldq	$6,-16($17)
+	cmpult	$20,$0,$25		# compute cy from last add
+	ldq	$7,-8($17)
+	addq	$1,$25,$28		# cy add
+	addq	$18,32,$18		# update s2_ptr
+	addq	$5,$28,$21		# 2nd main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
 	.align	4
-.Loop:	subq	$19,4,$19
-	unop
-
-	ldq	$6,8($18)
-	addq	$4,$0,$0
+.Loop:	cmpult	$21,$28,$25		# compute cy from last add
+	ldq	$0,0($18)
+	or	$8,$25,$25		# combine cy from the two adds
+	ldq	$1,8($18)
+	addq	$2,$25,$28		# cy add
+	ldq	$4,0($17)
+	addq	$28,$6,$22		# 3rd main add
 	ldq	$5,8($17)
-	cmpult	$0,$4,$1
-	ldq	$4,16($18)
-	addq	$3,$0,$20
-	cmpult	$20,$3,$0
-	ldq	$3,16($17)
-	or	$0,$1,$0
-	addq	$6,$0,$0
-	cmpult	$0,$6,$1
-	ldq	$6,24($18)
-	addq	$5,$0,$21
-	cmpult	$21,$5,$0
-	ldq	$5,24($17)
-	or	$0,$1,$0
-	addq	$4,$0,$0
-	cmpult	$0,$4,$1
-	ldq	$4,32($18)
-	addq	$3,$0,$22
-	cmpult	$22,$3,$0
-	ldq	$3,32($17)
-	or	$0,$1,$0
-	addq	$6,$0,$0
-	cmpult	$0,$6,$1
-	addq	$5,$0,$23
-	cmpult	$23,$5,$0
-	or	$0,$1,$0
-
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$22,$28,$25		# compute cy from last add
 	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
 	stq	$21,8($16)
-	stq	$22,16($16)
-	stq	$23,24($16)
-
-	addq	$17,32,$17
-	addq	$18,32,$18
-	addq	$16,32,$16
-	bne	$19,.Loop
+	addq	$3,$25,$28		# cy add
+	addq	$28,$7,$23		# 4th main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$23,$28,$25		# compute cy from last add
+	addq	$17,32,$17		# update s1_ptr
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,32,$16		# update res_ptr
+	addq	$0,$25,$28		# cy add
+	ldq	$2,16($18)
+	addq	$4,$28,$20		# 1st main add
+	ldq	$3,24($18)
+	cmpult	$28,$25,$8		# compute cy from last add
+	ldq	$6,-16($17)
+	cmpult	$20,$28,$25		# compute cy from last add
+	ldq	$7,-8($17)
+	or	$8,$25,$25		# combine cy from the two adds
+	subq	$19,4,$19		# decr loop cnt
+	stq	$22,-16($16)
+	addq	$1,$25,$28		# cy add
+	stq	$23,-8($16)
+	addq	$5,$28,$21		# 2nd main add
+	addq	$18,32,$18		# update s2_ptr
+	cmpult	$28,$25,$8		# compute cy from last add
+	bge	$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:	cmpult	$21,$28,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$2,$25,$28		# cy add
+	addq	$28,$6,$22		# 3rd main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$22,$28,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+	stq	$21,8($16)
+	addq	$3,$25,$28		# cy add
+	addq	$28,$7,$23		# 4th main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$23,$28,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,32,$16		# update res_ptr
+	stq	$22,-16($16)
+	stq	$23,-8($16)
+.Lend2:	addq	$19,4,$19		# restore loop cnt
+	beq	$19,.Lret
+ # Start software pipeline for 2nd loop
+	ldq	$0,0($18)
+	ldq	$4,0($17)
+	subq	$19,1,$19
+	beq	$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+	.align	4
+.Loop0:	addq	$0,$25,$28		# cy add
+	ldq	$0,8($18)
+	addq	$4,$28,$20		# main add
+	ldq	$4,8($17)
+	addq	$18,8,$18
+	cmpult	$28,$25,$8		# compute cy from last add
+	addq	$17,8,$17
+	stq	$20,0($16)
+	cmpult	$20,$28,$25		# compute cy from last add
+	subq	$19,1,$19		# decr loop cnt
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,8,$16
+	bne	$19,.Loop0
+.Lend0:	addq	$0,$25,$28		# cy add
+	addq	$4,$28,$20		# main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$20,$28,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
 
-.Lend:	addq	$4,$0,$4
-	cmpult	$4,$0,$1
-	addq	$3,$4,$4
-	cmpult	$4,$3,$0
-	stq	$4,0($16)
-	or	$0,$1,$0
+.Lret:	or	$25,$31,$0		# return cy
 	ret	$31,($26),1
-
 	.end	__mpn_add_n
diff --git a/sysdeps/alpha/alphaev5/lshift.s b/sysdeps/alpha/alphaev5/lshift.s
index fdb089550f..392b4249b8 100644
--- a/sysdeps/alpha/alphaev5/lshift.s
+++ b/sysdeps/alpha/alphaev5/lshift.s
@@ -25,7 +25,7 @@
  # size		r18
  # cnt		r19
 
- # This code runs at 4.25 cycles/limb on the EV5.
+ # This code runs at 3.25 cycles/limb on the EV5.
 
 	.set	noreorder
 	.set	noat
@@ -44,11 +44,11 @@ __mpn_lshift:
 	and	$18,4-1,$28	# number of limbs in first loop
 	srl	$4,$20,$0	# compute function result
 
-	beq	$28,L0
+	beq	$28,.L0
 	subq	$18,$28,$18
 
 	.align	3
-Loop0:	ldq	$3,-16($17)
+.Loop0:	ldq	$3,-16($17)
 	subq	$16,8,$16
 	sll	$4,$19,$5
 	subq	$17,8,$17
@@ -57,17 +57,17 @@ Loop0:	ldq	$3,-16($17)
 	or	$3,$3,$4
 	or	$5,$6,$8
 	stq	$8,0($16)
-	bne	$28,Loop0
+	bne	$28,.Loop0
 
-L0:	sll	$4,$19,$24
-	beq	$18,Lend
+.L0:	sll	$4,$19,$24
+	beq	$18,.Lend
  # warm up phase 1
 	ldq	$1,-16($17)
 	subq	$18,4,$18
 	ldq	$2,-24($17)
 	ldq	$3,-32($17)
 	ldq	$4,-40($17)
-	beq	$18,Lcool1
+	beq	$18,.Lend1
  # warm up phase 2
 	srl	$1,$20,$7
 	sll	$1,$19,$21
@@ -84,10 +84,10 @@ L0:	sll	$4,$19,$24
 	sll	$4,$19,$24
 	ldq	$4,-72($17)
 	subq	$18,4,$18
-	beq	$18,Lcool1
+	beq	$18,.Lend2
 	.align  4
  # main loop
-Loop:	stq	$7,-8($16)
+.Loop:	stq	$7,-8($16)
 	or	$5,$22,$5
 	stq	$8,-16($16)
 	or	$6,$23,$6
@@ -113,16 +113,14 @@ Loop:	stq	$7,-8($16)
 	subq	$16,32,$16
 
 	srl	$4,$20,$6
-	ldq	$3,-96($17
+	ldq	$3,-96($17)
 	sll	$4,$19,$24
 	ldq	$4,-104($17)
 
 	subq	$17,32,$17
-	bne	$18,Loop
-	unop
-	unop
+	bne	$18,.Loop
  # cool down phase 2/1
-Lcool1:	stq	$7,-8($16)
+.Lend2:	stq	$7,-8($16)
 	or	$5,$22,$5
 	stq	$8,-16($16)
 	or	$6,$23,$6
@@ -150,7 +148,7 @@ Lcool1:	stq	$7,-8($16)
 	ret	$31,($26),1
 
  # cool down phase 1/1
-Lcool1:	srl	$1,$20,$7
+.Lend1:	srl	$1,$20,$7
 	sll	$1,$19,$21
 	srl	$2,$20,$8
 	sll	$2,$19,$22
@@ -170,6 +168,6 @@ Lcool1:	srl	$1,$20,$7
 	stq	$24,-40($16)
 	ret	$31,($26),1
 
-Lend	stq	$24,-8($16)
+.Lend:	stq	$24,-8($16)
 	ret	$31,($26),1
 	.end	__mpn_lshift
diff --git a/sysdeps/alpha/alphaev5/rshift.s b/sysdeps/alpha/alphaev5/rshift.s
index 1da9960b46..d20dde35b7 100644
--- a/sysdeps/alpha/alphaev5/rshift.s
+++ b/sysdeps/alpha/alphaev5/rshift.s
@@ -25,7 +25,7 @@
  # size		r18
  # cnt		r19
 
- # This code runs at 4.25 cycles/limb on the EV5.
+ # This code runs at 3.25 cycles/limb on the EV5.
 
 	.set	noreorder
 	.set	noat
@@ -42,11 +42,11 @@ __mpn_rshift:
 	and	$18,4-1,$28	# number of limbs in first loop
 	sll	$4,$20,$0	# compute function result
 
-	beq	$28,L0
+	beq	$28,.L0
 	subq	$18,$28,$18
 
 	.align	3
-Loop0:	ldq	$3,8($17)
+.Loop0:	ldq	$3,8($17)
 	addq	$16,8,$16
 	srl	$4,$19,$5
 	addq	$17,8,$17
@@ -55,17 +55,17 @@ Loop0:	ldq	$3,8($17)
 	or	$3,$3,$4
 	or	$5,$6,$8
 	stq	$8,-8($16)
-	bne	$28,Loop0
+	bne	$28,.Loop0
 
-L0:	srl	$4,$19,$24
-	beq	$18,Lend
+.L0:	srl	$4,$19,$24
+	beq	$18,.Lend
  # warm up phase 1
 	ldq	$1,8($17)
 	subq	$18,4,$18
 	ldq	$2,16($17)
 	ldq	$3,24($17)
 	ldq	$4,32($17)
-	beq	$18,Lcool1
+	beq	$18,.Lend1
  # warm up phase 2
 	sll	$1,$20,$7
 	srl	$1,$19,$21
@@ -82,10 +82,10 @@ L0:	srl	$4,$19,$24
 	srl	$4,$19,$24
 	ldq	$4,64($17)
 	subq	$18,4,$18
-	beq	$18,Lcool2
+	beq	$18,.Lend2
 	.align  4
  # main loop
-Loop:	stq	$7,0($16)
+.Loop:	stq	$7,0($16)
 	or	$5,$22,$5
 	stq	$8,8($16)
 	or	$6,$23,$6
@@ -116,11 +116,9 @@ Loop:	stq	$7,0($16)
 	ldq	$4,96($17)
 
 	addq	$17,32,$17
-	bne	$18,Loop
-	unop
-	unop
+	bne	$18,.Loop
  # cool down phase 2/1
-Lcool2:	stq	$7,0($16)
+.Lend2:	stq	$7,0($16)
 	or	$5,$22,$5
 	stq	$8,8($16)
 	or	$6,$23,$6
@@ -148,7 +146,7 @@ Lcool2:	stq	$7,0($16)
 	ret	$31,($26),1
 
  # cool down phase 1/1
-Lcool1:	sll	$1,$20,$7
+.Lend1:	sll	$1,$20,$7
 	srl	$1,$19,$21
 	sll	$2,$20,$8
 	srl	$2,$19,$22
@@ -168,6 +166,6 @@ Lcool1:	sll	$1,$20,$7
 	stq	$24,32($16)
 	ret	$31,($26),1
 
-Lend:	stq	$24,0($16)
+.Lend:	stq	$24,0($16)
 	ret	$31,($26),1
 	.end	__mpn_rshift
diff --git a/sysdeps/alpha/alphaev5/sub_n.s b/sysdeps/alpha/alphaev5/sub_n.s
new file mode 100644
index 0000000000..c9f3a4ec3a
--- /dev/null
+++ b/sysdeps/alpha/alphaev5/sub_n.s
@@ -0,0 +1,148 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.frame	$30,0,$26,0
+
+	or	$31,$31,$25		# clear cy
+	subq	$19,4,$19		# decr loop cnt
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+	ldq	$0,0($18)
+	ldq	$1,8($18)
+	ldq	$4,0($17)
+	ldq	$5,8($17)
+	addq	$17,32,$17		# update s1_ptr
+	ldq	$2,16($18)
+	subq	$4,$0,$20		# 1st main sub
+	ldq	$3,24($18)
+	subq	$19,4,$19		# decr loop cnt
+	ldq	$6,-16($17)
+	cmpult	$4,$20,$25		# compute cy from last sub
+	ldq	$7,-8($17)
+	addq	$1,$25,$28		# cy add
+	addq	$18,32,$18		# update s2_ptr
+	subq	$5,$28,$21		# 2nd main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+	.align	4
+.Loop:	cmpult	$5,$21,$25		# compute cy from last add
+	ldq	$0,0($18)
+	or	$8,$25,$25		# combine cy from the two adds
+	ldq	$1,8($18)
+	addq	$2,$25,$28		# cy add
+	ldq	$4,0($17)
+	subq	$6,$28,$22		# 3rd main sub
+	ldq	$5,8($17)
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$6,$22,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+	stq	$21,8($16)
+	addq	$3,$25,$28		# cy add
+	subq	$7,$28,$23		# 4th main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$7,$23,$25		# compute cy from last add
+	addq	$17,32,$17		# update s1_ptr
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,32,$16		# update res_ptr
+	addq	$0,$25,$28		# cy add
+	ldq	$2,16($18)
+	subq	$4,$28,$20		# 1st main sub
+	ldq	$3,24($18)
+	cmpult	$28,$25,$8		# compute cy from last add
+	ldq	$6,-16($17)
+	cmpult	$4,$20,$25		# compute cy from last add
+	ldq	$7,-8($17)
+	or	$8,$25,$25		# combine cy from the two adds
+	subq	$19,4,$19		# decr loop cnt
+	stq	$22,-16($16)
+	addq	$1,$25,$28		# cy add
+	stq	$23,-8($16)
+	subq	$5,$28,$21		# 2nd main sub
+	addq	$18,32,$18		# update s2_ptr
+	cmpult	$28,$25,$8		# compute cy from last add
+	bge	$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:	cmpult	$5,$21,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$2,$25,$28		# cy add
+	subq	$6,$28,$22		# 3rd main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$6,$22,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+	stq	$21,8($16)
+	addq	$3,$25,$28		# cy add
+	subq	$7,$28,$23		# 4th main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$7,$23,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,32,$16		# update res_ptr
+	stq	$22,-16($16)
+	stq	$23,-8($16)
+.Lend2:	addq	$19,4,$19		# restore loop cnt
+	beq	$19,.Lret
+ # Start software pipeline for 2nd loop
+	ldq	$0,0($18)
+	ldq	$4,0($17)
+	subq	$19,1,$19
+	beq	$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+	.align	4
+.Loop0:	addq	$0,$25,$28		# cy add
+	ldq	$0,8($18)
+	subq	$4,$28,$20		# main sub
+	ldq	$1,8($17)
+	addq	$18,8,$18
+	cmpult	$28,$25,$8		# compute cy from last add
+	addq	$17,8,$17
+	stq	$20,0($16)
+	cmpult	$4,$20,$25		# compute cy from last add
+	subq	$19,1,$19		# decr loop cnt
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,8,$16
+	or	$1,$31,$4
+	bne	$19,.Loop0
+.Lend0:	addq	$0,$25,$28		# cy add
+	subq	$4,$28,$20		# main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$4,$20,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+
+.Lret:	or	$25,$31,$0		# return cy
+	ret	$31,($26),1
+	.end	__mpn_sub_n
diff --git a/sysdeps/alpha/lshift.s b/sysdeps/alpha/lshift.s
index c28434926b..aa8417bb6a 100644
--- a/sysdeps/alpha/lshift.s
+++ b/sysdeps/alpha/lshift.s
@@ -53,11 +53,11 @@ __mpn_lshift:
 	and	$18,4-1,$20	# number of limbs in first loop
 	srl	$4,$7,$0	# compute function result
 
-	beq	$20,L0
+	beq	$20,.L0
 	subq	$18,$20,$18
 
 	.align	3
-Loop0:
+.Loop0:
 	ldq	$3,-8($17)
 	subq	$16,8,$16
 	subq	$17,8,$17
@@ -67,12 +67,12 @@ Loop0:
 	bis	$3,$3,$4
 	bis	$5,$6,$8
 	stq	$8,0($16)
-	bne	$20,Loop0
+	bne	$20,.Loop0
 
-L0:	beq	$18,Lend
+.L0:	beq	$18,.Lend
 
 	.align	3
-Loop:	ldq	$3,-8($17)
+.Loop:	ldq	$3,-8($17)
 	subq	$16,32,$16
 	subq	$18,4,$18
 	sll	$4,$19,$5
@@ -100,9 +100,9 @@ Loop:	ldq	$3,-8($17)
 	bis	$1,$2,$8
 	stq	$8,0($16)
 
-	bgt	$18,Loop
+	bgt	$18,.Loop
 
-Lend:	sll	$4,$19,$8
+.Lend:	sll	$4,$19,$8
 	stq	$8,-8($16)
 	ret	$31,($26),1
 	.end	__mpn_lshift
diff --git a/sysdeps/alpha/mul_1.s b/sysdeps/alpha/mul_1.s
index 3ef194d7e6..58a63dfa5d 100644
--- a/sysdeps/alpha/mul_1.s
+++ b/sysdeps/alpha/mul_1.s
@@ -1,7 +1,7 @@
  # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
  # the result in a second limb vector.
 
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
  # This file is part of the GNU MP Library.
 
diff --git a/sysdeps/alpha/rshift.s b/sysdeps/alpha/rshift.s
index 74eab0434a..037b776017 100644
--- a/sysdeps/alpha/rshift.s
+++ b/sysdeps/alpha/rshift.s
@@ -34,7 +34,7 @@
  # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
  # 2. Only aligned instruction pairs can be paired.
  # 3. The store buffer or silo might not be able to deal with the bandwidth.
-      
+
 	.set	noreorder
 	.set	noat
 .text
@@ -51,11 +51,11 @@ __mpn_rshift:
 	and	$18,4-1,$20	# number of limbs in first loop
 	sll	$4,$7,$0	# compute function result
 
-	beq	$20,L0
+	beq	$20,.L0
 	subq	$18,$20,$18
 
 	.align	3
-Loop0:
+.Loop0:
 	ldq	$3,0($17)
 	addq	$16,8,$16
 	addq	$17,8,$17
@@ -65,12 +65,12 @@ Loop0:
 	bis	$3,$3,$4
 	bis	$5,$6,$8
 	stq	$8,-8($16)
-	bne	$20,Loop0
+	bne	$20,.Loop0
 
-L0:	beq	$18,Lend
+.L0:	beq	$18,.Lend
 
 	.align	3
-Loop:	ldq	$3,0($17)
+.Loop:	ldq	$3,0($17)
 	addq	$16,32,$16
 	subq	$18,4,$18
 	srl	$4,$19,$5
@@ -98,9 +98,9 @@ Loop:	ldq	$3,0($17)
 	bis	$1,$2,$8
 	stq	$8,-8($16)
 
-	bgt	$18,Loop
+	bgt	$18,.Loop
 
-Lend:	srl	$4,$19,$8
+.Lend:	srl	$4,$19,$8
 	stq	$8,0($16)
 	ret	$31,($26),1
 	.end	__mpn_rshift
diff --git a/sysdeps/alpha/submul_1.s b/sysdeps/alpha/submul_1.s
index acaa11c545..292b2c18b6 100644
--- a/sysdeps/alpha/submul_1.s
+++ b/sysdeps/alpha/submul_1.s
@@ -26,16 +26,7 @@
  # size		r18
  # s2_limb	r19
 
- # This code runs at 42 cycles/limb on the 21064.
-
- # To improve performance for long multiplications, we would use
- # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
- # these instructions without slowing down the general code: 1. We can
- # only have two prefetches in operation at any time in the Alpha
- # architecture.  2. There will seldom be any special alignment
- # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
- # loop into an inner and outer loop, having the inner loop handle
- # exactly one prefetch block?
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
 
 	.set	noreorder
 	.set	noat
@@ -52,7 +43,7 @@ __mpn_submul_1:
 	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	umulh	$2,$19,$0	# $0 = prod_high
-	beq	$18,Lend1	# jump if size was == 1
+	beq	$18,.Lend1	# jump if size was == 1
 	ldq	$2,0($17)	# $2 = s1_limb
 	addq	$17,8,$17	# s1_ptr++
 	subq	$18,1,$18	# size--
@@ -60,10 +51,10 @@ __mpn_submul_1:
 	cmpult	$5,$3,$4
 	stq	$3,0($16)
 	addq	$16,8,$16	# res_ptr++
-	beq	$18,Lend2	# jump if size was == 2
+	beq	$18,.Lend2	# jump if size was == 2
 
 	.align	3
-Loop:	mulq	$2,$19,$3	# $3 = prod_low
+.Loop:	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
 	subq	$18,1,$18	# size--
@@ -77,9 +68,9 @@ Loop:	mulq	$2,$19,$3	# $3 = prod_low
 	stq	$3,0($16)
 	addq	$16,8,$16	# res_ptr++
 	addq	$5,$0,$0	# combine carries
-	bne	$18,Loop
+	bne	$18,.Loop
 
-Lend2:	mulq	$2,$19,$3	# $3 = prod_low
+.Lend2:	mulq	$2,$19,$3	# $3 = prod_low
 	ldq	$5,0($16)	# $5 = *res_ptr
 	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
 	umulh	$2,$19,$4	# $4 = cy_limb
@@ -91,7 +82,7 @@ Lend2:	mulq	$2,$19,$3	# $3 = prod_low
 	addq	$5,$0,$0	# combine carries
 	addq	$4,$0,$0	# cy_limb = prod_high + cy
 	ret	$31,($26),1
-Lend1:	subq	$5,$3,$3
+.Lend1:	subq	$5,$3,$3
 	cmpult	$5,$3,$5
 	stq	$3,0($16)
 	addq	$0,$5,$0
diff --git a/sysdeps/alpha/udiv_qrnnd.S b/sysdeps/alpha/udiv_qrnnd.S
index bafafd672e..ce590ede6c 100644
--- a/sysdeps/alpha/udiv_qrnnd.S
+++ b/sysdeps/alpha/udiv_qrnnd.S
@@ -1,6 +1,6 @@
  # Alpha 21064 __udiv_qrnnd
 
- # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
  # This file is part of the GNU MP Library.
 
@@ -21,13 +21,11 @@
 
         .set noreorder
         .set noat
-
 .text
-        .align 3
-        .globl __udiv_qrnnd
-        .ent __udiv_qrnnd 0
+        .align	3
+        .globl	__udiv_qrnnd
+        .ent	__udiv_qrnnd
 __udiv_qrnnd:
-__udiv_qrnnd..ng:
         .frame $30,0,$26,0
         .prologue 0
 #define cnt	$2
@@ -39,9 +37,9 @@ __udiv_qrnnd..ng:
 #define qb	$20
 
 	ldiq	cnt,16
-	blt	d,Largedivisor
+	blt	d,.Largedivisor
 
-Loop1:	cmplt	n0,0,tmp
+.Loop1:	cmplt	n0,0,tmp
 	addq	n1,n1,n1
 	bis	n1,tmp,n1
 	addq	n0,n0,n0
@@ -74,12 +72,12 @@ Loop1:	cmplt	n0,0,tmp
 	cmovne	qb,tmp,n1
 	bis	n0,qb,n0
 	subq	cnt,1,cnt
-	bgt	cnt,Loop1
+	bgt	cnt,.Loop1
 	stq	n1,0(rem_ptr)
 	bis	$31,n0,$0
 	ret	$31,($26),1
 
-Largedivisor:
+.Largedivisor:
 	and	n0,1,$4
 
 	srl	n0,1,n0
@@ -91,7 +89,7 @@ Largedivisor:
 	srl	d,1,$5
 	addq	$5,$6,$5
 
-Loop2:	cmplt	n0,0,tmp
+.Loop2:	cmplt	n0,0,tmp
 	addq	n1,n1,n1
 	bis	n1,tmp,n1
 	addq	n0,n0,n0
@@ -124,27 +122,27 @@ Loop2:	cmplt	n0,0,tmp
 	cmovne	qb,tmp,n1
 	bis	n0,qb,n0
 	subq	cnt,1,cnt
-	bgt	cnt,Loop2
+	bgt	cnt,.Loop2
 
 	addq	n1,n1,n1
 	addq	$4,n1,n1
-	bne	$6,Odd
+	bne	$6,.LOdd
 	stq	n1,0(rem_ptr)
 	bis	$31,n0,$0
 	ret	$31,($26),1
 
-Odd:
+.LOdd:
 	/* q' in n0. r' in n1 */
 	addq	n1,n0,n1
 	cmpult	n1,n0,tmp	# tmp := carry from addq
-	beq	tmp,LLp6
+	beq	tmp,.LLp6
 	addq	n0,1,n0
 	subq	n1,d,n1
-LLp6:	cmpult	n1,d,tmp
-	bne	tmp,LLp7
+.LLp6:	cmpult	n1,d,tmp
+	bne	tmp,.LLp7
 	addq	n0,1,n0
 	subq	n1,d,n1
-LLp7:
+.LLp7:
 	stq	n1,0(rem_ptr)
 	bis	$31,n0,$0
 	ret	$31,($26),1