about summary refs log tree commit diff
path: root/REORG.TODO/sysdeps/alpha/alphaev5
diff options
context:
space:
mode:
authorZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
committerZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
commit5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/alpha/alphaev5
parent199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
downloadglibc-zack/build-layout-experiment.tar.gz
glibc-zack/build-layout-experiment.tar.xz
glibc-zack/build-layout-experiment.zip
Prepare for radical source tree reorganization. zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage
directory, REORG.TODO, except for files that will certainly still
exist in their current form at top level when we're done (COPYING,
COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which
are moved to the new directory OldChangeLogs, instead), and the
generated file INSTALL (which is just deleted; in the new order, there
will be no generated files checked into version control).
Diffstat (limited to 'REORG.TODO/sysdeps/alpha/alphaev5')
-rw-r--r--REORG.TODO/sysdeps/alpha/alphaev5/add_n.S146
-rw-r--r--REORG.TODO/sysdeps/alpha/alphaev5/lshift.S172
-rw-r--r--REORG.TODO/sysdeps/alpha/alphaev5/rshift.S170
-rw-r--r--REORG.TODO/sysdeps/alpha/alphaev5/sub_n.S147
4 files changed, 635 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/alpha/alphaev5/add_n.S b/REORG.TODO/sysdeps/alpha/alphaev5/add_n.S
new file mode 100644
index 0000000000..d7db8f4672
--- /dev/null
+++ b/REORG.TODO/sysdeps/alpha/alphaev5/add_n.S
@@ -0,0 +1,146 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995-2017 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.frame	$30,0,$26,0
+
+	or	$31,$31,$25		# clear cy
+	subq	$19,4,$19		# decr loop cnt
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+	ldq	$0,0($18)
+	ldq	$1,8($18)
+	ldq	$4,0($17)
+	ldq	$5,8($17)
+	addq	$17,32,$17		# update s1_ptr
+	ldq	$2,16($18)
+	addq	$0,$4,$20		# 1st main add
+	ldq	$3,24($18)
+	subq	$19,4,$19		# decr loop cnt
+	ldq	$6,-16($17)
+	cmpult	$20,$0,$25		# compute cy from last add
+	ldq	$7,-8($17)
+	addq	$1,$25,$28		# cy add
+	addq	$18,32,$18		# update s2_ptr
+	addq	$5,$28,$21		# 2nd main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+	.align	4
+.Loop:	cmpult	$21,$28,$25		# compute cy from last add
+	ldq	$0,0($18)
+	or	$8,$25,$25		# combine cy from the two adds
+	ldq	$1,8($18)
+	addq	$2,$25,$28		# cy add
+	ldq	$4,0($17)
+	addq	$28,$6,$22		# 3rd main add
+	ldq	$5,8($17)
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$22,$28,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+	stq	$21,8($16)
+	addq	$3,$25,$28		# cy add
+	addq	$28,$7,$23		# 4th main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$23,$28,$25		# compute cy from last add
+	addq	$17,32,$17		# update s1_ptr
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,32,$16		# update res_ptr
+	addq	$0,$25,$28		# cy add
+	ldq	$2,16($18)
+	addq	$4,$28,$20		# 1st main add
+	ldq	$3,24($18)
+	cmpult	$28,$25,$8		# compute cy from last add
+	ldq	$6,-16($17)
+	cmpult	$20,$28,$25		# compute cy from last add
+	ldq	$7,-8($17)
+	or	$8,$25,$25		# combine cy from the two adds
+	subq	$19,4,$19		# decr loop cnt
+	stq	$22,-16($16)
+	addq	$1,$25,$28		# cy add
+	stq	$23,-8($16)
+	addq	$5,$28,$21		# 2nd main add
+	addq	$18,32,$18		# update s2_ptr
+	cmpult	$28,$25,$8		# compute cy from last add
+	bge	$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:	cmpult	$21,$28,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$2,$25,$28		# cy add
+	addq	$28,$6,$22		# 3rd main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$22,$28,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+	stq	$21,8($16)
+	addq	$3,$25,$28		# cy add
+	addq	$28,$7,$23		# 4th main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$23,$28,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,32,$16		# update res_ptr
+	stq	$22,-16($16)
+	stq	$23,-8($16)
+.Lend2:	addq	$19,4,$19		# restore loop cnt
+	beq	$19,.Lret
+ # Start software pipeline for 2nd loop
+	ldq	$0,0($18)
+	ldq	$4,0($17)
+	subq	$19,1,$19
+	beq	$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+	.align	4
+.Loop0:	addq	$0,$25,$28		# cy add
+	ldq	$0,8($18)
+	addq	$4,$28,$20		# main add
+	ldq	$4,8($17)
+	addq	$18,8,$18
+	cmpult	$28,$25,$8		# compute cy from last add
+	addq	$17,8,$17
+	stq	$20,0($16)
+	cmpult	$20,$28,$25		# compute cy from last add
+	subq	$19,1,$19		# decr loop cnt
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,8,$16
+	bne	$19,.Loop0
+.Lend0:	addq	$0,$25,$28		# cy add
+	addq	$4,$28,$20		# main add
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$20,$28,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+
+.Lret:	or	$25,$31,$0		# return cy
+	ret	$31,($26),1
+	.end	__mpn_add_n
diff --git a/REORG.TODO/sysdeps/alpha/alphaev5/lshift.S b/REORG.TODO/sysdeps/alpha/alphaev5/lshift.S
new file mode 100644
index 0000000000..24ff8e2fc3
--- /dev/null
+++ b/REORG.TODO/sysdeps/alpha/alphaev5/lshift.S
@@ -0,0 +1,172 @@
+ # Alpha EV5 __mpn_lshift --
+
+ # Copyright (C) 1994-2017 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 3.25 cycles/limb on the EV5.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.frame	$30,0,$26,0
+
+	s8addq	$18,$17,$17	# make r17 point at end of s1
+	ldq	$4,-8($17)	# load first limb
+	subq	$31,$19,$20
+	s8addq	$18,$16,$16	# make r16 point at end of RES
+	subq	$18,1,$18
+	and	$18,4-1,$28	# number of limbs in first loop
+	srl	$4,$20,$0	# compute function result
+
+	beq	$28,.L0
+	subq	$18,$28,$18
+
+	.align	3
+.Loop0:	ldq	$3,-16($17)
+	subq	$16,8,$16
+	sll	$4,$19,$5
+	subq	$17,8,$17
+	subq	$28,1,$28
+	srl	$3,$20,$6
+	or	$3,$3,$4
+	or	$5,$6,$8
+	stq	$8,0($16)
+	bne	$28,.Loop0
+
+.L0:	sll	$4,$19,$24
+	beq	$18,.Lend
+ # warm up phase 1
+	ldq	$1,-16($17)
+	subq	$18,4,$18
+	ldq	$2,-24($17)
+	ldq	$3,-32($17)
+	ldq	$4,-40($17)
+	beq	$18,.Lend1
+ # warm up phase 2
+	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	ldq	$1,-48($17)
+	sll	$2,$19,$22
+	ldq	$2,-56($17)
+	srl	$3,$20,$5
+	or	$7,$24,$7
+	sll	$3,$19,$23
+	or	$8,$21,$8
+	srl	$4,$20,$6
+	ldq	$3,-64($17)
+	sll	$4,$19,$24
+	ldq	$4,-72($17)
+	subq	$18,4,$18
+	beq	$18,.Lend2
+	.align  4
+ # main loop
+.Loop:	stq	$7,-8($16)
+	or	$5,$22,$5
+	stq	$8,-16($16)
+	or	$6,$23,$6
+
+	srl	$1,$20,$7
+	subq	$18,4,$18
+	sll	$1,$19,$21
+	unop	# ldq	$31,-96($17)
+
+	srl	$2,$20,$8
+	ldq	$1,-80($17)
+	sll	$2,$19,$22
+	ldq	$2,-88($17)
+
+	stq	$5,-24($16)
+	or	$7,$24,$7
+	stq	$6,-32($16)
+	or	$8,$21,$8
+
+	srl	$3,$20,$5
+	unop	# ldq	$31,-96($17)
+	sll	$3,$19,$23
+	subq	$16,32,$16
+
+	srl	$4,$20,$6
+	ldq	$3,-96($17)
+	sll	$4,$19,$24
+	ldq	$4,-104($17)
+
+	subq	$17,32,$17
+	bne	$18,.Loop
+ # cool down phase 2/1
+.Lend2:	stq	$7,-8($16)
+	or	$5,$22,$5
+	stq	$8,-16($16)
+	or	$6,$23,$6
+	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	sll	$2,$19,$22
+	stq	$5,-24($16)
+	or	$7,$24,$7
+	stq	$6,-32($16)
+	or	$8,$21,$8
+	srl	$3,$20,$5
+	sll	$3,$19,$23
+	srl	$4,$20,$6
+	sll	$4,$19,$24
+ # cool down phase 2/2
+	stq	$7,-40($16)
+	or	$5,$22,$5
+	stq	$8,-48($16)
+	or	$6,$23,$6
+	stq	$5,-56($16)
+	stq	$6,-64($16)
+ # cool down phase 2/3
+	stq	$24,-72($16)
+	ret	$31,($26),1
+
+ # cool down phase 1/1
+.Lend1:	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	sll	$2,$19,$22
+	srl	$3,$20,$5
+	or	$7,$24,$7
+	sll	$3,$19,$23
+	or	$8,$21,$8
+	srl	$4,$20,$6
+	sll	$4,$19,$24
+ # cool down phase 1/2
+	stq	$7,-8($16)
+	or	$5,$22,$5
+	stq	$8,-16($16)
+	or	$6,$23,$6
+	stq	$5,-24($16)
+	stq	$6,-32($16)
+	stq	$24,-40($16)
+	ret	$31,($26),1
+
+.Lend:	stq	$24,-8($16)
+	ret	$31,($26),1
+	.end	__mpn_lshift
diff --git a/REORG.TODO/sysdeps/alpha/alphaev5/rshift.S b/REORG.TODO/sysdeps/alpha/alphaev5/rshift.S
new file mode 100644
index 0000000000..0a44c77d0a
--- /dev/null
+++ b/REORG.TODO/sysdeps/alpha/alphaev5/rshift.S
@@ -0,0 +1,170 @@
+ # Alpha EV5 __mpn_rshift --
+
+ # Copyright (C) 1994-2017 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 3.25 cycles/limb on the EV5.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.frame	$30,0,$26,0
+
+	ldq	$4,0($17)	# load first limb
+	subq	$31,$19,$20
+	subq	$18,1,$18
+	and	$18,4-1,$28	# number of limbs in first loop
+	sll	$4,$20,$0	# compute function result
+
+	beq	$28,.L0
+	subq	$18,$28,$18
+
+	.align	3
+.Loop0:	ldq	$3,8($17)
+	addq	$16,8,$16
+	srl	$4,$19,$5
+	addq	$17,8,$17
+	subq	$28,1,$28
+	sll	$3,$20,$6
+	or	$3,$3,$4
+	or	$5,$6,$8
+	stq	$8,-8($16)
+	bne	$28,.Loop0
+
+.L0:	srl	$4,$19,$24
+	beq	$18,.Lend
+ # warm up phase 1
+	ldq	$1,8($17)
+	subq	$18,4,$18
+	ldq	$2,16($17)
+	ldq	$3,24($17)
+	ldq	$4,32($17)
+	beq	$18,.Lend1
+ # warm up phase 2
+	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	ldq	$1,40($17)
+	srl	$2,$19,$22
+	ldq	$2,48($17)
+	sll	$3,$20,$5
+	or	$7,$24,$7
+	srl	$3,$19,$23
+	or	$8,$21,$8
+	sll	$4,$20,$6
+	ldq	$3,56($17)
+	srl	$4,$19,$24
+	ldq	$4,64($17)
+	subq	$18,4,$18
+	beq	$18,.Lend2
+	.align  4
+ # main loop
+.Loop:	stq	$7,0($16)
+	or	$5,$22,$5
+	stq	$8,8($16)
+	or	$6,$23,$6
+
+	sll	$1,$20,$7
+	subq	$18,4,$18
+	srl	$1,$19,$21
+	unop	# ldq	$31,-96($17)
+
+	sll	$2,$20,$8
+	ldq	$1,72($17)
+	srl	$2,$19,$22
+	ldq	$2,80($17)
+
+	stq	$5,16($16)
+	or	$7,$24,$7
+	stq	$6,24($16)
+	or	$8,$21,$8
+
+	sll	$3,$20,$5
+	unop	# ldq	$31,-96($17)
+	srl	$3,$19,$23
+	addq	$16,32,$16
+
+	sll	$4,$20,$6
+	ldq	$3,88($17)
+	srl	$4,$19,$24
+	ldq	$4,96($17)
+
+	addq	$17,32,$17
+	bne	$18,.Loop
+ # cool down phase 2/1
+.Lend2:	stq	$7,0($16)
+	or	$5,$22,$5
+	stq	$8,8($16)
+	or	$6,$23,$6
+	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	srl	$2,$19,$22
+	stq	$5,16($16)
+	or	$7,$24,$7
+	stq	$6,24($16)
+	or	$8,$21,$8
+	sll	$3,$20,$5
+	srl	$3,$19,$23
+	sll	$4,$20,$6
+	srl	$4,$19,$24
+ # cool down phase 2/2
+	stq	$7,32($16)
+	or	$5,$22,$5
+	stq	$8,40($16)
+	or	$6,$23,$6
+	stq	$5,48($16)
+	stq	$6,56($16)
+ # cool down phase 2/3
+	stq	$24,64($16)
+	ret	$31,($26),1
+
+ # cool down phase 1/1
+.Lend1:	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	srl	$2,$19,$22
+	sll	$3,$20,$5
+	or	$7,$24,$7
+	srl	$3,$19,$23
+	or	$8,$21,$8
+	sll	$4,$20,$6
+	srl	$4,$19,$24
+ # cool down phase 1/2
+	stq	$7,0($16)
+	or	$5,$22,$5
+	stq	$8,8($16)
+	or	$6,$23,$6
+	stq	$5,16($16)
+	stq	$6,24($16)
+	stq	$24,32($16)
+	ret	$31,($26),1
+
+.Lend:	stq	$24,0($16)
+	ret	$31,($26),1
+	.end	__mpn_rshift
diff --git a/REORG.TODO/sysdeps/alpha/alphaev5/sub_n.S b/REORG.TODO/sysdeps/alpha/alphaev5/sub_n.S
new file mode 100644
index 0000000000..032b0c616b
--- /dev/null
+++ b/REORG.TODO/sysdeps/alpha/alphaev5/sub_n.S
@@ -0,0 +1,147 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995-2017 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.frame	$30,0,$26,0
+
+	or	$31,$31,$25		# clear cy
+	subq	$19,4,$19		# decr loop cnt
+	blt	$19,.Lend2		# if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+	ldq	$0,0($18)
+	ldq	$1,8($18)
+	ldq	$4,0($17)
+	ldq	$5,8($17)
+	addq	$17,32,$17		# update s1_ptr
+	ldq	$2,16($18)
+	subq	$4,$0,$20		# 1st main sub
+	ldq	$3,24($18)
+	subq	$19,4,$19		# decr loop cnt
+	ldq	$6,-16($17)
+	cmpult	$4,$20,$25		# compute cy from last sub
+	ldq	$7,-8($17)
+	addq	$1,$25,$28		# cy add
+	addq	$18,32,$18		# update s2_ptr
+	subq	$5,$28,$21		# 2nd main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	blt	$19,.Lend1		# if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+	.align	4
+.Loop:	cmpult	$5,$21,$25		# compute cy from last add
+	ldq	$0,0($18)
+	or	$8,$25,$25		# combine cy from the two adds
+	ldq	$1,8($18)
+	addq	$2,$25,$28		# cy add
+	ldq	$4,0($17)
+	subq	$6,$28,$22		# 3rd main sub
+	ldq	$5,8($17)
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$6,$22,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+	stq	$21,8($16)
+	addq	$3,$25,$28		# cy add
+	subq	$7,$28,$23		# 4th main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$7,$23,$25		# compute cy from last add
+	addq	$17,32,$17		# update s1_ptr
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,32,$16		# update res_ptr
+	addq	$0,$25,$28		# cy add
+	ldq	$2,16($18)
+	subq	$4,$28,$20		# 1st main sub
+	ldq	$3,24($18)
+	cmpult	$28,$25,$8		# compute cy from last add
+	ldq	$6,-16($17)
+	cmpult	$4,$20,$25		# compute cy from last add
+	ldq	$7,-8($17)
+	or	$8,$25,$25		# combine cy from the two adds
+	subq	$19,4,$19		# decr loop cnt
+	stq	$22,-16($16)
+	addq	$1,$25,$28		# cy add
+	stq	$23,-8($16)
+	subq	$5,$28,$21		# 2nd main sub
+	addq	$18,32,$18		# update s2_ptr
+	cmpult	$28,$25,$8		# compute cy from last add
+	bge	$19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:	cmpult	$5,$21,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$2,$25,$28		# cy add
+	subq	$6,$28,$22		# 3rd main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$6,$22,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+	stq	$21,8($16)
+	addq	$3,$25,$28		# cy add
+	subq	$7,$28,$23		# 4th main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$7,$23,$25		# compute cy from last add
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,32,$16		# update res_ptr
+	stq	$22,-16($16)
+	stq	$23,-8($16)
+.Lend2:	addq	$19,4,$19		# restore loop cnt
+	beq	$19,.Lret
+ # Start software pipeline for 2nd loop
+	ldq	$0,0($18)
+	ldq	$4,0($17)
+	subq	$19,1,$19
+	beq	$19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+	.align	4
+.Loop0:	addq	$0,$25,$28		# cy add
+	ldq	$0,8($18)
+	subq	$4,$28,$20		# main sub
+	ldq	$1,8($17)
+	addq	$18,8,$18
+	cmpult	$28,$25,$8		# compute cy from last add
+	addq	$17,8,$17
+	stq	$20,0($16)
+	cmpult	$4,$20,$25		# compute cy from last add
+	subq	$19,1,$19		# decr loop cnt
+	or	$8,$25,$25		# combine cy from the two adds
+	addq	$16,8,$16
+	or	$1,$31,$4
+	bne	$19,.Loop0
+.Lend0:	addq	$0,$25,$28		# cy add
+	subq	$4,$28,$20		# main sub
+	cmpult	$28,$25,$8		# compute cy from last add
+	cmpult	$4,$20,$25		# compute cy from last add
+	stq	$20,0($16)
+	or	$8,$25,$25		# combine cy from the two adds
+
+.Lret:	or	$25,$31,$0		# return cy
+	ret	$31,($26),1
+	.end	__mpn_sub_n