about summary refs log tree commit diff
path: root/sysdeps/x86_64/lshift.S
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2010-09-02 23:36:25 -0700
committerUlrich Drepper <drepper@redhat.com>2010-09-02 23:36:25 -0700
commit0959ffc97b738c489087bcf45578c1580a87e66d (patch)
treeac76fbfa5e53376a579a3220a4a7873624e4a296 /sysdeps/x86_64/lshift.S
parentece298407076558531796450af39199aa0b34bef (diff)
downloadglibc-0959ffc97b738c489087bcf45578c1580a87e66d.tar.gz
glibc-0959ffc97b738c489087bcf45578c1580a87e66d.tar.xz
glibc-0959ffc97b738c489087bcf45578c1580a87e66d.zip
Update x86-64 mpn routines from GMP 5.0.1.
Diffstat (limited to 'sysdeps/x86_64/lshift.S')
-rw-r--r--sysdeps/x86_64/lshift.S127
1 files changed, 92 insertions, 35 deletions
diff --git a/sysdeps/x86_64/lshift.S b/sysdeps/x86_64/lshift.S
index 5ac66f0a36..f89d3e09b3 100644
--- a/sysdeps/x86_64/lshift.S
+++ b/sysdeps/x86_64/lshift.S
@@ -1,5 +1,5 @@
-/* AMD64 __mpn_lshift --
-   Copyright 2004, 2006 Free Software Foundation, Inc.
+/* x86-64 __mpn_lshift --
+   Copyright (C) 2007, 2009 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
@@ -20,41 +20,98 @@
 #include "sysdep.h"
 #include "asm-syntax.h"
 
+#define rp	%rdi
+#define up	%rsi
+#define n	%rdx
+#define cnt	%cl
 
 	.text
 ENTRY (__mpn_lshift)
-	movq	-8(%rsi,%rdx,8), %mm7
-	movd	%ecx, %mm1
-	movl	$64, %eax
-	subl	%ecx, %eax
-	movd	%eax, %mm0
-	movq	%mm7, %mm3
-	psrlq	%mm0, %mm7
-	movd	%mm7, %rax
-	subq	$2, %rdx
-	jl	L(endo)
-	.p2align 2
-L(loop):
-	movq	(%rsi,%rdx,8), %mm6
-	movq	%mm6, %mm2
-	psrlq	%mm0, %mm6
-	psllq	%mm1, %mm3
-	por	%mm6, %mm3
-	movq	%mm3, 8(%rdi,%rdx,8)
-	je	L(ende)
-	movq	-8(%rsi,%rdx,8), %mm7
-	movq	%mm7, %mm3
-	psrlq	%mm0, %mm7
-	psllq	%mm1, %mm2
-	por	%mm7, %mm2
-	movq	%mm2, (%rdi,%rdx,8)
-	subq	$2, %rdx
-	jge	L(loop)
-L(endo):
-	movq	%mm3, %mm2
-L(ende):
-	psllq	%mm1, %mm2
-	movq	%mm2, (%rdi)
-	emms
+	lea	-8(rp,n,8), rp
+	lea	-8(up,n,8), up
+
+	mov	%edx, %eax
+	and	$3, %eax
+	jne	L(nb00)
+L(b00):	/* n = 4, 8, 12, ... */
+	mov	(up), %r10
+	mov	-8(up), %r11
+	xor	%eax, %eax
+	shld	%cl, %r10, %rax
+	mov	-16(up), %r8
+	lea	24(rp), rp
+	sub	$4, n
+	jmp	L(00)
+
+L(nb00):/* n = 1, 5, 9, ... */
+	cmp	$2, %eax
+	jae	L(nb01)
+L(b01):	mov	(up), %r9
+	xor	%eax, %eax
+	shld	%cl, %r9, %rax
+	sub	$2, n
+	jb	L(le1)
+	mov	-8(up), %r10
+	mov	-16(up), %r11
+	lea	-8(up), up
+	lea	16(rp), rp
+	jmp	L(01)
+L(le1):	shl	%cl, %r9
+	mov	%r9, (rp)
+	ret
+
+L(nb01):/* n = 2, 6, 10, ... */
+	jne	L(b11)
+L(b10):	mov	(up), %r8
+	mov	-8(up), %r9
+	xor	%eax, %eax
+	shld	%cl, %r8, %rax
+	sub	$3, n
+	jb	L(le2)
+	mov	-16(up), %r10
+	lea	-16(up), up
+	lea	8(rp), rp
+	jmp	L(10)
+L(le2):	shld	%cl, %r9, %r8
+	mov	%r8, (rp)
+	shl	%cl, %r9
+	mov	%r9, -8(rp)
+	ret
+
+	.p2align 4		/* performance critical! */
+L(b11):	/* n = 3, 7, 11, ... */
+	mov	(up), %r11
+	mov	-8(up), %r8
+	xor	%eax, %eax
+	shld	%cl, %r11, %rax
+	mov	-16(up), %r9
+	lea	-24(up), up
+	sub	$4, n
+	jb	L(end)
+
+	.p2align 4
+L(top):	shld	%cl, %r8, %r11
+	mov	(up), %r10
+	mov	%r11, (rp)
+L(10):	shld	%cl, %r9, %r8
+	mov	-8(up), %r11
+	mov	%r8, -8(rp)
+L(01):	shld	%cl, %r10, %r9
+	mov	-16(up), %r8
+	mov	%r9, -16(rp)
+L(00):	shld	%cl, %r11, %r10
+	mov	-24(up), %r9
+	mov	%r10, -24(rp)
+	add	$-32, up
+	lea	-32(rp), rp
+	sub	$4, n
+	jnc	L(top)
+
+L(end):	shld	%cl, %r8, %r11
+	mov	%r11, (rp)
+	shld	%cl, %r9, %r8
+	mov	%r8, -8(rp)
+	shl	%cl, %r9
+	mov	%r9, -16(rp)
 	ret
 END (__mpn_lshift)