about summary refs log tree commit diff
path: root/src/math/i386/exp.s
diff options
context:
space:
mode:
authorRich Felker <dalias@aerifal.cx>2012-03-19 09:00:30 -0400
committerRich Felker <dalias@aerifal.cx>2012-03-19 09:00:30 -0400
commit02db27d9deaee71b244c91e720ec819c74dab150 (patch)
treeef2543fd54a6fdbca8839cb14c71bb10ffdaa8f1 /src/math/i386/exp.s
parentda7458a602a6f0bdea25d6b9b613372048a974e6 (diff)
downloadmusl-02db27d9deaee71b244c91e720ec819c74dab150.tar.gz
musl-02db27d9deaee71b244c91e720ec819c74dab150.tar.xz
musl-02db27d9deaee71b244c91e720ec819c74dab150.zip
optimize exponential asm for i386
up to 30% faster exp2 by avoiding slow frndint and fscale functions.
expm1 also takes a much more direct path for small arguments (the
expected usage case).
Diffstat (limited to 'src/math/i386/exp.s')
-rw-r--r--src/math/i386/exp.s87
1 files changed, 76 insertions, 11 deletions
diff --git a/src/math/i386/exp.s b/src/math/i386/exp.s
index f4769d59..76ab4d64 100644
--- a/src/math/i386/exp.s
+++ b/src/math/i386/exp.s
@@ -1,3 +1,37 @@
+.global expm1f
+.type expm1f,@function
+expm1f:
+	flds 4(%esp)
+	jmp 1f
+
+.global expm1l
+.type expm1l,@function
+expm1l:
+	fldt 4(%esp)
+	jmp 1f
+
+.global expm1
+.type expm1,@function
+expm1:
+	fldl 4(%esp)
+1:	fldl2e
+	fmulp
+	fld1
+	fld %st(1)
+	fabs
+	fucom %st(1)
+	fnstsw %ax
+	fstp %st(0)
+	fstp %st(0)
+	sahf
+	ja 1f
+	f2xm1
+	ret
+1:	call 1f
+	fld1
+	fsubrp
+	ret
+
 .global exp2f
 .type exp2f,@function
 exp2f:
@@ -34,22 +68,53 @@ exp:
 .type exp2,@function
 exp2:
 	fldl 4(%esp)
-1:	fxam
-	fnstsw %ax
+1:	mov $0x47000000,%eax
+	push %eax
+	flds (%esp)
+	shl $7,%eax
+	push %eax
+	add %eax,%eax
+	push %eax
+	fld %st(1)
+	fabs
+	fucom %st(1)
+	fnstsw
 	sahf
-	jnp 1f
-	jnc 1f
-	fstps 4(%esp)
-	mov $0xfe,%al
-	and %al,7(%esp)
-	flds 4(%esp)
-1:	fld %st(0)
-	frndint
+	ja 2f
+	fstp %st(0)
+	fstp %st(0)
+	fld %st(0)
+	fistpl 8(%esp)
+	fildl 8(%esp)
 	fxch %st(1)
 	fsub %st(1)
+	mov $0x3fff,%eax
+	add %eax,8(%esp)
 	f2xm1
 	fld1
 	faddp
-	fscale
+	fldt (%esp)
+	fmulp
 	fstp %st(1)
+	add $12,%esp
+	ret
+
+2:	fstp %st(0)
+	fstp %st(0)
+	fsts 8(%esp)
+	mov 8(%esp),%eax
+	lea (%eax,%eax),%ecx
+	cmp $0xff000000,%ecx
+	ja 2f
+	fstp %st(0)
+	xor %ecx,%ecx
+	inc %ecx
+	add %eax,%eax
+	jc 1f
+	mov $0x7ffe,%ecx
+1:	mov %ecx,8(%esp)
+	fldt (%esp)
+	fld %st(0)
+	fmulp
+2:	add $12,%esp
 	ret