about summary refs log tree commit diff
path: root/src/math/i386/scalbnl.s
diff options
context:
space:
mode:
authorRich Felker <dalias@aerifal.cx>2012-03-20 00:51:32 -0400
committerRich Felker <dalias@aerifal.cx>2012-03-20 00:51:32 -0400
commitbaa43bca0a051e8deb0d6a9a8882ceeea5c27249 (patch)
treef5fe7ae916d9039adfe82217716e2aafd08702fb /src/math/i386/scalbnl.s
parent7513d3ecabb998e2c8c4cb9ed5de48c4b64a166b (diff)
downloadmusl-baa43bca0a051e8deb0d6a9a8882ceeea5c27249.tar.gz
musl-baa43bca0a051e8deb0d6a9a8882ceeea5c27249.tar.xz
musl-baa43bca0a051e8deb0d6a9a8882ceeea5c27249.zip
optimize scalbn family
the fscale instruction is slow everywhere, probably because it
involves a costly and unnecessary integer truncation operation that
ends up being a no-op in common usages. instead, construct a floating
point scale value with integer arithmetic and simply multiply by it,
when possible.

for float and double, this is always possible by going to the
next-larger type. we use some cheap but effective saturating
arithmetic tricks to make sure even very large-magnitude exponents
fit. for long double, if the scaling exponent is too large to fit in
the exponent of a long double value, we simply fallback to the
expensive fscale method.

on atom cpu, these changes speed up scalbn by over 30%. (min rdtsc
timing dropped from 110 cycles to 70 cycles.)
Diffstat (limited to 'src/math/i386/scalbnl.s')
-rw-r--r--src/math/i386/scalbnl.s16
1 files changed, 15 insertions, 1 deletions
diff --git a/src/math/i386/scalbnl.s b/src/math/i386/scalbnl.s
index 224b1bef..54414c2e 100644
--- a/src/math/i386/scalbnl.s
+++ b/src/math/i386/scalbnl.s
@@ -11,7 +11,21 @@ scalblnl:
 .global scalbnl
 .type scalbnl,@function
 scalbnl:
-	fildl 16(%esp)
+	mov 16(%esp),%eax
+	add $0x3ffe,%eax
+	cmp $0x7ffd,%eax
+	jae 1f
+	inc %eax
+	fldt 4(%esp)
+	mov %eax,12(%esp)
+	mov $0x80000000,%eax
+	mov %eax,8(%esp)
+	xor %eax,%eax
+	mov %eax,4(%esp)
+	fldt 4(%esp)
+	fmulp
+	ret
+1:	fildl 16(%esp)
 	fldt 4(%esp)
 	fscale
 	fstp %st(1)