correctly rounded sqrt() asm for x86 (i387)

the fsqrt opcode is correctly rounded, but only in the fpu's selected precision mode, which is 80-bit extended precision. to get a correctly rounded double precision output, we check for the only corner cases where two-step rounding could give different results than one-step (extended-precision mantissa ending in 0x400) and adjust the mantissa slightly in the opposite direction of the rounding which the fpu already did (reported in the c1 flag of the fpu status word). this should have near-zero cost in the non-corner cases and at worst very low cost. note that in order for sqrt() to get used when compiling with gcc, the broken, non-conformant builtin sqrt must be disabled.
author: Rich Felker <dalias@aerifal.cx> 2012-03-15 01:29:03 -0400
committer: Rich Felker <dalias@aerifal.cx> 2012-03-15 01:29:03 -0400
commit: 809556e60a3359f646946879dd94c4760e5b8e84 (patch)
tree: 23afab7c3cd180406757b99efda0363de99af703
parent: e0a54e6725eaa0b5aeb28e9815c310f70068d308 (diff)
download: musl-809556e60a3359f646946879dd94c4760e5b8e84.tar.gz
musl-809556e60a3359f646946879dd94c4760e5b8e84.tar.xz
musl-809556e60a3359f646946879dd94c4760e5b8e84.zip
1 files changed, 16 insertions, 0 deletions
diff --git a/src/math/i386/sqrt.s b/src/math/i386/sqrt.s
index c6e55303..8289d094 100644
--- a/src/math/i386/sqrt.s
+++ b/src/math/i386/sqrt.s
@@ -2,4 +2,20 @@
 .type sqrt,@function
 sqrt:	fldl 4(%esp)
 	fsqrt
+	fstsw %ax
+	sub $12,%esp
+	fld %st(0)
+	fstpt (%esp)
+	mov (%esp),%ecx
+	and $0x7ff,%ecx
+	cmp $0x400,%ecx
+	jnz 1f
+	and $0x200,%eax
+	sub $0x100,%eax
+	sub %eax,(%esp)
+	fstp %st(0)
+	fldt (%esp)
+1:	add $12,%esp
+	fstpl 4(%esp)
+	fldl 4(%esp)
 	ret
author	Rich Felker <dalias@aerifal.cx>	2012-03-15 01:29:03 -0400
committer	Rich Felker <dalias@aerifal.cx>	2012-03-15 01:29:03 -0400
commit	809556e60a3359f646946879dd94c4760e5b8e84 (patch)
tree	23afab7c3cd180406757b99efda0363de99af703
parent	e0a54e6725eaa0b5aeb28e9815c310f70068d308 (diff)
download	musl-809556e60a3359f646946879dd94c4760e5b8e84.tar.gz musl-809556e60a3359f646946879dd94c4760e5b8e84.tar.xz musl-809556e60a3359f646946879dd94c4760e5b8e84.zip