diff options
Diffstat (limited to 'ports/sysdeps/ia64/ia64libgcc.S')
-rw-r--r-- | ports/sysdeps/ia64/ia64libgcc.S | 350 |
1 files changed, 350 insertions, 0 deletions
diff --git a/ports/sysdeps/ia64/ia64libgcc.S b/ports/sysdeps/ia64/ia64libgcc.S new file mode 100644 index 0000000000..3f77b06a5a --- /dev/null +++ b/ports/sysdeps/ia64/ia64libgcc.S @@ -0,0 +1,350 @@ +/* From the Intel IA-64 Optimization Guide, choose the minimum latency + alternative. */ + +#include <sysdep.h> +#undef ret + +#include <shlib-compat.h> + +#if SHLIB_COMPAT(libc, GLIBC_2_2, GLIBC_2_2_6) + +/* __divtf3 + Compute a 80-bit IEEE double-extended quotient. + farg0 holds the dividend. farg1 holds the divisor. */ + +ENTRY(___divtf3) + cmp.eq p7, p0 = r0, r0 + frcpa.s0 f10, p6 = farg0, farg1 + ;; +(p6) cmp.ne p7, p0 = r0, r0 + .pred.rel.mutex p6, p7 +(p6) fnma.s1 f11 = farg1, f10, f1 +(p6) fma.s1 f12 = farg0, f10, f0 + ;; +(p6) fma.s1 f13 = f11, f11, f0 +(p6) fma.s1 f14 = f11, f11, f11 + ;; +(p6) fma.s1 f11 = f13, f13, f11 +(p6) fma.s1 f13 = f14, f10, f10 + ;; +(p6) fma.s1 f10 = f13, f11, f10 +(p6) fnma.s1 f11 = farg1, f12, farg0 + ;; +(p6) fma.s1 f11 = f11, f10, f12 +(p6) fnma.s1 f12 = farg1, f10, f1 + ;; +(p6) fma.s1 f10 = f12, f10, f10 +(p6) fnma.s1 f12 = farg1, f11, farg0 + ;; +(p6) fma.s0 fret0 = f12, f10, f11 +(p7) mov fret0 = f10 + br.ret.sptk rp +END(___divtf3) + .symver ___divtf3, __divtf3@GLIBC_2.2 + +/* __divdf3 + Compute a 64-bit IEEE double quotient. + farg0 holds the dividend. farg1 holds the divisor. */ + +ENTRY(___divdf3) + cmp.eq p7, p0 = r0, r0 + frcpa.s0 f10, p6 = farg0, farg1 + ;; +(p6) cmp.ne p7, p0 = r0, r0 + .pred.rel.mutex p6, p7 +(p6) fmpy.s1 f11 = farg0, f10 +(p6) fnma.s1 f12 = farg1, f10, f1 + ;; +(p6) fma.s1 f11 = f12, f11, f11 +(p6) fmpy.s1 f13 = f12, f12 + ;; +(p6) fma.s1 f10 = f12, f10, f10 +(p6) fma.s1 f11 = f13, f11, f11 + ;; +(p6) fmpy.s1 f12 = f13, f13 +(p6) fma.s1 f10 = f13, f10, f10 + ;; +(p6) fma.d.s1 f11 = f12, f11, f11 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fnma.d.s1 f8 = farg1, f11, farg0 + ;; +(p6) fma.d fret0 = f8, f10, f11 +(p7) mov fret0 = f10 + br.ret.sptk rp + ;; +END(___divdf3) + .symver ___divdf3, __divdf3@GLIBC_2.2 + +/* __divsf3 + Compute a 32-bit IEEE float quotient. + farg0 holds the dividend. farg1 holds the divisor. */ + +ENTRY(___divsf3) + cmp.eq p7, p0 = r0, r0 + frcpa.s0 f10, p6 = farg0, farg1 + ;; +(p6) cmp.ne p7, p0 = r0, r0 + .pred.rel.mutex p6, p7 +(p6) fmpy.s1 f8 = farg0, f10 +(p6) fnma.s1 f9 = farg1, f10, f1 + ;; +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fmpy.s1 f9 = f9, f9 + ;; +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fmpy.s1 f9 = f9, f9 + ;; +(p6) fma.d.s1 f10 = f9, f8, f8 + ;; +(p6) fnorm.s.s0 fret0 = f10 +(p7) mov fret0 = f10 + br.ret.sptk rp + ;; +END(___divsf3) + .symver ___divsf3, __divsf3@GLIBC_2.2 + +/* __divdi3 + Compute a 64-bit integer quotient. + in0 holds the dividend. in1 holds the divisor. */ + +ENTRY(___divdi3) + .regstk 2,0,0,0 + /* Transfer inputs to FP registers. */ + setf.sig f8 = in0 + setf.sig f9 = in1 + ;; + /* Convert the inputs to FP, so that they won't be treated as + unsigned. */ + fcvt.xf f8 = f8 + fcvt.xf f9 = f9 + ;; + /* Compute the reciprocal approximation. */ + frcpa.s1 f10, p6 = f8, f9 + ;; + /* 3 Newton-Raphson iterations. */ +(p6) fnma.s1 f11 = f9, f10, f1 +(p6) fmpy.s1 f12 = f8, f10 + ;; +(p6) fmpy.s1 f13 = f11, f11 +(p6) fma.s1 f12 = f11, f12, f12 + ;; +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 + ;; +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 + ;; +(p6) fma.s1 f10 = f12, f10, f11 + ;; + /* Round quotient to an integer. */ + fcvt.fx.trunc.s1 f10 = f10 + ;; + /* Transfer result to GP registers. */ + getf.sig ret0 = f10 + br.ret.sptk rp + ;; +END(___divdi3) + .symver ___divdi3, __divdi3@GLIBC_2.2 + +/* __moddi3 + Compute a 64-bit integer modulus. + in0 holds the dividend (a). in1 holds the divisor (b). */ + +ENTRY(___moddi3) + .regstk 2,0,0,0 + /* Transfer inputs to FP registers. */ + setf.sig f14 = in0 + setf.sig f9 = in1 + ;; + /* Convert the inputs to FP, so that they won't be treated as + unsigned. */ + fcvt.xf f8 = f14 + fcvt.xf f9 = f9 + ;; + /* Compute the reciprocal approximation. */ + frcpa.s1 f10, p6 = f8, f9 + ;; + /* 3 Newton-Raphson iterations. */ +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f11 = f9, f10, f1 + ;; +(p6) fma.s1 f12 = f11, f12, f12 +(p6) fmpy.s1 f13 = f11, f11 + ;; +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 + ;; + sub in1 = r0, in1 +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 + ;; + setf.sig f9 = in1 +(p6) fma.s1 f10 = f12, f10, f11 + ;; + fcvt.fx.trunc.s1 f10 = f10 + ;; + /* r = q * (-b) + a */ + xma.l f10 = f10, f9, f14 + ;; + /* Transfer result to GP registers. */ + getf.sig ret0 = f10 + br.ret.sptk rp + ;; +END(___moddi3) + .symver ___moddi3, __moddi3@GLIBC_2.2 + +/* __udivdi3 + Compute a 64-bit unsigned integer quotient. + in0 holds the dividend. in1 holds the divisor. */ + +ENTRY(___udivdi3) + .regstk 2,0,0,0 + /* Transfer inputs to FP registers. */ + setf.sig f8 = in0 + setf.sig f9 = in1 + ;; + /* Convert the inputs to FP, to avoid FP software-assist faults. */ + fcvt.xuf.s1 f8 = f8 + fcvt.xuf.s1 f9 = f9 + ;; + /* Compute the reciprocal approximation. */ + frcpa.s1 f10, p6 = f8, f9 + ;; + /* 3 Newton-Raphson iterations. */ +(p6) fnma.s1 f11 = f9, f10, f1 +(p6) fmpy.s1 f12 = f8, f10 + ;; +(p6) fmpy.s1 f13 = f11, f11 +(p6) fma.s1 f12 = f11, f12, f12 + ;; +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 + ;; +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 + ;; +(p6) fma.s1 f10 = f12, f10, f11 + ;; + /* Round quotient to an unsigned integer. */ + fcvt.fxu.trunc.s1 f10 = f10 + ;; + /* Transfer result to GP registers. */ + getf.sig ret0 = f10 + br.ret.sptk rp + ;; +END(___udivdi3) + .symver ___udivdi3, __udivdi3@GLIBC_2.2 + +/* __umoddi3 + Compute a 64-bit unsigned integer modulus. + in0 holds the dividend (a). in1 holds the divisor (b). */ + +ENTRY(___umoddi3) + .regstk 2,0,0,0 + /* Transfer inputs to FP registers. */ + setf.sig f14 = in0 + setf.sig f9 = in1 + ;; + /* Convert the inputs to FP, to avoid FP software assist faults. */ + fcvt.xuf.s1 f8 = f14 + fcvt.xuf.s1 f9 = f9 + ;; + /* Compute the reciprocal approximation. */ + frcpa.s1 f10, p6 = f8, f9 + ;; + /* 3 Newton-Raphson iterations. */ +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f11 = f9, f10, f1 + ;; +(p6) fma.s1 f12 = f11, f12, f12 +(p6) fmpy.s1 f13 = f11, f11 + ;; +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 + ;; + sub in1 = r0, in1 +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 + ;; + setf.sig f9 = in1 +(p6) fma.s1 f10 = f12, f10, f11 + ;; + /* Round quotient to an unsigned integer. */ + fcvt.fxu.trunc.s1 f10 = f10 + ;; + /* r = q * (-b) + a */ + xma.l f10 = f10, f9, f14 + ;; + /* Transfer result to GP registers. */ + getf.sig ret0 = f10 + br.ret.sptk rp + ;; +END(___umoddi3) + .symver ___umoddi3, __umoddi3@GLIBC_2.2 + +/* __multi3 + Compute a 128-bit multiply of 128-bit multiplicands. + in0/in1 holds one multiplicand (a), in2/in3 holds the other one (b). */ + +ENTRY(___multi3) + .regstk 4,0,0,0 + setf.sig f6 = in1 + movl r19 = 0xffffffff + setf.sig f7 = in2 + ;; + and r14 = r19, in0 + ;; + setf.sig f10 = r14 + and r14 = r19, in2 + xmpy.l f9 = f6, f7 + ;; + setf.sig f6 = r14 + shr.u r14 = in0, 32 + ;; + setf.sig f7 = r14 + shr.u r14 = in2, 32 + ;; + setf.sig f8 = r14 + xmpy.l f11 = f10, f6 + xmpy.l f6 = f7, f6 + ;; + getf.sig r16 = f11 + xmpy.l f7 = f7, f8 + ;; + shr.u r14 = r16, 32 + and r16 = r19, r16 + getf.sig r17 = f6 + setf.sig f6 = in0 + ;; + setf.sig f11 = r14 + getf.sig r21 = f7 + setf.sig f7 = in3 + ;; + xma.l f11 = f10, f8, f11 + xma.l f6 = f6, f7, f9 + ;; + getf.sig r18 = f11 + ;; + add r18 = r18, r17 + ;; + and r15 = r19, r18 + cmp.ltu p7, p6 = r18, r17 + ;; + getf.sig r22 = f6 +(p7) adds r14 = 1, r19 + ;; +(p7) add r21 = r21, r14 + shr.u r14 = r18, 32 + shl r15 = r15, 32 + ;; + add r20 = r21, r14 + ;; + add ret0 = r15, r16 + add ret1 = r22, r20 + br.ret.sptk rp + ;; +END(___multi3) + .symver ___multi3, __multi3@GLIBC_2.2 + +#endif |