123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350 |
- /* From the Intel IA-64 Optimization Guide, choose the minimum latency
- alternative. */
- #include <sysdep.h>
- #undef ret
- #include <shlib-compat.h>
- #if SHLIB_COMPAT(libc, GLIBC_2_2, GLIBC_2_2_6)
- /* __divtf3
- Compute a 80-bit IEEE double-extended quotient.
- farg0 holds the dividend. farg1 holds the divisor. */
- ENTRY(___divtf3)
- cmp.eq p7, p0 = r0, r0
- frcpa.s0 f10, p6 = farg0, farg1
- ;;
- (p6) cmp.ne p7, p0 = r0, r0
- .pred.rel.mutex p6, p7
- (p6) fnma.s1 f11 = farg1, f10, f1
- (p6) fma.s1 f12 = farg0, f10, f0
- ;;
- (p6) fma.s1 f13 = f11, f11, f0
- (p6) fma.s1 f14 = f11, f11, f11
- ;;
- (p6) fma.s1 f11 = f13, f13, f11
- (p6) fma.s1 f13 = f14, f10, f10
- ;;
- (p6) fma.s1 f10 = f13, f11, f10
- (p6) fnma.s1 f11 = farg1, f12, farg0
- ;;
- (p6) fma.s1 f11 = f11, f10, f12
- (p6) fnma.s1 f12 = farg1, f10, f1
- ;;
- (p6) fma.s1 f10 = f12, f10, f10
- (p6) fnma.s1 f12 = farg1, f11, farg0
- ;;
- (p6) fma.s0 fret0 = f12, f10, f11
- (p7) mov fret0 = f10
- br.ret.sptk rp
- END(___divtf3)
- .symver ___divtf3, __divtf3@GLIBC_2.2
- /* __divdf3
- Compute a 64-bit IEEE double quotient.
- farg0 holds the dividend. farg1 holds the divisor. */
- ENTRY(___divdf3)
- cmp.eq p7, p0 = r0, r0
- frcpa.s0 f10, p6 = farg0, farg1
- ;;
- (p6) cmp.ne p7, p0 = r0, r0
- .pred.rel.mutex p6, p7
- (p6) fmpy.s1 f11 = farg0, f10
- (p6) fnma.s1 f12 = farg1, f10, f1
- ;;
- (p6) fma.s1 f11 = f12, f11, f11
- (p6) fmpy.s1 f13 = f12, f12
- ;;
- (p6) fma.s1 f10 = f12, f10, f10
- (p6) fma.s1 f11 = f13, f11, f11
- ;;
- (p6) fmpy.s1 f12 = f13, f13
- (p6) fma.s1 f10 = f13, f10, f10
- ;;
- (p6) fma.d.s1 f11 = f12, f11, f11
- (p6) fma.s1 f10 = f12, f10, f10
- ;;
- (p6) fnma.d.s1 f8 = farg1, f11, farg0
- ;;
- (p6) fma.d fret0 = f8, f10, f11
- (p7) mov fret0 = f10
- br.ret.sptk rp
- ;;
- END(___divdf3)
- .symver ___divdf3, __divdf3@GLIBC_2.2
- /* __divsf3
- Compute a 32-bit IEEE float quotient.
- farg0 holds the dividend. farg1 holds the divisor. */
- ENTRY(___divsf3)
- cmp.eq p7, p0 = r0, r0
- frcpa.s0 f10, p6 = farg0, farg1
- ;;
- (p6) cmp.ne p7, p0 = r0, r0
- .pred.rel.mutex p6, p7
- (p6) fmpy.s1 f8 = farg0, f10
- (p6) fnma.s1 f9 = farg1, f10, f1
- ;;
- (p6) fma.s1 f8 = f9, f8, f8
- (p6) fmpy.s1 f9 = f9, f9
- ;;
- (p6) fma.s1 f8 = f9, f8, f8
- (p6) fmpy.s1 f9 = f9, f9
- ;;
- (p6) fma.d.s1 f10 = f9, f8, f8
- ;;
- (p6) fnorm.s.s0 fret0 = f10
- (p7) mov fret0 = f10
- br.ret.sptk rp
- ;;
- END(___divsf3)
- .symver ___divsf3, __divsf3@GLIBC_2.2
- /* __divdi3
- Compute a 64-bit integer quotient.
- in0 holds the dividend. in1 holds the divisor. */
- ENTRY(___divdi3)
- .regstk 2,0,0,0
- /* Transfer inputs to FP registers. */
- setf.sig f8 = in0
- setf.sig f9 = in1
- ;;
- /* Convert the inputs to FP, so that they won't be treated as
- unsigned. */
- fcvt.xf f8 = f8
- fcvt.xf f9 = f9
- ;;
- /* Compute the reciprocal approximation. */
- frcpa.s1 f10, p6 = f8, f9
- ;;
- /* 3 Newton-Raphson iterations. */
- (p6) fnma.s1 f11 = f9, f10, f1
- (p6) fmpy.s1 f12 = f8, f10
- ;;
- (p6) fmpy.s1 f13 = f11, f11
- (p6) fma.s1 f12 = f11, f12, f12
- ;;
- (p6) fma.s1 f10 = f11, f10, f10
- (p6) fma.s1 f11 = f13, f12, f12
- ;;
- (p6) fma.s1 f10 = f13, f10, f10
- (p6) fnma.s1 f12 = f9, f11, f8
- ;;
- (p6) fma.s1 f10 = f12, f10, f11
- ;;
- /* Round quotient to an integer. */
- fcvt.fx.trunc.s1 f10 = f10
- ;;
- /* Transfer result to GP registers. */
- getf.sig ret0 = f10
- br.ret.sptk rp
- ;;
- END(___divdi3)
- .symver ___divdi3, __divdi3@GLIBC_2.2
- /* __moddi3
- Compute a 64-bit integer modulus.
- in0 holds the dividend (a). in1 holds the divisor (b). */
- ENTRY(___moddi3)
- .regstk 2,0,0,0
- /* Transfer inputs to FP registers. */
- setf.sig f14 = in0
- setf.sig f9 = in1
- ;;
- /* Convert the inputs to FP, so that they won't be treated as
- unsigned. */
- fcvt.xf f8 = f14
- fcvt.xf f9 = f9
- ;;
- /* Compute the reciprocal approximation. */
- frcpa.s1 f10, p6 = f8, f9
- ;;
- /* 3 Newton-Raphson iterations. */
- (p6) fmpy.s1 f12 = f8, f10
- (p6) fnma.s1 f11 = f9, f10, f1
- ;;
- (p6) fma.s1 f12 = f11, f12, f12
- (p6) fmpy.s1 f13 = f11, f11
- ;;
- (p6) fma.s1 f10 = f11, f10, f10
- (p6) fma.s1 f11 = f13, f12, f12
- ;;
- sub in1 = r0, in1
- (p6) fma.s1 f10 = f13, f10, f10
- (p6) fnma.s1 f12 = f9, f11, f8
- ;;
- setf.sig f9 = in1
- (p6) fma.s1 f10 = f12, f10, f11
- ;;
- fcvt.fx.trunc.s1 f10 = f10
- ;;
- /* r = q * (-b) + a */
- xma.l f10 = f10, f9, f14
- ;;
- /* Transfer result to GP registers. */
- getf.sig ret0 = f10
- br.ret.sptk rp
- ;;
- END(___moddi3)
- .symver ___moddi3, __moddi3@GLIBC_2.2
- /* __udivdi3
- Compute a 64-bit unsigned integer quotient.
- in0 holds the dividend. in1 holds the divisor. */
- ENTRY(___udivdi3)
- .regstk 2,0,0,0
- /* Transfer inputs to FP registers. */
- setf.sig f8 = in0
- setf.sig f9 = in1
- ;;
- /* Convert the inputs to FP, to avoid FP software-assist faults. */
- fcvt.xuf.s1 f8 = f8
- fcvt.xuf.s1 f9 = f9
- ;;
- /* Compute the reciprocal approximation. */
- frcpa.s1 f10, p6 = f8, f9
- ;;
- /* 3 Newton-Raphson iterations. */
- (p6) fnma.s1 f11 = f9, f10, f1
- (p6) fmpy.s1 f12 = f8, f10
- ;;
- (p6) fmpy.s1 f13 = f11, f11
- (p6) fma.s1 f12 = f11, f12, f12
- ;;
- (p6) fma.s1 f10 = f11, f10, f10
- (p6) fma.s1 f11 = f13, f12, f12
- ;;
- (p6) fma.s1 f10 = f13, f10, f10
- (p6) fnma.s1 f12 = f9, f11, f8
- ;;
- (p6) fma.s1 f10 = f12, f10, f11
- ;;
- /* Round quotient to an unsigned integer. */
- fcvt.fxu.trunc.s1 f10 = f10
- ;;
- /* Transfer result to GP registers. */
- getf.sig ret0 = f10
- br.ret.sptk rp
- ;;
- END(___udivdi3)
- .symver ___udivdi3, __udivdi3@GLIBC_2.2
- /* __umoddi3
- Compute a 64-bit unsigned integer modulus.
- in0 holds the dividend (a). in1 holds the divisor (b). */
- ENTRY(___umoddi3)
- .regstk 2,0,0,0
- /* Transfer inputs to FP registers. */
- setf.sig f14 = in0
- setf.sig f9 = in1
- ;;
- /* Convert the inputs to FP, to avoid FP software assist faults. */
- fcvt.xuf.s1 f8 = f14
- fcvt.xuf.s1 f9 = f9
- ;;
- /* Compute the reciprocal approximation. */
- frcpa.s1 f10, p6 = f8, f9
- ;;
- /* 3 Newton-Raphson iterations. */
- (p6) fmpy.s1 f12 = f8, f10
- (p6) fnma.s1 f11 = f9, f10, f1
- ;;
- (p6) fma.s1 f12 = f11, f12, f12
- (p6) fmpy.s1 f13 = f11, f11
- ;;
- (p6) fma.s1 f10 = f11, f10, f10
- (p6) fma.s1 f11 = f13, f12, f12
- ;;
- sub in1 = r0, in1
- (p6) fma.s1 f10 = f13, f10, f10
- (p6) fnma.s1 f12 = f9, f11, f8
- ;;
- setf.sig f9 = in1
- (p6) fma.s1 f10 = f12, f10, f11
- ;;
- /* Round quotient to an unsigned integer. */
- fcvt.fxu.trunc.s1 f10 = f10
- ;;
- /* r = q * (-b) + a */
- xma.l f10 = f10, f9, f14
- ;;
- /* Transfer result to GP registers. */
- getf.sig ret0 = f10
- br.ret.sptk rp
- ;;
- END(___umoddi3)
- .symver ___umoddi3, __umoddi3@GLIBC_2.2
- /* __multi3
- Compute a 128-bit multiply of 128-bit multiplicands.
- in0/in1 holds one multiplicand (a), in2/in3 holds the other one (b). */
- ENTRY(___multi3)
- .regstk 4,0,0,0
- setf.sig f6 = in1
- movl r19 = 0xffffffff
- setf.sig f7 = in2
- ;;
- and r14 = r19, in0
- ;;
- setf.sig f10 = r14
- and r14 = r19, in2
- xmpy.l f9 = f6, f7
- ;;
- setf.sig f6 = r14
- shr.u r14 = in0, 32
- ;;
- setf.sig f7 = r14
- shr.u r14 = in2, 32
- ;;
- setf.sig f8 = r14
- xmpy.l f11 = f10, f6
- xmpy.l f6 = f7, f6
- ;;
- getf.sig r16 = f11
- xmpy.l f7 = f7, f8
- ;;
- shr.u r14 = r16, 32
- and r16 = r19, r16
- getf.sig r17 = f6
- setf.sig f6 = in0
- ;;
- setf.sig f11 = r14
- getf.sig r21 = f7
- setf.sig f7 = in3
- ;;
- xma.l f11 = f10, f8, f11
- xma.l f6 = f6, f7, f9
- ;;
- getf.sig r18 = f11
- ;;
- add r18 = r18, r17
- ;;
- and r15 = r19, r18
- cmp.ltu p7, p6 = r18, r17
- ;;
- getf.sig r22 = f6
- (p7) adds r14 = 1, r19
- ;;
- (p7) add r21 = r21, r14
- shr.u r14 = r18, 32
- shl r15 = r15, 32
- ;;
- add r20 = r21, r14
- ;;
- add ret0 = r15, r16
- add ret1 = r22, r20
- br.ret.sptk rp
- ;;
- END(___multi3)
- .symver ___multi3, __multi3@GLIBC_2.2
- #endif
|