ia64libgcc.S 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. /* From the Intel IA-64 Optimization Guide, choose the minimum latency
  2. alternative. */
  3. #include <sysdep.h>
  4. #undef ret
  5. #include <shlib-compat.h>
  6. #if SHLIB_COMPAT(libc, GLIBC_2_2, GLIBC_2_2_6)
  7. /* __divtf3
  8. Compute a 80-bit IEEE double-extended quotient.
  9. farg0 holds the dividend. farg1 holds the divisor. */
  10. ENTRY(___divtf3)
  11. cmp.eq p7, p0 = r0, r0
  12. frcpa.s0 f10, p6 = farg0, farg1
  13. ;;
  14. (p6) cmp.ne p7, p0 = r0, r0
  15. .pred.rel.mutex p6, p7
  16. (p6) fnma.s1 f11 = farg1, f10, f1
  17. (p6) fma.s1 f12 = farg0, f10, f0
  18. ;;
  19. (p6) fma.s1 f13 = f11, f11, f0
  20. (p6) fma.s1 f14 = f11, f11, f11
  21. ;;
  22. (p6) fma.s1 f11 = f13, f13, f11
  23. (p6) fma.s1 f13 = f14, f10, f10
  24. ;;
  25. (p6) fma.s1 f10 = f13, f11, f10
  26. (p6) fnma.s1 f11 = farg1, f12, farg0
  27. ;;
  28. (p6) fma.s1 f11 = f11, f10, f12
  29. (p6) fnma.s1 f12 = farg1, f10, f1
  30. ;;
  31. (p6) fma.s1 f10 = f12, f10, f10
  32. (p6) fnma.s1 f12 = farg1, f11, farg0
  33. ;;
  34. (p6) fma.s0 fret0 = f12, f10, f11
  35. (p7) mov fret0 = f10
  36. br.ret.sptk rp
  37. END(___divtf3)
  38. .symver ___divtf3, __divtf3@GLIBC_2.2
  39. /* __divdf3
  40. Compute a 64-bit IEEE double quotient.
  41. farg0 holds the dividend. farg1 holds the divisor. */
  42. ENTRY(___divdf3)
  43. cmp.eq p7, p0 = r0, r0
  44. frcpa.s0 f10, p6 = farg0, farg1
  45. ;;
  46. (p6) cmp.ne p7, p0 = r0, r0
  47. .pred.rel.mutex p6, p7
  48. (p6) fmpy.s1 f11 = farg0, f10
  49. (p6) fnma.s1 f12 = farg1, f10, f1
  50. ;;
  51. (p6) fma.s1 f11 = f12, f11, f11
  52. (p6) fmpy.s1 f13 = f12, f12
  53. ;;
  54. (p6) fma.s1 f10 = f12, f10, f10
  55. (p6) fma.s1 f11 = f13, f11, f11
  56. ;;
  57. (p6) fmpy.s1 f12 = f13, f13
  58. (p6) fma.s1 f10 = f13, f10, f10
  59. ;;
  60. (p6) fma.d.s1 f11 = f12, f11, f11
  61. (p6) fma.s1 f10 = f12, f10, f10
  62. ;;
  63. (p6) fnma.d.s1 f8 = farg1, f11, farg0
  64. ;;
  65. (p6) fma.d fret0 = f8, f10, f11
  66. (p7) mov fret0 = f10
  67. br.ret.sptk rp
  68. ;;
  69. END(___divdf3)
  70. .symver ___divdf3, __divdf3@GLIBC_2.2
  71. /* __divsf3
  72. Compute a 32-bit IEEE float quotient.
  73. farg0 holds the dividend. farg1 holds the divisor. */
  74. ENTRY(___divsf3)
  75. cmp.eq p7, p0 = r0, r0
  76. frcpa.s0 f10, p6 = farg0, farg1
  77. ;;
  78. (p6) cmp.ne p7, p0 = r0, r0
  79. .pred.rel.mutex p6, p7
  80. (p6) fmpy.s1 f8 = farg0, f10
  81. (p6) fnma.s1 f9 = farg1, f10, f1
  82. ;;
  83. (p6) fma.s1 f8 = f9, f8, f8
  84. (p6) fmpy.s1 f9 = f9, f9
  85. ;;
  86. (p6) fma.s1 f8 = f9, f8, f8
  87. (p6) fmpy.s1 f9 = f9, f9
  88. ;;
  89. (p6) fma.d.s1 f10 = f9, f8, f8
  90. ;;
  91. (p6) fnorm.s.s0 fret0 = f10
  92. (p7) mov fret0 = f10
  93. br.ret.sptk rp
  94. ;;
  95. END(___divsf3)
  96. .symver ___divsf3, __divsf3@GLIBC_2.2
  97. /* __divdi3
  98. Compute a 64-bit integer quotient.
  99. in0 holds the dividend. in1 holds the divisor. */
  100. ENTRY(___divdi3)
  101. .regstk 2,0,0,0
  102. /* Transfer inputs to FP registers. */
  103. setf.sig f8 = in0
  104. setf.sig f9 = in1
  105. ;;
  106. /* Convert the inputs to FP, so that they won't be treated as
  107. unsigned. */
  108. fcvt.xf f8 = f8
  109. fcvt.xf f9 = f9
  110. ;;
  111. /* Compute the reciprocal approximation. */
  112. frcpa.s1 f10, p6 = f8, f9
  113. ;;
  114. /* 3 Newton-Raphson iterations. */
  115. (p6) fnma.s1 f11 = f9, f10, f1
  116. (p6) fmpy.s1 f12 = f8, f10
  117. ;;
  118. (p6) fmpy.s1 f13 = f11, f11
  119. (p6) fma.s1 f12 = f11, f12, f12
  120. ;;
  121. (p6) fma.s1 f10 = f11, f10, f10
  122. (p6) fma.s1 f11 = f13, f12, f12
  123. ;;
  124. (p6) fma.s1 f10 = f13, f10, f10
  125. (p6) fnma.s1 f12 = f9, f11, f8
  126. ;;
  127. (p6) fma.s1 f10 = f12, f10, f11
  128. ;;
  129. /* Round quotient to an integer. */
  130. fcvt.fx.trunc.s1 f10 = f10
  131. ;;
  132. /* Transfer result to GP registers. */
  133. getf.sig ret0 = f10
  134. br.ret.sptk rp
  135. ;;
  136. END(___divdi3)
  137. .symver ___divdi3, __divdi3@GLIBC_2.2
  138. /* __moddi3
  139. Compute a 64-bit integer modulus.
  140. in0 holds the dividend (a). in1 holds the divisor (b). */
  141. ENTRY(___moddi3)
  142. .regstk 2,0,0,0
  143. /* Transfer inputs to FP registers. */
  144. setf.sig f14 = in0
  145. setf.sig f9 = in1
  146. ;;
  147. /* Convert the inputs to FP, so that they won't be treated as
  148. unsigned. */
  149. fcvt.xf f8 = f14
  150. fcvt.xf f9 = f9
  151. ;;
  152. /* Compute the reciprocal approximation. */
  153. frcpa.s1 f10, p6 = f8, f9
  154. ;;
  155. /* 3 Newton-Raphson iterations. */
  156. (p6) fmpy.s1 f12 = f8, f10
  157. (p6) fnma.s1 f11 = f9, f10, f1
  158. ;;
  159. (p6) fma.s1 f12 = f11, f12, f12
  160. (p6) fmpy.s1 f13 = f11, f11
  161. ;;
  162. (p6) fma.s1 f10 = f11, f10, f10
  163. (p6) fma.s1 f11 = f13, f12, f12
  164. ;;
  165. sub in1 = r0, in1
  166. (p6) fma.s1 f10 = f13, f10, f10
  167. (p6) fnma.s1 f12 = f9, f11, f8
  168. ;;
  169. setf.sig f9 = in1
  170. (p6) fma.s1 f10 = f12, f10, f11
  171. ;;
  172. fcvt.fx.trunc.s1 f10 = f10
  173. ;;
  174. /* r = q * (-b) + a */
  175. xma.l f10 = f10, f9, f14
  176. ;;
  177. /* Transfer result to GP registers. */
  178. getf.sig ret0 = f10
  179. br.ret.sptk rp
  180. ;;
  181. END(___moddi3)
  182. .symver ___moddi3, __moddi3@GLIBC_2.2
  183. /* __udivdi3
  184. Compute a 64-bit unsigned integer quotient.
  185. in0 holds the dividend. in1 holds the divisor. */
  186. ENTRY(___udivdi3)
  187. .regstk 2,0,0,0
  188. /* Transfer inputs to FP registers. */
  189. setf.sig f8 = in0
  190. setf.sig f9 = in1
  191. ;;
  192. /* Convert the inputs to FP, to avoid FP software-assist faults. */
  193. fcvt.xuf.s1 f8 = f8
  194. fcvt.xuf.s1 f9 = f9
  195. ;;
  196. /* Compute the reciprocal approximation. */
  197. frcpa.s1 f10, p6 = f8, f9
  198. ;;
  199. /* 3 Newton-Raphson iterations. */
  200. (p6) fnma.s1 f11 = f9, f10, f1
  201. (p6) fmpy.s1 f12 = f8, f10
  202. ;;
  203. (p6) fmpy.s1 f13 = f11, f11
  204. (p6) fma.s1 f12 = f11, f12, f12
  205. ;;
  206. (p6) fma.s1 f10 = f11, f10, f10
  207. (p6) fma.s1 f11 = f13, f12, f12
  208. ;;
  209. (p6) fma.s1 f10 = f13, f10, f10
  210. (p6) fnma.s1 f12 = f9, f11, f8
  211. ;;
  212. (p6) fma.s1 f10 = f12, f10, f11
  213. ;;
  214. /* Round quotient to an unsigned integer. */
  215. fcvt.fxu.trunc.s1 f10 = f10
  216. ;;
  217. /* Transfer result to GP registers. */
  218. getf.sig ret0 = f10
  219. br.ret.sptk rp
  220. ;;
  221. END(___udivdi3)
  222. .symver ___udivdi3, __udivdi3@GLIBC_2.2
  223. /* __umoddi3
  224. Compute a 64-bit unsigned integer modulus.
  225. in0 holds the dividend (a). in1 holds the divisor (b). */
  226. ENTRY(___umoddi3)
  227. .regstk 2,0,0,0
  228. /* Transfer inputs to FP registers. */
  229. setf.sig f14 = in0
  230. setf.sig f9 = in1
  231. ;;
  232. /* Convert the inputs to FP, to avoid FP software assist faults. */
  233. fcvt.xuf.s1 f8 = f14
  234. fcvt.xuf.s1 f9 = f9
  235. ;;
  236. /* Compute the reciprocal approximation. */
  237. frcpa.s1 f10, p6 = f8, f9
  238. ;;
  239. /* 3 Newton-Raphson iterations. */
  240. (p6) fmpy.s1 f12 = f8, f10
  241. (p6) fnma.s1 f11 = f9, f10, f1
  242. ;;
  243. (p6) fma.s1 f12 = f11, f12, f12
  244. (p6) fmpy.s1 f13 = f11, f11
  245. ;;
  246. (p6) fma.s1 f10 = f11, f10, f10
  247. (p6) fma.s1 f11 = f13, f12, f12
  248. ;;
  249. sub in1 = r0, in1
  250. (p6) fma.s1 f10 = f13, f10, f10
  251. (p6) fnma.s1 f12 = f9, f11, f8
  252. ;;
  253. setf.sig f9 = in1
  254. (p6) fma.s1 f10 = f12, f10, f11
  255. ;;
  256. /* Round quotient to an unsigned integer. */
  257. fcvt.fxu.trunc.s1 f10 = f10
  258. ;;
  259. /* r = q * (-b) + a */
  260. xma.l f10 = f10, f9, f14
  261. ;;
  262. /* Transfer result to GP registers. */
  263. getf.sig ret0 = f10
  264. br.ret.sptk rp
  265. ;;
  266. END(___umoddi3)
  267. .symver ___umoddi3, __umoddi3@GLIBC_2.2
  268. /* __multi3
  269. Compute a 128-bit multiply of 128-bit multiplicands.
  270. in0/in1 holds one multiplicand (a), in2/in3 holds the other one (b). */
  271. ENTRY(___multi3)
  272. .regstk 4,0,0,0
  273. setf.sig f6 = in1
  274. movl r19 = 0xffffffff
  275. setf.sig f7 = in2
  276. ;;
  277. and r14 = r19, in0
  278. ;;
  279. setf.sig f10 = r14
  280. and r14 = r19, in2
  281. xmpy.l f9 = f6, f7
  282. ;;
  283. setf.sig f6 = r14
  284. shr.u r14 = in0, 32
  285. ;;
  286. setf.sig f7 = r14
  287. shr.u r14 = in2, 32
  288. ;;
  289. setf.sig f8 = r14
  290. xmpy.l f11 = f10, f6
  291. xmpy.l f6 = f7, f6
  292. ;;
  293. getf.sig r16 = f11
  294. xmpy.l f7 = f7, f8
  295. ;;
  296. shr.u r14 = r16, 32
  297. and r16 = r19, r16
  298. getf.sig r17 = f6
  299. setf.sig f6 = in0
  300. ;;
  301. setf.sig f11 = r14
  302. getf.sig r21 = f7
  303. setf.sig f7 = in3
  304. ;;
  305. xma.l f11 = f10, f8, f11
  306. xma.l f6 = f6, f7, f9
  307. ;;
  308. getf.sig r18 = f11
  309. ;;
  310. add r18 = r18, r17
  311. ;;
  312. and r15 = r19, r18
  313. cmp.ltu p7, p6 = r18, r17
  314. ;;
  315. getf.sig r22 = f6
  316. (p7) adds r14 = 1, r19
  317. ;;
  318. (p7) add r21 = r21, r14
  319. shr.u r14 = r18, 32
  320. shl r15 = r15, 32
  321. ;;
  322. add r20 = r21, r14
  323. ;;
  324. add ret0 = r15, r16
  325. add ret1 = r22, r20
  326. br.ret.sptk rp
  327. ;;
  328. END(___multi3)
  329. .symver ___multi3, __multi3@GLIBC_2.2
  330. #endif