powl_helper.c 8.3 KB


  1. /* Implement powl for x86 using extra-precision log.
  2. Copyright (C) 2012-2019 Free Software Foundation, Inc.
  3. This file is part of the GNU C Library.
  4. The GNU C Library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 2.1 of the License, or (at your option) any later version.
  8. The GNU C Library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with the GNU C Library; if not, see
  14. <http://www.gnu.org/licenses/>. */
  15. #include <math.h>
  16. #include <math_private.h>
  17. #include <math-underflow.h>
  18. #include <stdbool.h>
  19. /* High parts and low parts of -log (k/16), for integer k from 12 to
  20. 24. */
  21. static const long double powl_log_table[] =
  22. {
  23. 0x4.9a58844d36e49e1p-4L, -0x1.0522624fd558f574p-68L,
  24. 0x3.527da7915b3c6de4p-4L, 0x1.7d4ef4b901b99b9ep-68L,
  25. 0x2.22f1d044fc8f7bc8p-4L, -0x1.8e97c071a42fc388p-68L,
  26. 0x1.08598b59e3a0688ap-4L, 0x3.fd9bf503372c12fcp-72L,
  27. -0x0p+0L, 0x0p+0L,
  28. -0xf.85186008b15330cp-8L, 0x1.9b47488a6687672cp-72L,
  29. -0x1.e27076e2af2e5e9ep-4L, -0xa.87ffe1fe9e155dcp-72L,
  30. -0x2.bfe60e14f27a791p-4L, 0x1.83bebf1bdb88a032p-68L,
  31. -0x3.91fef8f353443584p-4L, -0xb.b03de5ff734495cp-72L,
  32. -0x4.59d72aeae98380e8p-4L, 0xc.e0aa3be4747dc1p-72L,
  33. -0x5.1862f08717b09f4p-4L, -0x2.decdeccf1cd10578p-68L,
  34. -0x5.ce75fdaef401a738p-4L, -0x9.314feb4fbde5aaep-72L,
  35. -0x6.7cc8fb2fe612fcbp-4L, 0x2.5ca2642feb779f98p-68L,
  36. };
  37. /* High 32 bits of log2 (e), and remainder rounded to 64 bits. */
  38. static const long double log2e_hi = 0x1.71547652p+0L;
  39. static const long double log2e_lo = 0xb.82fe1777d0ffda1p-36L;
  40. /* Given a number with high part HI and low part LO, add the number X
  41. to it and store the result in *RHI and *RLO. It is given that
  42. either |X| < |0.7 * HI|, or HI == LO == 0, and that the values are
  43. small enough that no overflow occurs. The result does not need to
  44. be exact to 128 bits; 78-bit accuracy of the final accumulated
  45. result suffices. */
  46. static inline void
  47. acc_split (long double *rhi, long double *rlo, long double hi, long double lo,
  48. long double x)
  49. {
  50. long double thi = hi + x;
  51. long double tlo = (hi - thi) + x + lo;
  52. *rhi = thi + tlo;
  53. *rlo = (thi - *rhi) + tlo;
  54. }
  55. extern long double __powl_helper (long double x, long double y);
  56. libm_hidden_proto (__powl_helper)
  57. /* Given X a value that is finite and nonzero, or a NaN, and Y a
  58. finite nonzero value with 0x1p-79 <= |Y| <= 0x1p78, compute X to
  59. the power Y. */
  60. long double
  61. __powl_helper (long double x, long double y)
  62. {
  63. if (isnan (x))
  64. return __ieee754_expl (y * __ieee754_logl (x));
  65. bool negate;
  66. if (x < 0)
  67. {
  68. long double absy = fabsl (y);
  69. if (absy >= 0x1p64L)
  70. negate = false;
  71. else
  72. {
  73. unsigned long long yll = absy;
  74. if (yll != absy)
  75. return __ieee754_expl (y * __ieee754_logl (x));
  76. negate = (yll & 1) != 0;
  77. }
  78. x = fabsl (x);
  79. }
  80. else
  81. negate = false;
  82. /* We need to compute Y * log2 (X) to at least 64 bits after the
  83. point for normal results (that is, to at least 78 bits
  84. precision). */
  85. int x_int_exponent;
  86. long double x_frac;
  87. x_frac = __frexpl (x, &x_int_exponent);
  88. if (x_frac <= 0x0.aaaaaaaaaaaaaaaap0L) /* 2.0L / 3.0L, rounded down */
  89. {
  90. x_frac *= 2.0;
  91. x_int_exponent--;
  92. }
  93. long double log_x_frac_hi, log_x_frac_lo;
  94. /* Determine an initial approximation to log (X_FRAC) using
  95. POWL_LOG_TABLE, and multiply by a value K/16 to reduce to an
  96. interval (24/25, 26/25). */
  97. int k = (int) ((16.0L / x_frac) + 0.5L);
  98. log_x_frac_hi = powl_log_table[2 * k - 24];
  99. log_x_frac_lo = powl_log_table[2 * k - 23];
  100. long double x_frac_low;
  101. if (k == 16)
  102. x_frac_low = 0.0L;
  103. else
  104. {
  105. /* Mask off low 5 bits of X_FRAC so the multiplication by K/16
  106. is exact. These bits are small enough that they can be
  107. corrected for by adding log2 (e) * X_FRAC_LOW to the final
  108. result. */
  109. int32_t se;
  110. uint32_t i0, i1;
  111. GET_LDOUBLE_WORDS (se, i0, i1, x_frac);
  112. x_frac_low = x_frac;
  113. i1 &= 0xffffffe0;
  114. SET_LDOUBLE_WORDS (x_frac, se, i0, i1);
  115. x_frac_low -= x_frac;
  116. x_frac_low /= x_frac;
  117. x_frac *= k / 16.0L;
  118. }
  119. /* Now compute log (X_FRAC) for X_FRAC in (24/25, 26/25). Separate
  120. W = X_FRAC - 1 into high 16 bits and remaining bits, so that
  121. multiplications for low-order power series terms are exact. The
  122. remaining bits are small enough that adding a 64-bit value of
  123. log2 (1 + W_LO / (1 + W_HI)) will be a sufficient correction for
  124. them. */
  125. long double w = x_frac - 1;
  126. long double w_hi, w_lo;
  127. int32_t se;
  128. uint32_t i0, i1;
  129. GET_LDOUBLE_WORDS (se, i0, i1, w);
  130. i0 &= 0xffff0000;
  131. i1 = 0;
  132. SET_LDOUBLE_WORDS (w_hi, se, i0, i1);
  133. w_lo = w - w_hi;
  134. long double wp = w_hi;
  135. acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, wp);
  136. wp *= -w_hi;
  137. acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
  138. wp / 2.0L);
  139. wp *= -w_hi;
  140. acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
  141. wp * 0x0.5555p0L); /* -W_HI**3 / 3, high part. */
  142. acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
  143. wp * 0x0.5555555555555555p-16L); /* -W_HI**3 / 3, low part. */
  144. wp *= -w_hi;
  145. acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
  146. wp / 4.0L);
  147. /* Subsequent terms are small enough that they only need be computed
  148. to 64 bits. */
  149. for (int i = 5; i <= 17; i++)
  150. {
  151. wp *= -w_hi;
  152. acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
  153. wp / i);
  154. }
  155. /* Convert LOG_X_FRAC_HI + LOG_X_FRAC_LO to a base-2 logarithm. */
  156. long double log2_x_frac_hi, log2_x_frac_lo;
  157. long double log_x_frac_hi32, log_x_frac_lo64;
  158. GET_LDOUBLE_WORDS (se, i0, i1, log_x_frac_hi);
  159. i1 = 0;
  160. SET_LDOUBLE_WORDS (log_x_frac_hi32, se, i0, i1);
  161. log_x_frac_lo64 = (log_x_frac_hi - log_x_frac_hi32) + log_x_frac_lo;
  162. long double log2_x_frac_hi1 = log_x_frac_hi32 * log2e_hi;
  163. long double log2_x_frac_lo1
  164. = log_x_frac_lo64 * log2e_hi + log_x_frac_hi * log2e_lo;
  165. log2_x_frac_hi = log2_x_frac_hi1 + log2_x_frac_lo1;
  166. log2_x_frac_lo = (log2_x_frac_hi1 - log2_x_frac_hi) + log2_x_frac_lo1;
  167. /* Correct for the masking off of W_LO. */
  168. long double log2_1p_w_lo;
  169. asm ("fyl2xp1"
  170. : "=t" (log2_1p_w_lo)
  171. : "0" (w_lo / (1.0L + w_hi)), "u" (1.0L)
  172. : "st(1)");
  173. acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo,
  174. log2_1p_w_lo);
  175. /* Correct for the masking off of X_FRAC_LOW. */
  176. acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo,
  177. x_frac_low * M_LOG2El);
  178. /* Add the integer and fractional parts of the base-2 logarithm. */
  179. long double log2_x_hi, log2_x_lo;
  180. log2_x_hi = x_int_exponent + log2_x_frac_hi;
  181. log2_x_lo = ((x_int_exponent - log2_x_hi) + log2_x_frac_hi) + log2_x_frac_lo;
  182. /* Compute the base-2 logarithm of the result. */
  183. long double log2_res_hi, log2_res_lo;
  184. long double log2_x_hi32, log2_x_lo64;
  185. GET_LDOUBLE_WORDS (se, i0, i1, log2_x_hi);
  186. i1 = 0;
  187. SET_LDOUBLE_WORDS (log2_x_hi32, se, i0, i1);
  188. log2_x_lo64 = (log2_x_hi - log2_x_hi32) + log2_x_lo;
  189. long double y_hi32, y_lo32;
  190. GET_LDOUBLE_WORDS (se, i0, i1, y);
  191. i1 = 0;
  192. SET_LDOUBLE_WORDS (y_hi32, se, i0, i1);
  193. y_lo32 = y - y_hi32;
  194. log2_res_hi = log2_x_hi32 * y_hi32;
  195. log2_res_lo = log2_x_hi32 * y_lo32 + log2_x_lo64 * y;
  196. /* Split the base-2 logarithm of the result into integer and
  197. fractional parts. */
  198. long double log2_res_int = roundl (log2_res_hi);
  199. long double log2_res_frac = log2_res_hi - log2_res_int + log2_res_lo;
  200. /* If the integer part is very large, the computed fractional part
  201. may be outside the valid range for f2xm1. */
  202. if (fabsl (log2_res_int) > 16500)
  203. log2_res_frac = 0;
  204. /* Compute the final result. */
  205. long double res;
  206. asm ("f2xm1" : "=t" (res) : "0" (log2_res_frac));
  207. res += 1.0L;
  208. if (negate)
  209. res = -res;
  210. asm ("fscale" : "=t" (res) : "0" (res), "u" (log2_res_int));
  211. math_check_force_underflow (res);
  212. return res;
  213. }
  214. libm_hidden_def (__powl_helper)