Skip to content

Commit 468096b

Browse files
committed
Revert "Use libc's lgamma/tgamma instead of custom implementations"
This reverts commit 7d14105.
1 parent 7d14105 commit 468096b

File tree

1 file changed

+264
-12
lines changed

1 file changed

+264
-12
lines changed

Modules/mathmodule.c

+264-12
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,16 @@ get_math_module_state(PyObject *module)
9292
return (math_module_state *)state;
9393
}
9494

95+
/*
96+
sin(pi*x), giving accurate results for all finite x (especially x
97+
integral or close to an integer). This is here for use in the
98+
reflection formula for the gamma function. It conforms to IEEE
99+
754-2008 for finite arguments, but not for infinities or nans.
100+
*/
101+
102+
static const double pi = 3.141592653589793238462643383279502884197;
103+
static const double logpi = 1.144729885849400174143427351353058711647;
104+
95105
/* Version of PyFloat_AsDouble() with in-line fast paths
96106
for exact floats and integers. Gives a substantial
97107
speed improvement for extracting float arguments.
@@ -114,6 +124,162 @@ get_math_module_state(PyObject *module)
114124
} \
115125
}
116126

127+
static double
128+
m_sinpi(double x)
129+
{
130+
double y, r;
131+
int n;
132+
/* this function should only ever be called for finite arguments */
133+
assert(Py_IS_FINITE(x));
134+
y = fmod(fabs(x), 2.0);
135+
n = (int)round(2.0*y);
136+
assert(0 <= n && n <= 4);
137+
switch (n) {
138+
case 0:
139+
r = sin(pi*y);
140+
break;
141+
case 1:
142+
r = cos(pi*(y-0.5));
143+
break;
144+
case 2:
145+
/* N.B. -sin(pi*(y-1.0)) is *not* equivalent: it would give
146+
-0.0 instead of 0.0 when y == 1.0. */
147+
r = sin(pi*(1.0-y));
148+
break;
149+
case 3:
150+
r = -cos(pi*(y-1.5));
151+
break;
152+
case 4:
153+
r = sin(pi*(y-2.0));
154+
break;
155+
default:
156+
Py_UNREACHABLE();
157+
}
158+
return copysign(1.0, x)*r;
159+
}
160+
161+
/* Implementation of the real gamma function. In extensive but non-exhaustive
162+
random tests, this function proved accurate to within <= 10 ulps across the
163+
entire float domain. Note that accuracy may depend on the quality of the
164+
system math functions, the pow function in particular. Special cases
165+
follow C99 annex F. The parameters and method are tailored to platforms
166+
whose double format is the IEEE 754 binary64 format.
167+
168+
Method: for x > 0.0 we use the Lanczos approximation with parameters N=13
169+
and g=6.024680040776729583740234375; these parameters are amongst those
170+
used by the Boost library. Following Boost (again), we re-express the
171+
Lanczos sum as a rational function, and compute it that way. The
172+
coefficients below were computed independently using MPFR, and have been
173+
double-checked against the coefficients in the Boost source code.
174+
175+
For x < 0.0 we use the reflection formula.
176+
177+
There's one minor tweak that deserves explanation: Lanczos' formula for
178+
Gamma(x) involves computing pow(x+g-0.5, x-0.5) / exp(x+g-0.5). For many x
179+
values, x+g-0.5 can be represented exactly. However, in cases where it
180+
can't be represented exactly the small error in x+g-0.5 can be magnified
181+
significantly by the pow and exp calls, especially for large x. A cheap
182+
correction is to multiply by (1 + e*g/(x+g-0.5)), where e is the error
183+
involved in the computation of x+g-0.5 (that is, e = computed value of
184+
x+g-0.5 - exact value of x+g-0.5). Here's the proof:
185+
186+
Correction factor
187+
-----------------
188+
Write x+g-0.5 = y-e, where y is exactly representable as an IEEE 754
189+
double, and e is tiny. Then:
190+
191+
pow(x+g-0.5,x-0.5)/exp(x+g-0.5) = pow(y-e, x-0.5)/exp(y-e)
192+
= pow(y, x-0.5)/exp(y) * C,
193+
194+
where the correction_factor C is given by
195+
196+
C = pow(1-e/y, x-0.5) * exp(e)
197+
198+
Since e is tiny, pow(1-e/y, x-0.5) ~ 1-(x-0.5)*e/y, and exp(x) ~ 1+e, so:
199+
200+
C ~ (1-(x-0.5)*e/y) * (1+e) ~ 1 + e*(y-(x-0.5))/y
201+
202+
But y-(x-0.5) = g+e, and g+e ~ g. So we get C ~ 1 + e*g/y, and
203+
204+
pow(x+g-0.5,x-0.5)/exp(x+g-0.5) ~ pow(y, x-0.5)/exp(y) * (1 + e*g/y),
205+
206+
Note that for accuracy, when computing r*C it's better to do
207+
208+
r + e*g/y*r;
209+
210+
than
211+
212+
r * (1 + e*g/y);
213+
214+
since the addition in the latter throws away most of the bits of
215+
information in e*g/y.
216+
*/
217+
218+
#define LANCZOS_N 13
219+
static const double lanczos_g = 6.024680040776729583740234375;
220+
static const double lanczos_g_minus_half = 5.524680040776729583740234375;
221+
static const double lanczos_num_coeffs[LANCZOS_N] = {
222+
23531376880.410759688572007674451636754734846804940,
223+
42919803642.649098768957899047001988850926355848959,
224+
35711959237.355668049440185451547166705960488635843,
225+
17921034426.037209699919755754458931112671403265390,
226+
6039542586.3520280050642916443072979210699388420708,
227+
1439720407.3117216736632230727949123939715485786772,
228+
248874557.86205415651146038641322942321632125127801,
229+
31426415.585400194380614231628318205362874684987640,
230+
2876370.6289353724412254090516208496135991145378768,
231+
186056.26539522349504029498971604569928220784236328,
232+
8071.6720023658162106380029022722506138218516325024,
233+
210.82427775157934587250973392071336271166969580291,
234+
2.5066282746310002701649081771338373386264310793408
235+
};
236+
237+
/* denominator is x*(x+1)*...*(x+LANCZOS_N-2) */
238+
static const double lanczos_den_coeffs[LANCZOS_N] = {
239+
0.0, 39916800.0, 120543840.0, 150917976.0, 105258076.0, 45995730.0,
240+
13339535.0, 2637558.0, 357423.0, 32670.0, 1925.0, 66.0, 1.0};
241+
242+
/* gamma values for small positive integers, 1 though NGAMMA_INTEGRAL */
243+
#define NGAMMA_INTEGRAL 23
244+
static const double gamma_integral[NGAMMA_INTEGRAL] = {
245+
1.0, 1.0, 2.0, 6.0, 24.0, 120.0, 720.0, 5040.0, 40320.0, 362880.0,
246+
3628800.0, 39916800.0, 479001600.0, 6227020800.0, 87178291200.0,
247+
1307674368000.0, 20922789888000.0, 355687428096000.0,
248+
6402373705728000.0, 121645100408832000.0, 2432902008176640000.0,
249+
51090942171709440000.0, 1124000727777607680000.0,
250+
};
251+
252+
/* Lanczos' sum L_g(x), for positive x */
253+
254+
static double
255+
lanczos_sum(double x)
256+
{
257+
double num = 0.0, den = 0.0;
258+
int i;
259+
assert(x > 0.0);
260+
/* evaluate the rational function lanczos_sum(x). For large
261+
x, the obvious algorithm risks overflow, so we instead
262+
rescale the denominator and numerator of the rational
263+
function by x**(1-LANCZOS_N) and treat this as a
264+
rational function in 1/x. This also reduces the error for
265+
larger x values. The choice of cutoff point (5.0 below) is
266+
somewhat arbitrary; in tests, smaller cutoff values than
267+
this resulted in lower accuracy. */
268+
if (x < 5.0) {
269+
for (i = LANCZOS_N; --i >= 0; ) {
270+
num = num * x + lanczos_num_coeffs[i];
271+
den = den * x + lanczos_den_coeffs[i];
272+
}
273+
}
274+
else {
275+
for (i = 0; i < LANCZOS_N; i++) {
276+
num = num / x + lanczos_num_coeffs[i];
277+
den = den / x + lanczos_den_coeffs[i];
278+
}
279+
}
280+
return num/den;
281+
}
282+
117283
/* Constant for +infinity, generated in the same way as float('inf'). */
118284

119285
static double
@@ -143,46 +309,113 @@ m_nan(void)
143309

144310
#endif
145311

146-
/*
147-
gamma: the real gamma function.
148-
*/
149-
150312
static double
151-
m_gamma(double x)
313+
m_tgamma(double x)
152314
{
315+
double absx, r, y, z, sqrtpow;
316+
153317
/* special cases */
154318
if (!Py_IS_FINITE(x)) {
155319
if (Py_IS_NAN(x) || x > 0.0)
156-
return x; /* gamma(nan) = nan, gamma(inf) = inf */
320+
return x; /* tgamma(nan) = nan, tgamma(inf) = inf */
157321
else {
158322
errno = EDOM;
159-
return Py_NAN; /* gamma(-inf) = nan, invalid */
323+
return Py_NAN; /* tgamma(-inf) = nan, invalid */
160324
}
161325
}
162326
if (x == 0.0) {
163327
errno = EDOM;
164-
/* gamma(+-0.0) = +-inf, divide-by-zero */
328+
/* tgamma(+-0.0) = +-inf, divide-by-zero */
165329
return copysign(Py_HUGE_VAL, x);
166330
}
167331

168332
/* integer arguments */
169333
if (x == floor(x)) {
170334
if (x < 0.0) {
171-
errno = EDOM; /* gamma(n) = nan, invalid for */
335+
errno = EDOM; /* tgamma(n) = nan, invalid for */
172336
return Py_NAN; /* negative integers n */
173337
}
338+
if (x <= NGAMMA_INTEGRAL)
339+
return gamma_integral[(int)x - 1];
340+
}
341+
absx = fabs(x);
342+
343+
/* tiny arguments: tgamma(x) ~ 1/x for x near 0 */
344+
if (absx < 1e-20) {
345+
r = 1.0/x;
346+
if (Py_IS_INFINITY(r))
347+
errno = ERANGE;
348+
return r;
349+
}
350+
351+
/* large arguments: assuming IEEE 754 doubles, tgamma(x) overflows for
352+
x > 200, and underflows to +-0.0 for x < -200, not a negative
353+
integer. */
354+
if (absx > 200.0) {
355+
if (x < 0.0) {
356+
return 0.0/m_sinpi(x);
357+
}
358+
else {
359+
errno = ERANGE;
360+
return Py_HUGE_VAL;
361+
}
174362
}
175363

176-
return tgamma(x);
364+
y = absx + lanczos_g_minus_half;
365+
/* compute error in sum */
366+
if (absx > lanczos_g_minus_half) {
367+
/* note: the correction can be foiled by an optimizing
368+
compiler that (incorrectly) thinks that an expression like
369+
a + b - a - b can be optimized to 0.0. This shouldn't
370+
happen in a standards-conforming compiler. */
371+
double q = y - absx;
372+
z = q - lanczos_g_minus_half;
373+
}
374+
else {
375+
double q = y - lanczos_g_minus_half;
376+
z = q - absx;
377+
}
378+
z = z * lanczos_g / y;
379+
if (x < 0.0) {
380+
r = -pi / m_sinpi(absx) / absx * exp(y) / lanczos_sum(absx);
381+
r -= z * r;
382+
if (absx < 140.0) {
383+
r /= pow(y, absx - 0.5);
384+
}
385+
else {
386+
sqrtpow = pow(y, absx / 2.0 - 0.25);
387+
r /= sqrtpow;
388+
r /= sqrtpow;
389+
}
390+
}
391+
else {
392+
r = lanczos_sum(absx) / exp(y);
393+
r += z * r;
394+
if (absx < 140.0) {
395+
r *= pow(y, absx - 0.5);
396+
}
397+
else {
398+
sqrtpow = pow(y, absx / 2.0 - 0.25);
399+
r *= sqrtpow;
400+
r *= sqrtpow;
401+
}
402+
}
403+
if (Py_IS_INFINITY(r))
404+
errno = ERANGE;
405+
return r;
177406
}
178407

179408
/*
180409
lgamma: natural log of the absolute value of the Gamma function.
410+
For large arguments, Lanczos' formula works extremely well here.
181411
*/
182412

183413
static double
184414
m_lgamma(double x)
185415
{
416+
double r;
417+
double absx;
418+
186419
/* special cases */
187420
if (!Py_IS_FINITE(x)) {
188421
if (Py_IS_NAN(x))
@@ -197,9 +430,28 @@ m_lgamma(double x)
197430
errno = EDOM; /* lgamma(n) = inf, divide-by-zero for */
198431
return Py_HUGE_VAL; /* integers n <= 0 */
199432
}
433+
else {
434+
return 0.0; /* lgamma(1) = lgamma(2) = 0.0 */
435+
}
200436
}
201437

202-
return lgamma(x);
438+
absx = fabs(x);
439+
/* tiny arguments: lgamma(x) ~ -log(fabs(x)) for small x */
440+
if (absx < 1e-20)
441+
return -log(absx);
442+
443+
/* Lanczos' formula. We could save a fraction of a ulp in accuracy by
444+
having a second set of numerator coefficients for lanczos_sum that
445+
absorbed the exp(-lanczos_g) term, and throwing out the lanczos_g
446+
subtraction below; it's probably not worth it. */
447+
r = log(lanczos_sum(absx)) - lanczos_g;
448+
r += (absx - 0.5) * (log(absx + lanczos_g - 0.5) - 1);
449+
if (x < 0.0)
450+
/* Use reflection formula to get value for negative x. */
451+
r = logpi - log(fabs(m_sinpi(absx))) - log(absx) - r;
452+
if (Py_IS_INFINITY(r))
453+
errno = ERANGE;
454+
return r;
203455
}
204456

205457
/*
@@ -907,7 +1159,7 @@ math_floor(PyObject *module, PyObject *number)
9071159
return PyLong_FromDouble(floor(x));
9081160
}
9091161

910-
FUNC1A(gamma, m_gamma,
1162+
FUNC1A(gamma, m_tgamma,
9111163
"gamma($module, x, /)\n--\n\n"
9121164
"Gamma function at x.")
9131165
FUNC1A(lgamma, m_lgamma,

0 commit comments

Comments
 (0)