Skip to content

Commit 4c54410

Browse files
committed
ENH: Single pass, stable roll_corr
Uses a Welford-like method for general computations, and includes detection of exact zeros in the denominator and of exactly identical sequences.
1 parent 7fba69d commit 4c54410

File tree

1 file changed

+179
-17
lines changed

1 file changed

+179
-17
lines changed

Diff for: pandas/algos.pyx

+179-17
Original file line numberDiff line numberDiff line change
@@ -1253,6 +1253,166 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
12531253
return result
12541254

12551255

1256+
#----------------------------------------------------------------------
1257+
# Rolling correlation
1258+
1259+
def roll_corr(ndarray[double_t] x, ndarray[double_t] y, int win, int minp,
1260+
int ddof=1):
1261+
"""
1262+
Numerically stable implementation using a Welford-like method, and
1263+
detection of exactly matching sequences and exactly zero denominators.
1264+
"""
1265+
cdef double val_x, val_y, prev_x, prev_y, rep_x = NaN, rep_y = NaN
1266+
cdef double mean_x = 0, mean_y = 0, ssqdm_x = 0, ssqdm_y = 0
1267+
cdef double sproddm_xy = 0, delta_x, delta_y
1268+
cdef Py_ssize_t nobs = 0, nrep_x = 0, nrep_y = 0, ndup = 0, i, N = len(x)
1269+
cdef bint val_not_nan, prev_not_nan
1270+
1271+
cdef ndarray[double_t] output = np.empty(N, dtype=float)
1272+
1273+
minp = _check_minp(win, minp, N)
1274+
1275+
for i from 0 <= i < N:
1276+
val_x = x[i]
1277+
val_y = y[i]
1278+
val_not_nan = val_x == val_x and val_y == val_y
1279+
if i < win:
1280+
prev_x = prev_y = NaN
1281+
prev_not_nan = 0
1282+
else:
1283+
prev_x = x[i - win]
1284+
prev_y = y[i - win]
1285+
prev_not_nan = prev_x == prev_x and prev_y == prev_y
1286+
1287+
# First, count the number of observations and consecutive repeats
1288+
if prev_not_nan:
1289+
# prev is not NaN, removing an observation...
1290+
if nobs == nrep_x:
1291+
# ...and removing a repeat from x
1292+
nrep_x -= 1
1293+
if nrep_x == 0:
1294+
rep_x = NaN
1295+
if nobs == nrep_y:
1296+
# ...and removing a repeat from y
1297+
nrep_y -= 1
1298+
if nrep_y == 0:
1299+
rep_y = NaN
1300+
if nobs == ndup:
1301+
# ...and removing a duplicate
1302+
ndup -= 1
1303+
nobs -= 1
1304+
if val_not_nan:
1305+
# val is not NaN, adding an observation
1306+
if val_x == rep_x:
1307+
# ...and adding a repeat to x
1308+
nrep_x += 1
1309+
else:
1310+
# ...and resetting x repeats
1311+
nrep_x = 1
1312+
rep_x = val_x
1313+
if val_y == rep_y:
1314+
# ...and adding a repeat to y
1315+
nrep_y += 1
1316+
else:
1317+
# ...and resetting y repeats
1318+
nrep_y = 1
1319+
rep_y = val_y
1320+
if val_x == val_y:
1321+
# ...and adding a duplicate
1322+
ndup += 1
1323+
else:
1324+
# ...and resetting duplicates
1325+
ndup = 0
1326+
nobs += 1
1327+
1328+
# Then, compute the new mean, sums of squared differences from the
1329+
# mean and sum of product of differences from the mean
1330+
if nobs == nrep_x and nobs == nrep_y:
1331+
if nobs > 0:
1332+
mean_x = rep_x
1333+
mean_y = rep_y
1334+
else:
1335+
mean_x = mean_y = 0
1336+
ssqdm_x = ssqdm_y = sproddm_xy = 0
1337+
elif val_not_nan:
1338+
# Adding one observation...
1339+
if prev_not_nan:
1340+
# ...and removing another
1341+
delta_x = val_x - prev_x
1342+
prev_x -= mean_x
1343+
if nobs == nrep_x:
1344+
mean_x = rep_x
1345+
val_x = 0
1346+
ssqdm_x = 0
1347+
else:
1348+
mean_x += delta_x / nobs
1349+
val_x -= mean_x
1350+
ssqdm_x += (val_x + prev_x) * delta_x
1351+
delta_y = val_y - prev_y
1352+
prev_y -= mean_y
1353+
if nobs == nrep_y:
1354+
mean_y = rep_y
1355+
val_y = 0
1356+
ssqdm_y = 0
1357+
else:
1358+
mean_y += delta_y / nobs
1359+
val_y -= mean_y
1360+
ssqdm_y += (val_y + prev_y) * delta_y
1361+
sproddm_xy += (delta_x * (val_y + prev_y) +
1362+
delta_y * (val_x + prev_x)) * 0.5
1363+
else:
1364+
# ...and not removing any
1365+
if nobs == nrep_x:
1366+
delta_x = 0
1367+
mean_x = rep_x
1368+
ssqdm_x = 0
1369+
else:
1370+
delta_x = val_x - mean_x
1371+
mean_x += delta_x / nobs
1372+
ssqdm_x += delta_x * (val_x - mean_x)
1373+
if nobs == nrep_y:
1374+
delta_y = 0
1375+
mean_y = rep_y
1376+
ssqdm_y = 0
1377+
else:
1378+
delta_y = val_y - mean_y
1379+
mean_y += delta_y / nobs
1380+
ssqdm_y += delta_y * (val_y - mean_y)
1381+
sproddm_xy += ((val_x - mean_x) * delta_y +
1382+
(val_y - mean_y) * delta_x) * 0.5
1383+
elif prev_not_nan:
1384+
# Adding no new observation, but removing one
1385+
if nobs == nrep_x:
1386+
delta_x = 0
1387+
mean_x = rep_x
1388+
ssqdm_x = 0
1389+
else:
1390+
delta_x = prev_x - mean_x
1391+
mean_x -= delta_x / nobs
1392+
ssqdm_x -= delta_x * (prev_x - mean_x)
1393+
if nobs == nrep_y:
1394+
delta_y = 0
1395+
mean_y = rep_y
1396+
ssqdm_y = 0
1397+
else:
1398+
delta_y = prev_y - mean_y
1399+
mean_y -= delta_y / nobs
1400+
ssqdm_y -= delta_y * (prev_y - mean_y)
1401+
sproddm_xy -= ((prev_x - mean_x) * delta_y +
1402+
(prev_y - mean_y) * delta_x) * 0.5
1403+
# Correlation is unchanged if no observation is added or removed
1404+
1405+
# Finally, compute and write the rolling correlation to output
1406+
if nobs < minp or nobs < ddof or ssqdm_x <= 0 or ssqdm_y <= 0:
1407+
output[i] = NaN
1408+
elif nobs == ndup:
1409+
output[i] = 1.0
1410+
else:
1411+
output[i] = sproddm_xy / sqrt(ssqdm_x * ssqdm_y)
1412+
1413+
return output
1414+
1415+
12561416
#----------------------------------------------------------------------
12571417
# Rolling covariance
12581418

@@ -1261,8 +1421,8 @@ def roll_cov(ndarray[double_t] x, ndarray[double_t] y, int win, int minp,
12611421
"""
12621422
Numerically stable implementation using a Welford-like method.
12631423
"""
1264-
cdef double val_x, val_y, prev_x, prev_y, delta_x, delta_y, temp_x, temp_y
1265-
cdef double mean_x = 0, mean_y = 0, proddm_xy = 0
1424+
cdef double val_x, val_y, prev_x, prev_y, delta_x, delta_y
1425+
cdef double mean_x = 0, mean_y = 0, sproddm_xy = 0
12661426
cdef Py_ssize_t nobs = 0, i, N = len(x)
12671427
cdef bint val_not_nan, prev_not_nan
12681428

@@ -1294,16 +1454,17 @@ def roll_cov(ndarray[double_t] x, ndarray[double_t] y, int win, int minp,
12941454
prev_y -= mean_y
12951455
mean_y += delta_y / nobs
12961456
val_y -= mean_y
1297-
proddm_xy += (delta_x * (val_y + prev_y) + delta_y * (val_x + prev_x)) / 2
1457+
sproddm_xy += (delta_x * (val_y + prev_y) +
1458+
delta_y * (val_x + prev_x)) * 0.5
12981459
else:
12991460
# ...and not removing any
13001461
nobs += 1
13011462
delta_x = val_x - mean_x
13021463
mean_x += delta_x / nobs
13031464
delta_y = val_y - mean_y
13041465
mean_y += delta_y / nobs
1305-
proddm_xy += ((val_x - mean_x) * delta_y +
1306-
(val_y - mean_y) * delta_x) * 0.5
1466+
sproddm_xy += ((val_x - mean_x) * delta_y +
1467+
(val_y - mean_y) * delta_x) * 0.5
13071468
elif prev_not_nan:
13081469
# Adding no new observation, but removing one
13091470
nobs -= 1
@@ -1312,15 +1473,15 @@ def roll_cov(ndarray[double_t] x, ndarray[double_t] y, int win, int minp,
13121473
mean_x -= delta_x / nobs
13131474
delta_y = prev_y - mean_y
13141475
mean_y -= delta_y / nobs
1315-
proddm_xy -= ((prev_x - mean_x) * delta_y +
1316-
(prev_y - mean_y) * delta_x) * 0.5
1476+
sproddm_xy -= ((prev_x - mean_x) * delta_y +
1477+
(prev_y - mean_y) * delta_x) * 0.5
13171478
else:
1318-
mean_x = mean_y = proddm_xy = 0
1479+
mean_x = mean_y = sproddm_xy = 0
13191480
# Covariance is unchanged if no observation is added or removed
13201481

13211482
# Finally, compute and write the rolling covariance to output
13221483
if nobs >= minp and nobs > ddof:
1323-
output[i] = proddm_xy / (nobs - ddof)
1484+
output[i] = sproddm_xy / (nobs - ddof)
13241485
else:
13251486
output[i] = NaN
13261487

@@ -1334,7 +1495,7 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
13341495
"""
13351496
Numerically stable implementation using Welford's method.
13361497
"""
1337-
cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta, out
1498+
cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta
13381499
cdef Py_ssize_t i
13391500
cdef Py_ssize_t N = len(input)
13401501

@@ -1350,7 +1511,7 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
13501511
# Adding one observation...
13511512
if prev == prev:
13521513
# ...and removing another
1353-
delta val - prev
1514+
delta = val - prev
13541515
prev -= mean_x
13551516
mean_x += delta / nobs
13561517
val -= mean_x
@@ -1362,7 +1523,7 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
13621523
mean_x += delta / nobs
13631524
ssqdm_x += delta * (val - mean_x)
13641525
elif prev == prev:
1365-
# adding no new observation, but removing one
1526+
# Adding no new observation, but removing one
13661527
nobs -= 1
13671528
if nobs:
13681529
delta = prev - mean_x
@@ -1372,13 +1533,14 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
13721533
mean_x = ssqdm_x = 0
13731534
# Variance is unchanged if no observation is added or removed
13741535

1536+
# Sum of squared differences from the mean must be non-negative
1537+
ssqdm_x = 0 if ssqdm_x < 0 else ssqdm_x
1538+
1539+
# Finally, compute and write the rolling variance to output
13751540
if nobs >= minp and nobs > ddof:
1376-
out = ssqdm_x / (nobs - ddof)
1377-
out = 0 if out < 0 else out
1541+
output[i] = ssqdm_x / (nobs - ddof)
13781542
else:
1379-
out = NaN
1380-
1381-
output[i] = out
1543+
output[i] = NaN
13821544

13831545
return output
13841546

0 commit comments

Comments
 (0)