-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Refactor groupby group_prod, group_var, group_mean, group_ohlc #25249
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -382,6 +382,10 @@ def group_any_all(uint8_t[:] out, | |
if values[i] == flag_val: | ||
out[lab] = flag_val | ||
|
||
# ---------------------------------------------------------------------- | ||
# group_add, group_prod, group_var, group_mean, group_ohlc | ||
# ---------------------------------------------------------------------- | ||
|
||
|
||
@cython.wraparound(False) | ||
@cython.boundscheck(False) | ||
|
@@ -433,5 +437,213 @@ def _group_add(floating[:, :] out, | |
group_add_float32 = _group_add['float'] | ||
group_add_float64 = _group_add['double'] | ||
|
||
|
||
@cython.wraparound(False) | ||
@cython.boundscheck(False) | ||
def _group_prod(floating[:, :] out, | ||
int64_t[:] counts, | ||
floating[:, :] values, | ||
const int64_t[:] labels, | ||
Py_ssize_t min_count=0): | ||
""" | ||
Only aggregates on axis=0 | ||
""" | ||
cdef: | ||
Py_ssize_t i, j, N, K, lab, ncounts = len(counts) | ||
floating val, count | ||
ndarray[floating, ndim=2] prodx, nobs | ||
|
||
if not len(values) == len(labels): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. != There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I made the minimum amount of changes when I refactored from tempita to fused types. So just to make sure, you would like me to take this opportunity to the fix small issues inside the functions themselves? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes please. Be sure to comment to point out where you have made changes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. generally when moving large blocks of code best to just move them and not make any other changes. generally prefer to just have these changes in a followup; if very small ok though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
raise AssertionError("len(index) != len(labels)") | ||
|
||
nobs = np.zeros_like(out) | ||
prodx = np.ones_like(out) | ||
|
||
N, K = (<object>values).shape | ||
|
||
with nogil: | ||
for i in range(N): | ||
lab = labels[i] | ||
if lab < 0: | ||
continue | ||
|
||
counts[lab] += 1 | ||
for j in range(K): | ||
val = values[i, j] | ||
|
||
# not nan | ||
if val == val: | ||
nobs[lab, j] += 1 | ||
prodx[lab, j] *= val | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this overflow safe? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure, kept the same logic as was in tempita. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. Definitely not something to worry about for this PR. Ditto with the performance in the variance calculation. |
||
|
||
for i in range(ncounts): | ||
for j in range(K): | ||
if nobs[i, j] < min_count: | ||
out[i, j] = NAN | ||
else: | ||
out[i, j] = prodx[i, j] | ||
|
||
|
||
group_prod_float32 = _group_prod['float'] | ||
group_prod_float64 = _group_prod['double'] | ||
|
||
|
||
@cython.wraparound(False) | ||
@cython.boundscheck(False) | ||
@cython.cdivision(True) | ||
def _group_var(floating[:, :] out, | ||
int64_t[:] counts, | ||
floating[:, :] values, | ||
const int64_t[:] labels, | ||
Py_ssize_t min_count=-1): | ||
cdef: | ||
Py_ssize_t i, j, N, K, lab, ncounts = len(counts) | ||
floating val, ct, oldmean | ||
ndarray[floating, ndim=2] nobs, mean | ||
|
||
assert min_count == -1, "'min_count' only used in add and prod" | ||
|
||
if not len(values) == len(labels): | ||
raise AssertionError("len(index) != len(labels)") | ||
|
||
nobs = np.zeros_like(out) | ||
mean = np.zeros_like(out) | ||
|
||
N, K = (<object>values).shape | ||
|
||
out[:, :] = 0.0 | ||
|
||
with nogil: | ||
for i in range(N): | ||
lab = labels[i] | ||
if lab < 0: | ||
continue | ||
|
||
counts[lab] += 1 | ||
|
||
for j in range(K): | ||
val = values[i, j] | ||
|
||
# not nan | ||
if val == val: | ||
nobs[lab, j] += 1 | ||
oldmean = mean[lab, j] | ||
mean[lab, j] += (val - oldmean) / nobs[lab, j] | ||
out[lab, j] += (val - mean[lab, j]) * (val - oldmean) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this performant? Seems like a lot of unnecessary operations There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure. kept logic as-is. |
||
|
||
for i in range(ncounts): | ||
for j in range(K): | ||
ct = nobs[i, j] | ||
if ct < 2: | ||
out[i, j] = NAN | ||
else: | ||
out[i, j] /= (ct - 1) | ||
|
||
|
||
group_var_float32 = _group_var['float'] | ||
group_var_float64 = _group_var['double'] | ||
|
||
|
||
@cython.wraparound(False) | ||
@cython.boundscheck(False) | ||
def _group_mean(floating[:, :] out, | ||
int64_t[:] counts, | ||
floating[:, :] values, | ||
const int64_t[:] labels, | ||
Py_ssize_t min_count=-1): | ||
cdef: | ||
Py_ssize_t i, j, N, K, lab, ncounts = len(counts) | ||
floating val, count | ||
ndarray[floating, ndim=2] sumx, nobs | ||
|
||
assert min_count == -1, "'min_count' only used in add and prod" | ||
|
||
if not len(values) == len(labels): | ||
raise AssertionError("len(index) != len(labels)") | ||
|
||
nobs = np.zeros_like(out) | ||
sumx = np.zeros_like(out) | ||
|
||
N, K = (<object>values).shape | ||
|
||
with nogil: | ||
for i in range(N): | ||
lab = labels[i] | ||
if lab < 0: | ||
continue | ||
|
||
counts[lab] += 1 | ||
for j in range(K): | ||
val = values[i, j] | ||
# not nan | ||
if val == val: | ||
nobs[lab, j] += 1 | ||
sumx[lab, j] += val | ||
|
||
for i in range(ncounts): | ||
for j in range(K): | ||
count = nobs[i, j] | ||
if nobs[i, j] == 0: | ||
out[i, j] = NAN | ||
else: | ||
out[i, j] = sumx[i, j] / count | ||
|
||
|
||
group_mean_float32 = _group_mean['float'] | ||
group_mean_float64 = _group_mean['double'] | ||
|
||
|
||
@cython.wraparound(False) | ||
@cython.boundscheck(False) | ||
def _group_ohlc(floating[:, :] out, | ||
int64_t[:] counts, | ||
floating[:, :] values, | ||
const int64_t[:] labels, | ||
Py_ssize_t min_count=-1): | ||
""" | ||
Only aggregates on axis=0 | ||
""" | ||
cdef: | ||
Py_ssize_t i, j, N, K, lab | ||
floating val, count | ||
Py_ssize_t ngroups = len(counts) | ||
|
||
assert min_count == -1, "'min_count' only used in add and prod" | ||
|
||
if len(labels) == 0: | ||
return | ||
|
||
N, K = (<object>values).shape | ||
|
||
if out.shape[1] != 4: | ||
raise ValueError('Output array must have 4 columns') | ||
|
||
if K > 1: | ||
raise NotImplementedError("Argument 'values' must have only " | ||
"one dimension") | ||
out[:] = np.nan | ||
|
||
with nogil: | ||
for i in range(N): | ||
lab = labels[i] | ||
if lab == -1: | ||
continue | ||
|
||
counts[lab] += 1 | ||
val = values[i, 0] | ||
if val != val: | ||
continue | ||
|
||
if out[lab, 0] != out[lab, 0]: | ||
out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val | ||
else: | ||
out[lab, 1] = max(out[lab, 1], val) | ||
out[lab, 2] = min(out[lab, 2], val) | ||
out[lab, 3] = val | ||
|
||
|
||
group_ohlc_float32 = _group_ohlc['float'] | ||
group_ohlc_float64 = _group_ohlc['double'] | ||
|
||
# generated from template | ||
include "groupby_helper.pxi" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can these be memoryviews?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably but I wonder if we want them to be because they are assigned ndarrays a few lines below:
Would you like me to declare them as memoryviews?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Supposedly lookups, assignment, and slicing are slightly faster with memoryviews. Since I think thats all we do with these objects and we are doing it in a loop, using memoryviews should be preferred.
(cython takes care of converting the ndarray to memoryview, so the
nobs = np.zeros_like(...)
lines don't need to be changed.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed. Also in a few other locations in the copied functions.