-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Added 'direction' parameter to merge_asof() (#14887) #15129
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
50431ad
ce5caaa
879c9f0
da38483
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,13 +33,15 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, | |
ndarray[{{by_dtype}}] left_by_values, | ||
ndarray[{{by_dtype}}] right_by_values, | ||
bint allow_exact_matches=1, | ||
tolerance=None): | ||
tolerance=None, | ||
int64_t direction_enum=0): | ||
|
||
cdef: | ||
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos | ||
ndarray[int64_t] left_indexer, right_indexer | ||
ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri | ||
bint has_tolerance = 0 | ||
{{on_dtype}} tolerance_ | ||
{{on_dtype}} diff, bdiff, fdiff | ||
{{table_type}} hash_table | ||
{{by_dtype}} by_value | ||
|
||
|
@@ -56,37 +58,94 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, | |
|
||
hash_table = {{table_type}}(right_size) | ||
|
||
right_pos = 0 | ||
for left_pos in range(left_size): | ||
# restart right_pos if it went negative in a previous iteration | ||
if right_pos < 0: | ||
right_pos = 0 | ||
|
||
# find last position in right whose value is less than left's value | ||
if allow_exact_matches: | ||
while right_pos < right_size and\ | ||
right_values[right_pos] <= left_values[left_pos]: | ||
hash_table.set_item(right_by_values[right_pos], right_pos) | ||
right_pos += 1 | ||
else: | ||
while right_pos < right_size and\ | ||
right_values[right_pos] < left_values[left_pos]: | ||
hash_table.set_item(right_by_values[right_pos], right_pos) | ||
right_pos += 1 | ||
right_pos -= 1 | ||
|
||
# save positions as the desired index | ||
by_value = left_by_values[left_pos] | ||
found_right_pos = hash_table.get_item(by_value)\ | ||
if by_value in hash_table else -1 | ||
left_indexer[left_pos] = left_pos | ||
right_indexer[left_pos] = found_right_pos | ||
|
||
# if needed, verify that tolerance is met | ||
if has_tolerance and found_right_pos != -1: | ||
diff = left_values[left_pos] - right_values[found_right_pos] | ||
if diff > tolerance_: | ||
right_indexer[left_pos] = -1 | ||
if direction_enum == 0: #backward | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. btw I think all of this can be |
||
right_pos = 0 | ||
for left_pos in range(left_size): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we pull each of these direction out as separate functions? (to make this top level a bit more clear) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. I'll take a shot at that. |
||
# restart right_pos if it went negative in a previous iteration | ||
if right_pos < 0: | ||
right_pos = 0 | ||
|
||
# find last position in right whose value is less than left's | ||
if allow_exact_matches: | ||
while right_pos < right_size and\ | ||
right_values[right_pos] <= left_values[left_pos]: | ||
hash_table.set_item(right_by_values[right_pos], right_pos) | ||
right_pos += 1 | ||
else: | ||
while right_pos < right_size and\ | ||
right_values[right_pos] < left_values[left_pos]: | ||
hash_table.set_item(right_by_values[right_pos], right_pos) | ||
right_pos += 1 | ||
right_pos -= 1 | ||
|
||
# save positions as the desired index | ||
by_value = left_by_values[left_pos] | ||
found_right_pos = hash_table.get_item(by_value)\ | ||
if by_value in hash_table else -1 | ||
left_indexer[left_pos] = left_pos | ||
right_indexer[left_pos] = found_right_pos | ||
|
||
# if needed, verify that tolerance is met | ||
if has_tolerance and found_right_pos != -1: | ||
diff = left_values[left_pos] - right_values[found_right_pos] | ||
if diff > tolerance_: | ||
right_indexer[left_pos] = -1 | ||
elif direction_enum == 1: # forward | ||
right_pos = right_size - 1 | ||
for left_pos in range(left_size - 1, -1, -1): | ||
# restart right_pos if it went over in a previous iteration | ||
if right_pos == right_size: | ||
right_pos = right_size - 1 | ||
|
||
# find first position in right whose value is greater than left's | ||
if allow_exact_matches: | ||
while right_pos >= 0 and\ | ||
right_values[right_pos] >= left_values[left_pos]: | ||
hash_table.set_item(right_by_values[right_pos], right_pos) | ||
right_pos -= 1 | ||
else: | ||
while right_pos >= 0 and\ | ||
right_values[right_pos] > left_values[left_pos]: | ||
hash_table.set_item(right_by_values[right_pos], right_pos) | ||
right_pos -= 1 | ||
right_pos += 1 | ||
|
||
# save positions as the desired index | ||
by_value = left_by_values[left_pos] | ||
found_right_pos = hash_table.get_item(by_value)\ | ||
if by_value in hash_table else -1 | ||
left_indexer[left_pos] = left_pos | ||
right_indexer[left_pos] = found_right_pos | ||
|
||
# if needed, verify that tolerance is met | ||
if has_tolerance and found_right_pos != -1: | ||
diff = right_values[found_right_pos] - left_values[left_pos] | ||
if diff > tolerance_: | ||
right_indexer[left_pos] = -1 | ||
else: # nearest | ||
# search both forward and backward | ||
bli, bri = asof_join_{{on_dtype}}_by_{{by_dtype}}(left_values, | ||
right_values, | ||
left_by_values, | ||
right_by_values, | ||
allow_exact_matches, | ||
tolerance, 0) | ||
fli, fri = asof_join_{{on_dtype}}_by_{{by_dtype}}(left_values, | ||
right_values, | ||
left_by_values, | ||
right_by_values, | ||
allow_exact_matches, | ||
tolerance, 1) | ||
|
||
for i in range(len(bri)): | ||
# choose timestamp from right with smaller difference | ||
if bri[i] != -1 and fri[i] != -1: | ||
bdiff = left_values[bli[i]] - right_values[bri[i]] | ||
fdiff = right_values[fri[i]] - left_values[fli[i]] | ||
right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] | ||
else: | ||
right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] | ||
left_indexer[i] = bli[i] | ||
|
||
return left_indexer, right_indexer | ||
|
||
|
@@ -113,13 +172,15 @@ dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', | |
def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values, | ||
ndarray[{{on_dtype}}] right_values, | ||
bint allow_exact_matches=1, | ||
tolerance=None): | ||
tolerance=None, | ||
int64_t direction_enum=0): | ||
|
||
cdef: | ||
Py_ssize_t left_pos, right_pos, left_size, right_size | ||
ndarray[int64_t] left_indexer, right_indexer | ||
ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri | ||
bint has_tolerance = 0 | ||
{{on_dtype}} tolerance_ | ||
{{on_dtype}} diff, bdiff, fdiff | ||
|
||
# if we are using tolerance, set our objects | ||
if tolerance is not None: | ||
|
@@ -132,32 +193,77 @@ def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values, | |
left_indexer = np.empty(left_size, dtype=np.int64) | ||
right_indexer = np.empty(left_size, dtype=np.int64) | ||
|
||
right_pos = 0 | ||
for left_pos in range(left_size): | ||
# restart right_pos if it went negative in a previous iteration | ||
if right_pos < 0: | ||
right_pos = 0 | ||
|
||
# find last position in right whose value is less than left's value | ||
if allow_exact_matches: | ||
while right_pos < right_size and\ | ||
right_values[right_pos] <= left_values[left_pos]: | ||
right_pos += 1 | ||
else: | ||
while right_pos < right_size and\ | ||
right_values[right_pos] < left_values[left_pos]: | ||
right_pos += 1 | ||
right_pos -= 1 | ||
|
||
# save positions as the desired index | ||
left_indexer[left_pos] = left_pos | ||
right_indexer[left_pos] = right_pos | ||
|
||
# if needed, verify that tolerance is met | ||
if has_tolerance and right_pos != -1: | ||
diff = left_values[left_pos] - right_values[right_pos] | ||
if diff > tolerance_: | ||
right_indexer[left_pos] = -1 | ||
if direction_enum == 0: # backward | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this repeated here (the actual searching part)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This search is in the non-by logic. The search above tracks the last-seen position for each element in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ahh right, now I remember, you directly did the groupby one and the non-groupby. ok sure. that's more of a reason then to have some helpers to avoid repeating things (which in an of itself is not a perf issue), but a readability one. |
||
right_pos = 0 | ||
for left_pos in range(left_size): | ||
# restart right_pos if it went negative in a previous iteration | ||
if right_pos < 0: | ||
right_pos = 0 | ||
|
||
# find last position in right whose value is less than left's | ||
if allow_exact_matches: | ||
while right_pos < right_size and\ | ||
right_values[right_pos] <= left_values[left_pos]: | ||
right_pos += 1 | ||
else: | ||
while right_pos < right_size and\ | ||
right_values[right_pos] < left_values[left_pos]: | ||
right_pos += 1 | ||
right_pos -= 1 | ||
|
||
# save positions as the desired index | ||
left_indexer[left_pos] = left_pos | ||
right_indexer[left_pos] = right_pos | ||
|
||
# if needed, verify that tolerance is met | ||
if has_tolerance and right_pos != -1: | ||
diff = left_values[left_pos] - right_values[right_pos] | ||
if diff > tolerance_: | ||
right_indexer[left_pos] = -1 | ||
elif direction_enum == 1: # forward | ||
right_pos = right_size - 1 | ||
for left_pos in range(left_size - 1, -1, -1): | ||
# restart right_pos if it went over in a previous iteration | ||
if right_pos == right_size: | ||
right_pos = right_size - 1 | ||
|
||
# find first position in right whose value is greater than left's | ||
if allow_exact_matches: | ||
while right_pos >= 0 and\ | ||
right_values[right_pos] >= left_values[left_pos]: | ||
right_pos -= 1 | ||
else: | ||
while right_pos >= 0 and\ | ||
right_values[right_pos] > left_values[left_pos]: | ||
right_pos -= 1 | ||
right_pos += 1 | ||
|
||
# save positions as the desired index | ||
left_indexer[left_pos] = left_pos | ||
right_indexer[left_pos] = right_pos\ | ||
if right_pos != right_size else -1 | ||
|
||
# if needed, verify that tolerance is met | ||
if has_tolerance and right_pos != right_size: | ||
diff = right_values[right_pos] - left_values[left_pos] | ||
if diff > tolerance_: | ||
right_indexer[left_pos] = -1 | ||
else: # nearest | ||
# search both forward and backward | ||
bli, bri = asof_join_{{on_dtype}}(left_values, right_values, | ||
allow_exact_matches, tolerance, 0) | ||
fli, fri = asof_join_{{on_dtype}}(left_values, right_values, | ||
allow_exact_matches, tolerance, 1) | ||
|
||
for i in range(len(bri)): | ||
# choose timestamp from right with smaller difference | ||
if bri[i] != -1 and fri[i] != -1: | ||
bdiff = left_values[bli[i]] - right_values[bri[i]] | ||
fdiff = right_values[fri[i]] - left_values[fli[i]] | ||
right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] | ||
else: | ||
right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] | ||
left_indexer[i] = bli[i] | ||
|
||
return left_indexer, right_indexer | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
any reason not to pass a string here instead?