Skip to content

ENH: Added 'direction' parameter to merge_asof() (#14887) #15129

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ Other enhancements
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)

- ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`)
- ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`)


.. _whatsnew_0200.api_breaking:
Expand Down
228 changes: 167 additions & 61 deletions pandas/src/joins_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,15 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values,
ndarray[{{by_dtype}}] left_by_values,
ndarray[{{by_dtype}}] right_by_values,
bint allow_exact_matches=1,
tolerance=None):
tolerance=None,
int64_t direction_enum=0):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any reason not to pass a string here instead?


cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri
bint has_tolerance = 0
{{on_dtype}} tolerance_
{{on_dtype}} diff, bdiff, fdiff
{{table_type}} hash_table
{{by_dtype}} by_value

Expand All @@ -56,37 +58,94 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values,

hash_table = {{table_type}}(right_size)

right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0

# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
right_pos -= 1

# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table.get_item(by_value)\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos

# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
if direction_enum == 0: #backward
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw I think all of this can be nogil

right_pos = 0
for left_pos in range(left_size):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we pull each of these direction out as separate functions? (to make this top level a bit more clear)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I'll take a shot at that.

# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0

# find last position in right whose value is less than left's
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
right_pos -= 1

# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table.get_item(by_value)\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos

# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
elif direction_enum == 1: # forward
right_pos = right_size - 1
for left_pos in range(left_size - 1, -1, -1):
# restart right_pos if it went over in a previous iteration
if right_pos == right_size:
right_pos = right_size - 1

# find first position in right whose value is greater than left's
if allow_exact_matches:
while right_pos >= 0 and\
right_values[right_pos] >= left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos -= 1
else:
while right_pos >= 0 and\
right_values[right_pos] > left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos -= 1
right_pos += 1

# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table.get_item(by_value)\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos

# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = right_values[found_right_pos] - left_values[left_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
else: # nearest
# search both forward and backward
bli, bri = asof_join_{{on_dtype}}_by_{{by_dtype}}(left_values,
right_values,
left_by_values,
right_by_values,
allow_exact_matches,
tolerance, 0)
fli, fri = asof_join_{{on_dtype}}_by_{{by_dtype}}(left_values,
right_values,
left_by_values,
right_by_values,
allow_exact_matches,
tolerance, 1)

for i in range(len(bri)):
# choose timestamp from right with smaller difference
if bri[i] != -1 and fri[i] != -1:
bdiff = left_values[bli[i]] - right_values[bri[i]]
fdiff = right_values[fri[i]] - left_values[fli[i]]
right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i]
else:
right_indexer[i] = bri[i] if bri[i] != -1 else fri[i]
left_indexer[i] = bli[i]

return left_indexer, right_indexer

Expand All @@ -113,13 +172,15 @@ dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t',
def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values,
ndarray[{{on_dtype}}] right_values,
bint allow_exact_matches=1,
tolerance=None):
tolerance=None,
int64_t direction_enum=0):

cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size
ndarray[int64_t] left_indexer, right_indexer
ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri
bint has_tolerance = 0
{{on_dtype}} tolerance_
{{on_dtype}} diff, bdiff, fdiff

# if we are using tolerance, set our objects
if tolerance is not None:
Expand All @@ -132,32 +193,77 @@ def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values,
left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)

right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0

# find last position in right whose value is less than left's value
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
right_pos += 1
right_pos -= 1

# save positions as the desired index
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = right_pos

# if needed, verify that tolerance is met
if has_tolerance and right_pos != -1:
diff = left_values[left_pos] - right_values[right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
if direction_enum == 0: # backward
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this repeated here (the actual searching part)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This search is in the non-by logic. The search above tracks the last-seen position for each element in the by column. (I assume you are asking why there are two asof_join_* functions.)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh right, now I remember, you directly did the groupby one and the non-groupby.

ok sure. that's more of a reason then to have some helpers to avoid repeating things (which in an of itself is not a perf issue), but a readability one.

right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0

# find last position in right whose value is less than left's
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
right_pos += 1
else:
while right_pos < right_size and\
right_values[right_pos] < left_values[left_pos]:
right_pos += 1
right_pos -= 1

# save positions as the desired index
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = right_pos

# if needed, verify that tolerance is met
if has_tolerance and right_pos != -1:
diff = left_values[left_pos] - right_values[right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
elif direction_enum == 1: # forward
right_pos = right_size - 1
for left_pos in range(left_size - 1, -1, -1):
# restart right_pos if it went over in a previous iteration
if right_pos == right_size:
right_pos = right_size - 1

# find first position in right whose value is greater than left's
if allow_exact_matches:
while right_pos >= 0 and\
right_values[right_pos] >= left_values[left_pos]:
right_pos -= 1
else:
while right_pos >= 0 and\
right_values[right_pos] > left_values[left_pos]:
right_pos -= 1
right_pos += 1

# save positions as the desired index
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = right_pos\
if right_pos != right_size else -1

# if needed, verify that tolerance is met
if has_tolerance and right_pos != right_size:
diff = right_values[right_pos] - left_values[left_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
else: # nearest
# search both forward and backward
bli, bri = asof_join_{{on_dtype}}(left_values, right_values,
allow_exact_matches, tolerance, 0)
fli, fri = asof_join_{{on_dtype}}(left_values, right_values,
allow_exact_matches, tolerance, 1)

for i in range(len(bri)):
# choose timestamp from right with smaller difference
if bri[i] != -1 and fri[i] != -1:
bdiff = left_values[bli[i]] - right_values[bri[i]]
fdiff = right_values[fri[i]] - left_values[fli[i]]
right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i]
else:
right_indexer[i] = bri[i] if bri[i] != -1 else fri[i]
left_indexer[i] = bli[i]

return left_indexer, right_indexer

Expand Down
Loading