-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
DataFrame.join Copy-on-Write optimization tests #52751
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
ecf04f1
1497064
4c6c1bf
967c46c
481ab5e
c4e7f2a
7f6635c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -310,3 +310,93 @@ def test_merge_copy_keyword(using_copy_on_write, copy): | |
else: | ||
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) | ||
assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) | ||
|
||
|
||
def test_join_on_key(using_copy_on_write): | ||
"""Test if DataFrame.join applies Copy-On-Write optimization. | ||
|
||
GIVEN two DataFrame instances | ||
WHEN DataFrame.join is called for one of them | ||
THEN check that the result DataFrame instance | ||
shares the same memory with original dataframes until it is edited. | ||
""" | ||
df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) | ||
df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you define There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, thank you for the review. Implemented changes based on the comments. |
||
df1_orig = df1.copy() | ||
df2_orig = df2.copy() | ||
|
||
result = df1.join(df2.set_index("key"), on="key") | ||
|
||
if using_copy_on_write: | ||
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) | ||
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) | ||
assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) | ||
assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key")) | ||
else: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) | ||
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) | ||
|
||
result.iloc[0, 1] = 0 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) | ||
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) | ||
|
||
result.iloc[0, 2] = 0 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) | ||
tm.assert_frame_equal(df1, df1_orig) | ||
tm.assert_frame_equal(df2, df2_orig) | ||
|
||
|
||
def test_join_multiple_dataframes_on_key(using_copy_on_write): | ||
"""Test if DataFrame.join applies Copy-On-Write optimization. | ||
|
||
GIVEN a DataFrame instance and a list of DataFrame instances to be joined | ||
WHEN DataFrame.join is called for original DataFrame instance | ||
THEN check that the result DataFrame instance | ||
shares the same memory with original dataframes until it is edited. | ||
""" | ||
df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}).set_index("key") | ||
dfs_list = [ | ||
DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}).set_index("key"), | ||
DataFrame({"key": ["a", "b", "c"], "c": [7, 8, 9]}).set_index("key"), | ||
] | ||
df1_orig = df1.copy() | ||
dfs_list_orig = [df.copy() for df in dfs_list] | ||
|
||
result = df1.join(dfs_list) | ||
|
||
if using_copy_on_write: | ||
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) | ||
assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) | ||
assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) | ||
assert np.shares_memory(get_array(result.index), get_array(df1.index)) | ||
assert not np.shares_memory( | ||
get_array(result.index), get_array(dfs_list[0].index) | ||
) | ||
assert not np.shares_memory( | ||
get_array(result.index), get_array(dfs_list[1].index) | ||
) | ||
else: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) | ||
assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) | ||
assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) | ||
|
||
result.iloc[0, 0] = 0 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) | ||
assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) | ||
assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) | ||
|
||
result.iloc[0, 1] = 0 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) | ||
assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) | ||
|
||
result.iloc[0, 2] = 0 | ||
if using_copy_on_write: | ||
assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) | ||
|
||
tm.assert_frame_equal(df1, df1_orig) | ||
for df, df_orig in zip(dfs_list, dfs_list_orig): | ||
tm.assert_frame_equal(df, df_orig) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you remove the comment? We generally don't add comments in tests