Skip to content

DataFrame.join Copy-on-Write optimization tests #52751

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 21, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions pandas/tests/copy_view/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,93 @@ def test_merge_copy_keyword(using_copy_on_write, copy):
else:
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))


def test_join_on_key(using_copy_on_write):
"""Test if DataFrame.join applies Copy-On-Write optimization.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you remove the comment? We generally don't add comments in tests


GIVEN two DataFrame instances
WHEN DataFrame.join is called for one of them
THEN check that the result DataFrame instance
shares the same memory with original dataframes until it is edited.
"""
df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]})
df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you define index=Index(["a", "b", "c"], name="key") instead of using it as a column? We always try to create a test with the least number of operations possible

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, thank you for the review. Implemented changes based on the comments.

df1_orig = df1.copy()
df2_orig = df2.copy()

result = df1.join(df2.set_index("key"), on="key")

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
assert np.shares_memory(get_array(result, "key"), get_array(df1, "key"))
assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key"))
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))

result.iloc[0, 1] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))

result.iloc[0, 2] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
tm.assert_frame_equal(df1, df1_orig)
tm.assert_frame_equal(df2, df2_orig)


def test_join_multiple_dataframes_on_key(using_copy_on_write):
"""Test if DataFrame.join applies Copy-On-Write optimization.

GIVEN a DataFrame instance and a list of DataFrame instances to be joined
WHEN DataFrame.join is called for original DataFrame instance
THEN check that the result DataFrame instance
shares the same memory with original dataframes until it is edited.
"""
df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}).set_index("key")
dfs_list = [
DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}).set_index("key"),
DataFrame({"key": ["a", "b", "c"], "c": [7, 8, 9]}).set_index("key"),
]
df1_orig = df1.copy()
dfs_list_orig = [df.copy() for df in dfs_list]

result = df1.join(dfs_list)

if using_copy_on_write:
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
assert np.shares_memory(get_array(result.index), get_array(df1.index))
assert not np.shares_memory(
get_array(result.index), get_array(dfs_list[0].index)
)
assert not np.shares_memory(
get_array(result.index), get_array(dfs_list[1].index)
)
else:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))

result.iloc[0, 0] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))

result.iloc[0, 1] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))

result.iloc[0, 2] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))

tm.assert_frame_equal(df1, df1_orig)
for df, df_orig in zip(dfs_list, dfs_list_orig):
tm.assert_frame_equal(df, df_orig)