-
Notifications
You must be signed in to change notification settings - Fork 180
ENH some steps to make cloudpickle dynamic function/classes more deterministic #524
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3cfaf20
0018fac
edbd021
f16a63f
124826b
00febd0
9e2325c
9e7fa91
7eec003
e90c088
cadb9f2
252989c
9510a0e
5e6314d
8df4ceb
5ed2b21
32191f4
2d3e917
9cb6bfc
e18678f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -126,7 +126,7 @@ def _lookup_class_or_track(class_tracker_id, class_def): | |
|
||
|
||
def register_pickle_by_value(module): | ||
"""Register a module to make it functions and classes picklable by value. | ||
"""Register a module to make its functions and classes picklable by value. | ||
|
||
By default, functions and classes that are attributes of an importable | ||
module are to be pickled by reference, that is relying on re-importing | ||
|
@@ -369,7 +369,7 @@ def func(): | |
# sys.modules. | ||
if name is not None and name.startswith(prefix): | ||
# check whether the function can address the sub-module | ||
tokens = set(name[len(prefix) :].split(".")) | ||
tokens = set(name[len(prefix):].split(".")) | ||
if not tokens - set(code.co_names): | ||
subimports.append(sys.modules[name]) | ||
return subimports | ||
|
@@ -409,7 +409,10 @@ def _walk_global_ops(code): | |
|
||
def _extract_class_dict(cls): | ||
"""Retrieve a copy of the dict of a class without the inherited method.""" | ||
clsdict = dict(cls.__dict__) # copy dict proxy to a dict | ||
# Hack to circumvent non-predictable memoization caused by string interning. | ||
# See the inline comment in _class_setstate for details. | ||
clsdict = {"".join(k): cls.__dict__[k] for k in sorted(cls.__dict__)} | ||
|
||
if len(cls.__bases__) == 1: | ||
inherited_dict = cls.__bases__[0].__dict__ | ||
else: | ||
|
@@ -533,9 +536,15 @@ class id will also reuse this class definition. | |
The "extra" variable is meant to be a dict (or None) that can be used for | ||
forward compatibility shall the need arise. | ||
""" | ||
# We need to intern the keys of the type_kwargs dict to avoid having | ||
# different pickles for the same dynamic class depending on whether it was | ||
# dynamically created or reconstructed from a pickled stream. | ||
type_kwargs = {sys.intern(k): v for k, v in type_kwargs.items()} | ||
|
||
skeleton_class = types.new_class( | ||
name, bases, {"metaclass": type_constructor}, lambda ns: ns.update(type_kwargs) | ||
) | ||
|
||
return _lookup_class_or_track(class_tracker_id, skeleton_class) | ||
|
||
|
||
|
@@ -694,7 +703,9 @@ def _function_getstate(func): | |
# unpickling time by iterating over slotstate and calling setattr(func, | ||
# slotname, slotvalue) | ||
slotstate = { | ||
"__name__": func.__name__, | ||
# Hack to circumvent non-predictable memoization caused by string interning. | ||
# See the inline comment in _class_setstate for details. | ||
"__name__": "".join(func.__name__), | ||
"__qualname__": func.__qualname__, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any idea why we don't need this treatment for the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, at least for the latter, there are still things to solver. I will push a slight change to the test to show what I mean. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The issue with string interning is mostly due to collision between the string defined in two different places: typically method names that are interned as class attributes, vs the dynamic func name. |
||
"__annotations__": func.__annotations__, | ||
"__kwdefaults__": func.__kwdefaults__, | ||
|
@@ -721,7 +732,9 @@ def _function_getstate(func): | |
) | ||
slotstate["__globals__"] = f_globals | ||
|
||
state = func.__dict__ | ||
# Hack to circumvent non-predictable memoization caused by string interning. | ||
# See the inline comment in _class_setstate for details. | ||
state = {"".join(k): v for k, v in func.__dict__.items()} | ||
return state, slotstate | ||
|
||
|
||
|
@@ -802,6 +815,19 @@ def _code_reduce(obj): | |
# of the specific type from types, for example: | ||
# >>> from types import CodeType | ||
# >>> help(CodeType) | ||
|
||
# Hack to circumvent non-predictable memoization caused by string interning. | ||
# See the inline comment in _class_setstate for details. | ||
co_name = "".join(obj.co_name) | ||
|
||
# Create shallow copies of these tuple to make cloudpickle payload deterministic. | ||
# When creating a code object during load, copies of these four tuples are | ||
# created, while in the main process, these tuples can be shared. | ||
# By always creating copies, we make sure the resulting payload is deterministic. | ||
co_names = tuple(name for name in obj.co_names) | ||
co_varnames = tuple(name for name in obj.co_varnames) | ||
co_freevars = tuple(name for name in obj.co_freevars) | ||
co_cellvars = tuple(name for name in obj.co_cellvars) | ||
if hasattr(obj, "co_exceptiontable"): | ||
# Python 3.11 and later: there are some new attributes | ||
# related to the enhanced exceptions. | ||
|
@@ -814,16 +840,16 @@ def _code_reduce(obj): | |
obj.co_flags, | ||
obj.co_code, | ||
obj.co_consts, | ||
obj.co_names, | ||
obj.co_varnames, | ||
co_names, | ||
co_varnames, | ||
obj.co_filename, | ||
obj.co_name, | ||
co_name, | ||
obj.co_qualname, | ||
obj.co_firstlineno, | ||
obj.co_linetable, | ||
obj.co_exceptiontable, | ||
obj.co_freevars, | ||
obj.co_cellvars, | ||
co_freevars, | ||
co_cellvars, | ||
) | ||
elif hasattr(obj, "co_linetable"): | ||
# Python 3.10 and later: obj.co_lnotab is deprecated and constructor | ||
|
@@ -837,14 +863,14 @@ def _code_reduce(obj): | |
obj.co_flags, | ||
obj.co_code, | ||
obj.co_consts, | ||
obj.co_names, | ||
obj.co_varnames, | ||
co_names, | ||
co_varnames, | ||
obj.co_filename, | ||
obj.co_name, | ||
co_name, | ||
obj.co_firstlineno, | ||
obj.co_linetable, | ||
obj.co_freevars, | ||
obj.co_cellvars, | ||
co_freevars, | ||
co_cellvars, | ||
) | ||
elif hasattr(obj, "co_nmeta"): # pragma: no cover | ||
# "nogil" Python: modified attributes from 3.9 | ||
|
@@ -859,15 +885,15 @@ def _code_reduce(obj): | |
obj.co_flags, | ||
obj.co_code, | ||
obj.co_consts, | ||
obj.co_varnames, | ||
co_varnames, | ||
obj.co_filename, | ||
obj.co_name, | ||
co_name, | ||
obj.co_firstlineno, | ||
obj.co_lnotab, | ||
obj.co_exc_handlers, | ||
obj.co_jump_table, | ||
obj.co_freevars, | ||
obj.co_cellvars, | ||
co_freevars, | ||
co_cellvars, | ||
obj.co_free2reg, | ||
obj.co_cell2reg, | ||
) | ||
|
@@ -882,14 +908,14 @@ def _code_reduce(obj): | |
obj.co_flags, | ||
obj.co_code, | ||
obj.co_consts, | ||
obj.co_names, | ||
obj.co_varnames, | ||
co_names, | ||
co_varnames, | ||
obj.co_filename, | ||
obj.co_name, | ||
co_name, | ||
obj.co_firstlineno, | ||
obj.co_lnotab, | ||
obj.co_freevars, | ||
obj.co_cellvars, | ||
co_freevars, | ||
co_cellvars, | ||
) | ||
return types.CodeType, args | ||
|
||
|
@@ -1127,6 +1153,18 @@ def _class_setstate(obj, state): | |
if attrname == "_abc_impl": | ||
registry = attr | ||
else: | ||
# Note: setting attribute names on a class automatically triggers their | ||
# interning in CPython: | ||
# https://github.com/python/cpython/blob/v3.12.0/Objects/object.c#L957 | ||
# | ||
# This means that to get deterministic pickling for a dynamic class that | ||
# was initially defined in a different Python process, the pickler | ||
# needs to ensure that dynamic class and function attribute names are | ||
# systematically copied into a non-interned version to avoid | ||
# unpredictable pickle payloads. | ||
# | ||
# Indeed the Pickler's memoizer relies on physical object identity to break | ||
# cycles in the reference graph of the object being serialized. | ||
setattr(obj, attrname, attr) | ||
if registry is not None: | ||
for subclass in registry: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
|
||
See: tests/test_backward_compat.py | ||
""" | ||
|
||
from . import cloudpickle | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import pytest | ||
|
||
pytest.register_assert_rewrite("tests.testutils") |
Uh oh!
There was an error while loading. Please reload this page.