Skip to content

Commit d003266

Browse files
tomMoralogrisel
andauthored
ENH some steps to make cloudpickle dynamic function/classes more deterministic (#524)
Co-authored-by: Olivier Grisel <[email protected]>
1 parent 25aef95 commit d003266

File tree

13 files changed

+273
-49
lines changed

13 files changed

+273
-49
lines changed

CHANGES.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
3.1.0 (in development)
22
======================
33

4+
- Some improvements to make cloudpickle more deterministic when pickling
5+
dynamic functions and classes.
6+
([PR #524](https://github.com/cloudpipe/cloudpickle/pull/524))
7+
48

59
3.0.0
610
=====

ci/install_coverage_subprocess_pth.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
import coverage; coverage.process_startup()
1010
"""
1111

12-
filename = op.join(get_path('purelib'), 'coverage_subprocess.pth')
13-
with open(filename, 'wb') as f:
14-
f.write(FILE_CONTENT.encode('ascii'))
12+
filename = op.join(get_path("purelib"), "coverage_subprocess.pth")
13+
with open(filename, "wb") as f:
14+
f.write(FILE_CONTENT.encode("ascii"))
1515

16-
print('Installed subprocess coverage support: %s' % filename)
16+
print("Installed subprocess coverage support: %s" % filename)

cloudpickle/cloudpickle.py

+62-24
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def _lookup_class_or_track(class_tracker_id, class_def):
126126

127127

128128
def register_pickle_by_value(module):
129-
"""Register a module to make it functions and classes picklable by value.
129+
"""Register a module to make its functions and classes picklable by value.
130130
131131
By default, functions and classes that are attributes of an importable
132132
module are to be pickled by reference, that is relying on re-importing
@@ -369,7 +369,7 @@ def func():
369369
# sys.modules.
370370
if name is not None and name.startswith(prefix):
371371
# check whether the function can address the sub-module
372-
tokens = set(name[len(prefix) :].split("."))
372+
tokens = set(name[len(prefix):].split("."))
373373
if not tokens - set(code.co_names):
374374
subimports.append(sys.modules[name])
375375
return subimports
@@ -409,7 +409,10 @@ def _walk_global_ops(code):
409409

410410
def _extract_class_dict(cls):
411411
"""Retrieve a copy of the dict of a class without the inherited method."""
412-
clsdict = dict(cls.__dict__) # copy dict proxy to a dict
412+
# Hack to circumvent non-predictable memoization caused by string interning.
413+
# See the inline comment in _class_setstate for details.
414+
clsdict = {"".join(k): cls.__dict__[k] for k in sorted(cls.__dict__)}
415+
413416
if len(cls.__bases__) == 1:
414417
inherited_dict = cls.__bases__[0].__dict__
415418
else:
@@ -533,9 +536,15 @@ class id will also reuse this class definition.
533536
The "extra" variable is meant to be a dict (or None) that can be used for
534537
forward compatibility shall the need arise.
535538
"""
539+
# We need to intern the keys of the type_kwargs dict to avoid having
540+
# different pickles for the same dynamic class depending on whether it was
541+
# dynamically created or reconstructed from a pickled stream.
542+
type_kwargs = {sys.intern(k): v for k, v in type_kwargs.items()}
543+
536544
skeleton_class = types.new_class(
537545
name, bases, {"metaclass": type_constructor}, lambda ns: ns.update(type_kwargs)
538546
)
547+
539548
return _lookup_class_or_track(class_tracker_id, skeleton_class)
540549

541550

@@ -694,7 +703,9 @@ def _function_getstate(func):
694703
# unpickling time by iterating over slotstate and calling setattr(func,
695704
# slotname, slotvalue)
696705
slotstate = {
697-
"__name__": func.__name__,
706+
# Hack to circumvent non-predictable memoization caused by string interning.
707+
# See the inline comment in _class_setstate for details.
708+
"__name__": "".join(func.__name__),
698709
"__qualname__": func.__qualname__,
699710
"__annotations__": func.__annotations__,
700711
"__kwdefaults__": func.__kwdefaults__,
@@ -721,7 +732,9 @@ def _function_getstate(func):
721732
)
722733
slotstate["__globals__"] = f_globals
723734

724-
state = func.__dict__
735+
# Hack to circumvent non-predictable memoization caused by string interning.
736+
# See the inline comment in _class_setstate for details.
737+
state = {"".join(k): v for k, v in func.__dict__.items()}
725738
return state, slotstate
726739

727740

@@ -802,6 +815,19 @@ def _code_reduce(obj):
802815
# of the specific type from types, for example:
803816
# >>> from types import CodeType
804817
# >>> help(CodeType)
818+
819+
# Hack to circumvent non-predictable memoization caused by string interning.
820+
# See the inline comment in _class_setstate for details.
821+
co_name = "".join(obj.co_name)
822+
823+
# Create shallow copies of these tuple to make cloudpickle payload deterministic.
824+
# When creating a code object during load, copies of these four tuples are
825+
# created, while in the main process, these tuples can be shared.
826+
# By always creating copies, we make sure the resulting payload is deterministic.
827+
co_names = tuple(name for name in obj.co_names)
828+
co_varnames = tuple(name for name in obj.co_varnames)
829+
co_freevars = tuple(name for name in obj.co_freevars)
830+
co_cellvars = tuple(name for name in obj.co_cellvars)
805831
if hasattr(obj, "co_exceptiontable"):
806832
# Python 3.11 and later: there are some new attributes
807833
# related to the enhanced exceptions.
@@ -814,16 +840,16 @@ def _code_reduce(obj):
814840
obj.co_flags,
815841
obj.co_code,
816842
obj.co_consts,
817-
obj.co_names,
818-
obj.co_varnames,
843+
co_names,
844+
co_varnames,
819845
obj.co_filename,
820-
obj.co_name,
846+
co_name,
821847
obj.co_qualname,
822848
obj.co_firstlineno,
823849
obj.co_linetable,
824850
obj.co_exceptiontable,
825-
obj.co_freevars,
826-
obj.co_cellvars,
851+
co_freevars,
852+
co_cellvars,
827853
)
828854
elif hasattr(obj, "co_linetable"):
829855
# Python 3.10 and later: obj.co_lnotab is deprecated and constructor
@@ -837,14 +863,14 @@ def _code_reduce(obj):
837863
obj.co_flags,
838864
obj.co_code,
839865
obj.co_consts,
840-
obj.co_names,
841-
obj.co_varnames,
866+
co_names,
867+
co_varnames,
842868
obj.co_filename,
843-
obj.co_name,
869+
co_name,
844870
obj.co_firstlineno,
845871
obj.co_linetable,
846-
obj.co_freevars,
847-
obj.co_cellvars,
872+
co_freevars,
873+
co_cellvars,
848874
)
849875
elif hasattr(obj, "co_nmeta"): # pragma: no cover
850876
# "nogil" Python: modified attributes from 3.9
@@ -859,15 +885,15 @@ def _code_reduce(obj):
859885
obj.co_flags,
860886
obj.co_code,
861887
obj.co_consts,
862-
obj.co_varnames,
888+
co_varnames,
863889
obj.co_filename,
864-
obj.co_name,
890+
co_name,
865891
obj.co_firstlineno,
866892
obj.co_lnotab,
867893
obj.co_exc_handlers,
868894
obj.co_jump_table,
869-
obj.co_freevars,
870-
obj.co_cellvars,
895+
co_freevars,
896+
co_cellvars,
871897
obj.co_free2reg,
872898
obj.co_cell2reg,
873899
)
@@ -882,14 +908,14 @@ def _code_reduce(obj):
882908
obj.co_flags,
883909
obj.co_code,
884910
obj.co_consts,
885-
obj.co_names,
886-
obj.co_varnames,
911+
co_names,
912+
co_varnames,
887913
obj.co_filename,
888-
obj.co_name,
914+
co_name,
889915
obj.co_firstlineno,
890916
obj.co_lnotab,
891-
obj.co_freevars,
892-
obj.co_cellvars,
917+
co_freevars,
918+
co_cellvars,
893919
)
894920
return types.CodeType, args
895921

@@ -1127,6 +1153,18 @@ def _class_setstate(obj, state):
11271153
if attrname == "_abc_impl":
11281154
registry = attr
11291155
else:
1156+
# Note: setting attribute names on a class automatically triggers their
1157+
# interning in CPython:
1158+
# https://github.com/python/cpython/blob/v3.12.0/Objects/object.c#L957
1159+
#
1160+
# This means that to get deterministic pickling for a dynamic class that
1161+
# was initially defined in a different Python process, the pickler
1162+
# needs to ensure that dynamic class and function attribute names are
1163+
# systematically copied into a non-interned version to avoid
1164+
# unpredictable pickle payloads.
1165+
#
1166+
# Indeed the Pickler's memoizer relies on physical object identity to break
1167+
# cycles in the reference graph of the object being serialized.
11301168
setattr(obj, attrname, attr)
11311169
if registry is not None:
11321170
for subclass in registry:

cloudpickle/cloudpickle_fast.py

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
77
See: tests/test_backward_compat.py
88
"""
9+
910
from . import cloudpickle
1011

1112

tests/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import pytest
2+
3+
pytest.register_assert_rewrite("tests.testutils")

0 commit comments

Comments
 (0)