Skip to content

GH-106176, GH-104702: Fix reference leak when importing across multiple threads #108497

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 128 additions & 12 deletions Lib/importlib/_bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,132 @@ def _new_module(name):

# Module-level locking ########################################################

# A dict mapping module names to weakrefs of _ModuleLock instances
# Dictionary protected by the global import lock
# For a list that can have a weakref to it.
class List(list):
pass
Copy link
Member

@Yhg1s Yhg1s Aug 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, I think it would be nice to use __slots__ for this class, to reduce the size of each instance (especially since the setdefault call creates one each time, even when it isn't necessary).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As in you're going to make a PR to make that change, or you're hoping someone else will open an issue on your behalf to track the idea?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The latter (although not on my behalf) -- I don't know if there's a reason not to do it, and I don't have the spare cycles to dig in and find out.



# From weakref.py with some simplifications.
class WeakValueDictionary:

def __init__(self):
self_weakref = _weakref.ref(self)

# Inlined to avoid issues with inheriting from _weakref before it's
# set. Since there's only one instance of this class, this is
# not expensive.
class KeyedRef(_weakref.ref):

__slots__ = "key",

def __new__(type, ob, key):
self = super().__new__(type, ob, type.remove)
self.key = key
return self

def __init__(self, ob, key):
super().__init__(ob, self.remove)

@staticmethod
def remove(wr):
nonlocal self_weakref

self = self_weakref()
if self is not None:
if self._iterating:
self._pending_removals.append(wr.key)
else:
# Atomic removal is necessary since this function
# can be called asynchronously by the GC
_weakref._remove_dead_weakref(self.data, wr.key)

self._KeyedRef = KeyedRef
self.clear()

def clear(self):
self._pending_removals = []
self._iterating = set()
self.data = {}

def _commit_removals(self):
pop = self._pending_removals.pop
d = self.data
# We shouldn't encounter any KeyError, because this method should
# always be called *before* mutating the dict.
while True:
try:
key = pop()
except IndexError:
return
_weakref._remove_dead_weakref(d, key)

def __getitem__(self, key):
if self._pending_removals:
self._commit_removals()
o = self.data[key]()
if o is None:
raise KeyError(key)
else:
return o

def __delitem__(self, key):
if self._pending_removals:
self._commit_removals()
del self.data[key]

def __repr__(self):
return "<%s at %#x>" % (self.__class__.__name__, id(self))

def __setitem__(self, key, value):
if self._pending_removals:
self._commit_removals()
self.data[key] = self._KeyedRef(value, key)

def get(self, key, default=None):
if self._pending_removals:
self._commit_removals()
try:
wr = self.data[key]
except KeyError:
return default
else:
o = wr()
if o is None:
# This should only happen
return default
else:
return o

def setdefault(self, key, default=None):
try:
o = self.data[key]()
except KeyError:
o = None
if o is None:
if self._pending_removals:
self._commit_removals()
self.data[key] = self._KeyedRef(default, key)
return default
else:
return o


# A dict mapping module names to weakrefs of _ModuleLock instances.
# Dictionary protected by the global import lock.
_module_locks = {}

# A dict mapping thread IDs to lists of _ModuleLock instances. This maps a
# thread to the module locks it is blocking on acquiring. The values are
# lists because a single thread could perform a re-entrant import and be "in
# the process" of blocking on locks for more than one module. A thread can
# be "in the process" because a thread cannot actually block on acquiring
# more than one lock but it can have set up bookkeeping that reflects that
# it intends to block on acquiring more than one lock.
_blocking_on = {}
# A dict mapping thread IDs to weakref'ed lists of _ModuleLock instances.
# This maps a thread to the module locks it is blocking on acquiring. The
# values are lists because a single thread could perform a re-entrant import
# and be "in the process" of blocking on locks for more than one module. A
# thread can be "in the process" because a thread cannot actually block on
# acquiring more than one lock but it can have set up bookkeeping that reflects
# that it intends to block on acquiring more than one lock.
#
# The dictionary uses a WeakValueDictionary to avoid keeping unnecessary
# lists around, regardless of GC runs. This way there's no memory leak if
# the list is no longer needed.
_blocking_on = None


class _BlockingOnManager:
Expand All @@ -79,7 +193,7 @@ def __enter__(self):
# re-entrant (i.e., a single thread may take it more than once) so it
# wouldn't help us be correct in the face of re-entrancy either.

self.blocked_on = _blocking_on.setdefault(self.thread_id, [])
self.blocked_on = _blocking_on.setdefault(self.thread_id, List())
self.blocked_on.append(self.lock)

def __exit__(self, *args, **kwargs):
Expand Down Expand Up @@ -1409,7 +1523,7 @@ def _setup(sys_module, _imp_module):
modules, those two modules must be explicitly passed in.

"""
global _imp, sys
global _imp, sys, _blocking_on
_imp = _imp_module
sys = sys_module

Expand Down Expand Up @@ -1437,6 +1551,8 @@ def _setup(sys_module, _imp_module):
builtin_module = sys.modules[builtin_name]
setattr(self_module, builtin_name, builtin_module)

_blocking_on = WeakValueDictionary()


def _install(sys_module, _imp_module):
"""Install importers for builtin and frozen modules"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Use a ``WeakValueDictionary`` to track the lists containing the modules each
thread is currently importing. This helps avoid a reference leak from
keeping the list around longer than necessary. Weakrefs are used as GC can't
interrupt the cleanup.