Skip to content

Commit 39bc70e

Browse files
authored
pythongh-97912: Avoid quadratic behavior when adding LOAD_FAST_CHECK (pythonGH-97952)
* The compiler analyzes the usage of the first 64 local variables all at once using bit masks. * Local variables beyond the first 64 are only partially analyzed, achieving linear time.
1 parent 6f15ca8 commit 39bc70e

File tree

3 files changed

+175
-64
lines changed

3 files changed

+175
-64
lines changed

Lib/test/test_peepholer.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,45 @@ def f():
776776
self.assertInBytecode(f, 'LOAD_FAST_CHECK')
777777
self.assertNotInBytecode(f, 'LOAD_FAST')
778778

779+
def test_load_fast_too_many_locals(self):
780+
# When there get to be too many locals to analyze completely,
781+
# later locals are all converted to LOAD_FAST_CHECK, except
782+
# when a store or prior load occurred in the same basicblock.
783+
def f():
784+
a00 = a01 = a02 = a03 = a04 = a05 = a06 = a07 = a08 = a09 = 1
785+
a10 = a11 = a12 = a13 = a14 = a15 = a16 = a17 = a18 = a19 = 1
786+
a20 = a21 = a22 = a23 = a24 = a25 = a26 = a27 = a28 = a29 = 1
787+
a30 = a31 = a32 = a33 = a34 = a35 = a36 = a37 = a38 = a39 = 1
788+
a40 = a41 = a42 = a43 = a44 = a45 = a46 = a47 = a48 = a49 = 1
789+
a50 = a51 = a52 = a53 = a54 = a55 = a56 = a57 = a58 = a59 = 1
790+
a60 = a61 = a62 = a63 = a64 = a65 = a66 = a67 = a68 = a69 = 1
791+
a70 = a71 = a72 = a73 = a74 = a75 = a76 = a77 = a78 = a79 = 1
792+
del a72, a73
793+
print(a73)
794+
print(a70, a71, a72, a73)
795+
while True:
796+
print(a00, a01, a62, a63)
797+
print(a64, a65, a78, a79)
798+
799+
for i in 0, 1, 62, 63:
800+
# First 64 locals: analyze completely
801+
self.assertInBytecode(f, 'LOAD_FAST', f"a{i:02}")
802+
self.assertNotInBytecode(f, 'LOAD_FAST_CHECK', f"a{i:02}")
803+
for i in 64, 65, 78, 79:
804+
# Locals >=64 not in the same basicblock
805+
self.assertInBytecode(f, 'LOAD_FAST_CHECK', f"a{i:02}")
806+
self.assertNotInBytecode(f, 'LOAD_FAST', f"a{i:02}")
807+
for i in 70, 71:
808+
# Locals >=64 in the same basicblock
809+
self.assertInBytecode(f, 'LOAD_FAST', f"a{i:02}")
810+
self.assertNotInBytecode(f, 'LOAD_FAST_CHECK', f"a{i:02}")
811+
# del statements should invalidate within basicblocks.
812+
self.assertInBytecode(f, 'LOAD_FAST_CHECK', "a72")
813+
self.assertNotInBytecode(f, 'LOAD_FAST', "a72")
814+
# previous checked loads within a basicblock enable unchecked loads
815+
self.assertInBytecode(f, 'LOAD_FAST_CHECK', "a73")
816+
self.assertInBytecode(f, 'LOAD_FAST', "a73")
817+
779818
def test_setting_lineno_adds_check(self):
780819
code = textwrap.dedent("""\
781820
def f():
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
The compiler now avoids quadratic behavior when finding which instructions should use the :opcode:`LOAD_FAST_CHECK` opcode.

Python/compile.c

Lines changed: 135 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,13 @@
114114
(opcode) == RAISE_VARARGS || \
115115
(opcode) == RERAISE)
116116

117+
#define IS_SUPERINSTRUCTION_OPCODE(opcode) \
118+
((opcode) == LOAD_FAST__LOAD_FAST || \
119+
(opcode) == LOAD_FAST__LOAD_CONST || \
120+
(opcode) == LOAD_CONST__LOAD_FAST || \
121+
(opcode) == STORE_FAST__LOAD_FAST || \
122+
(opcode) == STORE_FAST__STORE_FAST)
123+
117124
#define IS_TOP_LEVEL_AWAIT(c) ( \
118125
(c->c_flags->cf_flags & PyCF_ALLOW_TOP_LEVEL_AWAIT) \
119126
&& (c->u->u_ste->ste_type == ModuleBlock))
@@ -258,6 +265,8 @@ typedef struct basicblock_ {
258265
int b_iused;
259266
/* length of instruction array (b_instr) */
260267
int b_ialloc;
268+
/* Used by add_checks_for_loads_of_unknown_variables */
269+
uint64_t b_unsafe_locals_mask;
261270
/* Number of predecessors that a block has. */
262271
int b_predecessors;
263272
/* depth of stack upon entry of block, computed by stackdepth() */
@@ -8052,103 +8061,165 @@ assemble_jump_offsets(basicblock *entryblock)
80528061
}
80538062

80548063

8055-
// Ensure each basicblock is only put onto the stack once.
8056-
#define MAYBE_PUSH(B) do { \
8057-
if ((B)->b_visited == 0) { \
8058-
*(*stack_top)++ = (B); \
8059-
(B)->b_visited = 1; \
8060-
} \
8061-
} while (0)
8064+
// helper functions for add_checks_for_loads_of_unknown_variables
8065+
static inline void
8066+
maybe_push(basicblock *b, uint64_t unsafe_mask, basicblock ***sp)
8067+
{
8068+
// Push b if the unsafe mask is giving us any new information.
8069+
// To avoid overflowing the stack, only allow each block once.
8070+
// Use b->b_visited=1 to mean that b is currently on the stack.
8071+
uint64_t both = b->b_unsafe_locals_mask | unsafe_mask;
8072+
if (b->b_unsafe_locals_mask != both) {
8073+
b->b_unsafe_locals_mask = both;
8074+
// More work left to do.
8075+
if (!b->b_visited) {
8076+
// not on the stack, so push it.
8077+
*(*sp)++ = b;
8078+
b->b_visited = 1;
8079+
}
8080+
}
8081+
}
80628082

80638083
static void
8064-
scan_block_for_local(int target, basicblock *b, bool unsafe_to_start,
8065-
basicblock ***stack_top)
8084+
scan_block_for_locals(basicblock *b, basicblock ***sp)
80668085
{
8067-
bool unsafe = unsafe_to_start;
8086+
// bit i is set if local i is potentially uninitialized
8087+
uint64_t unsafe_mask = b->b_unsafe_locals_mask;
80688088
for (int i = 0; i < b->b_iused; i++) {
80698089
struct instr *instr = &b->b_instr[i];
80708090
assert(instr->i_opcode != EXTENDED_ARG);
80718091
assert(instr->i_opcode != EXTENDED_ARG_QUICK);
8072-
assert(instr->i_opcode != LOAD_FAST__LOAD_FAST);
8073-
assert(instr->i_opcode != STORE_FAST__LOAD_FAST);
8074-
assert(instr->i_opcode != LOAD_CONST__LOAD_FAST);
8075-
assert(instr->i_opcode != STORE_FAST__STORE_FAST);
8076-
assert(instr->i_opcode != LOAD_FAST__LOAD_CONST);
8077-
if (unsafe && instr->i_except != NULL) {
8078-
MAYBE_PUSH(instr->i_except);
8079-
}
8080-
if (instr->i_oparg != target) {
8092+
assert(!IS_SUPERINSTRUCTION_OPCODE(instr->i_opcode));
8093+
if (instr->i_except != NULL) {
8094+
maybe_push(instr->i_except, unsafe_mask, sp);
8095+
}
8096+
if (instr->i_oparg >= 64) {
80818097
continue;
80828098
}
8099+
assert(instr->i_oparg >= 0);
8100+
uint64_t bit = (uint64_t)1 << instr->i_oparg;
80838101
switch (instr->i_opcode) {
8102+
case DELETE_FAST:
8103+
unsafe_mask |= bit;
8104+
break;
8105+
case STORE_FAST:
8106+
unsafe_mask &= ~bit;
8107+
break;
80848108
case LOAD_FAST_CHECK:
8085-
// if this doesn't raise, then var is defined
8086-
unsafe = false;
8109+
// If this doesn't raise, then the local is defined.
8110+
unsafe_mask &= ~bit;
80878111
break;
80888112
case LOAD_FAST:
8089-
if (unsafe) {
8113+
if (unsafe_mask & bit) {
80908114
instr->i_opcode = LOAD_FAST_CHECK;
80918115
}
8092-
unsafe = false;
8093-
break;
8094-
case STORE_FAST:
8095-
unsafe = false;
8096-
break;
8097-
case DELETE_FAST:
8098-
unsafe = true;
8116+
unsafe_mask &= ~bit;
80998117
break;
81008118
}
81018119
}
8102-
if (unsafe) {
8103-
// unsafe at end of this block,
8104-
// so unsafe at start of next blocks
8105-
if (b->b_next && BB_HAS_FALLTHROUGH(b)) {
8106-
MAYBE_PUSH(b->b_next);
8107-
}
8108-
struct instr *last = basicblock_last_instr(b);
8109-
if (last != NULL) {
8110-
if (is_jump(last)) {
8111-
assert(last->i_target != NULL);
8112-
MAYBE_PUSH(last->i_target);
8120+
if (b->b_next && BB_HAS_FALLTHROUGH(b)) {
8121+
maybe_push(b->b_next, unsafe_mask, sp);
8122+
}
8123+
struct instr *last = basicblock_last_instr(b);
8124+
if (last && is_jump(last)) {
8125+
assert(last->i_target != NULL);
8126+
maybe_push(last->i_target, unsafe_mask, sp);
8127+
}
8128+
}
8129+
8130+
static int
8131+
fast_scan_many_locals(basicblock *entryblock, int nlocals)
8132+
{
8133+
assert(nlocals > 64);
8134+
Py_ssize_t *states = PyMem_Calloc(nlocals - 64, sizeof(Py_ssize_t));
8135+
if (states == NULL) {
8136+
PyErr_NoMemory();
8137+
return -1;
8138+
}
8139+
Py_ssize_t blocknum = 0;
8140+
// state[i - 64] == blocknum if local i is guaranteed to
8141+
// be initialized, i.e., if it has had a previous LOAD_FAST or
8142+
// STORE_FAST within that basicblock (not followed by DELETE_FAST).
8143+
for (basicblock *b = entryblock; b != NULL; b = b->b_next) {
8144+
blocknum++;
8145+
for (int i = 0; i < b->b_iused; i++) {
8146+
struct instr *instr = &b->b_instr[i];
8147+
assert(instr->i_opcode != EXTENDED_ARG);
8148+
assert(instr->i_opcode != EXTENDED_ARG_QUICK);
8149+
assert(!IS_SUPERINSTRUCTION_OPCODE(instr->i_opcode));
8150+
int arg = instr->i_oparg;
8151+
if (arg < 64) {
8152+
continue;
8153+
}
8154+
assert(arg >= 0);
8155+
switch (instr->i_opcode) {
8156+
case DELETE_FAST:
8157+
states[arg - 64] = blocknum - 1;
8158+
break;
8159+
case STORE_FAST:
8160+
states[arg - 64] = blocknum;
8161+
break;
8162+
case LOAD_FAST:
8163+
if (states[arg - 64] != blocknum) {
8164+
instr->i_opcode = LOAD_FAST_CHECK;
8165+
}
8166+
states[arg - 64] = blocknum;
8167+
break;
8168+
case LOAD_FAST_CHECK:
8169+
Py_UNREACHABLE();
81138170
}
81148171
}
81158172
}
8173+
PyMem_Free(states);
8174+
return 0;
81168175
}
8117-
#undef MAYBE_PUSH
81188176

81198177
static int
81208178
add_checks_for_loads_of_uninitialized_variables(basicblock *entryblock,
81218179
struct compiler *c)
81228180
{
8181+
int nlocals = (int)PyDict_GET_SIZE(c->u->u_varnames);
8182+
if (nlocals == 0) {
8183+
return 0;
8184+
}
8185+
if (nlocals > 64) {
8186+
// To avoid O(nlocals**2) compilation, locals beyond the first
8187+
// 64 are only analyzed one basicblock at a time: initialization
8188+
// info is not passed between basicblocks.
8189+
if (fast_scan_many_locals(entryblock, nlocals) < 0) {
8190+
return -1;
8191+
}
8192+
nlocals = 64;
8193+
}
81238194
basicblock **stack = make_cfg_traversal_stack(entryblock);
81248195
if (stack == NULL) {
81258196
return -1;
81268197
}
8127-
Py_ssize_t nparams = PyList_GET_SIZE(c->u->u_ste->ste_varnames);
8128-
int nlocals = (int)PyDict_GET_SIZE(c->u->u_varnames);
8129-
for (int target = 0; target < nlocals; target++) {
8130-
for (basicblock *b = entryblock; b != NULL; b = b->b_next) {
8131-
b->b_visited = 0;
8132-
}
8133-
basicblock **stack_top = stack;
8198+
basicblock **sp = stack;
81348199

8135-
// First pass: find the relevant DFS starting points:
8136-
// the places where "being uninitialized" originates,
8137-
// which are the entry block and any DELETE_FAST statements.
8138-
if (target >= nparams) {
8139-
// only non-parameter locals start out uninitialized.
8140-
*(stack_top++) = entryblock;
8141-
entryblock->b_visited = 1;
8142-
}
8143-
for (basicblock *b = entryblock; b != NULL; b = b->b_next) {
8144-
scan_block_for_local(target, b, false, &stack_top);
8145-
}
8200+
// First origin of being uninitialized:
8201+
// The non-parameter locals in the entry block.
8202+
int nparams = (int)PyList_GET_SIZE(c->u->u_ste->ste_varnames);
8203+
uint64_t start_mask = 0;
8204+
for (int i = nparams; i < nlocals; i++) {
8205+
start_mask |= (uint64_t)1 << i;
8206+
}
8207+
maybe_push(entryblock, start_mask, &sp);
81468208

8147-
// Second pass: Depth-first search to propagate uncertainty
8148-
while (stack_top > stack) {
8149-
basicblock *b = *--stack_top;
8150-
scan_block_for_local(target, b, true, &stack_top);
8151-
}
8209+
// Second origin of being uninitialized:
8210+
// There could be DELETE_FAST somewhere, so
8211+
// be sure to scan each basicblock at least once.
8212+
for (basicblock *b = entryblock; b != NULL; b = b->b_next) {
8213+
scan_block_for_locals(b, &sp);
8214+
}
8215+
8216+
// Now propagate the uncertainty from the origins we found: Use
8217+
// LOAD_FAST_CHECK for any LOAD_FAST where the local could be undefined.
8218+
while (sp > stack) {
8219+
basicblock *b = *--sp;
8220+
// mark as no longer on stack
8221+
b->b_visited = 0;
8222+
scan_block_for_locals(b, &sp);
81528223
}
81538224
PyMem_Free(stack);
81548225
return 0;

0 commit comments

Comments
 (0)