Skip to content

Commit d78c872

Browse files
authored
GH-111646: Simplify optimizer, by compacting uops when making executor. (GH-111647)
1 parent c8faa35 commit d78c872

File tree

2 files changed

+119
-117
lines changed

2 files changed

+119
-117
lines changed

Python/optimizer.c

Lines changed: 87 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -384,34 +384,12 @@ PyTypeObject _PyUOpExecutor_Type = {
384384
.tp_methods = executor_methods,
385385
};
386386

387-
static int
388-
move_stubs(
389-
_PyUOpInstruction *trace,
390-
int trace_length,
391-
int stubs_start,
392-
int stubs_end
393-
)
394-
{
395-
memmove(trace + trace_length,
396-
trace + stubs_start,
397-
(stubs_end - stubs_start) * sizeof(_PyUOpInstruction));
398-
// Patch up the jump targets
399-
for (int i = 0; i < trace_length; i++) {
400-
if (trace[i].opcode == _POP_JUMP_IF_FALSE ||
401-
trace[i].opcode == _POP_JUMP_IF_TRUE)
402-
{
403-
int target = trace[i].oparg;
404-
if (target >= stubs_start) {
405-
target += trace_length - stubs_start;
406-
trace[i].oparg = target;
407-
}
408-
}
409-
}
410-
return trace_length + stubs_end - stubs_start;
411-
}
412-
413387
#define TRACE_STACK_SIZE 5
414388

389+
/* Returns 1 on success,
390+
* 0 if it failed to produce a worthwhile trace,
391+
* and -1 on an error.
392+
*/
415393
static int
416394
translate_bytecode_to_trace(
417395
PyCodeObject *code,
@@ -790,7 +768,7 @@ translate_bytecode_to_trace(
790768
}
791769
assert(code == initial_code);
792770
// Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE
793-
if (trace_length > 3) {
771+
if (trace_length > 4) {
794772
ADD_TO_TRACE(_EXIT_TRACE, 0, 0);
795773
DPRINTF(1,
796774
"Created a trace for %s (%s:%d) at byte offset %d -- length %d+%d\n",
@@ -800,25 +778,8 @@ translate_bytecode_to_trace(
800778
2 * INSTR_IP(initial_instr, code),
801779
trace_length,
802780
buffer_size - max_length);
803-
if (max_length < buffer_size) {
804-
// There are stubs
805-
if (trace_length < max_length) {
806-
// There's a gap before the stubs
807-
// Move the stubs back to be immediately after the main trace
808-
// (which ends at trace_length)
809-
DPRINTF(2,
810-
"Moving %d stub uops back by %d\n",
811-
buffer_size - max_length,
812-
max_length - trace_length);
813-
trace_length = move_stubs(trace, trace_length, max_length, buffer_size);
814-
}
815-
else {
816-
assert(trace_length == max_length);
817-
// There's no gap
818-
trace_length = buffer_size;
819-
}
820-
}
821-
return trace_length;
781+
OPT_HIST(trace_length + buffer_size - max_length, trace_length_hist);
782+
return 1;
822783
}
823784
else {
824785
OPT_STAT_INC(trace_too_short);
@@ -838,70 +799,84 @@ translate_bytecode_to_trace(
838799
#undef DPRINTF
839800
}
840801

802+
#define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31)))
803+
#define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31)))
804+
#define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31)))
805+
806+
/* Count the number of used uops, and mark them in the bit vector `used`.
807+
* This can be done in a single pass using simple reachability analysis,
808+
* as there are no backward jumps.
809+
* NOPs are excluded from the count.
810+
*/
841811
static int
842-
remove_unneeded_uops(_PyUOpInstruction *trace, int trace_length)
812+
compute_used(_PyUOpInstruction *buffer, uint32_t *used)
843813
{
844-
// Stage 1: Replace unneeded _SET_IP uops with NOP.
845-
// Note that we don't enter stubs, those SET_IPs are needed.
846-
int last_set_ip = -1;
847-
int last_instr = 0;
848-
bool need_ip = true;
849-
for (int pc = 0; pc < trace_length; pc++) {
850-
int opcode = trace[pc].opcode;
851-
if (opcode == _SET_IP) {
852-
if (!need_ip && last_set_ip >= 0) {
853-
trace[last_set_ip].opcode = NOP;
854-
}
855-
need_ip = false;
856-
last_set_ip = pc;
814+
int count = 0;
815+
SET_BIT(used, 0);
816+
for (int i = 0; i < _Py_UOP_MAX_TRACE_LENGTH; i++) {
817+
if (!BIT_IS_SET(used, i)) {
818+
continue;
857819
}
858-
else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
859-
last_instr = pc + 1;
860-
break;
820+
count++;
821+
int opcode = buffer[i].opcode;
822+
if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
823+
continue;
861824
}
862-
else {
863-
// If opcode has ERROR or DEOPT, set need_ip to true
864-
if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) {
865-
need_ip = true;
866-
}
825+
/* All other micro-ops fall through, so i+1 is reachable */
826+
SET_BIT(used, i+1);
827+
switch(opcode) {
828+
case NOP:
829+
/* Don't count NOPs as used */
830+
count--;
831+
UNSET_BIT(used, i);
832+
break;
833+
case _POP_JUMP_IF_FALSE:
834+
case _POP_JUMP_IF_TRUE:
835+
/* Mark target as reachable */
836+
SET_BIT(used, buffer[i].oparg);
867837
}
868838
}
869-
// Stage 2: Squash NOP opcodes (pre-existing or set above).
870-
int dest = 0;
871-
for (int pc = 0; pc < last_instr; pc++) {
872-
int opcode = trace[pc].opcode;
873-
if (opcode != NOP) {
874-
if (pc != dest) {
875-
trace[dest] = trace[pc];
876-
}
877-
dest++;
878-
}
839+
return count;
840+
}
841+
842+
/* Makes an executor from a buffer of uops.
843+
* Account for the buffer having gaps and NOPs by computing a "used"
844+
* bit vector and only copying the used uops. Here "used" means reachable
845+
* and not a NOP.
846+
*/
847+
static _PyExecutorObject *
848+
make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies)
849+
{
850+
uint32_t used[(_Py_UOP_MAX_TRACE_LENGTH + 31)/32] = { 0 };
851+
int length = compute_used(buffer, used);
852+
_PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, length);
853+
if (executor == NULL) {
854+
return NULL;
879855
}
880-
// Stage 3: Move the stubs back.
881-
if (dest < last_instr) {
882-
int new_trace_length = move_stubs(trace, dest, last_instr, trace_length);
883-
#ifdef Py_DEBUG
884-
char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
885-
int lltrace = 0;
886-
if (python_lltrace != NULL && *python_lltrace >= '0') {
887-
lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that
856+
int dest = length - 1;
857+
/* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */
858+
for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) {
859+
if (!BIT_IS_SET(used, i)) {
860+
continue;
888861
}
889-
if (lltrace >= 2) {
890-
printf("Optimized trace (length %d+%d = %d, saved %d):\n",
891-
dest, trace_length - last_instr, new_trace_length,
892-
trace_length - new_trace_length);
893-
for (int pc = 0; pc < new_trace_length; pc++) {
894-
printf("%4d: (%s, %d, %" PRIu64 ")\n",
895-
pc,
896-
uop_name(trace[pc].opcode),
897-
(trace[pc].oparg),
898-
(uint64_t)(trace[pc].operand));
899-
}
862+
executor->trace[dest] = buffer[i];
863+
int opcode = buffer[i].opcode;
864+
if (opcode == _POP_JUMP_IF_FALSE ||
865+
opcode == _POP_JUMP_IF_TRUE)
866+
{
867+
/* The oparg of the target will already have been set to its new offset */
868+
int oparg = executor->trace[dest].oparg;
869+
executor->trace[dest].oparg = buffer[oparg].oparg;
900870
}
901-
#endif
902-
trace_length = new_trace_length;
871+
/* Set the oparg to be the destination offset,
872+
* so that we can set the oparg of earlier jumps correctly. */
873+
buffer[i].oparg = dest;
874+
dest--;
903875
}
904-
return trace_length;
876+
assert(dest == -1);
877+
executor->base.execute = _PyUopExecute;
878+
_Py_ExecutorInit((_PyExecutorObject *)executor, dependencies);
879+
return (_PyExecutorObject *)executor;
905880
}
906881

907882
static int
@@ -914,28 +889,26 @@ uop_optimize(
914889
{
915890
_PyBloomFilter dependencies;
916891
_Py_BloomFilter_Init(&dependencies);
917-
_PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH];
918-
int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH, &dependencies);
919-
if (trace_length <= 0) {
892+
_PyUOpInstruction buffer[_Py_UOP_MAX_TRACE_LENGTH];
893+
int err = translate_bytecode_to_trace(code, instr, buffer, _Py_UOP_MAX_TRACE_LENGTH, &dependencies);
894+
if (err <= 0) {
920895
// Error or nothing translated
921-
return trace_length;
896+
return err;
922897
}
923-
OPT_HIST(trace_length, trace_length_hist);
924898
OPT_STAT_INC(traces_created);
925899
char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE");
926-
if (uop_optimize != NULL && *uop_optimize > '0') {
927-
trace_length = _Py_uop_analyze_and_optimize(code, trace, trace_length, curr_stackentries);
900+
if (uop_optimize == NULL || *uop_optimize > '0') {
901+
err = _Py_uop_analyze_and_optimize(code, buffer, _Py_UOP_MAX_TRACE_LENGTH, curr_stackentries);
902+
if (err < 0) {
903+
return -1;
904+
}
928905
}
929-
trace_length = remove_unneeded_uops(trace, trace_length);
930-
_PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, trace_length);
906+
_PyExecutorObject *executor = make_executor_from_uops(buffer, &dependencies);
931907
if (executor == NULL) {
932908
return -1;
933909
}
934-
OPT_HIST(trace_length, optimized_trace_length_hist);
935-
executor->base.execute = _PyUopExecute;
936-
memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction));
937-
_Py_ExecutorInit((_PyExecutorObject *)executor, &dependencies);
938-
*exec_ptr = (_PyExecutorObject *)executor;
910+
OPT_HIST(Py_SIZE(executor), optimized_trace_length_hist);
911+
*exec_ptr = executor;
939912
return 1;
940913
}
941914

Python/optimizer_analysis.c

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,42 @@
1313
#include "pycore_optimizer.h"
1414

1515

16+
static void
17+
remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size)
18+
{
19+
// Note that we don't enter stubs, those SET_IPs are needed.
20+
int last_set_ip = -1;
21+
bool need_ip = true;
22+
for (int pc = 0; pc < buffer_size; pc++) {
23+
int opcode = buffer[pc].opcode;
24+
if (opcode == _SET_IP) {
25+
if (!need_ip && last_set_ip >= 0) {
26+
buffer[last_set_ip].opcode = NOP;
27+
}
28+
need_ip = false;
29+
last_set_ip = pc;
30+
}
31+
else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
32+
break;
33+
}
34+
else {
35+
// If opcode has ERROR or DEOPT, set need_ip to true
36+
if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) {
37+
need_ip = true;
38+
}
39+
}
40+
}
41+
}
42+
43+
1644
int
1745
_Py_uop_analyze_and_optimize(
1846
PyCodeObject *co,
19-
_PyUOpInstruction *trace,
20-
int trace_len,
47+
_PyUOpInstruction *buffer,
48+
int buffer_size,
2149
int curr_stacklen
2250
)
2351
{
24-
return trace_len;
52+
remove_unneeded_uops(buffer, buffer_size);
53+
return 0;
2554
}

0 commit comments

Comments
 (0)