Skip to content

GH-111646: Simplify optimizer, by compacting uops when making executor. #111647

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 87 additions & 114 deletions Python/optimizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -384,34 +384,12 @@ PyTypeObject _PyUOpExecutor_Type = {
.tp_methods = executor_methods,
};

static int
move_stubs(
_PyUOpInstruction *trace,
int trace_length,
int stubs_start,
int stubs_end
)
{
memmove(trace + trace_length,
trace + stubs_start,
(stubs_end - stubs_start) * sizeof(_PyUOpInstruction));
// Patch up the jump targets
for (int i = 0; i < trace_length; i++) {
if (trace[i].opcode == _POP_JUMP_IF_FALSE ||
trace[i].opcode == _POP_JUMP_IF_TRUE)
{
int target = trace[i].oparg;
if (target >= stubs_start) {
target += trace_length - stubs_start;
trace[i].oparg = target;
}
}
}
return trace_length + stubs_end - stubs_start;
}

#define TRACE_STACK_SIZE 5

/* Returns 1 on success,
* 0 if it failed to produce a worthwhile trace,
* and -1 on an error.
*/
static int
translate_bytecode_to_trace(
PyCodeObject *code,
Expand Down Expand Up @@ -790,7 +768,7 @@ translate_bytecode_to_trace(
}
assert(code == initial_code);
// Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE
if (trace_length > 3) {
if (trace_length > 4) {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0);
DPRINTF(1,
"Created a trace for %s (%s:%d) at byte offset %d -- length %d+%d\n",
Expand All @@ -800,25 +778,8 @@ translate_bytecode_to_trace(
2 * INSTR_IP(initial_instr, code),
trace_length,
buffer_size - max_length);
if (max_length < buffer_size) {
// There are stubs
if (trace_length < max_length) {
// There's a gap before the stubs
// Move the stubs back to be immediately after the main trace
// (which ends at trace_length)
DPRINTF(2,
"Moving %d stub uops back by %d\n",
buffer_size - max_length,
max_length - trace_length);
trace_length = move_stubs(trace, trace_length, max_length, buffer_size);
}
else {
assert(trace_length == max_length);
// There's no gap
trace_length = buffer_size;
}
}
return trace_length;
OPT_HIST(trace_length + buffer_size - max_length, trace_length_hist);
return 1;
}
else {
OPT_STAT_INC(trace_too_short);
Expand All @@ -838,70 +799,84 @@ translate_bytecode_to_trace(
#undef DPRINTF
}

#define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31)))
#define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31)))
#define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31)))

/* Count the number of used uops, and mark them in the bit vector `used`.
* This can be done in a single pass using simple reachability analysis,
* as there are no backward jumps.
* NOPs are excluded from the count.
*/
static int
remove_unneeded_uops(_PyUOpInstruction *trace, int trace_length)
compute_used(_PyUOpInstruction *buffer, uint32_t *used)
{
// Stage 1: Replace unneeded _SET_IP uops with NOP.
// Note that we don't enter stubs, those SET_IPs are needed.
int last_set_ip = -1;
int last_instr = 0;
bool need_ip = true;
for (int pc = 0; pc < trace_length; pc++) {
int opcode = trace[pc].opcode;
if (opcode == _SET_IP) {
if (!need_ip && last_set_ip >= 0) {
trace[last_set_ip].opcode = NOP;
}
need_ip = false;
last_set_ip = pc;
int count = 0;
SET_BIT(used, 0);
for (int i = 0; i < _Py_UOP_MAX_TRACE_LENGTH; i++) {
if (!BIT_IS_SET(used, i)) {
continue;
}
else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
last_instr = pc + 1;
break;
count++;
int opcode = buffer[i].opcode;
if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
continue;
}
else {
// If opcode has ERROR or DEOPT, set need_ip to true
if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) {
need_ip = true;
}
/* All other micro-ops fall through, so i+1 is reachable */
SET_BIT(used, i+1);
switch(opcode) {
case NOP:
/* Don't count NOPs as used */
count--;
UNSET_BIT(used, i);
break;
case _POP_JUMP_IF_FALSE:
case _POP_JUMP_IF_TRUE:
/* Mark target as reachable */
SET_BIT(used, buffer[i].oparg);
}
}
// Stage 2: Squash NOP opcodes (pre-existing or set above).
int dest = 0;
for (int pc = 0; pc < last_instr; pc++) {
int opcode = trace[pc].opcode;
if (opcode != NOP) {
if (pc != dest) {
trace[dest] = trace[pc];
}
dest++;
}
return count;
}

/* Makes an executor from a buffer of uops.
* Account for the buffer having gaps and NOPs by computing a "used"
* bit vector and only copying the used uops. Here "used" means reachable
* and not a NOP.
*/
static _PyExecutorObject *
make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies)
{
uint32_t used[(_Py_UOP_MAX_TRACE_LENGTH + 31)/32] = { 0 };
int length = compute_used(buffer, used);
_PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, length);
if (executor == NULL) {
return NULL;
}
// Stage 3: Move the stubs back.
if (dest < last_instr) {
int new_trace_length = move_stubs(trace, dest, last_instr, trace_length);
#ifdef Py_DEBUG
char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
int lltrace = 0;
if (python_lltrace != NULL && *python_lltrace >= '0') {
lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that
int dest = length - 1;
/* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */
for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) {
if (!BIT_IS_SET(used, i)) {
continue;
}
if (lltrace >= 2) {
printf("Optimized trace (length %d+%d = %d, saved %d):\n",
dest, trace_length - last_instr, new_trace_length,
trace_length - new_trace_length);
for (int pc = 0; pc < new_trace_length; pc++) {
printf("%4d: (%s, %d, %" PRIu64 ")\n",
pc,
uop_name(trace[pc].opcode),
(trace[pc].oparg),
(uint64_t)(trace[pc].operand));
}
executor->trace[dest] = buffer[i];
int opcode = buffer[i].opcode;
if (opcode == _POP_JUMP_IF_FALSE ||
opcode == _POP_JUMP_IF_TRUE)
{
/* The oparg of the target will already have been set to its new offset */
int oparg = executor->trace[dest].oparg;
executor->trace[dest].oparg = buffer[oparg].oparg;
}
#endif
trace_length = new_trace_length;
/* Set the oparg to be the destination offset,
* so that we can set the oparg of earlier jumps correctly. */
buffer[i].oparg = dest;
dest--;
}
return trace_length;
assert(dest == -1);
executor->base.execute = _PyUopExecute;
_Py_ExecutorInit((_PyExecutorObject *)executor, dependencies);
return (_PyExecutorObject *)executor;
}

static int
Expand All @@ -914,28 +889,26 @@ uop_optimize(
{
_PyBloomFilter dependencies;
_Py_BloomFilter_Init(&dependencies);
_PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH];
int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH, &dependencies);
if (trace_length <= 0) {
_PyUOpInstruction buffer[_Py_UOP_MAX_TRACE_LENGTH];
int err = translate_bytecode_to_trace(code, instr, buffer, _Py_UOP_MAX_TRACE_LENGTH, &dependencies);
if (err <= 0) {
// Error or nothing translated
return trace_length;
return err;
}
OPT_HIST(trace_length, trace_length_hist);
OPT_STAT_INC(traces_created);
char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE");
if (uop_optimize != NULL && *uop_optimize > '0') {
trace_length = _Py_uop_analyze_and_optimize(code, trace, trace_length, curr_stackentries);
if (uop_optimize == NULL || *uop_optimize > '0') {
err = _Py_uop_analyze_and_optimize(code, buffer, _Py_UOP_MAX_TRACE_LENGTH, curr_stackentries);
if (err < 0) {
return -1;
}
}
trace_length = remove_unneeded_uops(trace, trace_length);
_PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, trace_length);
_PyExecutorObject *executor = make_executor_from_uops(buffer, &dependencies);
if (executor == NULL) {
return -1;
}
OPT_HIST(trace_length, optimized_trace_length_hist);
executor->base.execute = _PyUopExecute;
memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction));
_Py_ExecutorInit((_PyExecutorObject *)executor, &dependencies);
*exec_ptr = (_PyExecutorObject *)executor;
OPT_HIST(Py_SIZE(executor), optimized_trace_length_hist);
*exec_ptr = executor;
return 1;
}

Expand Down
35 changes: 32 additions & 3 deletions Python/optimizer_analysis.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,42 @@
#include "pycore_optimizer.h"


static void
remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size)
{
// Note that we don't enter stubs, those SET_IPs are needed.
int last_set_ip = -1;
bool need_ip = true;
for (int pc = 0; pc < buffer_size; pc++) {
int opcode = buffer[pc].opcode;
if (opcode == _SET_IP) {
if (!need_ip && last_set_ip >= 0) {
buffer[last_set_ip].opcode = NOP;
}
need_ip = false;
last_set_ip = pc;
}
else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
break;
}
else {
// If opcode has ERROR or DEOPT, set need_ip to true
if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) {
need_ip = true;
}
}
}
}


int
_Py_uop_analyze_and_optimize(
PyCodeObject *co,
_PyUOpInstruction *trace,
int trace_len,
_PyUOpInstruction *buffer,
int buffer_size,
int curr_stacklen
)
{
return trace_len;
remove_unneeded_uops(buffer, buffer_size);
return 0;
}