Skip to content

GH-111646: Simplify optimizer, by compacting uops when making executor. #111647

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 58 additions & 106 deletions Python/optimizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -384,32 +384,6 @@ PyTypeObject _PyUOpExecutor_Type = {
.tp_methods = executor_methods,
};

static int
move_stubs(
_PyUOpInstruction *trace,
int trace_length,
int stubs_start,
int stubs_end
)
{
memmove(trace + trace_length,
trace + stubs_start,
(stubs_end - stubs_start) * sizeof(_PyUOpInstruction));
// Patch up the jump targets
for (int i = 0; i < trace_length; i++) {
if (trace[i].opcode == _POP_JUMP_IF_FALSE ||
trace[i].opcode == _POP_JUMP_IF_TRUE)
{
int target = trace[i].oparg;
if (target >= stubs_start) {
target += trace_length - stubs_start;
trace[i].oparg = target;
}
}
}
return trace_length + stubs_end - stubs_start;
}

#define TRACE_STACK_SIZE 5

static int
Expand Down Expand Up @@ -790,7 +764,7 @@ translate_bytecode_to_trace(
}
assert(code == initial_code);
// Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE
if (trace_length > 3) {
if (trace_length > 4) {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0);
DPRINTF(1,
"Created a trace for %s (%s:%d) at byte offset %d -- length %d+%d\n",
Expand All @@ -800,24 +774,6 @@ translate_bytecode_to_trace(
2 * INSTR_IP(initial_instr, code),
trace_length,
buffer_size - max_length);
if (max_length < buffer_size) {
// There are stubs
if (trace_length < max_length) {
// There's a gap before the stubs
// Move the stubs back to be immediately after the main trace
// (which ends at trace_length)
DPRINTF(2,
"Moving %d stub uops back by %d\n",
buffer_size - max_length,
max_length - trace_length);
trace_length = move_stubs(trace, trace_length, max_length, buffer_size);
}
else {
assert(trace_length == max_length);
// There's no gap
trace_length = buffer_size;
}
}
return trace_length;
}
else {
Expand All @@ -838,70 +794,70 @@ translate_bytecode_to_trace(
#undef DPRINTF
}

#define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31)))
#define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31)))
#define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31)))

static int
remove_unneeded_uops(_PyUOpInstruction *trace, int trace_length)
compute_used(_PyUOpInstruction *buffer, uint32_t *used)
{
// Stage 1: Replace unneeded _SET_IP uops with NOP.
// Note that we don't enter stubs, those SET_IPs are needed.
int last_set_ip = -1;
int last_instr = 0;
bool need_ip = true;
for (int pc = 0; pc < trace_length; pc++) {
int opcode = trace[pc].opcode;
if (opcode == _SET_IP) {
if (!need_ip && last_set_ip >= 0) {
trace[last_set_ip].opcode = NOP;
}
need_ip = false;
last_set_ip = pc;
int length = 0;
SET_BIT(used, 0);
for (int i = 0; i < _Py_UOP_MAX_TRACE_LENGTH; i++) {
if (!BIT_IS_SET(used, i)) {
continue;
}
else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
last_instr = pc + 1;
break;
int opcode = buffer[i].opcode;
if (opcode == NOP) {
UNSET_BIT(used, i);
SET_BIT(used, i+1);
continue;
}
else {
// If opcode has ERROR or DEOPT, set need_ip to true
if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) {
need_ip = true;
}
length ++;
if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
continue;
}
}
// Stage 2: Squash NOP opcodes (pre-existing or set above).
int dest = 0;
for (int pc = 0; pc < last_instr; pc++) {
int opcode = trace[pc].opcode;
if (opcode != NOP) {
if (pc != dest) {
trace[dest] = trace[pc];
}
dest++;
SET_BIT(used, i+1);
if (opcode == _POP_JUMP_IF_FALSE ||
opcode == _POP_JUMP_IF_TRUE)
{
int target = buffer[i].oparg;
SET_BIT(used, target);
}
}
// Stage 3: Move the stubs back.
if (dest < last_instr) {
int new_trace_length = move_stubs(trace, dest, last_instr, trace_length);
#ifdef Py_DEBUG
char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
int lltrace = 0;
if (python_lltrace != NULL && *python_lltrace >= '0') {
lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that
return length;
}

/* Account for the buffer having gaps and NOPs. */
static _PyExecutorObject *
make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies)
{
uint32_t used[(_Py_UOP_MAX_TRACE_LENGTH + 31)/32] = { 0 };
int length = compute_used(buffer, used);
_PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, length);
if (executor == NULL) {
return NULL;
}
int dest = length - 1;
for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) {
if (!BIT_IS_SET(used, i)) {
continue;
}
if (lltrace >= 2) {
printf("Optimized trace (length %d+%d = %d, saved %d):\n",
dest, trace_length - last_instr, new_trace_length,
trace_length - new_trace_length);
for (int pc = 0; pc < new_trace_length; pc++) {
printf("%4d: (%s, %d, %" PRIu64 ")\n",
pc,
uop_name(trace[pc].opcode),
(trace[pc].oparg),
(uint64_t)(trace[pc].operand));
}
executor->trace[dest] = buffer[i];
int opcode = buffer[i].opcode;
if (opcode == _POP_JUMP_IF_FALSE ||
opcode == _POP_JUMP_IF_TRUE)
{
int oparg = executor->trace[dest].oparg;
executor->trace[dest].oparg = buffer[oparg].oparg;
}
#endif
trace_length = new_trace_length;
buffer[i].oparg = dest;
dest--;
}
return trace_length;
assert(dest == -1);
executor->base.execute = _PyUopExecute;
_Py_ExecutorInit((_PyExecutorObject *)executor, dependencies);
return (_PyExecutorObject *)executor;
}

static int
Expand All @@ -926,16 +882,12 @@ uop_optimize(
if (uop_optimize != NULL && *uop_optimize > '0') {
trace_length = _Py_uop_analyze_and_optimize(code, trace, trace_length, curr_stackentries);
}
trace_length = remove_unneeded_uops(trace, trace_length);
_PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, trace_length);
OPT_HIST(trace_length, optimized_trace_length_hist);
_PyExecutorObject *executor = make_executor_from_uops(trace, &dependencies);
if (executor == NULL) {
return -1;
}
OPT_HIST(trace_length, optimized_trace_length_hist);
executor->base.execute = _PyUopExecute;
memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction));
_Py_ExecutorInit((_PyExecutorObject *)executor, &dependencies);
*exec_ptr = (_PyExecutorObject *)executor;
*exec_ptr = executor;
return 1;
}

Expand Down
29 changes: 29 additions & 0 deletions Python/optimizer_analysis.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,34 @@
#include "pycore_optimizer.h"


static void
remove_unneeded_uops(_PyUOpInstruction *trace, int trace_length)
{
// Note that we don't enter stubs, those SET_IPs are needed.
int last_set_ip = -1;
bool need_ip = true;
for (int pc = 0; pc < trace_length; pc++) {
int opcode = trace[pc].opcode;
if (opcode == _SET_IP) {
if (!need_ip && last_set_ip >= 0) {
trace[last_set_ip].opcode = NOP;
}
need_ip = false;
last_set_ip = pc;
}
else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
break;
}
else {
// If opcode has ERROR or DEOPT, set need_ip to true
if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) {
need_ip = true;
}
}
}
}


int
_Py_uop_analyze_and_optimize(
PyCodeObject *co,
Expand All @@ -21,5 +49,6 @@ _Py_uop_analyze_and_optimize(
int curr_stacklen
)
{
remove_unneeded_uops(trace, trace_len);
return trace_len;
}