Skip to content

gh-106581: Split CALL_PY_EXACT_ARGS into uops #107760

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Aug 16, 2023
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 37 additions & 4 deletions Include/internal/pycore_opcode_metadata.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions Lib/test/test_capi/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2618,6 +2618,23 @@ def testfunc(it):
with self.assertRaises(StopIteration):
next(it)

def test_call_py_exact_args(self):
def testfunc(n):
def dummy(x):
return x+1
for i in range(n):
dummy(i)

opt = _testinternalcapi.get_uop_optimizer()
with temporary_optimizer(opt):
testfunc(10)

ex = get_first_executor(testfunc)
self.assertIsNotNone(ex)
uops = {opname for opname, _, _ in ex}
self.assertIn("_PUSH_FRAME", uops)



if __name__ == "__main__":
unittest.main()
24 changes: 24 additions & 0 deletions Python/abstract_interp_cases.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

71 changes: 55 additions & 16 deletions Python/bytecodes.c
Original file line number Diff line number Diff line change
Expand Up @@ -957,13 +957,13 @@ dummy_func(
{
PyGenObject *gen = (PyGenObject *)receiver;
_PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe;
frame->return_offset = oparg;
STACK_SHRINK(1);
_PyFrame_StackPush(gen_frame, v);
gen->gi_frame_state = FRAME_EXECUTING;
gen->gi_exc_state.previous_item = tstate->exc_info;
tstate->exc_info = &gen->gi_exc_state;
SKIP_OVER(INLINE_CACHE_ENTRIES_SEND);
frame->return_offset = oparg;
DISPATCH_INLINED(gen_frame);
}
if (Py_IsNone(v) && PyIter_Check(receiver)) {
Expand Down Expand Up @@ -996,13 +996,13 @@ dummy_func(
DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, SEND);
STAT_INC(SEND, hit);
_PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe;
frame->return_offset = oparg;
STACK_SHRINK(1);
_PyFrame_StackPush(gen_frame, v);
gen->gi_frame_state = FRAME_EXECUTING;
gen->gi_exc_state.previous_item = tstate->exc_info;
tstate->exc_info = &gen->gi_exc_state;
SKIP_OVER(INLINE_CACHE_ENTRIES_SEND);
frame->return_offset = oparg;
DISPATCH_INLINED(gen_frame);
}

Expand Down Expand Up @@ -2588,14 +2588,14 @@ dummy_func(
DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, FOR_ITER);
STAT_INC(FOR_ITER, hit);
_PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe;
frame->return_offset = oparg;
_PyFrame_StackPush(gen_frame, Py_None);
gen->gi_frame_state = FRAME_EXECUTING;
gen->gi_exc_state.previous_item = tstate->exc_info;
tstate->exc_info = &gen->gi_exc_state;
SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER);
assert(next_instr[oparg].op.code == END_FOR ||
next_instr[oparg].op.code == INSTRUMENTED_END_FOR);
frame->return_offset = oparg;
DISPATCH_INLINED(gen_frame);
}

Expand Down Expand Up @@ -2950,32 +2950,71 @@ dummy_func(
GO_TO_INSTRUCTION(CALL_PY_EXACT_ARGS);
}

inst(CALL_PY_EXACT_ARGS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) {
ASSERT_KWNAMES_IS_NULL();
op(_CHECK_PEP_523, (--)) {
DEOPT_IF(tstate->interp->eval_frame, CALL);
int argcount = oparg;
if (self_or_null != NULL) {
args--;
argcount++;
}
}

op(_CHECK_FUNCTION_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- callable, self_or_null, unused[oparg])) {
ASSERT_KWNAMES_IS_NULL();
DEOPT_IF(!PyFunction_Check(callable), CALL);
PyFunctionObject *func = (PyFunctionObject *)callable;
DEOPT_IF(func->func_version != func_version, CALL);
PyCodeObject *code = (PyCodeObject *)func->func_code;
DEOPT_IF(code->co_argcount != argcount, CALL);
DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL);
}

op(_CHECK_STACK_SPACE, (callable, unused, unused[oparg] -- callable, unused, unused[oparg])) {
PyFunctionObject *func = (PyFunctionObject *)callable;
PyCodeObject *code = (PyCodeObject *)func->func_code;
DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL);
}

op(_INIT_CALL_PY_EXACT_ARGS, (callable, self_or_null, args[oparg] -- new_frame: _PyInterpreterFrame*)) {
int argcount = oparg;
if (self_or_null != NULL) {
args--;
argcount++;
}
STAT_INC(CALL, hit);
_PyInterpreterFrame *new_frame = _PyFrame_PushUnchecked(tstate, func, argcount);
PyFunctionObject *func = (PyFunctionObject *)callable;
new_frame = _PyFrame_PushUnchecked(tstate, func, argcount);
for (int i = 0; i < argcount; i++) {
new_frame->localsplus[i] = args[i];
}
// Manipulate stack directly since we leave using DISPATCH_INLINED().
STACK_SHRINK(oparg + 2);
SKIP_OVER(INLINE_CACHE_ENTRIES_CALL);
}

// The 'unused' output effect represents the return value
// (which will be pushed when the frame returns).
// It is needed so CALL_PY_EXACT_ARGS matches its family.
op(_PUSH_FRAME, (new_frame: _PyInterpreterFrame* -- unused)) {
Copy link
Member

@markshannon markshannon Aug 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since _PUSH_FRAME is just frame->return_offset = 0; DISPATCH_INLINED(new_frame);, it would make sense to spell out DISPATCH_INLINED to clarify which operations that need to be different for tier 1 and tier 2.
Something like:

op(_PUSH_FRAME, (new_frame: _PyInterpreterFrame* -- unused)) {
    SAVE_FRAME_STATE();   // Equivalent to frame->prev_instr = next_instr - 1; _PyFrame_SetStackPointer(frame, stack_pointer);
    frame->return_offset = 0;
    new_frame->previous = frame;
    frame = cframe.current_frame = new_frame;
    CALL_STAT_INC(inlined_py_calls);
    if (_Py_EnterRecursivePy(tstate)) {
        goto exit_unwind;
    }
    START_FRAME(); // Equivalent to next_instr = frame->prev_instr  + 1; stack_pointer = 
            stack_pointer = _PyFrame_GetStackPointer(frame);
}

Copy link
Member

@markshannon markshannon Aug 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example: 2b3e6f2

In which case the code generators needs to know to push the temporary stack values to the real stack before SAVE_FRAME_STATE()

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have to study this more. A problem is that the Tier 1 and Tier 2 versions of _PUSH_FRAME are so different. I am working on a mechanism to be able to say

#if TIER_ONE
<code for Tier 1>
#else
<code for Tier 2>
#endif

I'm not sure yet what you mean with your last remark about pushing temp stack values.

Copy link
Member Author

@gvanrossum gvanrossum Aug 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comparing carefully the two versions of DISPATCH_INLINED (adding frame->return_offset = 0 which precedes it in both cases):

In Tier 1:

        frame->return_offset = 0;
        assert(tstate->interp->eval_frame == NULL);
        _PyFrame_SetStackPointer(frame, stack_pointer);
        frame->prev_instr = next_instr - 1;
        (NEW_FRAME)->previous = frame;
        frame = cframe.current_frame = (NEW_FRAME);
        CALL_STAT_INC(inlined_py_calls);
        goto start_frame;

In Tier 2:

        frame->return_offset = 0;
        assert(tstate->interp->eval_frame == NULL);
        _PyFrame_SetStackPointer(frame, stack_pointer);
        frame->prev_instr -= 1;
        (NEW_FRAME)->previous = frame;
        frame = tstate->cframe->current_frame = (NEW_FRAME);
        CALL_STAT_INC(inlined_py_calls);
        stack_pointer = _PyFrame_GetStackPointer(frame);
        ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive;

Diff:

@@ -1,8 +1,9 @@
         frame->return_offset = 0;
         assert(tstate->interp->eval_frame == NULL);
         _PyFrame_SetStackPointer(frame, stack_pointer);
-        frame->prev_instr = next_instr - 1;
+        frame->prev_instr -= 1;
         (NEW_FRAME)->previous = frame;
-        frame = cframe.current_frame = (NEW_FRAME);
+        frame = tstate->cframe->current_frame = (NEW_FRAME);
         CALL_STAT_INC(inlined_py_calls);
-        goto start_frame;
+        stack_pointer = _PyFrame_GetStackPointer(frame);
+        ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive;

Note that the Tier 2 version must be preceded by a SAVE_IP, which does the equivalent of frame->prev_instr = next_instr. If we had a Tier 1 version of SAVE_IP we could include it in the macro definition:

        macro(CALL_PY_EXACT_ARGS) =
            unused/1 + // Skip over the counter
            _CHECK_PEP_523 +
            _CHECK_FUNCTION_EXACT_ARGS +
            _CHECK_STACK_SPACE +
            _INIT_CALL_PY_EXACT_ARGS +
            SAVE_IP +      // <-------------- added
            _PUSH_FRAME;

which would reduce the special-casing in the code generator a bit (it would still need to do something special for SAVE_IP to ensure that its oparg has the right value, different from the oparg of the macro (which is the argument count). This would take care of the first diff chunk (what to assign to frame->prev_inst), but it would still be pretty fragile. (Like my current version, it would entice the optimizer to incorrectly try to remove the SAVE_IP uop.)

The second diff chunk relates to how we set cframe.current_frame -- in Tier 2 we must access this through the tstate.

The third and final diff chunk relates to really start using the new frame. In Tier 1, this must actually do the following:

  • Check recursion
  • Load stack_pointer
  • Load next_instr
  • Dispatch to the next opcode.

This is done by the code at start_frame.

In Tier 2 there is no start_frame label (the only uop that can go to a label is EXIT_TRACE, and of course DEOPT_IF and ERROR_IF also jump). So we load stack_frame here. There is no direct equivalent to next_instr, but we have to set ip_offset, which SAVE_IP adds to its oparg to get the prev_instr value. (This variable is a cache for frame->code->co_code_adaptive, to save some memory loads, so whenever frame changes we must update it.)

(More later.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's another thing though, and I think that is what Mark meant. In Tier 1 the code generation for macros is special-cased for _PUSH_FRAME so that both the stack adjustment and the next_instr adjustment are emitted before the _PUSH_FRAME opcode. This is done so that the flushing of these variables to the frame in the DISPATCH_INLINED macro flush the correct values.

But this is really ugly and unprincipled, and the logic is much hairier than the other special cases for _PUSH_FRAME. One of Mark's ideas here is to make this special case look for uops using the SAVE_FRAME_STATE macro rather than for the specific uop _PUSH_FRAME. But detecting when to trigger the special case is only part of the problem -- IMO the worse problem is that the special case itself is so ugly:

        dispatch_inlined_special_case = False
        if mgr is managers[-1] and mgr.instr.always_exits.startswith("DISPATCH_INLINED") and mgr.instr.name == "_PUSH_FRAME":
            dispatch_inlined_special_case = True
            temp = mgr.final_offset.clone()
            temp.deeper(StackEffect(UNUSED))  # Hack
            out.stack_adjust(temp.deep, temp.high)
            # Use clone() since adjust_inverse() mutates final_offset
            mgr.adjust_inverse(mgr.final_offset.clone())
            if cache_adjust:
                out.emit(f"next_instr += {cache_adjust};")

The last 4 lines here, starting with # Use clone(), occur further down too, for the normal case (after the final uop). I don't even recall why the temp.deeper() call is needed!

I'll mull this over some more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I have addressed this. @markshannon Please have another look. Assuming the tests look okay I'll un-draft this.

// Write it out explicitly because it's subtly different.
// Eventually this should be the only occurrence of this code.
frame->return_offset = 0;
DISPATCH_INLINED(new_frame);
assert(tstate->interp->eval_frame == NULL);
SAVE_FRAME_STATE(); // Signals to the code generator
new_frame->previous = frame;
CALL_STAT_INC(inlined_py_calls);
#if TIER_ONE
frame = cframe.current_frame = new_frame;
goto start_frame;
#endif
#if TIER_TWO
frame = tstate->cframe->current_frame = new_frame;
ERROR_IF(_Py_EnterRecursivePy(tstate), exit_unwind);
stack_pointer = _PyFrame_GetStackPointer(frame);
ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive;
#endif
}

macro(CALL_PY_EXACT_ARGS) =
unused/1 + // Skip over the counter
_CHECK_PEP_523 +
_CHECK_FUNCTION_EXACT_ARGS +
_CHECK_STACK_SPACE +
_INIT_CALL_PY_EXACT_ARGS +
SAVE_IP + // Tier 2 only; special-cased oparg
_PUSH_FRAME;

inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) {
ASSERT_KWNAMES_IS_NULL();
DEOPT_IF(tstate->interp->eval_frame, CALL);
Expand Down
6 changes: 1 addition & 5 deletions Python/ceval.c
Original file line number Diff line number Diff line change
Expand Up @@ -602,11 +602,6 @@ int _Py_CheckRecursiveCallPy(
return 0;
}

static inline int _Py_EnterRecursivePy(PyThreadState *tstate) {
return (tstate->py_recursion_remaining-- <= 0) &&
_Py_CheckRecursiveCallPy(tstate);
}


static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate) {
tstate->py_recursion_remaining++;
Expand Down Expand Up @@ -770,6 +765,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
#endif
{

#define TIER_ONE 1
#include "generated_cases.c.h"

/* INSTRUMENTED_LINE has to be here, rather than in bytecodes.c,
Expand Down
14 changes: 12 additions & 2 deletions Python/ceval_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,16 @@
DISPATCH_GOTO(); \
}

#define SAVE_FRAME_STATE() \
Copy link
Member

@markshannon markshannon Aug 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See my comment below about splitting this into SAVE_CURRENT_IP; SAVE_SP

do { \
frame->prev_instr = next_instr - 1; \
_PyFrame_SetStackPointer(frame, stack_pointer); \
} while (0)

#define DISPATCH_INLINED(NEW_FRAME) \
do { \
assert(tstate->interp->eval_frame == NULL); \
_PyFrame_SetStackPointer(frame, stack_pointer); \
frame->prev_instr = next_instr - 1; \
SAVE_FRAME_STATE(); \
(NEW_FRAME)->previous = frame; \
frame = cframe.current_frame = (NEW_FRAME); \
CALL_STAT_INC(inlined_py_calls); \
Expand Down Expand Up @@ -364,3 +369,8 @@ static const convertion_func_ptr CONVERSION_FUNCTIONS[4] = {
#else
#define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) _Py_atomic_load_relaxed(ATOMIC_VAL)
#endif

static inline int _Py_EnterRecursivePy(PyThreadState *tstate) {
return (tstate->py_recursion_remaining-- <= 0) &&
_Py_CheckRecursiveCallPy(tstate);
}
10 changes: 10 additions & 0 deletions Python/executor.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
#undef ENABLE_SPECIALIZATION
#define ENABLE_SPECIALIZATION 0

#undef SAVE_FRAME_STATE
#define SAVE_FRAME_STATE() \
Copy link
Member

@markshannon markshannon Aug 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than a macro, I think the code generator needs to understand this.

Given that SAVE_FRAME_STATE is basically SAVE_CURRENT_IP followed by _PyFrame_SetStackPointer(frame, stack_pointer); we could convert it to two micro-ops: SAVE_CURRENT_IP and SAVE_SP.

In general, we want to avoid macros in the generated C code.
The generated code can be explicit and verbose.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, there are already many macros (and static inline functions) in the generated code. The generator recognizes the presence of SAVE_FRAME_STATE(), but it doesn't expand it -- the C preprocessor can do that for us more easily. Currently we only do the expansion in the generator for things whose expansion requires information that only the generator has (like the stack adjustment for ERROR_IF).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are proposing that the macro expansion for CALL_PY_EXACT_ARGS become

        macro(CALL_PY_EXACT_ARGS) =
            unused/1 + // Skip over the counter
            _CHECK_PEP_523 +
            _CHECK_FUNCTION_EXACT_ARGS +
            _CHECK_STACK_SPACE +
            _INIT_CALL_PY_EXACT_ARGS +
            SAVE_IP +  // Tier 2 only; special-cased oparg
            SAVE_CURRENT_IP +  // <------------------- Addition
            _PUSH_FRAME;

where SAVE_CURRENT_IP is something like this:

op(SAVE_CURRENT_IP, (--)) {
    #if TIER_ONE
    frame->prev_instr = next_instr - 1;
    #endif
    #if TIER_TWO
    frame->prev_instr--;
    #endif
}

Or we could special-case its expansion in the generator, potayto-potato. But it has to differ between tiers because in Tier 1 it must store next_instr whereas in Tier 2 it must rely on the preceding SAVE_IP to set frame->prev_instr. (Ideally at some point in the future we won't need the prev_instr-- yet, but that's a tricky change.)

The _PyFrame_SetStackPointer(frame, stack_pointer); call should be moved back into _PUSH_FRAME (at the point where I currently call SAVE_FRAME_STATE).

If I can get this to work I'll apply it and merge the PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did get this working (see 05af848), and will test and benchmark it before merging it.

Note that there are still some #if TIER_ONE and #if TIER_TWO sections, but they are unavoidable.

do { \
/* Assume preceding SAVE_IP has set frame->prev_instr */ \
frame->prev_instr--; \
_PyFrame_SetStackPointer(frame, stack_pointer); \
} while (0)


_PyInterpreterFrame *
_PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject **stack_pointer)
Expand Down Expand Up @@ -81,6 +89,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
OBJECT_STAT_INC(optimization_uops_executed);
switch (opcode) {

#define TIER_TWO 2
#include "executor_cases.c.h"

default:
Expand All @@ -106,6 +115,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
pop_2_error:
STACK_SHRINK(1);
pop_1_error:
pop_1_exit_unwind:
STACK_SHRINK(1);
error:
// On ERROR_IF we return NULL as the frame.
Expand Down
Loading