Skip to content

Commit 2f233fc

Browse files
gvanrossumvstinnersweeneyde
authored
gh-89279: In ceval.c, redefine some macros for speed (#32387)
Macros Py_DECREF, Py_XDECREF, Py_IS_TYPE, _Py_atomic_load_32bit_impl and _Py_DECREF_SPECIALIZED are redefined as macros that completely replace the inline functions of the same name. These three came out in the top four of functions that (in MSVC) somehow weren't inlined. Co-authored-by: Victor Stinner <[email protected]> Co-authored-by: Dennis Sweeney <[email protected]>
1 parent 9fe82d0 commit 2f233fc

File tree

2 files changed

+61
-13
lines changed

2 files changed

+61
-13
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve interpreter performance on Windows by inlining a few specific macros.

Python/ceval.c

+60-13
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,55 @@
4444
# error "ceval.c must be build with Py_BUILD_CORE define for best performance"
4545
#endif
4646

47+
#ifndef Py_DEBUG
48+
// GH-89279: The MSVC compiler does not inline these static inline functions
49+
// in PGO build in _PyEval_EvalFrameDefault(), because this function is over
50+
// the limit of PGO, and that limit cannot be configured.
51+
// Define them as macros to make sure that they are always inlined by the
52+
// preprocessor.
53+
54+
#undef Py_DECREF
55+
#define Py_DECREF(arg) \
56+
do { \
57+
PyObject *op = _PyObject_CAST(arg); \
58+
if (--op->ob_refcnt == 0) { \
59+
destructor dealloc = Py_TYPE(op)->tp_dealloc; \
60+
(*dealloc)(op); \
61+
} \
62+
} while (0)
63+
64+
#undef Py_XDECREF
65+
#define Py_XDECREF(arg) \
66+
do { \
67+
PyObject *xop = _PyObject_CAST(arg); \
68+
if (xop != NULL) { \
69+
Py_DECREF(xop); \
70+
} \
71+
} while (0)
72+
73+
#undef Py_IS_TYPE
74+
#define Py_IS_TYPE(ob, type) \
75+
(_PyObject_CAST(ob)->ob_type == (type))
76+
77+
#undef _Py_DECREF_SPECIALIZED
78+
#define _Py_DECREF_SPECIALIZED(arg, dealloc) \
79+
do { \
80+
PyObject *op = _PyObject_CAST(arg); \
81+
if (--op->ob_refcnt == 0) { \
82+
destructor d = (destructor)(dealloc); \
83+
d(op); \
84+
} \
85+
} while (0)
86+
#endif
87+
88+
// GH-89279: Similar to above, force inlining by using a macro.
89+
#if defined(_MSC_VER) && SIZEOF_INT == 4
90+
#define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) (assert(sizeof((ATOMIC_VAL)->_value) == 4), *((volatile int*)&((ATOMIC_VAL)->_value)))
91+
#else
92+
#define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) _Py_atomic_load_relaxed(ATOMIC_VAL)
93+
#endif
94+
95+
4796
/* Forward declarations */
4897
static PyObject *trace_call_function(
4998
PyThreadState *tstate, PyObject *callable, PyObject **stack,
@@ -192,10 +241,10 @@ COMPUTE_EVAL_BREAKER(PyInterpreterState *interp,
192241
struct _ceval_state *ceval2)
193242
{
194243
_Py_atomic_store_relaxed(&ceval2->eval_breaker,
195-
_Py_atomic_load_relaxed(&ceval2->gil_drop_request)
196-
| (_Py_atomic_load_relaxed(&ceval->signals_pending)
244+
_Py_atomic_load_relaxed_int32(&ceval2->gil_drop_request)
245+
| (_Py_atomic_load_relaxed_int32(&ceval->signals_pending)
197246
&& _Py_ThreadCanHandleSignals(interp))
198-
| (_Py_atomic_load_relaxed(&ceval2->pending.calls_to_do)
247+
| (_Py_atomic_load_relaxed_int32(&ceval2->pending.calls_to_do)
199248
&& _Py_ThreadCanHandlePendingCalls())
200249
| ceval2->pending.async_exc);
201250
}
@@ -740,7 +789,7 @@ _Py_FinishPendingCalls(PyThreadState *tstate)
740789

741790
struct _pending_calls *pending = &tstate->interp->ceval.pending;
742791

743-
if (!_Py_atomic_load_relaxed(&(pending->calls_to_do))) {
792+
if (!_Py_atomic_load_relaxed_int32(&(pending->calls_to_do))) {
744793
return;
745794
}
746795

@@ -1187,22 +1236,22 @@ eval_frame_handle_pending(PyThreadState *tstate)
11871236
struct _ceval_runtime_state *ceval = &runtime->ceval;
11881237

11891238
/* Pending signals */
1190-
if (_Py_atomic_load_relaxed(&ceval->signals_pending)) {
1239+
if (_Py_atomic_load_relaxed_int32(&ceval->signals_pending)) {
11911240
if (handle_signals(tstate) != 0) {
11921241
return -1;
11931242
}
11941243
}
11951244

11961245
/* Pending calls */
11971246
struct _ceval_state *ceval2 = &tstate->interp->ceval;
1198-
if (_Py_atomic_load_relaxed(&ceval2->pending.calls_to_do)) {
1247+
if (_Py_atomic_load_relaxed_int32(&ceval2->pending.calls_to_do)) {
11991248
if (make_pending_calls(tstate->interp) != 0) {
12001249
return -1;
12011250
}
12021251
}
12031252

12041253
/* GIL drop request */
1205-
if (_Py_atomic_load_relaxed(&ceval2->gil_drop_request)) {
1254+
if (_Py_atomic_load_relaxed_int32(&ceval2->gil_drop_request)) {
12061255
/* Give another thread a chance */
12071256
if (_PyThreadState_Swap(&runtime->gilstate, NULL) != tstate) {
12081257
Py_FatalError("tstate mix-up");
@@ -1360,7 +1409,7 @@ eval_frame_handle_pending(PyThreadState *tstate)
13601409

13611410
#define CHECK_EVAL_BREAKER() \
13621411
_Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY(); \
1363-
if (_Py_atomic_load_relaxed(eval_breaker)) { \
1412+
if (_Py_atomic_load_relaxed_int32(eval_breaker)) { \
13641413
goto handle_eval_breaker; \
13651414
}
13661415

@@ -1640,10 +1689,8 @@ typedef struct {
16401689
PyObject *kwnames;
16411690
} CallShape;
16421691

1643-
static inline bool
1644-
is_method(PyObject **stack_pointer, int args) {
1645-
return PEEK(args+2) != NULL;
1646-
}
1692+
// GH-89279: Must be a macro to be sure it's inlined by MSVC.
1693+
#define is_method(stack_pointer, args) (PEEK((args)+2) != NULL)
16471694

16481695
#define KWNAMES_LEN() \
16491696
(call_shape.kwnames == NULL ? 0 : ((int)PyTuple_GET_SIZE(call_shape.kwnames)))
@@ -1796,7 +1843,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
17961843
PREDICTED(RESUME_QUICK);
17971844
assert(tstate->cframe == &cframe);
17981845
assert(frame == cframe.current_frame);
1799-
if (_Py_atomic_load_relaxed(eval_breaker) && oparg < 2) {
1846+
if (_Py_atomic_load_relaxed_int32(eval_breaker) && oparg < 2) {
18001847
goto handle_eval_breaker;
18011848
}
18021849
DISPATCH();

0 commit comments

Comments
 (0)