@@ -384,34 +384,12 @@ PyTypeObject _PyUOpExecutor_Type = {
384
384
.tp_methods = executor_methods ,
385
385
};
386
386
387
- static int
388
- move_stubs (
389
- _PyUOpInstruction * trace ,
390
- int trace_length ,
391
- int stubs_start ,
392
- int stubs_end
393
- )
394
- {
395
- memmove (trace + trace_length ,
396
- trace + stubs_start ,
397
- (stubs_end - stubs_start ) * sizeof (_PyUOpInstruction ));
398
- // Patch up the jump targets
399
- for (int i = 0 ; i < trace_length ; i ++ ) {
400
- if (trace [i ].opcode == _POP_JUMP_IF_FALSE ||
401
- trace [i ].opcode == _POP_JUMP_IF_TRUE )
402
- {
403
- int target = trace [i ].oparg ;
404
- if (target >= stubs_start ) {
405
- target += trace_length - stubs_start ;
406
- trace [i ].oparg = target ;
407
- }
408
- }
409
- }
410
- return trace_length + stubs_end - stubs_start ;
411
- }
412
-
413
387
#define TRACE_STACK_SIZE 5
414
388
389
+ /* Returns 1 on success,
390
+ * 0 if it failed to produce a worthwhile trace,
391
+ * and -1 on an error.
392
+ */
415
393
static int
416
394
translate_bytecode_to_trace (
417
395
PyCodeObject * code ,
@@ -790,7 +768,7 @@ translate_bytecode_to_trace(
790
768
}
791
769
assert (code == initial_code );
792
770
// Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE
793
- if (trace_length > 3 ) {
771
+ if (trace_length > 4 ) {
794
772
ADD_TO_TRACE (_EXIT_TRACE , 0 , 0 );
795
773
DPRINTF (1 ,
796
774
"Created a trace for %s (%s:%d) at byte offset %d -- length %d+%d\n" ,
@@ -800,25 +778,8 @@ translate_bytecode_to_trace(
800
778
2 * INSTR_IP (initial_instr , code ),
801
779
trace_length ,
802
780
buffer_size - max_length );
803
- if (max_length < buffer_size ) {
804
- // There are stubs
805
- if (trace_length < max_length ) {
806
- // There's a gap before the stubs
807
- // Move the stubs back to be immediately after the main trace
808
- // (which ends at trace_length)
809
- DPRINTF (2 ,
810
- "Moving %d stub uops back by %d\n" ,
811
- buffer_size - max_length ,
812
- max_length - trace_length );
813
- trace_length = move_stubs (trace , trace_length , max_length , buffer_size );
814
- }
815
- else {
816
- assert (trace_length == max_length );
817
- // There's no gap
818
- trace_length = buffer_size ;
819
- }
820
- }
821
- return trace_length ;
781
+ OPT_HIST (trace_length + buffer_size - max_length , trace_length_hist );
782
+ return 1 ;
822
783
}
823
784
else {
824
785
OPT_STAT_INC (trace_too_short );
@@ -838,70 +799,84 @@ translate_bytecode_to_trace(
838
799
#undef DPRINTF
839
800
}
840
801
802
+ #define UNSET_BIT (array , bit ) (array[(bit)>>5] &= ~(1<<((bit)&31)))
803
+ #define SET_BIT (array , bit ) (array[(bit)>>5] |= (1<<((bit)&31)))
804
+ #define BIT_IS_SET (array , bit ) (array[(bit)>>5] & (1<<((bit)&31)))
805
+
806
+ /* Count the number of used uops, and mark them in the bit vector `used`.
807
+ * This can be done in a single pass using simple reachability analysis,
808
+ * as there are no backward jumps.
809
+ * NOPs are excluded from the count.
810
+ */
841
811
static int
842
- remove_unneeded_uops (_PyUOpInstruction * trace , int trace_length )
812
+ compute_used (_PyUOpInstruction * buffer , uint32_t * used )
843
813
{
844
- // Stage 1: Replace unneeded _SET_IP uops with NOP.
845
- // Note that we don't enter stubs, those SET_IPs are needed.
846
- int last_set_ip = -1 ;
847
- int last_instr = 0 ;
848
- bool need_ip = true;
849
- for (int pc = 0 ; pc < trace_length ; pc ++ ) {
850
- int opcode = trace [pc ].opcode ;
851
- if (opcode == _SET_IP ) {
852
- if (!need_ip && last_set_ip >= 0 ) {
853
- trace [last_set_ip ].opcode = NOP ;
854
- }
855
- need_ip = false;
856
- last_set_ip = pc ;
814
+ int count = 0 ;
815
+ SET_BIT (used , 0 );
816
+ for (int i = 0 ; i < _Py_UOP_MAX_TRACE_LENGTH ; i ++ ) {
817
+ if (!BIT_IS_SET (used , i )) {
818
+ continue ;
857
819
}
858
- else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE ) {
859
- last_instr = pc + 1 ;
860
- break ;
820
+ count ++ ;
821
+ int opcode = buffer [i ].opcode ;
822
+ if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE ) {
823
+ continue ;
861
824
}
862
- else {
863
- // If opcode has ERROR or DEOPT, set need_ip to true
864
- if (_PyOpcode_opcode_metadata [opcode ].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG ) || opcode == _PUSH_FRAME ) {
865
- need_ip = true;
866
- }
825
+ /* All other micro-ops fall through, so i+1 is reachable */
826
+ SET_BIT (used , i + 1 );
827
+ switch (opcode ) {
828
+ case NOP :
829
+ /* Don't count NOPs as used */
830
+ count -- ;
831
+ UNSET_BIT (used , i );
832
+ break ;
833
+ case _POP_JUMP_IF_FALSE :
834
+ case _POP_JUMP_IF_TRUE :
835
+ /* Mark target as reachable */
836
+ SET_BIT (used , buffer [i ].oparg );
867
837
}
868
838
}
869
- // Stage 2: Squash NOP opcodes (pre-existing or set above).
870
- int dest = 0 ;
871
- for (int pc = 0 ; pc < last_instr ; pc ++ ) {
872
- int opcode = trace [pc ].opcode ;
873
- if (opcode != NOP ) {
874
- if (pc != dest ) {
875
- trace [dest ] = trace [pc ];
876
- }
877
- dest ++ ;
878
- }
839
+ return count ;
840
+ }
841
+
842
+ /* Makes an executor from a buffer of uops.
843
+ * Account for the buffer having gaps and NOPs by computing a "used"
844
+ * bit vector and only copying the used uops. Here "used" means reachable
845
+ * and not a NOP.
846
+ */
847
+ static _PyExecutorObject *
848
+ make_executor_from_uops (_PyUOpInstruction * buffer , _PyBloomFilter * dependencies )
849
+ {
850
+ uint32_t used [(_Py_UOP_MAX_TRACE_LENGTH + 31 )/32 ] = { 0 };
851
+ int length = compute_used (buffer , used );
852
+ _PyUOpExecutorObject * executor = PyObject_NewVar (_PyUOpExecutorObject , & _PyUOpExecutor_Type , length );
853
+ if (executor == NULL ) {
854
+ return NULL ;
879
855
}
880
- // Stage 3: Move the stubs back.
881
- if (dest < last_instr ) {
882
- int new_trace_length = move_stubs (trace , dest , last_instr , trace_length );
883
- #ifdef Py_DEBUG
884
- char * python_lltrace = Py_GETENV ("PYTHON_LLTRACE" );
885
- int lltrace = 0 ;
886
- if (python_lltrace != NULL && * python_lltrace >= '0' ) {
887
- lltrace = * python_lltrace - '0' ; // TODO: Parse an int and all that
856
+ int dest = length - 1 ;
857
+ /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */
858
+ for (int i = _Py_UOP_MAX_TRACE_LENGTH - 1 ; i >= 0 ; i -- ) {
859
+ if (!BIT_IS_SET (used , i )) {
860
+ continue ;
888
861
}
889
- if (lltrace >= 2 ) {
890
- printf ("Optimized trace (length %d+%d = %d, saved %d):\n" ,
891
- dest , trace_length - last_instr , new_trace_length ,
892
- trace_length - new_trace_length );
893
- for (int pc = 0 ; pc < new_trace_length ; pc ++ ) {
894
- printf ("%4d: (%s, %d, %" PRIu64 ")\n" ,
895
- pc ,
896
- uop_name (trace [pc ].opcode ),
897
- (trace [pc ].oparg ),
898
- (uint64_t )(trace [pc ].operand ));
899
- }
862
+ executor -> trace [dest ] = buffer [i ];
863
+ int opcode = buffer [i ].opcode ;
864
+ if (opcode == _POP_JUMP_IF_FALSE ||
865
+ opcode == _POP_JUMP_IF_TRUE )
866
+ {
867
+ /* The oparg of the target will already have been set to its new offset */
868
+ int oparg = executor -> trace [dest ].oparg ;
869
+ executor -> trace [dest ].oparg = buffer [oparg ].oparg ;
900
870
}
901
- #endif
902
- trace_length = new_trace_length ;
871
+ /* Set the oparg to be the destination offset,
872
+ * so that we can set the oparg of earlier jumps correctly. */
873
+ buffer [i ].oparg = dest ;
874
+ dest -- ;
903
875
}
904
- return trace_length ;
876
+ assert (dest == -1 );
877
+ executor -> base .execute = _PyUopExecute ;
878
+ _Py_ExecutorInit ((_PyExecutorObject * )executor , dependencies );
879
+ return (_PyExecutorObject * )executor ;
905
880
}
906
881
907
882
static int
@@ -914,28 +889,26 @@ uop_optimize(
914
889
{
915
890
_PyBloomFilter dependencies ;
916
891
_Py_BloomFilter_Init (& dependencies );
917
- _PyUOpInstruction trace [_Py_UOP_MAX_TRACE_LENGTH ];
918
- int trace_length = translate_bytecode_to_trace (code , instr , trace , _Py_UOP_MAX_TRACE_LENGTH , & dependencies );
919
- if (trace_length <= 0 ) {
892
+ _PyUOpInstruction buffer [_Py_UOP_MAX_TRACE_LENGTH ];
893
+ int err = translate_bytecode_to_trace (code , instr , buffer , _Py_UOP_MAX_TRACE_LENGTH , & dependencies );
894
+ if (err <= 0 ) {
920
895
// Error or nothing translated
921
- return trace_length ;
896
+ return err ;
922
897
}
923
- OPT_HIST (trace_length , trace_length_hist );
924
898
OPT_STAT_INC (traces_created );
925
899
char * uop_optimize = Py_GETENV ("PYTHONUOPSOPTIMIZE" );
926
- if (uop_optimize != NULL && * uop_optimize > '0' ) {
927
- trace_length = _Py_uop_analyze_and_optimize (code , trace , trace_length , curr_stackentries );
900
+ if (uop_optimize == NULL || * uop_optimize > '0' ) {
901
+ err = _Py_uop_analyze_and_optimize (code , buffer , _Py_UOP_MAX_TRACE_LENGTH , curr_stackentries );
902
+ if (err < 0 ) {
903
+ return -1 ;
904
+ }
928
905
}
929
- trace_length = remove_unneeded_uops (trace , trace_length );
930
- _PyUOpExecutorObject * executor = PyObject_NewVar (_PyUOpExecutorObject , & _PyUOpExecutor_Type , trace_length );
906
+ _PyExecutorObject * executor = make_executor_from_uops (buffer , & dependencies );
931
907
if (executor == NULL ) {
932
908
return -1 ;
933
909
}
934
- OPT_HIST (trace_length , optimized_trace_length_hist );
935
- executor -> base .execute = _PyUopExecute ;
936
- memcpy (executor -> trace , trace , trace_length * sizeof (_PyUOpInstruction ));
937
- _Py_ExecutorInit ((_PyExecutorObject * )executor , & dependencies );
938
- * exec_ptr = (_PyExecutorObject * )executor ;
910
+ OPT_HIST (Py_SIZE (executor ), optimized_trace_length_hist );
911
+ * exec_ptr = executor ;
939
912
return 1 ;
940
913
}
941
914
0 commit comments