@@ -598,16 +598,22 @@ add_to_trace(
598598 ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))
599599
600600
601- /* Branch penalty: 0 if fully biased, FITNESS_BRANCH_BALANCED if 50/50,
602- * 2*FITNESS_BRANCH_BALANCED if fully against the traced direction. */
601+ /* Branch penalty: 0 for a fully biased branch and FITNESS_BRANCH_BALANCED for
602+ * a balanced or fully off-trace branch. This keeps any single branch from
603+ * consuming more than one balanced-branch cost.
604+ */
603605static inline int
604606compute_branch_penalty (uint16_t history )
605607{
606608 bool branch_taken = history & 1 ;
607609 int taken_count = _Py_popcount32 ((uint32_t )history );
608610 int on_trace_count = branch_taken ? taken_count : 16 - taken_count ;
609611 int off_trace = 16 - on_trace_count ;
610- return off_trace * FITNESS_BRANCH_BALANCED / 8 ;
612+ int penalty = off_trace * FITNESS_BRANCH_BALANCED / 8 ;
613+ if (penalty > FITNESS_BRANCH_BALANCED ) {
614+ penalty = FITNESS_BRANCH_BALANCED ;
615+ }
616+ return penalty ;
611617}
612618
613619/* Compute exit quality for the current trace position.
@@ -818,10 +824,9 @@ _PyJit_translate_single_bytecode_to_trace(
818824 goto done ;
819825 }
820826
821- // Snapshot the buffer before reserving tail slots. The later charge
822- // includes both emitted uops and capacity reserved for exits/deopts/errors.
823- _PyUOpInstruction * next_before = trace -> next ;
824- _PyUOpInstruction * end_before = trace -> end ;
827+ // Snapshot remaining space so the later fitness charge reflects all buffer
828+ // space this bytecode consumed, including reserved tail slots.
829+ int32_t remaining_before = uop_buffer_remaining_space (trace );
825830
826831 // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
827832 trace -> end -= 2 ;
@@ -1002,10 +1007,13 @@ _PyJit_translate_single_bytecode_to_trace(
10021007 _PyJitTracerTranslatorState * ts_depth = & tracer -> translator_state ;
10031008 int32_t frame_penalty = compute_frame_penalty (tstate -> interp -> opt_config .fitness_initial );
10041009 if (ts_depth -> frame_depth <= 0 ) {
1005- // Returning from a frame we didn't enter — penalize.
1006- ts_depth -> fitness -= frame_penalty ;
1010+ // Returning past the traced root is normal for guarded
1011+ // caller continuation. Charge a small penalty so these
1012+ // paths still terminate.
1013+ int32_t underflow_penalty = frame_penalty / 4 ;
1014+ ts_depth -> fitness -= underflow_penalty ;
10071015 DPRINTF (3 , " %s: underflow penalty=-%d -> fitness=%d\n" ,
1008- _PyOpcode_uop_name [uop ], frame_penalty ,
1016+ _PyOpcode_uop_name [uop ], underflow_penalty ,
10091017 ts_depth -> fitness );
10101018 }
10111019 else {
@@ -1063,12 +1071,9 @@ _PyJit_translate_single_bytecode_to_trace(
10631071 // Charge fitness by trace-buffer capacity consumed for this bytecode,
10641072 // including both emitted uops and tail reservations.
10651073 {
1066- int32_t slots_fwd = (int32_t )(trace -> next - next_before );
1067- int32_t slots_rev = (int32_t )(end_before - trace -> end );
1068- int32_t slots_used = slots_fwd + slots_rev ;
1074+ int32_t slots_used = remaining_before - uop_buffer_remaining_space (trace );
10691075 tracer -> translator_state .fitness -= slots_used ;
1070- DPRINTF (3 , " per-insn cost: -%d (fwd=%d, rev=%d) -> fitness=%d\n" ,
1071- slots_used , slots_fwd , slots_rev ,
1076+ DPRINTF (3 , " per-insn cost: -%d -> fitness=%d\n" , slots_used ,
10721077 tracer -> translator_state .fitness );
10731078 }
10741079 DPRINTF (2 , "Trace continuing (fitness=%d)\n" , tracer -> translator_state .fitness );
0 commit comments