Skip to content

Commit f2bde9e

Browse files
committed
fine tuning
1 parent bafa264 commit f2bde9e

2 files changed

Lines changed: 27 additions & 21 deletions

File tree

Include/internal/pycore_optimizer.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,15 @@ extern "C" {
3030
* 4. A push followed by a matching return is net-zero on frame-specific
3131
* fitness, excluding per-slot costs.
3232
*/
33-
#define MAX_TARGET_LENGTH (UOP_MAX_TRACE_LENGTH / 5 * 2)
33+
#define MAX_TARGET_LENGTH (UOP_MAX_TRACE_LENGTH / 2)
3434
#define OPTIMIZER_EFFECTIVENESS 2
3535
#define FITNESS_INITIAL (MAX_TARGET_LENGTH * OPTIMIZER_EFFECTIVENESS)
3636

3737
/* Exit quality thresholds: trace stops when fitness < exit_quality.
3838
* Higher = trace is more willing to stop here. */
3939
#define EXIT_QUALITY_CLOSE_LOOP (FITNESS_INITIAL - AVG_SLOTS_PER_INSTRUCTION*4)
4040
#define EXIT_QUALITY_ENTER_EXECUTOR (FITNESS_INITIAL * 1 / 8)
41-
#define EXIT_QUALITY_DEFAULT (FITNESS_INITIAL / 8)
41+
#define EXIT_QUALITY_DEFAULT (FITNESS_INITIAL / 40)
4242
#define EXIT_QUALITY_SPECIALIZABLE (FITNESS_INITIAL / 80)
4343

4444
/* Estimated buffer slots per bytecode, used only to derive heuristics.
@@ -51,12 +51,13 @@ extern "C" {
5151
#define N_BACKWARD_SLACK 10
5252
#define EXIT_QUALITY_BACKWARD_EDGE (EXIT_QUALITY_CLOSE_LOOP / 2 - N_BACKWARD_SLACK * AVG_SLOTS_PER_INSTRUCTION)
5353

54-
/* Penalty for a perfectly balanced (50/50) branch.
55-
* 7 such branches (after per-slot cost) exhaust fitness to EXIT_QUALITY_DEFAULT.
56-
* The calculation assumes the branches are spread out roughly equally throughout the trace.
54+
/* Penalty for a balanced branch.
55+
* It is sized so repeated balanced branches can drive a trace toward
56+
* EXIT_QUALITY_DEFAULT, while compute_branch_penalty() keeps any single branch
57+
* from dominating the budget.
5758
*/
5859
#define FITNESS_BRANCH_BALANCED ((FITNESS_INITIAL - EXIT_QUALITY_DEFAULT - \
59-
(MAX_TARGET_LENGTH / 7 * AVG_SLOTS_PER_INSTRUCTION)) / (7))
60+
(MAX_TARGET_LENGTH / 14 * AVG_SLOTS_PER_INSTRUCTION)) / (14))
6061

6162

6263
typedef struct _PyJitUopBuffer {

Python/optimizer.c

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -598,16 +598,22 @@ add_to_trace(
598598
((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))
599599

600600

601-
/* Branch penalty: 0 if fully biased, FITNESS_BRANCH_BALANCED if 50/50,
602-
* 2*FITNESS_BRANCH_BALANCED if fully against the traced direction. */
601+
/* Branch penalty: 0 for a fully biased branch and FITNESS_BRANCH_BALANCED for
602+
* a balanced or fully off-trace branch. This keeps any single branch from
603+
* consuming more than one balanced-branch cost.
604+
*/
603605
static inline int
604606
compute_branch_penalty(uint16_t history)
605607
{
606608
bool branch_taken = history & 1;
607609
int taken_count = _Py_popcount32((uint32_t)history);
608610
int on_trace_count = branch_taken ? taken_count : 16 - taken_count;
609611
int off_trace = 16 - on_trace_count;
610-
return off_trace * FITNESS_BRANCH_BALANCED / 8;
612+
int penalty = off_trace * FITNESS_BRANCH_BALANCED / 8;
613+
if (penalty > FITNESS_BRANCH_BALANCED) {
614+
penalty = FITNESS_BRANCH_BALANCED;
615+
}
616+
return penalty;
611617
}
612618

613619
/* Compute exit quality for the current trace position.
@@ -818,10 +824,9 @@ _PyJit_translate_single_bytecode_to_trace(
818824
goto done;
819825
}
820826

821-
// Snapshot the buffer before reserving tail slots. The later charge
822-
// includes both emitted uops and capacity reserved for exits/deopts/errors.
823-
_PyUOpInstruction *next_before = trace->next;
824-
_PyUOpInstruction *end_before = trace->end;
827+
// Snapshot remaining space so the later fitness charge reflects all buffer
828+
// space this bytecode consumed, including reserved tail slots.
829+
int32_t remaining_before = uop_buffer_remaining_space(trace);
825830

826831
// One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
827832
trace->end -= 2;
@@ -1002,10 +1007,13 @@ _PyJit_translate_single_bytecode_to_trace(
10021007
_PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
10031008
int32_t frame_penalty = compute_frame_penalty(tstate->interp->opt_config.fitness_initial);
10041009
if (ts_depth->frame_depth <= 0) {
1005-
// Returning from a frame we didn't enter — penalize.
1006-
ts_depth->fitness -= frame_penalty;
1010+
// Returning past the traced root is normal for guarded
1011+
// caller continuation. Charge a small penalty so these
1012+
// paths still terminate.
1013+
int32_t underflow_penalty = frame_penalty / 4;
1014+
ts_depth->fitness -= underflow_penalty;
10071015
DPRINTF(3, " %s: underflow penalty=-%d -> fitness=%d\n",
1008-
_PyOpcode_uop_name[uop], frame_penalty,
1016+
_PyOpcode_uop_name[uop], underflow_penalty,
10091017
ts_depth->fitness);
10101018
}
10111019
else {
@@ -1063,12 +1071,9 @@ _PyJit_translate_single_bytecode_to_trace(
10631071
// Charge fitness by trace-buffer capacity consumed for this bytecode,
10641072
// including both emitted uops and tail reservations.
10651073
{
1066-
int32_t slots_fwd = (int32_t)(trace->next - next_before);
1067-
int32_t slots_rev = (int32_t)(end_before - trace->end);
1068-
int32_t slots_used = slots_fwd + slots_rev;
1074+
int32_t slots_used = remaining_before - uop_buffer_remaining_space(trace);
10691075
tracer->translator_state.fitness -= slots_used;
1070-
DPRINTF(3, " per-insn cost: -%d (fwd=%d, rev=%d) -> fitness=%d\n",
1071-
slots_used, slots_fwd, slots_rev,
1076+
DPRINTF(3, " per-insn cost: -%d -> fitness=%d\n", slots_used,
10721077
tracer->translator_state.fitness);
10731078
}
10741079
DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness);

0 commit comments

Comments
 (0)