Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion pyrit/score/scorer_evaluation/scorer_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,21 @@ def _compute_metrics(
diff[np.abs(diff) < 1e-10] = 0.0

abs_error = np.abs(diff)
t_statistic, p_value = cast("tuple[float, float]", ttest_1samp(diff, 0))
# ttest_1samp on a zero-variance sample returns NaN and emits scipy
# divide-by-zero / catastrophic-cancellation warnings. Two degenerate cases
# warrant explicit handling (np.allclose tolerates the float noise that
# creeps in from `np.median(...)` differences):
# - Perfect agreement (diff effectively all zeros): the null hypothesis
# (mean diff = 0) is exactly satisfied, so report t=0.0, p=1.0.
# - Systematic bias with no variance (constant non-zero diff): the t-test
# is undefined; report NaN explicitly. MAE captures the bias magnitude.
if diff.size > 0 and np.allclose(diff, diff[0]):
if np.isclose(diff[0], 0.0):
t_statistic, p_value = 0.0, 1.0
else:
t_statistic, p_value = float("nan"), float("nan")
else:
t_statistic, p_value = cast("tuple[float, float]", ttest_1samp(diff, 0))

num_responses = all_human_scores.shape[1]
num_human_raters = all_human_scores.shape[0]
Expand Down
6 changes: 5 additions & 1 deletion pyrit/score/scorer_evaluation/scorer_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,13 @@ class HarmScorerMetrics(ScorerMetrics):
a confidence interval for the mean absolute error.
t_statistic (float): The t-statistic for the one-sample t-test comparing model scores to human scores with a
null hypothesis that the mean difference is 0. A high positive t-statistic (along with a low p-value)
indicates that the model scores are typically higher than the human scores.
indicates that the model scores are typically higher than the human scores. When the model perfectly
agrees with the gold labels (zero difference everywhere), this is reported as 0.0. When all differences
are equal and non-zero (a systematic constant bias with no variance), the t-test is undefined and this
is reported as NaN; consult `mean_absolute_error` for the bias magnitude in that case.
p_value (float): The p-value for the one-sample t-test above. It represents the probability of obtaining a
difference in means as extreme as the observed difference, assuming the null hypothesis is true.
Reported as 1.0 on perfect agreement and NaN on the constant-non-zero-bias case (see `t_statistic`).
krippendorff_alpha_combined (float): Krippendorff's alpha for the reliability data, which includes both
human and model scores. This measures the agreement between all the human raters and model scoring trials
and ranges between -1.0 to 1.0 where 1.0 indicates perfect agreement, 0.0 indicates no agreement, and
Expand Down
23 changes: 22 additions & 1 deletion tests/unit/score/test_scorer_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,20 +170,41 @@ def test_compute_harm_metrics_perfect_agreement(mock_harm_scorer):
)
assert metrics.mean_absolute_error == 0.0
assert metrics.mae_standard_error == 0.0
# Perfect agreement: diff is all zeros, t-test guarded to avoid NaN propagation.
assert metrics.t_statistic == 0.0
assert metrics.p_value == 1.0
assert metrics.krippendorff_alpha_combined == 1.0
assert metrics.krippendorff_alpha_humans == 1.0
assert metrics.krippendorff_alpha_model == 1.0


def test_compute_harm_metrics_partial_agreement(mock_harm_scorer):
evaluator = HarmScorerEvaluator(scorer=mock_harm_scorer)
# 2 responses, 3 human scores each, model is off by 0.1 for each
# 2 responses, 3 human scores each, model is off by 0.1 for each (constant bias, zero variance)
all_human_scores = np.array([[0.1, 0.2], [0.1, 0.2], [0.1, 0.2]])
all_model_scores = np.array([[0.2, 0.3], [0.2, 0.3]])
metrics = evaluator._compute_metrics(
all_human_scores=all_human_scores, all_model_scores=all_model_scores, num_scorer_trials=2
)
assert np.isclose(metrics.mean_absolute_error, 0.1)
# Constant non-zero diff has no within-sample variance: t-test undefined, reported as NaN.
# MAE captures the bias magnitude.
assert np.isnan(metrics.t_statistic)
assert np.isnan(metrics.p_value)


def test_compute_harm_metrics_partial_agreement_with_variance(mock_harm_scorer):
evaluator = HarmScorerEvaluator(scorer=mock_harm_scorer)
# Model scores have variance across responses so ttest_1samp is well-defined.
all_human_scores = np.array([[0.1, 0.5], [0.1, 0.5], [0.1, 0.5]])
all_model_scores = np.array([[0.2, 0.3], [0.2, 0.3]])
metrics = evaluator._compute_metrics(
all_human_scores=all_human_scores, all_model_scores=all_model_scores, num_scorer_trials=2
)
# diff = [0.1, -0.2]; both t_statistic and p_value should be finite floats.
assert np.isfinite(metrics.t_statistic)
assert np.isfinite(metrics.p_value)
assert 0.0 <= metrics.p_value <= 1.0


@patch("pyrit.score.scorer_evaluation.scorer_evaluator.find_objective_metrics_by_eval_hash")
Expand Down
Loading