diff --git a/pyrit/score/scorer_evaluation/scorer_evaluator.py b/pyrit/score/scorer_evaluation/scorer_evaluator.py index ff16fcef0..5f203753f 100644 --- a/pyrit/score/scorer_evaluation/scorer_evaluator.py +++ b/pyrit/score/scorer_evaluation/scorer_evaluator.py @@ -585,7 +585,21 @@ def _compute_metrics( diff[np.abs(diff) < 1e-10] = 0.0 abs_error = np.abs(diff) - t_statistic, p_value = cast("tuple[float, float]", ttest_1samp(diff, 0)) + # ttest_1samp on a zero-variance sample returns NaN and emits scipy + # divide-by-zero / catastrophic-cancellation warnings. Two degenerate cases + # warrant explicit handling (np.allclose tolerates the float noise that + # creeps in from `np.median(...)` differences): + # - Perfect agreement (diff effectively all zeros): the null hypothesis + # (mean diff = 0) is exactly satisfied, so report t=0.0, p=1.0. + # - Systematic bias with no variance (constant non-zero diff): the t-test + # is undefined; report NaN explicitly. MAE captures the bias magnitude. + if diff.size > 0 and np.allclose(diff, diff[0]): + if np.isclose(diff[0], 0.0): + t_statistic, p_value = 0.0, 1.0 + else: + t_statistic, p_value = float("nan"), float("nan") + else: + t_statistic, p_value = cast("tuple[float, float]", ttest_1samp(diff, 0)) num_responses = all_human_scores.shape[1] num_human_raters = all_human_scores.shape[0] diff --git a/pyrit/score/scorer_evaluation/scorer_metrics.py b/pyrit/score/scorer_evaluation/scorer_metrics.py index 0dbccec57..a85638ccc 100644 --- a/pyrit/score/scorer_evaluation/scorer_metrics.py +++ b/pyrit/score/scorer_evaluation/scorer_metrics.py @@ -94,9 +94,13 @@ class HarmScorerMetrics(ScorerMetrics): a confidence interval for the mean absolute error. t_statistic (float): The t-statistic for the one-sample t-test comparing model scores to human scores with a null hypothesis that the mean difference is 0. A high positive t-statistic (along with a low p-value) - indicates that the model scores are typically higher than the human scores. + indicates that the model scores are typically higher than the human scores. When the model perfectly + agrees with the gold labels (zero difference everywhere), this is reported as 0.0. When all differences + are equal and non-zero (a systematic constant bias with no variance), the t-test is undefined and this + is reported as NaN; consult `mean_absolute_error` for the bias magnitude in that case. p_value (float): The p-value for the one-sample t-test above. It represents the probability of obtaining a difference in means as extreme as the observed difference, assuming the null hypothesis is true. + Reported as 1.0 on perfect agreement and NaN on the constant-non-zero-bias case (see `t_statistic`). krippendorff_alpha_combined (float): Krippendorff's alpha for the reliability data, which includes both human and model scores. This measures the agreement between all the human raters and model scoring trials and ranges between -1.0 to 1.0 where 1.0 indicates perfect agreement, 0.0 indicates no agreement, and diff --git a/tests/unit/score/test_scorer_evaluator.py b/tests/unit/score/test_scorer_evaluator.py index 77b489767..a2d201671 100644 --- a/tests/unit/score/test_scorer_evaluator.py +++ b/tests/unit/score/test_scorer_evaluator.py @@ -170,6 +170,9 @@ def test_compute_harm_metrics_perfect_agreement(mock_harm_scorer): ) assert metrics.mean_absolute_error == 0.0 assert metrics.mae_standard_error == 0.0 + # Perfect agreement: diff is all zeros, t-test guarded to avoid NaN propagation. + assert metrics.t_statistic == 0.0 + assert metrics.p_value == 1.0 assert metrics.krippendorff_alpha_combined == 1.0 assert metrics.krippendorff_alpha_humans == 1.0 assert metrics.krippendorff_alpha_model == 1.0 @@ -177,13 +180,31 @@ def test_compute_harm_metrics_perfect_agreement(mock_harm_scorer): def test_compute_harm_metrics_partial_agreement(mock_harm_scorer): evaluator = HarmScorerEvaluator(scorer=mock_harm_scorer) - # 2 responses, 3 human scores each, model is off by 0.1 for each + # 2 responses, 3 human scores each, model is off by 0.1 for each (constant bias, zero variance) all_human_scores = np.array([[0.1, 0.2], [0.1, 0.2], [0.1, 0.2]]) all_model_scores = np.array([[0.2, 0.3], [0.2, 0.3]]) metrics = evaluator._compute_metrics( all_human_scores=all_human_scores, all_model_scores=all_model_scores, num_scorer_trials=2 ) assert np.isclose(metrics.mean_absolute_error, 0.1) + # Constant non-zero diff has no within-sample variance: t-test undefined, reported as NaN. + # MAE captures the bias magnitude. + assert np.isnan(metrics.t_statistic) + assert np.isnan(metrics.p_value) + + +def test_compute_harm_metrics_partial_agreement_with_variance(mock_harm_scorer): + evaluator = HarmScorerEvaluator(scorer=mock_harm_scorer) + # Model scores have variance across responses so ttest_1samp is well-defined. + all_human_scores = np.array([[0.1, 0.5], [0.1, 0.5], [0.1, 0.5]]) + all_model_scores = np.array([[0.2, 0.3], [0.2, 0.3]]) + metrics = evaluator._compute_metrics( + all_human_scores=all_human_scores, all_model_scores=all_model_scores, num_scorer_trials=2 + ) + # diff = [0.1, -0.2]; both t_statistic and p_value should be finite floats. + assert np.isfinite(metrics.t_statistic) + assert np.isfinite(metrics.p_value) + assert 0.0 <= metrics.p_value <= 1.0 @patch("pyrit.score.scorer_evaluation.scorer_evaluator.find_objective_metrics_by_eval_hash")