microsoft · romanlutz · May 27, 2026 · May 25, 2026 · May 26, 2026
diff --git a/pyrit/score/scorer_evaluation/scorer_evaluator.py b/pyrit/score/scorer_evaluation/scorer_evaluator.py
@@ -585,7 +585,21 @@ def _compute_metrics(
         diff[np.abs(diff) < 1e-10] = 0.0
 
         abs_error = np.abs(diff)
-        t_statistic, p_value = cast("tuple[float, float]", ttest_1samp(diff, 0))
+        # ttest_1samp on a zero-variance sample returns NaN and emits scipy
+        # divide-by-zero / catastrophic-cancellation warnings. Two degenerate cases
+        # warrant explicit handling (np.allclose tolerates the float noise that
+        # creeps in from `np.median(...)` differences):
+        #   - Perfect agreement (diff effectively all zeros): the null hypothesis
+        #     (mean diff = 0) is exactly satisfied, so report t=0.0, p=1.0.
+        #   - Systematic bias with no variance (constant non-zero diff): the t-test
+        #     is undefined; report NaN explicitly. MAE captures the bias magnitude.
+        if diff.size > 0 and np.allclose(diff, diff[0]):
+            if np.isclose(diff[0], 0.0):
+                t_statistic, p_value = 0.0, 1.0
+            else:
+                t_statistic, p_value = float("nan"), float("nan")
+        else:
+            t_statistic, p_value = cast("tuple[float, float]", ttest_1samp(diff, 0))
 
         num_responses = all_human_scores.shape[1]
         num_human_raters = all_human_scores.shape[0]

diff --git a/pyrit/score/scorer_evaluation/scorer_metrics.py b/pyrit/score/scorer_evaluation/scorer_metrics.py
@@ -94,9 +94,13 @@ class HarmScorerMetrics(ScorerMetrics):
             a confidence interval for the mean absolute error.
         t_statistic (float): The t-statistic for the one-sample t-test comparing model scores to human scores with a
             null hypothesis that the mean difference is 0. A high positive t-statistic (along with a low p-value)
-            indicates that the model scores are typically higher than the human scores.
+            indicates that the model scores are typically higher than the human scores. When the model perfectly
+            agrees with the gold labels (zero difference everywhere), this is reported as 0.0. When all differences
+            are equal and non-zero (a systematic constant bias with no variance), the t-test is undefined and this
+            is reported as NaN; consult `mean_absolute_error` for the bias magnitude in that case.
         p_value (float): The p-value for the one-sample t-test above. It represents the probability of obtaining a
             difference in means as extreme as the observed difference, assuming the null hypothesis is true.
+            Reported as 1.0 on perfect agreement and NaN on the constant-non-zero-bias case (see `t_statistic`).
         krippendorff_alpha_combined (float): Krippendorff's alpha for the reliability data, which includes both
             human and model scores. This measures the agreement between all the human raters and model scoring trials
             and ranges between -1.0 to 1.0 where 1.0 indicates perfect agreement, 0.0 indicates no agreement, and

diff --git a/tests/unit/score/test_scorer_evaluator.py b/tests/unit/score/test_scorer_evaluator.py
@@ -170,20 +170,41 @@ def test_compute_harm_metrics_perfect_agreement(mock_harm_scorer):
     )
     assert metrics.mean_absolute_error == 0.0
     assert metrics.mae_standard_error == 0.0
+    # Perfect agreement: diff is all zeros, t-test guarded to avoid NaN propagation.
+    assert metrics.t_statistic == 0.0
+    assert metrics.p_value == 1.0
     assert metrics.krippendorff_alpha_combined == 1.0
     assert metrics.krippendorff_alpha_humans == 1.0
     assert metrics.krippendorff_alpha_model == 1.0
 
 
 def test_compute_harm_metrics_partial_agreement(mock_harm_scorer):
     evaluator = HarmScorerEvaluator(scorer=mock_harm_scorer)
-    # 2 responses, 3 human scores each, model is off by 0.1 for each
+    # 2 responses, 3 human scores each, model is off by 0.1 for each (constant bias, zero variance)
     all_human_scores = np.array([[0.1, 0.2], [0.1, 0.2], [0.1, 0.2]])
     all_model_scores = np.array([[0.2, 0.3], [0.2, 0.3]])
     metrics = evaluator._compute_metrics(
         all_human_scores=all_human_scores, all_model_scores=all_model_scores, num_scorer_trials=2
     )
     assert np.isclose(metrics.mean_absolute_error, 0.1)
+    # Constant non-zero diff has no within-sample variance: t-test undefined, reported as NaN.
+    # MAE captures the bias magnitude.
+    assert np.isnan(metrics.t_statistic)
+    assert np.isnan(metrics.p_value)
+
+
+def test_compute_harm_metrics_partial_agreement_with_variance(mock_harm_scorer):
+    evaluator = HarmScorerEvaluator(scorer=mock_harm_scorer)
+    # Model scores have variance across responses so ttest_1samp is well-defined.
+    all_human_scores = np.array([[0.1, 0.5], [0.1, 0.5], [0.1, 0.5]])
+    all_model_scores = np.array([[0.2, 0.3], [0.2, 0.3]])
+    metrics = evaluator._compute_metrics(
+        all_human_scores=all_human_scores, all_model_scores=all_model_scores, num_scorer_trials=2
+    )
+    # diff = [0.1, -0.2]; both t_statistic and p_value should be finite floats.
+    assert np.isfinite(metrics.t_statistic)
+    assert np.isfinite(metrics.p_value)
+    assert 0.0 <= metrics.p_value <= 1.0
 
 
 @patch("pyrit.score.scorer_evaluation.scorer_evaluator.find_objective_metrics_by_eval_hash")