NVIDIA · tdophung · Apr 30, 2026 · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/examples/jax/mnist/test_single_gpu_mnist.py b/examples/jax/mnist/test_single_gpu_mnist.py
@@ -3,6 +3,7 @@
 # See LICENSE for license information.
 """MNIST training on single GPU"""
 import argparse
+import math
 import unittest
 from functools import partial
 import sys
@@ -223,6 +224,10 @@ def train_and_evaluate(args):
             print("PASSED")
             return None
 
+        train_losses = []
+        train_accuracies = []
+        test_losses = []
+        test_accuracies = []
         for epoch in range(1, args.epochs + 1):
             rng, input_rng = jax.random.split(rng)
             rng, dropout_rng = jax.random.split(rng)
@@ -233,6 +238,11 @@ def train_and_evaluate(args):
             )
             test_loss, test_accuracy = eval_model(state, test_ds, args.test_batch_size, var_collect)
 
+            train_losses.append(train_loss)
+            train_accuracies.append(train_accuracy)
+            test_losses.append(test_loss)
+            test_accuracies.append(test_accuracy)
+
             print(
                 f"Epoch: {epoch:>2} "
                 f"Train Loss: {train_loss:.6f} "
@@ -241,7 +251,7 @@ def train_and_evaluate(args):
                 f"Test Accuracy: {test_accuracy:.6f} "
             )
 
-    return [train_loss, train_accuracy, test_loss, test_accuracy]
+    return [train_losses, train_accuracies, test_losses, test_accuracies]
 
 
 def mnist_parser(args):
@@ -324,15 +334,42 @@ def setUpClass(cls):
 
     @staticmethod
     def verify(actual):
-        """Check If loss and accuracy match target"""
-        desired_traing_loss = 0.055
+        """Check that loss and accuracy match target.
+
+        ``actual`` is ``[train_losses, train_accuracies, test_losses, test_accuracies]``,
+        i.e. per-epoch lists of metrics. To avoid flakiness from stochastic noise in
+        the final epoch near convergence (especially under FP8), the check considers
+        a tail window of the last ~10% of epochs (at least 2) and asserts on the
+        best metric within that window.
+        """
+        train_losses, train_accuracies, test_losses, test_accuracies = actual
+        epochs = len(train_losses)
+        tail = max(2, math.ceil(epochs * 0.1))
+        tail = min(tail, epochs)
+
+        best_train_loss = min(train_losses[-tail:])
+        best_train_accuracy = max(train_accuracies[-tail:])
+        best_test_loss = min(test_losses[-tail:])
+        best_test_accuracy = max(test_accuracies[-tail:])
+
+        desired_traing_loss = 0.06
         desired_traing_accuracy = 0.98
-        desired_test_loss = 0.045
-        desired_test_accuracy = 0.098
-        assert actual[0] < desired_traing_loss
-        assert actual[1] > desired_traing_accuracy
-        assert actual[2] < desired_test_loss
-        assert actual[3] > desired_test_accuracy
+        desired_test_loss = 0.05
+        desired_test_accuracy = 0.98
+        assert (
+            best_train_loss < desired_traing_loss
+        ), f"best train loss over last {tail} epochs {best_train_loss} >= {desired_traing_loss}"
+        assert best_train_accuracy > desired_traing_accuracy, (
+            f"best train accuracy over last {tail} epochs {best_train_accuracy} "
+            f"<= {desired_traing_accuracy}"
+        )
+        assert (
+            best_test_loss < desired_test_loss
+        ), f"best test loss over last {tail} epochs {best_test_loss} >= {desired_test_loss}"
+        assert best_test_accuracy > desired_test_accuracy, (
+            f"best test accuracy over last {tail} epochs {best_test_accuracy} "
+            f"<= {desired_test_accuracy}"
+        )
 
     @unittest.skipIf(not is_bf16_supported(), "Device compute capability 8.0+ is required for BF16")
     def test_te_bf16(self):

diff --git a/qa/L2_jax_unittest/test.sh b/qa/L2_jax_unittest/test.sh
@@ -31,11 +31,11 @@ mkdir -p "$XML_LOG_DIR"
 NVTE_JAX_UNITTEST_LEVEL="L2" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_jax_not_distributed.xml $TE_PATH/tests/jax -k 'not distributed' || test_fail "tests/jax/*not_distributed_*"
 
 pip3 install -r $TE_PATH/examples/jax/mnist/requirements.txt || error_exit "Failed to install mnist requirements"
+# Make mnist and encoder tests run-to-run deterministic for stable CI results
+export XLA_FLAGS="${XLA_FLAGS} --xla_gpu_deterministic_ops"
 NVTE_JAX_UNITTEST_LEVEL="L2" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_mnist.xml $TE_PATH/examples/jax/mnist || test_fail "mnist"
 
 pip3 install -r $TE_PATH/examples/jax/encoder/requirements.txt || error_exit "Failed to install encoder requirements"
-# Make encoder tests to have run-to-run deterministic to have the stable CI results
-export XLA_FLAGS="${XLA_FLAGS} --xla_gpu_deterministic_ops"
 NVTE_JAX_UNITTEST_LEVEL="L2" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_test_single_gpu_encoder.xml $TE_PATH/examples/jax/encoder/test_single_gpu_encoder.py || test_fail "test_single_gpu_encoder.py"
 # Test without custom calls
 export XLA_FLAGS="${XLA_FLAGS} --xla_gpu_deterministic_ops"