From a182f2f89667bbf7f9f846f7244c7fa951151ef2 Mon Sep 17 00:00:00 2001
From: Mohamed Zeidan <zeidmo@amazon.com>
Date: Thu, 30 Apr 2026 11:52:00 -0700
Subject: [PATCH 1/4] fix: Fix failing train tests for v3

---
 .../integ/jumpstart/test_jumpstart_train.py   |  2 +-
 .../train/test_llm_as_judge_base_model_fix.py | 48 +++++++++----------
 2 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/sagemaker-train/tests/integ/jumpstart/test_jumpstart_train.py b/sagemaker-train/tests/integ/jumpstart/test_jumpstart_train.py
index 2ee33f760c..298ea85e3e 100644
--- a/sagemaker-train/tests/integ/jumpstart/test_jumpstart_train.py
+++ b/sagemaker-train/tests/integ/jumpstart/test_jumpstart_train.py
@@ -30,7 +30,7 @@
             },
             # Override default instance type; the model's default
             # (ml.p3.2xlarge) is deprecated.
-            "compute": Compute(instance_type="ml.g5.xlarge"),
+            "compute": Compute(instance_type="ml.g4dn.xlarge"),
         },
         {"model_id": "xgboost-classification-model"},
         {"model_id": "catboost-regression-model"},
diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
index dcef7d4881..d53bb44dc4 100644
--- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
+++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
@@ -273,32 +273,6 @@ def test_base_model_false_still_works(self):
         logger.info(f"✓ Pipeline started successfully")
         logger.info(f"  Execution ARN: {execution.arn}")
         
-        # Verify pipeline structure - should only have custom inference step
-        execution.refresh()
-        step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else []
-        
-        logger.info(f"Pipeline steps ({len(step_names)}): {step_names}")
-        
-        # If no steps yet, wait a bit for pipeline to initialize
-        if not step_names:
-            logger.info("No steps found yet, waiting for pipeline initialization...")
-            import time
-            time.sleep(10)
-            execution.refresh()
-            step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else []
-            logger.info(f"Pipeline steps after wait ({len(step_names)}): {step_names}")
-        
-        # Should NOT have base inference step (case-insensitive, flexible matching)
-        has_base_step = any("base" in name.lower() and "inference" in name.lower() for name in step_names)
-        has_custom_step = any("custom" in name.lower() and "inference" in name.lower() for name in step_names)
-        
-        assert not has_base_step, f"Pipeline should NOT have base inference step when evaluate_base_model=False. Found steps: {step_names}"
-        assert has_custom_step, f"Pipeline should have custom inference step. Found steps: {step_names}"
-        
-        logger.info(f"✓ Pipeline structure correct for evaluate_base_model=False")
-        logger.info(f"  Base model step: {'Found (ERROR!)' if has_base_step else 'Not present (correct)'}")
-        logger.info(f"  Custom model step: {'Found (correct)' if has_custom_step else 'Missing (ERROR!)'}")
-        
         # Wait for completion
         logger.info(f"\nWaiting for evaluation to complete...")
         
@@ -308,6 +282,28 @@ def test_base_model_false_still_works(self):
             
             assert execution.status.overall_status == "Succeeded"
             
+            # Verify pipeline structure - should only have custom inference step
+            # Check after completion — step details are
+            # only reliable once the execution has finished.  Checking earlier
+            # is racy when tests run in parallel (pytest-xdist) because
+            # _get_or_create_pipeline may update a shared pipeline resource
+            # and the service can briefly return stale step metadata.
+            execution.refresh()
+            step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else []
+            
+            logger.info(f"Pipeline steps ({len(step_names)}): {step_names}")
+            
+            # Should NOT have base inference step (case-insensitive, flexible matching)
+            has_base_step = any("base" in name.lower() and "inference" in name.lower() for name in step_names)
+            has_custom_step = any("custom" in name.lower() and "inference" in name.lower() for name in step_names)
+            
+            assert not has_base_step, f"Pipeline should NOT have base inference step when evaluate_base_model=False. Found steps: {step_names}"
+            assert has_custom_step, f"Pipeline should have custom inference step. Found steps: {step_names}"
+            
+            logger.info(f"✓ Pipeline structure correct for evaluate_base_model=False")
+            logger.info(f"  Base model step: {'Found (ERROR!)' if has_base_step else 'Not present (correct)'}")
+            logger.info(f"  Custom model step: {'Found (correct)' if has_custom_step else 'Missing (ERROR!)'}")
+            
             logger.info("\n" + "=" * 80)
             logger.info("Backward Compatibility Test: PASSED")
             logger.info("=" * 80)

From dc4eff3365668214a5ad9d1bbab3a73b164e745a Mon Sep 17 00:00:00 2001
From: Mohamed Zeidan <zeidmo@amazon.com>
Date: Thu, 30 Apr 2026 13:33:09 -0700
Subject: [PATCH 2/4] serial

---
 .../tests/integ/train/test_llm_as_judge_base_model_fix.py        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
index d53bb44dc4..5bae4684e5 100644
--- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
+++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
@@ -76,6 +76,7 @@
 }
 
 
+@pytest.mark.serial
 class TestLLMAsJudgeBaseModelFix:
     """Integration test for base model fix in LLMAsJudgeEvaluator"""
 

From c9743cf5ab570cc517021a469298c7c755fb7811 Mon Sep 17 00:00:00 2001
From: Mohamed Zeidan <zeidmo@amazon.com>
Date: Thu, 30 Apr 2026 14:02:09 -0700
Subject: [PATCH 3/4] only keep instance type fix

---
 .../train/test_llm_as_judge_base_model_fix.py | 49 ++++++++++---------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
index 5bae4684e5..dcef7d4881 100644
--- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
+++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
@@ -76,7 +76,6 @@
 }
 
 
-@pytest.mark.serial
 class TestLLMAsJudgeBaseModelFix:
     """Integration test for base model fix in LLMAsJudgeEvaluator"""
 
@@ -274,6 +273,32 @@ def test_base_model_false_still_works(self):
         logger.info(f"✓ Pipeline started successfully")
         logger.info(f"  Execution ARN: {execution.arn}")
         
+        # Verify pipeline structure - should only have custom inference step
+        execution.refresh()
+        step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else []
+        
+        logger.info(f"Pipeline steps ({len(step_names)}): {step_names}")
+        
+        # If no steps yet, wait a bit for pipeline to initialize
+        if not step_names:
+            logger.info("No steps found yet, waiting for pipeline initialization...")
+            import time
+            time.sleep(10)
+            execution.refresh()
+            step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else []
+            logger.info(f"Pipeline steps after wait ({len(step_names)}): {step_names}")
+        
+        # Should NOT have base inference step (case-insensitive, flexible matching)
+        has_base_step = any("base" in name.lower() and "inference" in name.lower() for name in step_names)
+        has_custom_step = any("custom" in name.lower() and "inference" in name.lower() for name in step_names)
+        
+        assert not has_base_step, f"Pipeline should NOT have base inference step when evaluate_base_model=False. Found steps: {step_names}"
+        assert has_custom_step, f"Pipeline should have custom inference step. Found steps: {step_names}"
+        
+        logger.info(f"✓ Pipeline structure correct for evaluate_base_model=False")
+        logger.info(f"  Base model step: {'Found (ERROR!)' if has_base_step else 'Not present (correct)'}")
+        logger.info(f"  Custom model step: {'Found (correct)' if has_custom_step else 'Missing (ERROR!)'}")
+        
         # Wait for completion
         logger.info(f"\nWaiting for evaluation to complete...")
         
@@ -283,28 +308,6 @@ def test_base_model_false_still_works(self):
             
             assert execution.status.overall_status == "Succeeded"
             
-            # Verify pipeline structure - should only have custom inference step
-            # Check after completion — step details are
-            # only reliable once the execution has finished.  Checking earlier
-            # is racy when tests run in parallel (pytest-xdist) because
-            # _get_or_create_pipeline may update a shared pipeline resource
-            # and the service can briefly return stale step metadata.
-            execution.refresh()
-            step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else []
-            
-            logger.info(f"Pipeline steps ({len(step_names)}): {step_names}")
-            
-            # Should NOT have base inference step (case-insensitive, flexible matching)
-            has_base_step = any("base" in name.lower() and "inference" in name.lower() for name in step_names)
-            has_custom_step = any("custom" in name.lower() and "inference" in name.lower() for name in step_names)
-            
-            assert not has_base_step, f"Pipeline should NOT have base inference step when evaluate_base_model=False. Found steps: {step_names}"
-            assert has_custom_step, f"Pipeline should have custom inference step. Found steps: {step_names}"
-            
-            logger.info(f"✓ Pipeline structure correct for evaluate_base_model=False")
-            logger.info(f"  Base model step: {'Found (ERROR!)' if has_base_step else 'Not present (correct)'}")
-            logger.info(f"  Custom model step: {'Found (correct)' if has_custom_step else 'Missing (ERROR!)'}")
-            
             logger.info("\n" + "=" * 80)
             logger.info("Backward Compatibility Test: PASSED")
             logger.info("=" * 80)

From 9de4f23b55069378284f61436b880f5833521cfa Mon Sep 17 00:00:00 2001
From: Mohamed Zeidan <zeidmo@amazon.com>
Date: Thu, 30 Apr 2026 15:00:33 -0700
Subject: [PATCH 4/4] added serial back since it worked

---
 .../tests/integ/train/test_llm_as_judge_base_model_fix.py        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
index dcef7d4881..7490a373b5 100644
--- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
+++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
@@ -76,6 +76,7 @@
 }
 
 
+@pytest.mark.serial
 class TestLLMAsJudgeBaseModelFix:
     """Integration test for base model fix in LLMAsJudgeEvaluator"""