From a182f2f89667bbf7f9f846f7244c7fa951151ef2 Mon Sep 17 00:00:00 2001 From: Mohamed Zeidan Date: Thu, 30 Apr 2026 11:52:00 -0700 Subject: [PATCH 1/4] fix: Fix failing train tests for v3 --- .../integ/jumpstart/test_jumpstart_train.py | 2 +- .../train/test_llm_as_judge_base_model_fix.py | 48 +++++++++---------- 2 files changed, 23 insertions(+), 27 deletions(-) diff --git a/sagemaker-train/tests/integ/jumpstart/test_jumpstart_train.py b/sagemaker-train/tests/integ/jumpstart/test_jumpstart_train.py index 2ee33f760c..298ea85e3e 100644 --- a/sagemaker-train/tests/integ/jumpstart/test_jumpstart_train.py +++ b/sagemaker-train/tests/integ/jumpstart/test_jumpstart_train.py @@ -30,7 +30,7 @@ }, # Override default instance type; the model's default # (ml.p3.2xlarge) is deprecated. - "compute": Compute(instance_type="ml.g5.xlarge"), + "compute": Compute(instance_type="ml.g4dn.xlarge"), }, {"model_id": "xgboost-classification-model"}, {"model_id": "catboost-regression-model"}, diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py index dcef7d4881..d53bb44dc4 100644 --- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py +++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py @@ -273,32 +273,6 @@ def test_base_model_false_still_works(self): logger.info(f"✓ Pipeline started successfully") logger.info(f" Execution ARN: {execution.arn}") - # Verify pipeline structure - should only have custom inference step - execution.refresh() - step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else [] - - logger.info(f"Pipeline steps ({len(step_names)}): {step_names}") - - # If no steps yet, wait a bit for pipeline to initialize - if not step_names: - logger.info("No steps found yet, waiting for pipeline initialization...") - import time - time.sleep(10) - execution.refresh() - step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else [] - logger.info(f"Pipeline steps after wait ({len(step_names)}): {step_names}") - - # Should NOT have base inference step (case-insensitive, flexible matching) - has_base_step = any("base" in name.lower() and "inference" in name.lower() for name in step_names) - has_custom_step = any("custom" in name.lower() and "inference" in name.lower() for name in step_names) - - assert not has_base_step, f"Pipeline should NOT have base inference step when evaluate_base_model=False. Found steps: {step_names}" - assert has_custom_step, f"Pipeline should have custom inference step. Found steps: {step_names}" - - logger.info(f"✓ Pipeline structure correct for evaluate_base_model=False") - logger.info(f" Base model step: {'Found (ERROR!)' if has_base_step else 'Not present (correct)'}") - logger.info(f" Custom model step: {'Found (correct)' if has_custom_step else 'Missing (ERROR!)'}") - # Wait for completion logger.info(f"\nWaiting for evaluation to complete...") @@ -308,6 +282,28 @@ def test_base_model_false_still_works(self): assert execution.status.overall_status == "Succeeded" + # Verify pipeline structure - should only have custom inference step + # Check after completion — step details are + # only reliable once the execution has finished. Checking earlier + # is racy when tests run in parallel (pytest-xdist) because + # _get_or_create_pipeline may update a shared pipeline resource + # and the service can briefly return stale step metadata. + execution.refresh() + step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else [] + + logger.info(f"Pipeline steps ({len(step_names)}): {step_names}") + + # Should NOT have base inference step (case-insensitive, flexible matching) + has_base_step = any("base" in name.lower() and "inference" in name.lower() for name in step_names) + has_custom_step = any("custom" in name.lower() and "inference" in name.lower() for name in step_names) + + assert not has_base_step, f"Pipeline should NOT have base inference step when evaluate_base_model=False. Found steps: {step_names}" + assert has_custom_step, f"Pipeline should have custom inference step. Found steps: {step_names}" + + logger.info(f"✓ Pipeline structure correct for evaluate_base_model=False") + logger.info(f" Base model step: {'Found (ERROR!)' if has_base_step else 'Not present (correct)'}") + logger.info(f" Custom model step: {'Found (correct)' if has_custom_step else 'Missing (ERROR!)'}") + logger.info("\n" + "=" * 80) logger.info("Backward Compatibility Test: PASSED") logger.info("=" * 80) From dc4eff3365668214a5ad9d1bbab3a73b164e745a Mon Sep 17 00:00:00 2001 From: Mohamed Zeidan Date: Thu, 30 Apr 2026 13:33:09 -0700 Subject: [PATCH 2/4] serial --- .../tests/integ/train/test_llm_as_judge_base_model_fix.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py index d53bb44dc4..5bae4684e5 100644 --- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py +++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py @@ -76,6 +76,7 @@ } +@pytest.mark.serial class TestLLMAsJudgeBaseModelFix: """Integration test for base model fix in LLMAsJudgeEvaluator""" From c9743cf5ab570cc517021a469298c7c755fb7811 Mon Sep 17 00:00:00 2001 From: Mohamed Zeidan Date: Thu, 30 Apr 2026 14:02:09 -0700 Subject: [PATCH 3/4] only keep instance type fix --- .../train/test_llm_as_judge_base_model_fix.py | 49 ++++++++++--------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py index 5bae4684e5..dcef7d4881 100644 --- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py +++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py @@ -76,7 +76,6 @@ } -@pytest.mark.serial class TestLLMAsJudgeBaseModelFix: """Integration test for base model fix in LLMAsJudgeEvaluator""" @@ -274,6 +273,32 @@ def test_base_model_false_still_works(self): logger.info(f"✓ Pipeline started successfully") logger.info(f" Execution ARN: {execution.arn}") + # Verify pipeline structure - should only have custom inference step + execution.refresh() + step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else [] + + logger.info(f"Pipeline steps ({len(step_names)}): {step_names}") + + # If no steps yet, wait a bit for pipeline to initialize + if not step_names: + logger.info("No steps found yet, waiting for pipeline initialization...") + import time + time.sleep(10) + execution.refresh() + step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else [] + logger.info(f"Pipeline steps after wait ({len(step_names)}): {step_names}") + + # Should NOT have base inference step (case-insensitive, flexible matching) + has_base_step = any("base" in name.lower() and "inference" in name.lower() for name in step_names) + has_custom_step = any("custom" in name.lower() and "inference" in name.lower() for name in step_names) + + assert not has_base_step, f"Pipeline should NOT have base inference step when evaluate_base_model=False. Found steps: {step_names}" + assert has_custom_step, f"Pipeline should have custom inference step. Found steps: {step_names}" + + logger.info(f"✓ Pipeline structure correct for evaluate_base_model=False") + logger.info(f" Base model step: {'Found (ERROR!)' if has_base_step else 'Not present (correct)'}") + logger.info(f" Custom model step: {'Found (correct)' if has_custom_step else 'Missing (ERROR!)'}") + # Wait for completion logger.info(f"\nWaiting for evaluation to complete...") @@ -283,28 +308,6 @@ def test_base_model_false_still_works(self): assert execution.status.overall_status == "Succeeded" - # Verify pipeline structure - should only have custom inference step - # Check after completion — step details are - # only reliable once the execution has finished. Checking earlier - # is racy when tests run in parallel (pytest-xdist) because - # _get_or_create_pipeline may update a shared pipeline resource - # and the service can briefly return stale step metadata. - execution.refresh() - step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else [] - - logger.info(f"Pipeline steps ({len(step_names)}): {step_names}") - - # Should NOT have base inference step (case-insensitive, flexible matching) - has_base_step = any("base" in name.lower() and "inference" in name.lower() for name in step_names) - has_custom_step = any("custom" in name.lower() and "inference" in name.lower() for name in step_names) - - assert not has_base_step, f"Pipeline should NOT have base inference step when evaluate_base_model=False. Found steps: {step_names}" - assert has_custom_step, f"Pipeline should have custom inference step. Found steps: {step_names}" - - logger.info(f"✓ Pipeline structure correct for evaluate_base_model=False") - logger.info(f" Base model step: {'Found (ERROR!)' if has_base_step else 'Not present (correct)'}") - logger.info(f" Custom model step: {'Found (correct)' if has_custom_step else 'Missing (ERROR!)'}") - logger.info("\n" + "=" * 80) logger.info("Backward Compatibility Test: PASSED") logger.info("=" * 80) From 9de4f23b55069378284f61436b880f5833521cfa Mon Sep 17 00:00:00 2001 From: Mohamed Zeidan Date: Thu, 30 Apr 2026 15:00:33 -0700 Subject: [PATCH 4/4] added serial back since it worked --- .../tests/integ/train/test_llm_as_judge_base_model_fix.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py index dcef7d4881..7490a373b5 100644 --- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py +++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py @@ -76,6 +76,7 @@ } +@pytest.mark.serial class TestLLMAsJudgeBaseModelFix: """Integration test for base model fix in LLMAsJudgeEvaluator"""