Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
498 commits
Select commit Hold shift + click to select a range
a889a30
Fail fast on Megatron job failures
FurtherAI May 15, 2026
a35be61
Fix CP exchange collective participation
FurtherAI May 15, 2026
6cc1e49
Fix CP empty-rank collective participation
FurtherAI May 15, 2026
b03853b
Add streaming weight offload validation hooks
FurtherAI May 16, 2026
83da657
Fix qwen35 gdn compile boundaries
FurtherAI May 16, 2026
0c527d8
Narrow expert lora compile boundary
FurtherAI May 16, 2026
c325859
Fix oracle routing trace for variable micros
FurtherAI May 16, 2026
646b748
Refine oracle LoRA reference controls
FurtherAI May 17, 2026
c872195
Use native Megatron MoE routing replay
FurtherAI May 18, 2026
f6a369f
Add production MoE routing replay plumbing
FurtherAI May 18, 2026
a5d6a26
Expose trajectory routing replay train flag
FurtherAI May 18, 2026
211b7e2
Make expert replay a backend setting
FurtherAI May 18, 2026
f3f619c
Add real-path train inf mismatch test
FurtherAI May 18, 2026
9ab0308
Disable async scheduling for expert replay
FurtherAI May 18, 2026
f5f1714
Forward false vLLM runtime flags
FurtherAI May 18, 2026
3b84202
Use nonzero advantages in real mismatch test
FurtherAI May 18, 2026
45627c8
Align real mismatch rollout chat template
FurtherAI May 18, 2026
200494c
Allow replay to omit terminal generated route
FurtherAI May 18, 2026
cde0316
Replay known routes and live-route terminal gaps
FurtherAI May 18, 2026
2d043df
Gather TP logits in mismatch extractor
FurtherAI May 18, 2026
cb815e4
Run real mismatch test without opt-in env
FurtherAI May 18, 2026
3f3cc5f
Make routing replay native and cp2 by default
FurtherAI May 18, 2026
3470a2b
Fix mismatch test topology world size
FurtherAI May 18, 2026
b72a01a
Restore tp2 ep2 mismatch defaults
FurtherAI May 18, 2026
8125f8a
Fix CP attention backward grad layout
FurtherAI May 18, 2026
d9dbdb6
Wire weight offload config into attention oracle
FurtherAI May 18, 2026
f61d43c
Document mismatch threshold diagnostics
FurtherAI May 18, 2026
bec322b
Fix CP flash grad handoff
FurtherAI May 19, 2026
85583fb
Default oracle validation to Qwen3.5
FurtherAI May 19, 2026
75a4abb
Allow streaming offload with compiled layers
FurtherAI May 20, 2026
9aff411
Tolerate job tensor cleanup races
FurtherAI May 20, 2026
bac0f1a
Revert job tensor cleanup retry
FurtherAI May 20, 2026
a6e1749
Raise train-inf mismatch bf16 gate
FurtherAI May 20, 2026
22aa60f
Fix oracle routing replay capture
FurtherAI May 20, 2026
7e44709
Tune streaming weight offload defaults
FurtherAI May 20, 2026
9a2abc0
Keep full-model streaming offload defaults
FurtherAI May 20, 2026
6a0a9c2
Optimize CP block mask refinement
FurtherAI May 20, 2026
0e688f8
Fix MoE replay topology parity
FurtherAI May 21, 2026
050d6cb
Spread synthetic replay routes
FurtherAI May 21, 2026
bf3ec9b
Clean up Megatron compile workarounds
FurtherAI May 21, 2026
ea92cb8
Remove temporary flex compile options
FurtherAI May 21, 2026
48cb055
Move routing replay trace bundle builder to tests
FurtherAI May 21, 2026
7c5548d
Fix flex attention compile defaults
FurtherAI May 21, 2026
ae9933b
Move model support validation APIs to tests
FurtherAI May 21, 2026
bbfe210
Clean up Qwen3.5 text bridge registration
FurtherAI May 21, 2026
9dba103
Merge branch 'main' into austin/train_inf_mismatch
FurtherAI May 21, 2026
2bef373
Clean up routing replay merge state
FurtherAI May 23, 2026
1a0adcd
Drop stale megatron core build config
FurtherAI May 23, 2026
2566b64
Clean up train inf mismatch real path gate
FurtherAI May 23, 2026
7b9a0c6
Restore explicit NCCL weight transfer contract
FurtherAI May 23, 2026
a8f07ea
Lower train-inf mismatch rollout temperature
FurtherAI May 23, 2026
bf99ef8
Seed train-inf mismatch rollouts
FurtherAI May 23, 2026
04ac948
Use lower train-inf rollout temperature without seeds
FurtherAI May 23, 2026
2d6de24
Restore train-inf rollout temperature
FurtherAI May 23, 2026
66451c0
Refactor Megatron provider runtime env handling
FurtherAI May 23, 2026
f761ead
Refactor Megatron train support helpers
FurtherAI May 23, 2026
8da254e
Move Megatron microbatch helpers out of train
FurtherAI May 23, 2026
9def1cf
Move Megatron runtime patches out of compile helpers
FurtherAI May 23, 2026
71882dd
Group Megatron flex attention helpers
FurtherAI May 23, 2026
b358a4a
Move provider helpers and Megatron backend into main module
FurtherAI May 23, 2026
1ce63a7
Use compact non-CP oracle topology matrix
FurtherAI May 23, 2026
b3f6f4b
Merge origin/main into vllm merge worktree
FurtherAI May 23, 2026
28fcde8
Fix Megatron type checking
FurtherAI May 23, 2026
98b1cd7
Add durable model support workflow CLI
FurtherAI May 23, 2026
850ce28
Remove native LoRA exclusion from workflow CLI
FurtherAI May 23, 2026
082d0aa
Add vLLM routed expert prefix sidecar
FurtherAI May 23, 2026
923f025
Make CP prepare keep planning metadata on CPU
FurtherAI May 24, 2026
c4aacbe
Fix empty-rank GDN CP autograd participation
FurtherAI May 24, 2026
137c4d3
Fix GDN CP oracle metadata paths
FurtherAI May 24, 2026
53cd24c
Fix routed expert prefix cache sidecar dependencies
FurtherAI May 24, 2026
fb5d442
Refresh native GDN CP packed test assertions
FurtherAI May 24, 2026
003b433
Tune train-inf mismatch gates
FurtherAI May 24, 2026
09937e0
Relax qwen3 train-inf gates
FurtherAI May 24, 2026
f12dd5a
Relax bf16 attention oracle thresholds
FurtherAI May 24, 2026
a9f79bd
Set bf16 attention oracle threshold to two percent
FurtherAI May 24, 2026
5cda0b2
Fix fused expert LoRA ETP sharding
FurtherAI May 24, 2026
54855ec
Recognize fused moe lora coverage
FurtherAI May 25, 2026
0f70173
Enable managed MoE routing replay
FurtherAI May 25, 2026
aedd9ea
Clean up oracle trace UID handling
FurtherAI May 25, 2026
fdeb42b
Release routing replay before job cleanup
FurtherAI May 25, 2026
456ee60
Update Qwen3.5 train-inf invariant gate
FurtherAI May 25, 2026
bdd6c0e
Support dense real-path train-inf topology
FurtherAI May 25, 2026
491ef59
Ignore token-only MoE routing metadata
FurtherAI May 25, 2026
7822790
Treat null route fields as absent
FurtherAI May 25, 2026
7192d07
Fix dense real-path score matching
FurtherAI May 25, 2026
4604b9e
Fix CP GDN forward trace canonicalization
FurtherAI May 25, 2026
6593840
Add real-path base mismatch diagnostics
FurtherAI May 26, 2026
d7a381c
Fix real-path base diagnostic scoring
FurtherAI May 26, 2026
db3cffb
Freeze base diagnostic Megatron worker
FurtherAI May 26, 2026
3084544
Add real-path base mismatch diagnostic
FurtherAI May 26, 2026
47991e1
Move GDN trace UID helpers to oracle tests
FurtherAI May 26, 2026
4ab349d
Add train-inf forward trace diagnostic
FurtherAI May 26, 2026
7931829
Lease scheduled eval adapters
FurtherAI May 26, 2026
5e940a1
Keep forward trace on default vLLM path
FurtherAI May 26, 2026
fd3c3d4
Limit vLLM forward trace tensor dumps
FurtherAI May 26, 2026
f6e07d9
Capture Megatron final hidden in trace
FurtherAI May 26, 2026
87cd3a4
Save Megatron logits in forward trace
FurtherAI May 26, 2026
19297a9
Capture Megatron trace submodules for train-inf diagnostics
FurtherAI May 26, 2026
0286d1e
Trace vLLM projection submodules for diagnostics
FurtherAI May 26, 2026
9b4e340
Add all-architectures model support workflow
FurtherAI May 26, 2026
9a9cd7a
Clean up Megatron weight offload status logging
FurtherAI May 26, 2026
c97dbd8
Clean train-inf adapter artifacts on pass
FurtherAI May 26, 2026
58464af
Share external vLLM runtime lifecycle
FurtherAI May 26, 2026
651b354
Rename CP token UID tracing flag
FurtherAI May 26, 2026
3281ac5
Clean up weight transfer communicator lifetime
FurtherAI May 26, 2026
c17cc4e
Deduplicate Megatron test artifact helpers
FurtherAI May 26, 2026
3ca6f94
Use main loss for context-parallel RL
FurtherAI May 27, 2026
512b48a
Keep context-parallel loss reductions isolated
FurtherAI May 27, 2026
f07e733
Route loss inputs through explicit alignment adapters
FurtherAI May 27, 2026
244fd56
Require group ids in aligned loss inputs
FurtherAI May 27, 2026
4f27a8e
Avoid mutating aligned loss advantages
FurtherAI May 27, 2026
4836a67
Merge origin/main into vllm merge worktree
FurtherAI May 27, 2026
17e387e
Fix packed tensor cleanup for CP lookahead
FurtherAI May 27, 2026
ebdc538
Optimize Megatron LoRA checkpoint publishing
FurtherAI May 27, 2026
76177d6
Merge remote-tracking branch 'origin/main' into austin/train_inf_mism…
FurtherAI May 27, 2026
1baa5eb
Batch Megatron LoRA publish transfers
FurtherAI May 27, 2026
2bebf03
Optimize Megatron LoRA publish metadata
FurtherAI May 28, 2026
ad3c368
Derive Megatron LoRA publish metadata locally
FurtherAI May 28, 2026
ecc4d86
Optimize packed expert LoRA publish
FurtherAI May 28, 2026
c1d8020
Lazy load Megatron model support handlers
FurtherAI May 28, 2026
7226e60
Merge train-inf mismatch workflow and validation fixes
FurtherAI May 29, 2026
393d682
Adjust Qwen3 MoE train-inf parity gates
FurtherAI May 29, 2026
be6bfab
Cover Qwen3 MoE train-inf route-conflict KL
FurtherAI May 29, 2026
50b5c30
Adjust Qwen3 dense train-inf parity gate
FurtherAI May 29, 2026
4376533
Fix Qwen3.5 validation regressions
FurtherAI May 29, 2026
877b725
Patch Qwen3.5 fp32 parity GDN reference
FurtherAI May 29, 2026
e55f598
Fix packed GDN parity reference slicing
FurtherAI May 29, 2026
6a51b57
Run Qwen3.5 HF parity in bf16
FurtherAI May 29, 2026
c826d91
Allow bf16 HF parity validation
FurtherAI May 29, 2026
be653e5
Set Qwen3.5 dense train-inf parity gates
FurtherAI May 29, 2026
8e037ba
Revert "Set Qwen3.5 dense train-inf parity gates"
FurtherAI May 29, 2026
8c4aaeb
Use CP-first Megatron default topology
FurtherAI May 29, 2026
1b8f93e
Revert diagnostic validation threshold changes
FurtherAI May 29, 2026
fc190ed
Restore Qwen3.5 fused expert LoRA export
FurtherAI May 29, 2026
d6fd491
Restore bf16 real GDN CP validation
FurtherAI May 29, 2026
da54952
Pin Megatron integration artifacts to commits
FurtherAI May 29, 2026
bfcebb2
test: add gdn fp32 oracle reference
FurtherAI May 30, 2026
5ddc79e
test: stabilize gdn output loss checks
FurtherAI May 30, 2026
d0ff976
Fix train inf mismatch CP scoring harness
FurtherAI May 30, 2026
d3e8807
Fix CP routing replay explicit uid targets
FurtherAI May 30, 2026
63eb044
Optimize CP routing replay UID handoff
FurtherAI May 30, 2026
f909c41
Cache routing replay target refreshes
FurtherAI May 30, 2026
d427295
Prestage routing replay targets before forward
FurtherAI May 30, 2026
f095db5
Test prestaged routing replay layout switches
FurtherAI May 30, 2026
5705a00
Keep workflow architecture inspection single-rank
FurtherAI May 30, 2026
6fcacdb
Stage routing replay targets in validation harnesses
FurtherAI May 31, 2026
aedb7ed
Remove branch-only assertion tests
FurtherAI May 31, 2026
88d4f15
Keep CP scoring token UIDs on CPU
FurtherAI May 31, 2026
8de63fd
Retry train inf mismatch workflow stage
FurtherAI Jun 1, 2026
d486c38
Fix CP routing replay trace token uids
FurtherAI Jun 1, 2026
139a64b
Relax router score oracle for CP replay
FurtherAI Jun 1, 2026
a0df118
Drop padded expert rows from forward traces
FurtherAI Jun 1, 2026
9f80b5c
Pack oracle LoRA snapshots before safetensors save
FurtherAI Jun 1, 2026
dcc25a8
Disable compiled qwen35 routed expert compute
FurtherAI Jun 1, 2026
ad055f8
Normalize Megatron identity LoRA through model support
FurtherAI Jun 1, 2026
899b917
Preserve GDN layout across checkpoint recompute
FurtherAI Jun 1, 2026
18dad24
Tighten router score oracle threshold
FurtherAI Jun 1, 2026
075031c
Narrow Qwen3.5 MoE compile workaround
FurtherAI Jun 1, 2026
23d32d4
Use GDN island boundary layout state
FurtherAI Jun 1, 2026
0264231
Remove GDN layout inference fallback
FurtherAI Jun 1, 2026
3470ce8
Patch weighted SwiGLU compile autograd
FurtherAI Jun 1, 2026
d8b2209
Remove no-op CP training guard
FurtherAI Jun 2, 2026
342b100
Remove CP timing from production training results
FurtherAI Jun 2, 2026
64144ca
Trim GDN shared-prefix PR test surface
FurtherAI Jun 2, 2026
81fc8b2
Drop GDN shared-prefix README from PR surface
FurtherAI Jun 2, 2026
a2b0ec8
Remove dead GDN production helpers
FurtherAI Jun 2, 2026
25e0a7f
Merge latest main into vllm merge worktree
FurtherAI Jun 4, 2026
36e469e
Add Gemma 4 Megatron probe support
FurtherAI Jun 4, 2026
e5f4c47
Update Megatron deps for Gemma 4 bridge
FurtherAI Jun 5, 2026
016b016
Update Transformers mask patch for 5.6
FurtherAI Jun 6, 2026
daaba69
Handle Gemma 4 K equals V QKV loading
FurtherAI Jun 6, 2026
627029c
Patch Gemma 4 router for Megatron Core 0.17
FurtherAI Jun 6, 2026
198c67a
Use selective recompute for Gemma 4
FurtherAI Jun 6, 2026
745cdc8
Apply Gemma 4 text fusion helpers
FurtherAI Jun 6, 2026
c38ed25
Type Gemma 4 handler patches
FurtherAI Jun 6, 2026
b2a79ec
Capture Gemma 4 HF router replay
FurtherAI Jun 6, 2026
1806d67
Match Gemma 4 proportional RoPE
FurtherAI Jun 6, 2026
26d7705
Handle Gemma 4 fused gradient parity
FurtherAI Jun 8, 2026
31cb4cb
Handle Gemma 4 expert slice loading
FurtherAI Jun 8, 2026
51f7881
Add Gemma4 MoE merged serving support
FurtherAI Jun 8, 2026
3dc0446
Pass unvalidated arch flag to train-inf mismatch workflow
FurtherAI Jun 8, 2026
e0bb928
Respect merged rollout mode in train-inf real path
FurtherAI Jun 8, 2026
607688f
Patch Gemma4 MoE routed expert config for vLLM
FurtherAI Jun 8, 2026
53429e1
Fix chat template sentinel tokenization
FurtherAI Jun 8, 2026
da5eb91
Gather Gemma4 rotary tables for packed positions
FurtherAI Jun 8, 2026
f0dcb63
Check tuple rotary outputs in packed position validation
FurtherAI Jun 8, 2026
ef327cd
Disable multimodal limits in yes-no validation
FurtherAI Jun 8, 2026
761cecd
Patch Gemma4 merged weight update reload
FurtherAI Jun 8, 2026
1ef6b64
Revert "Check tuple rotary outputs in packed position validation"
FurtherAI Jun 8, 2026
0aa620c
Use handler rotary outputs in packed position validation
FurtherAI Jun 8, 2026
8261749
Validate rotary outputs from test-owned handler mapping
FurtherAI Jun 8, 2026
2951fdd
Add ART flex sliding-window masks
FurtherAI Jun 9, 2026
e360da3
Use vLLM token ids for RL tokenization
FurtherAI Jun 11, 2026
97179cd
Fix packed position SWA mask setup
FurtherAI Jun 12, 2026
3fca647
Use Triton flex fallback for wide heads
FurtherAI Jun 12, 2026
f96d505
Relax router score parity tolerance
FurtherAI Jun 12, 2026
741c381
Remove stale train-inf routing replay flag
FurtherAI Jun 12, 2026
62d52a1
Fix Gemma4 attention LoRA postnorm placement
FurtherAI Jun 12, 2026
7f4b902
Use compact LoRA deltas for merged serving
FurtherAI Jun 15, 2026
c1f5789
Fix chat template rollout synthetic choices
FurtherAI Jun 16, 2026
e415629
Handle Gemma 4 shared expert overlap compile path
FurtherAI Jun 16, 2026
175ec40
Fix Gemma 4 k-eq-v LoRA export
FurtherAI Jun 16, 2026
34ed1b1
Avoid double applying Gemma 4 k-eq-v LoRA deltas
FurtherAI Jun 16, 2026
013ecc6
Rescale Gemma 4 shared expert LoRA export
FurtherAI Jun 16, 2026
cc17a4a
Propagate mismatch LoRA target overrides
FurtherAI Jun 16, 2026
708bbbf
Propagate Megatron LoRA target modules
FurtherAI Jun 16, 2026
0375b31
Fix Gemma 4 shared-only LoRA export
FurtherAI Jun 16, 2026
adc2862
Exercise SWA length in train-inf mismatch
FurtherAI Jun 16, 2026
7bc8822
Tighten SWA prompt length default
FurtherAI Jun 16, 2026
7633322
Build SWA masks in mismatch scoring
FurtherAI Jun 16, 2026
c123dfe
Size Gemma 4 rotary tables for ART CP
FurtherAI Jun 16, 2026
16c4f30
Set Gemma 4 mismatch thresholds
FurtherAI Jun 17, 2026
9b19b02
Fix merged weight transfer communicator lifecycle
FurtherAI Jun 17, 2026
2661db1
Avoid batch index literals in flex block masks
FurtherAI Jun 17, 2026
708530f
Add managed length trainability smoke
FurtherAI Jun 17, 2026
927be75
Upgrade vLLM runtime to 0.23.0
FurtherAI Jun 18, 2026
8b33363
Merge remote-tracking branch 'origin/main' into austin/gemma_4_model_…
FurtherAI Jun 18, 2026
b09733a
Restore Qwen3 mismatch threshold
FurtherAI Jun 18, 2026
ceec8e3
Update vLLM runtime patches for 0.23
FurtherAI Jun 18, 2026
3e1865a
Align FlashInfer runtime dependency
FurtherAI Jun 18, 2026
17b0915
Decode vLLM routed expert responses
FurtherAI Jun 18, 2026
2db6f19
Derive max sequence length from model config
FurtherAI Jun 19, 2026
d479300
Fix dense shared topology expectation
FurtherAI Jun 19, 2026
b01b641
Add explicit Megatron runtime config
FurtherAI Jun 19, 2026
ef6d9f3
Lazy import optional Unsloth service in runtime test
FurtherAI Jun 19, 2026
d1ccaea
Keep live length trainability varied
FurtherAI Jun 19, 2026
081e911
Stop length trainability after target error
FurtherAI Jun 19, 2026
f09bf76
Mark Gemma4 native vLLM LoRA wip
FurtherAI Jun 19, 2026
f6f0407
Use native LoRA default for Gemma4 trainability
FurtherAI Jun 19, 2026
fa6e65a
Patch Gemma4 MoE LoRA metadata in vLLM runtime
FurtherAI Jun 19, 2026
749ee9b
Tie Gemma4 k-eq-v LoRA export for vLLM
FurtherAI Jun 20, 2026
e5454d5
Tighten Gemma4 train-inf mismatch thresholds
FurtherAI Jun 20, 2026
e955012
Enable Gemma4 LoRA grads through preprocess
FurtherAI Jun 20, 2026
f260f6b
Support Gemma4 full activation recompute
FurtherAI Jun 20, 2026
799bad0
Use length trainability in model support workflow
FurtherAI Jun 21, 2026
08f30ef
Set Gemma 4 mismatch thresholds
FurtherAI Jun 21, 2026
e4c8eaa
Clean up Gemma 4 model support branch
FurtherAI Jun 21, 2026
f25ca8e
Merge remote-tracking branch 'origin/main' into austin/gemma_4_model_…
FurtherAI Jun 22, 2026
9d98029
Restore strict CP block mask preparation
FurtherAI Jun 22, 2026
5faa770
Use backend-only Triton flex options
FurtherAI Jun 22, 2026
41742d6
Use direct vLLM token metadata fields
FurtherAI Jun 22, 2026
ab605bb
Clean up Gemma4 branch typing diagnostics
FurtherAI Jun 23, 2026
086eb15
Preserve chat template kwargs coverage
FurtherAI Jun 23, 2026
edee967
Fix tinker token id return type
FurtherAI Jun 23, 2026
3654bbd
Probe Gemma4 HF text token types
FurtherAI Jun 23, 2026
b9f279e
Use Gemma4-compatible backend deps
FurtherAI Jun 23, 2026
239d612
Revert "Use Gemma4-compatible backend deps"
FurtherAI Jun 23, 2026
f2947f9
Revert "Probe Gemma4 HF text token types"
FurtherAI Jun 23, 2026
9f818a2
Split Unsloth and Megatron dependency extras
FurtherAI Jun 23, 2026
ef085ef
Remove duplicate Unsloth extra
FurtherAI Jun 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 25 additions & 6 deletions .github/workflows/prek.yml
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ jobs:
done < "${part_paths_file}" | zstd -d -c | tar -xf - -C "${UV_CACHE_DIR}"
du -sh "${UV_CACHE_DIR}"

- name: Install dependencies (with all optional extras for complete type checking)
- name: Install Megatron dependencies
run: |
original_pyproject="$(mktemp)"
cp pyproject.toml "${original_pyproject}"
Expand All @@ -229,12 +229,31 @@ jobs:
--apex-nvcc-threads "${CI_APEX_NVCC_THREADS}"
echo "CI uv build overrides: APEX_PARALLEL_BUILD=${CI_APEX_PARALLEL_BUILD}, NVCC_APPEND_FLAGS=--threads ${CI_APEX_NVCC_THREADS}, UV_CONCURRENT_BUILDS=${CI_UV_BUILD_SLOTS}"
uv --version
uv sync --all-extras --group dev --frozen --python "${CI_PYTHON_MM}"
uv sync --extra megatron --extra langgraph --extra plotting --group dev --frozen --python "${CI_PYTHON_MM}"

- name: Run prek hooks (lint, format, typecheck, uv.lock, tests)
- name: Run prek hooks (lint, format, typecheck, uv.lock)
run: |
uv run --no-sync prek run --all-files
uv run --no-sync prek run ruff --all-files
uv run --no-sync prek run ruff-format --all-files
uv run --no-sync prek run ty --all-files
uv run --no-sync prek run uv-lock-check --all-files

- name: Run unit tests (via prek)
- name: Run Megatron unit tests
run: |
uv run --no-sync prek run pytest
uv run --no-sync pytest --nbval --current-env --tb=short \
tests/unit/test_megatron_reference_logprobs.py \
tests/unit/test_moe_routing_replay.py \
tests/unit/test_moe_routing_real_path.py \
tests/unit/test_pipeline_trainer_local_backend.py

- name: Install backend dependencies
run: |
uv sync --extra backend --extra tinker --extra langgraph --extra plotting --group dev --frozen --python "${CI_PYTHON_MM}"

- name: Run unit tests
run: |
uv run --no-sync pytest --nbval --current-env --tb=short tests/unit \
--ignore=tests/unit/test_megatron_reference_logprobs.py \
--ignore=tests/unit/test_moe_routing_replay.py \
--ignore=tests/unit/test_moe_routing_real_path.py \
--ignore=tests/unit/test_pipeline_trainer_local_backend.py
64 changes: 57 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ backend = [
"bitsandbytes>=0.45.2",
"unsloth==2026.3.3",
"unsloth-zoo==2026.3.1",
"torch>=2.11.0",
"torch==2.11.0",
"torchao==0.16.0",
"accelerate==1.7.0",
"awscli>=1.38.1",
Expand All @@ -43,7 +43,7 @@ backend = [
]
megatron = [
"numpy<2",
"torch>=2.11.0",
"torch==2.11.0",
"flash-attn-4==4.0.0b5",
"ninja>=1.11.1",
"quack-kernels==0.3.7",
Expand All @@ -61,6 +61,7 @@ megatron = [
"nvidia-ml-py==13.580.82",
"nvidia-modelopt>=0.42.0a0 ; sys_platform != 'darwin'",
"nvidia-resiliency-ext<0.5 ; sys_platform == 'linux'",
"transformers==5.6.2",
"ml-dtypes>=0.5.0 ; python_full_version < '3.13'",
]

Expand All @@ -79,8 +80,8 @@ tinker = [
"protobuf>=6.31.1",
"tinker-cookbook>=0.4.1,<0.5",
"tinker>=0.21.0,<0.22",
"torch>=2.11.0",
"transformers==5.2.0",
"torch==2.11.0",
"transformers>=5.2.0,<=5.5.3",
"uvicorn>=0.35.0",
"datrie>=0.8.3",
]
Expand Down Expand Up @@ -150,14 +151,23 @@ markers = [

[tool.uv]
required-version = ">=0.11.7"
conflicts = [
[
{ extra = "backend" },
{ extra = "megatron" },
],
[
{ extra = "tinker" },
{ extra = "megatron" },
],
]
override-dependencies = [
"flashinfer-python==0.6.1",
"flashinfer-python==0.6.8.post1",
"megatron-core==0.17.0",
"numpy<2",
"nvidia-resiliency-ext<0.5",
"quack-kernels==0.3.7",
"transformer-engine==2.11.0",
"transformers==5.2.0",
"torch==2.11.0",
]
exclude-dependencies = ["pynvml", "emerging-optimizers", "causal-conv1d", "mamba-ssm"]
Expand All @@ -184,6 +194,46 @@ name = "deep-ep"
version = "1.2.1+9af0e0d"
requires-dist = []

# The Megatron Bridge source metadata currently requires Transformers 5.8.x,
# but this branch is validated against Transformers 5.6.2 for Gemma 4.
# Keep Bridge's runtime deps explicit here and let ART's megatron extra own the
# Transformers pin.
[[tool.uv.dependency-metadata]]
name = "megatron-bridge"
version = "0.5.0+e1a207ac"
requires-dist = [
"accelerate",
"comet-ml",
"datasets",
"diffusers",
"einops",
"flash-linear-attention",
"flashinfer-cubin",
"flashinfer-python",
"hydra-core",
"imageio",
"imageio-ffmpeg",
"megatron-core",
"mistral-common",
"mlflow",
"nvidia-resiliency-ext",
"omegaconf",
"open-clip-torch",
"peft",
"pyyaml",
"qwen-vl-utils",
"regex",
"rich",
"six",
"tensorboard",
"timm",
"torch",
"tqdm",
"transformers",
"typing-extensions",
"wandb",
]

[[tool.uv.dependency-metadata]]
name = "transformer-engine-torch"
version = "2.11.0"
Expand Down Expand Up @@ -276,7 +326,7 @@ torch = [{ index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_pla
apex = { git = "https://github.com/NVIDIA/apex.git", rev = "25.09" }
deep-ep = { git = "https://github.com/deepseek-ai/DeepEP.git", rev = "v1.2.1" }
flash-attn-4 = { url = "https://files.pythonhosted.org/packages/24/f7/01ee2576ce41f9884d291ee21861ef194afc0b2b1ce3bd175fc7a6e1b133/flash_attn_4-4.0.0b5-py3-none-any.whl" }
megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "e049cc00c24d03e2ae45d2608c7a44e2d2364e3d" }
megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "e1a207ac757e5d0ed94d8ffbe1cbd28e81d8c084" }
panza = { git = "https://github.com/corbt/panza.git" }
transformer-engine-torch = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "v2.11", subdirectory = "transformer_engine/pytorch" }

Expand Down
5 changes: 3 additions & 2 deletions scripts/ci/build_and_push_uv_cache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,9 @@ build_cache_archive() {
export LIBRARY_PATH="${CUDNN_LIBRARY_PATH}${LIBRARY_PATH:+:${LIBRARY_PATH}}"
export LD_LIBRARY_PATH="${CUDNN_LIBRARY_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"

log "Building full uv cache with compile_jobs=${compile_jobs}, apex_parallel_build=${apex_parallel_build}, nvcc_threads=${CI_APEX_NVCC_THREADS}, cuda_arch_list=${TORCH_CUDA_ARCH_LIST}, and uv_concurrent_builds=${UV_BUILD_SLOTS}."
uv sync --frozen --all-extras --group dev --no-install-project --python "${PYTHON_MM}"
log "Building split uv cache with compile_jobs=${compile_jobs}, apex_parallel_build=${apex_parallel_build}, nvcc_threads=${CI_APEX_NVCC_THREADS}, cuda_arch_list=${TORCH_CUDA_ARCH_LIST}, and uv_concurrent_builds=${UV_BUILD_SLOTS}."
uv sync --frozen --extra megatron --extra langgraph --extra plotting --group dev --no-install-project --python "${PYTHON_MM}"
uv sync --frozen --extra backend --extra tinker --extra langgraph --extra plotting --group dev --no-install-project --python "${PYTHON_MM}"
rm -rf .venv

log "Packing uv cache archive to ${archive_path}."
Expand Down
4 changes: 2 additions & 2 deletions scripts/ci/compute_uv_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@ def main() -> int:
"uv_lock_sha256": _sha256_file(args.uv_lock),
},
"ci_context": {
"fingerprint_schema_version": 9,
"fingerprint_schema_version": 10,
"cache_kind": "full_uv_cache",
"cache_scope": "prek_all_extras_group_dev",
"cache_scope": "prek_split_extras_group_dev",
"cache_target": "uv_cache",
"cache_python_platform": "linux_x86_64",
"cache_package_manager": "uv",
Expand Down
2 changes: 1 addition & 1 deletion scripts/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ fi

# Sync the dependencies
if [ "${INSTALL_EXTRAS:-false}" = "true" ]; then
uv sync --all-extras --frozen
uv sync --extra backend --extra tinker --extra langgraph --extra plotting --frozen
else
uv sync --extra backend --frozen
fi
8 changes: 8 additions & 0 deletions src/art/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,16 @@
from .batches import trajectory_group_batches
from .dev import LoRAConfig
from .gather import gather_trajectories, gather_trajectory_groups
from .megatron.runtime_config import (
get_megatron_runtime_config,
init_megatron_runtime_config,
)
from .model import Model, TrainableModel
from .serverless import ServerlessBackend
from .trajectories import Trajectory, TrajectoryGroup
from .types import (
LocalTrainResult,
MegatronRuntimeConfig,
MegatronTopologyConfig,
Messages,
MessagesAndChoices,
Expand All @@ -91,7 +96,10 @@
"Backend",
"LocalTrainResult",
"LoRAConfig",
"MegatronRuntimeConfig",
"MegatronTopologyConfig",
"get_megatron_runtime_config",
"init_megatron_runtime_config",
"ServerlessBackend",
"ServerlessTrainResult",
"Messages",
Expand Down
7 changes: 1 addition & 6 deletions src/art/_backend_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
summarize_trajectory_groups,
)
from .trajectories import TrajectoryGroup
from .types import MegatronTopologyConfig, TrainConfig
from .types import TrainConfig


def build_rl_train_configs(
Expand All @@ -35,7 +35,6 @@ def build_rl_train_configs(
scale_learning_rate_by_reward_std_dev: bool | None = None,
logprob_calculation_chunk_size: int | None = None,
packed_sequence_length: int | None = None,
megatron_topology: MegatronTopologyConfig | dict[str, int | None] | None = None,
num_trajectories_learning_rate_multiplier_power: float | None = None,
kl_ref_adapter_path: str | None = None,
) -> tuple[TrainConfig, dev.TrainConfig]:
Expand Down Expand Up @@ -69,10 +68,6 @@ def build_rl_train_configs(
dev_config["logprob_calculation_chunk_size"] = logprob_calculation_chunk_size
if packed_sequence_length is not None:
dev_config["packed_sequence_length"] = packed_sequence_length
if megatron_topology is not None:
dev_config["megatron_topology"] = MegatronTopologyConfig.model_validate(
megatron_topology
).model_dump(mode="json")
if num_trajectories_learning_rate_multiplier_power is not None:
dev_config["num_trajectories_learning_rate_multiplier_power"] = (
num_trajectories_learning_rate_multiplier_power
Expand Down
17 changes: 17 additions & 0 deletions src/art/auto_trajectory.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from .openai import init_chat_completion, update_chat_completion
from .preprocessing.moe_routing import attach_moe_routing_metadata_to_choice
from .preprocessing.vllm_tokens import attach_vllm_token_metadata_to_choice
from .trajectories import History, Trajectory

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -105,9 +106,25 @@ def handle_httpx_response(self, response: httpx._models.Response) -> None:
# Parse SSE content directly from buffered bytes
chat_completion = parse_sse_to_chat_completion(content)
choice = chat_completion.choices[0]
response_payload = chat_completion.model_dump(mode="python")
attach_vllm_token_metadata_to_choice(
choice=choice,
response_payload=response_payload,
choice_index=0,
)
attach_moe_routing_metadata_to_choice(
choice=choice,
response_payload=response_payload,
choice_index=0,
)
else:
response_payload = json.loads(content)
choice = Choice(**response_payload["choices"][0])
attach_vllm_token_metadata_to_choice(
choice=choice,
response_payload=response_payload,
choice_index=0,
)
attach_moe_routing_metadata_to_choice(
choice=choice,
response_payload=response_payload,
Expand Down
12 changes: 8 additions & 4 deletions src/art/dev/get_model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
LoRAConfig,
TrainerArgs,
)
from .sequence_lengths import max_seq_length_from_model_config
from .validate import is_dedicated_mode


Expand Down Expand Up @@ -36,9 +37,14 @@ def get_model_config(
else:
enable_sleep_mode = config.get("engine_args", {}).get("enable_sleep_mode", True)

configured_init_args = config.get("init_args", {})
init_args = InitArgs(
load_in_4bit=True,
max_seq_length=32768,
max_seq_length=max_seq_length_from_model_config(
base_model,
revision=configured_init_args.get("revision"),
token=configured_init_args.get("token"),
),
model_name=base_model,
)
engine_args = EngineArgs(
Expand All @@ -48,7 +54,7 @@ def get_model_config(
model=base_model,
)
engine_args.update(config.get("engine_args", {}))
init_args.update(config.get("init_args", {}))
init_args.update(configured_init_args)
if last_checkpoint_dir := get_last_checkpoint_dir(output_dir):
init_args["model_name"] = last_checkpoint_dir
merged_lora_config = LoRAConfig(
Expand Down Expand Up @@ -95,6 +101,4 @@ def get_model_config(
result["trainer_gpu_ids"] = config["trainer_gpu_ids"]
if "inference_gpu_ids" in config:
result["inference_gpu_ids"] = config["inference_gpu_ids"]
if "megatron_topology" in config:
result["megatron_topology"] = config["megatron_topology"]
return result
7 changes: 1 addition & 6 deletions src/art/dev/model.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
from enum import Enum
from typing import TYPE_CHECKING, Literal, NoReturn
from typing import Literal, NoReturn

from typing_extensions import Required, TypedDict

from .engine import EngineArgs

if TYPE_CHECKING:
from ..types import MegatronTopologyConfig

RolloutWeightsMode = Literal["lora", "merged"]


Expand Down Expand Up @@ -138,7 +135,6 @@ class InternalModelConfig(TypedDict, total=False):
chat_template_content_format: vLLM chat template content format.
chat_template_tool_schema_format: Tool schema rendering format used for
local training tokenization.
megatron_topology: Fixed Megatron parallel topology for this model.
allow_unvalidated_arch: Permit model-support validation workflows to run
architectures that are not yet in the supported-model registry.
"""
Expand All @@ -156,7 +152,6 @@ class InternalModelConfig(TypedDict, total=False):
chat_template_path: str
chat_template_content_format: str
chat_template_tool_schema_format: Literal["default", "vllm_openai"]
megatron_topology: "MegatronTopologyConfig | dict[str, int | None]"
allow_unvalidated_arch: bool


Expand Down
Loading
Loading