Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
20ad279
megatron: integrate lora grad sync with finalize_model_grads
FurtherAI Mar 10, 2026
112e97c
megatron: harden sharded lora merge validation
FurtherAI Mar 10, 2026
4d5c345
tests: add megatron lora oracle correctness harness
FurtherAI Mar 10, 2026
fde2ff3
Minor typing changes
FurtherAI Mar 10, 2026
d2c1161
megatron: extend LoRA grad-sync semantics across tp/expert-tp
FurtherAI Mar 12, 2026
e418018
megatron: add MoE routing replay core and unit tests
FurtherAI Mar 12, 2026
bc5e7a4
megatron runtime/service: wire routing replay into training jobs
FurtherAI Mar 12, 2026
c5e06d9
oracle worker/trace: capture forward traces and emit replay bundles
FurtherAI Mar 12, 2026
a73ca1a
oracle harness/tests: refactor suite and add oracle-replay parity flow
FurtherAI Mar 12, 2026
ec83716
typing: clear blocking ty errors in oracle replay and LoRA paths
FurtherAI Mar 12, 2026
83d871b
megatron: reduce oracle variance with sequence grad accumulation
FurtherAI Mar 14, 2026
84e2ea7
megatron lora: fix TP/EP export participation rules
FurtherAI Mar 14, 2026
0bc9919
oracle trace: canonicalize MoE outputs across arbitrary topologies
FurtherAI Mar 14, 2026
8370c7d
oracle harness: stabilize scoring and expand sensitivity mutations
FurtherAI Mar 14, 2026
d396bfd
oracle tests: write suite output tables to log files
FurtherAI Mar 14, 2026
5385fbb
Add correct data parallelism.
FurtherAI Mar 16, 2026
7525567
Fix per-token DP normalization in Megatron training
FurtherAI Mar 17, 2026
7eb96e5
Expand the oracle harness for DP correctness checks
FurtherAI Mar 17, 2026
204e580
Merge origin/main into austin/megatron_lora_correctness_oracle_tests
FurtherAI Mar 17, 2026
9cde0d4
Clean up type errors in Megatron correctness changes
FurtherAI Mar 17, 2026
b2494ea
Testing harness was working, but real training surfaced a few errors,…
FurtherAI Mar 20, 2026
a98fafc
Cut over Megatron LoRA to QuACK
FurtherAI Mar 20, 2026
45e32f5
Del held packed tensors so dir can be removed.
FurtherAI Mar 20, 2026
a77bd7c
Fuse LoRA scale into QuACK grouped GEMM
FurtherAI Mar 21, 2026
8b83fb2
Avoid grad_out copy in QuACK LoRA backward
FurtherAI Mar 21, 2026
f39a5b2
Fuse MoE FC1 gate and up LoRA paths
FurtherAI Mar 23, 2026
92858a9
Tune QuACK low-rank tiles and rank contract
FurtherAI Mar 23, 2026
8cc45b8
Inline FC1 QuACK dual call
FurtherAI Mar 23, 2026
ed671b1
Merge remote-tracking branch 'origin/main' into austin/megatron_lora_…
FurtherAI Mar 24, 2026
6494108
Revert unnecessary python 3.12 requirement.
FurtherAI Mar 24, 2026
c26c00b
Merge branch 'main' into austin/megatron_lora_correctness_oracle_tests
FurtherAI Mar 24, 2026
fccec46
Create lora without instantiating full model by using meta device.
FurtherAI Mar 25, 2026
04fe905
Update Megatron dependencies for transformers v5 change.
FurtherAI Mar 25, 2026
31552e8
Update megatron tests for new lora kernel and avg grads across expert…
FurtherAI Mar 25, 2026
bd332a2
Limit max build jobs when building the uv cache.
FurtherAI Mar 25, 2026
2de9a2d
Fix CI uv cache build robustness
FurtherAI Mar 25, 2026
8d58bea
Tune CI uv cache build concurrency
FurtherAI Mar 25, 2026
42116aa
Trigger CI cache rebuild
FurtherAI Mar 25, 2026
4994f38
Bump CI uv cache fingerprint schema
FurtherAI Mar 25, 2026
4eba002
Try 16x2 CI cache build
FurtherAI Mar 25, 2026
d0c6dad
Try 8x4 CI cache build
FurtherAI Mar 25, 2026
e927ed6
Try 8x8 CI cache build
FurtherAI Mar 25, 2026
585c1b4
Restore 8x2 CI cache defaults
FurtherAI Mar 25, 2026
646f361
Fix CI Apex cache contract
FurtherAI Mar 26, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions .github/workflows/prek.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ env:
CI_PYTHON_MM: "3.11"
CI_UV_CACHE_RELEASE_TAG: "prek-uv-cache"
CI_UV_CACHE_ASSET_PREFIX: "prek-uv-cache"
CI_APEX_PARALLEL_BUILD: "8"
CI_APEX_NVCC_THREADS: "1"
CI_UV_BUILD_SLOTS: "2"
UV_CACHE_DIR: "/root/.cache/uv"
UV_LINK_MODE: "copy"
TORCH_CUDA_ARCH_LIST: "8.0"
Expand All @@ -34,7 +37,9 @@ jobs:
--pyproject pyproject.toml \
--uv-lock uv.lock \
--base-image "${CI_BASE_IMAGE}" \
--python-mm "${CI_PYTHON_MM}")"
--python-mm "${CI_PYTHON_MM}" \
--ci-apex-parallel-build "${CI_APEX_PARALLEL_BUILD}" \
--ci-apex-nvcc-threads "${CI_APEX_NVCC_THREADS}")"
echo "fingerprint=${fp}" >> "${GITHUB_OUTPUT}"
echo "Expected uv cache fingerprint: ${fp}"
Expand Down Expand Up @@ -198,6 +203,13 @@ jobs:
- name: Install dependencies (with all optional extras for complete type checking)
run: |
original_pyproject="$(mktemp)"
cp pyproject.toml "${original_pyproject}"
cleanup() {
mv "${original_pyproject}" pyproject.toml
}
trap cleanup EXIT
py_mm="$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')"
cudnn_path="${GITHUB_WORKSPACE}/.venv/lib/python${py_mm}/site-packages/nvidia/cudnn"
export CUDNN_PATH="${cudnn_path}"
Expand All @@ -207,13 +219,22 @@ jobs:
export CPLUS_INCLUDE_PATH="${CUDNN_INCLUDE_PATH}${CPLUS_INCLUDE_PATH:+:${CPLUS_INCLUDE_PATH}}"
export LIBRARY_PATH="${CUDNN_LIBRARY_PATH}${LIBRARY_PATH:+:${LIBRARY_PATH}}"
export LD_LIBRARY_PATH="${CUDNN_LIBRARY_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
export UV_CONCURRENT_BUILDS="${CI_UV_BUILD_SLOTS}"
export CMAKE_BUILD_PARALLEL_LEVEL="${CI_APEX_PARALLEL_BUILD}"
export MAX_JOBS="${CI_APEX_PARALLEL_BUILD}"
export NINJAFLAGS="-j${CI_APEX_PARALLEL_BUILD}"
python3 scripts/ci/apply_ci_uv_build_overrides.py \
--pyproject pyproject.toml \
--apex-parallel-build "${CI_APEX_PARALLEL_BUILD}" \
--apex-nvcc-threads "${CI_APEX_NVCC_THREADS}"
echo "CI uv build overrides: APEX_PARALLEL_BUILD=${CI_APEX_PARALLEL_BUILD}, NVCC_APPEND_FLAGS=--threads ${CI_APEX_NVCC_THREADS}, UV_CONCURRENT_BUILDS=${CI_UV_BUILD_SLOTS}"
uv --version
uv sync --all-extras --group dev --frozen
- name: Run prek hooks (lint, format, typecheck, uv.lock, tests)
run: |
uv run prek run --all-files
uv run --no-sync prek run --all-files
- name: Run unit tests (via prek)
run: |
uv run prek run pytest
uv run --no-sync prek run pytest
Loading
Loading