From 7f0f6c860d48edaf9db43ddf010e80f0bac75447 Mon Sep 17 00:00:00 2001 From: Ivan Basov Date: Mon, 30 Mar 2026 11:54:58 -0700 Subject: [PATCH 1/4] fix(ci): disable torch.compile in orientation training to prevent segfault torch.compile=on combined with DataLoader spawn workers during LER validation causes a segfault (20 leaked semaphores, core dumped). Set PREDECODER_TORCH_COMPILE=0 for the Train all orientations step. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/long-running-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/long-running-tests.yml b/.github/workflows/long-running-tests.yml index f536780..3c9e268 100644 --- a/.github/workflows/long-running-tests.yml +++ b/.github/workflows/long-running-tests.yml @@ -184,6 +184,7 @@ jobs: PREDECODER_VAL_SAMPLES: "4096" PREDECODER_TEST_SAMPLES: "4096" PREDECODER_TRAIN_EPOCHS: "1" + PREDECODER_TORCH_COMPILE: "0" - name: Multi-orientation inference (O1–O4) with LER output check shell: bash From 9d3fa086d9768091054aabe95606fea3424002f6 Mon Sep 17 00:00:00 2001 From: Ivan Basov Date: Mon, 30 Mar 2026 11:57:04 -0700 Subject: [PATCH 2/4] Revert "fix(ci): disable torch.compile in orientation training to prevent segfault" This reverts commit 7f0f6c860d48edaf9db43ddf010e80f0bac75447. --- .github/workflows/long-running-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/long-running-tests.yml b/.github/workflows/long-running-tests.yml index 3c9e268..f536780 100644 --- a/.github/workflows/long-running-tests.yml +++ b/.github/workflows/long-running-tests.yml @@ -184,7 +184,6 @@ jobs: PREDECODER_VAL_SAMPLES: "4096" PREDECODER_TEST_SAMPLES: "4096" PREDECODER_TRAIN_EPOCHS: "1" - PREDECODER_TORCH_COMPILE: "0" - name: Multi-orientation inference (O1–O4) with LER output check shell: bash From 7f49d4d69757b7ea01dfc3eb321b7cf3e78e03c3 Mon Sep 17 00:00:00 2001 From: Ivan Basov Date: Fri, 1 May 2026 12:32:10 -0700 Subject: [PATCH 3/4] ci: avoid deadsnakes in GPU workflows --- .github/workflows/ci-gpu.yml | 25 +++++++------- .github/workflows/long-running-tests.yml | 44 +++++++++++++----------- 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/.github/workflows/ci-gpu.yml b/.github/workflows/ci-gpu.yml index e2345ef..eebe986 100644 --- a/.github/workflows/ci-gpu.yml +++ b/.github/workflows/ci-gpu.yml @@ -59,15 +59,13 @@ jobs: run: | export DEBIAN_FRONTEND=noninteractive apt-get update - apt-get install -y git git-lfs gcc software-properties-common - add-apt-repository -y ppa:deadsnakes/ppa - apt-get update - apt-get install -y \ - python${{ matrix.python-version }} \ - python${{ matrix.python-version }}-venv \ - python${{ matrix.python-version }}-dev + apt-get install -y git git-lfs gcc git lfs install + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - uses: actions/checkout@v4 with: lfs: true @@ -78,7 +76,7 @@ jobs: - name: Install dependencies and run tests run: bash code/scripts/check_python_compat.sh env: - PYTHON_BIN: python${{ matrix.python-version }} + PYTHON_BIN: python MODE: train SKIP_TESTS: "0" REQUIRE_GPU: "1" @@ -141,12 +139,13 @@ jobs: run: | export DEBIAN_FRONTEND=noninteractive apt-get update - apt-get install -y git git-lfs gcc software-properties-common - add-apt-repository -y ppa:deadsnakes/ppa - apt-get update - apt-get install -y python3.13 python3.13-venv python3.13-dev + apt-get install -y git git-lfs gcc git lfs install + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - uses: actions/checkout@v4 with: lfs: true @@ -156,7 +155,7 @@ jobs: - name: Install Python dependencies run: | - python3.13 -m venv .venv_mid + python -m venv .venv_mid . .venv_mid/bin/activate python -m pip install --upgrade pip setuptools wheel # TODO: matrix by CUDA major version [cu12, cu13] diff --git a/.github/workflows/long-running-tests.yml b/.github/workflows/long-running-tests.yml index 957f4e3..f8e2bfe 100644 --- a/.github/workflows/long-running-tests.yml +++ b/.github/workflows/long-running-tests.yml @@ -99,19 +99,20 @@ jobs: run: | export DEBIAN_FRONTEND=noninteractive apt-get update - apt-get install -y git git-lfs gcc software-properties-common - add-apt-repository -y ppa:deadsnakes/ppa - apt-get update - apt-get install -y python${{ env.PYTHON_VERSION }} python${{ env.PYTHON_VERSION }}-venv python${{ env.PYTHON_VERSION }}-dev + apt-get install -y git git-lfs gcc git lfs install + - uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + - uses: actions/checkout@v4 with: lfs: true - name: Install Python dependencies run: | - python${{ env.PYTHON_VERSION }} -m venv .venv + python -m venv .venv . .venv/bin/activate python -m pip install --upgrade pip setuptools wheel pip install -r code/requirements_public_inference.txt @@ -156,19 +157,20 @@ jobs: run: | export DEBIAN_FRONTEND=noninteractive apt-get update - apt-get install -y git git-lfs gcc software-properties-common - add-apt-repository -y ppa:deadsnakes/ppa - apt-get update - apt-get install -y python${{ env.PYTHON_VERSION }} python${{ env.PYTHON_VERSION }}-venv python${{ env.PYTHON_VERSION }}-dev + apt-get install -y git git-lfs gcc git lfs install + - uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + - uses: actions/checkout@v4 with: lfs: true - name: Install Python dependencies run: | - python${{ env.PYTHON_VERSION }} -m venv .venv + python -m venv .venv . .venv/bin/activate python -m pip install --upgrade pip setuptools wheel # TODO: matrix by CUDA major version [cu12, cu13] @@ -237,19 +239,20 @@ jobs: run: | export DEBIAN_FRONTEND=noninteractive apt-get update - apt-get install -y git git-lfs gcc software-properties-common - add-apt-repository -y ppa:deadsnakes/ppa - apt-get update - apt-get install -y python${{ env.PYTHON_VERSION }} python${{ env.PYTHON_VERSION }}-venv python${{ env.PYTHON_VERSION }}-dev + apt-get install -y git git-lfs gcc git lfs install + - uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + - uses: actions/checkout@v4 with: lfs: true - name: Install Python dependencies run: | - python${{ env.PYTHON_VERSION }} -m venv .venv + python -m venv .venv . .venv/bin/activate python -m pip install --upgrade pip setuptools wheel pip install -r code/requirements_public_inference.txt @@ -293,19 +296,20 @@ jobs: run: | export DEBIAN_FRONTEND=noninteractive apt-get update - apt-get install -y git git-lfs gcc software-properties-common - add-apt-repository -y ppa:deadsnakes/ppa - apt-get update - apt-get install -y python${{ env.PYTHON_VERSION }} python${{ env.PYTHON_VERSION }}-venv python${{ env.PYTHON_VERSION }}-dev + apt-get install -y git git-lfs gcc git lfs install + - uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + - uses: actions/checkout@v4 with: lfs: true - name: Install Python dependencies run: | - python${{ env.PYTHON_VERSION }} -m venv .venv + python -m venv .venv . .venv/bin/activate python -m pip install --upgrade pip setuptools wheel # TODO: matrix by CUDA major version [cu12, cu13] From a8aaa3fec820be8602d065eb243d51e9b70fa416 Mon Sep 17 00:00:00 2001 From: Ivan Basov Date: Fri, 1 May 2026 12:46:59 -0700 Subject: [PATCH 4/4] ci: skip multi-gpu tests on PR branches --- .github/workflows/ci-gpu.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci-gpu.yml b/.github/workflows/ci-gpu.yml index eebe986..4ae1cc5 100644 --- a/.github/workflows/ci-gpu.yml +++ b/.github/workflows/ci-gpu.yml @@ -196,6 +196,7 @@ jobs: # Runs only after merge to main (not on PR branches) to conserve GPU quota. # --------------------------------------------------------------------------- multi-gpu-tests: + if: github.ref == 'refs/heads/main' needs: gpu-tests runs-on: linux-amd64-gpu-rtxpro6000-latest-2 container: