From 7f0f6c860d48edaf9db43ddf010e80f0bac75447 Mon Sep 17 00:00:00 2001
From: Ivan Basov <ibasov@nvidia.com>
Date: Mon, 30 Mar 2026 11:54:58 -0700
Subject: [PATCH 1/5] fix(ci): disable torch.compile in orientation training to
 prevent segfault

torch.compile=on combined with DataLoader spawn workers during LER
validation causes a segfault (20 leaked semaphores, core dumped).
Set PREDECODER_TORCH_COMPILE=0 for the Train all orientations step.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/long-running-tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/long-running-tests.yml b/.github/workflows/long-running-tests.yml
index f536780..3c9e268 100644
--- a/.github/workflows/long-running-tests.yml
+++ b/.github/workflows/long-running-tests.yml
@@ -184,6 +184,7 @@ jobs:
           PREDECODER_VAL_SAMPLES: "4096"
           PREDECODER_TEST_SAMPLES: "4096"
           PREDECODER_TRAIN_EPOCHS: "1"
+          PREDECODER_TORCH_COMPILE: "0"
 
       - name: Multi-orientation inference (O1–O4) with LER output check
         shell: bash

From 9d3fa086d9768091054aabe95606fea3424002f6 Mon Sep 17 00:00:00 2001
From: Ivan Basov <ibasov@nvidia.com>
Date: Mon, 30 Mar 2026 11:57:04 -0700
Subject: [PATCH 2/5] Revert "fix(ci): disable torch.compile in orientation
 training to prevent segfault"

This reverts commit 7f0f6c860d48edaf9db43ddf010e80f0bac75447.
---
 .github/workflows/long-running-tests.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/long-running-tests.yml b/.github/workflows/long-running-tests.yml
index 3c9e268..f536780 100644
--- a/.github/workflows/long-running-tests.yml
+++ b/.github/workflows/long-running-tests.yml
@@ -184,7 +184,6 @@ jobs:
           PREDECODER_VAL_SAMPLES: "4096"
           PREDECODER_TEST_SAMPLES: "4096"
           PREDECODER_TRAIN_EPOCHS: "1"
-          PREDECODER_TORCH_COMPILE: "0"
 
       - name: Multi-orientation inference (O1–O4) with LER output check
         shell: bash

From 96b6165dc2cabaacb148adc205529f83ff28804f Mon Sep 17 00:00:00 2001
From: Ivan Basov <ibasov@nvidia.com>
Date: Mon, 20 Apr 2026 11:01:45 -0700
Subject: [PATCH 3/5] fix(timing): warmup pass before timing loop to amortise
 torch.compile JIT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this, the first batch in the timing loop bears the full
torch.compile lazy-compilation cost (~887 ms vs ~1 ms steady-state),
skewing Phase Timing numbers — especially at low sample counts like
PREDECODER_INFERENCE_NUM_SAMPLES=1.  The warmup only runs when
torch.compile is active and TRT is not in use.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 code/evaluation/logical_error_rate.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/code/evaluation/logical_error_rate.py b/code/evaluation/logical_error_rate.py
index 54b65e9..63df333 100644
--- a/code/evaluation/logical_error_rate.py
+++ b/code/evaluation/logical_error_rate.py
@@ -1316,6 +1316,17 @@ def run_inference_and_decode_pre_decoder_memory(model, device, dist, cfg) -> dic
     else:
         data_iter = test_dataloader
 
+    # Warmup: trigger torch.compile lazy compilation before the timing loop.
+    if trt_context is None and _applied_compile:
+        with torch.no_grad():
+            _warmup_tensor = torch.as_tensor(
+                stim_dets[:1], dtype=torch.float32, device=device
+            )
+            _ = pipeline_module(_warmup_tensor)
+            del _warmup_tensor
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+
     # Timing instrumentation accumulators (used when timing_rank0 is True)
     residual_syndrome_density_sum = 0.0
     predecoder_batch_times = [] if timing_rank0 else None

From ef0a5a6254ae3afad3924b87107937c85d33aa24 Mon Sep 17 00:00:00 2001
From: Ivan Basov <ibasov@nvidia.com>
Date: Mon, 20 Apr 2026 11:07:32 -0700
Subject: [PATCH 4/5] test(timing): extract _maybe_warmup_compile helper + unit
 tests

Extracts the warmup block into a named helper so it can be tested in
isolation.  Five tests cover: fires when compile is active (CPU), skipped
when compile is off, skipped when TRT context is present, CUDA sync called
on GPU device, CUDA sync not called on CPU device.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 code/evaluation/logical_error_rate.py       | 22 ++++-----
 code/tests/test_inference_latency_timing.py | 49 ++++++++++++++++++++-
 2 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/code/evaluation/logical_error_rate.py b/code/evaluation/logical_error_rate.py
index 63df333..db06dec 100644
--- a/code/evaluation/logical_error_rate.py
+++ b/code/evaluation/logical_error_rate.py
@@ -501,6 +501,17 @@ def map_grid_to_stabilizer_tensor(grid_btdd, stab_indices_1d):
     return flat_bdt.index_select(dim=1, index=stab_indices_1d)
 
 
+def _maybe_warmup_compile(pipeline_module, stim_dets, device, trt_context, applied_compile):
+    """Fire one forward pass to trigger torch.compile JIT before the timing loop."""
+    if trt_context is not None or not applied_compile:
+        return
+    with torch.no_grad():
+        _warmup_tensor = torch.as_tensor(stim_dets[:1], dtype=torch.float32, device=device)
+        pipeline_module(_warmup_tensor)
+        if device.type == "cuda":
+            torch.cuda.synchronize()
+
+
 class PreDecoderMemoryEvalModule(nn.Module):
     """
     nn.Module that encapsulates the full pre-decoder eval path: batch input -> trainX,
@@ -1316,16 +1327,7 @@ def run_inference_and_decode_pre_decoder_memory(model, device, dist, cfg) -> dic
     else:
         data_iter = test_dataloader
 
-    # Warmup: trigger torch.compile lazy compilation before the timing loop.
-    if trt_context is None and _applied_compile:
-        with torch.no_grad():
-            _warmup_tensor = torch.as_tensor(
-                stim_dets[:1], dtype=torch.float32, device=device
-            )
-            _ = pipeline_module(_warmup_tensor)
-            del _warmup_tensor
-            if device.type == "cuda":
-                torch.cuda.synchronize()
+    _maybe_warmup_compile(pipeline_module, stim_dets, device, trt_context, _applied_compile)
 
     # Timing instrumentation accumulators (used when timing_rank0 is True)
     residual_syndrome_density_sum = 0.0
diff --git a/code/tests/test_inference_latency_timing.py b/code/tests/test_inference_latency_timing.py
index d537412..3e16c13 100644
--- a/code/tests/test_inference_latency_timing.py
+++ b/code/tests/test_inference_latency_timing.py
@@ -15,11 +15,12 @@
 
 import math
 import unittest
-from unittest.mock import patch
+from unittest.mock import MagicMock, call, patch
 
 import numpy as np
+import torch
 
-from evaluation.logical_error_rate import _time_single_shot_latency_stim
+from evaluation.logical_error_rate import _maybe_warmup_compile, _time_single_shot_latency_stim
 
 
 class _FakeMatcher:
@@ -84,5 +85,49 @@ def test_time_single_shot_latency_handles_empty(self) -> None:
         self.assertEqual(len(matcher.calls), 0)
 
 
+class TestMaybeWarmupCompile(unittest.TestCase):
+
+    def _cpu_device(self):
+        return torch.device("cpu")
+
+    def test_calls_pipeline_module_when_compile_active(self):
+        pipeline = MagicMock(return_value=torch.zeros(1))
+        dets = np.zeros((4, 8), dtype=np.uint8)
+        _maybe_warmup_compile(pipeline, dets, self._cpu_device(), trt_context=None, applied_compile=True)
+        self.assertEqual(pipeline.call_count, 1)
+        tensor_arg = pipeline.call_args[0][0]
+        self.assertEqual(tensor_arg.shape[0], 1)
+        self.assertEqual(tensor_arg.dtype, torch.float32)
+
+    def test_skipped_when_compile_not_applied(self):
+        pipeline = MagicMock()
+        dets = np.zeros((4, 8), dtype=np.uint8)
+        _maybe_warmup_compile(pipeline, dets, self._cpu_device(), trt_context=None, applied_compile=False)
+        pipeline.assert_not_called()
+
+    def test_skipped_when_trt_context_present(self):
+        pipeline = MagicMock()
+        dets = np.zeros((4, 8), dtype=np.uint8)
+        _maybe_warmup_compile(pipeline, dets, self._cpu_device(), trt_context=object(), applied_compile=True)
+        pipeline.assert_not_called()
+
+    def test_cuda_sync_called_on_gpu_device(self):
+        pipeline = MagicMock(return_value=torch.zeros(1))
+        dets = np.zeros((4, 8), dtype=np.uint8)
+        gpu_device = MagicMock(spec=torch.device)
+        gpu_device.type = "cuda"
+        with patch("evaluation.logical_error_rate.torch.as_tensor", return_value=torch.zeros(1)) as _mock_tensor, \
+             patch("evaluation.logical_error_rate.torch.cuda.synchronize") as mock_sync:
+            _maybe_warmup_compile(pipeline, dets, gpu_device, trt_context=None, applied_compile=True)
+        mock_sync.assert_called_once()
+
+    def test_cuda_sync_not_called_on_cpu_device(self):
+        pipeline = MagicMock(return_value=torch.zeros(1))
+        dets = np.zeros((4, 8), dtype=np.uint8)
+        with patch("evaluation.logical_error_rate.torch.cuda.synchronize") as mock_sync:
+            _maybe_warmup_compile(pipeline, dets, self._cpu_device(), trt_context=None, applied_compile=True)
+        mock_sync.assert_not_called()
+
+
 if __name__ == "__main__":
     unittest.main()

From b3471574ccf9bb22af1de18a337947443b911531 Mon Sep 17 00:00:00 2001
From: Ivan Basov <ibasov@nvidia.com>
Date: Mon, 20 Apr 2026 11:10:25 -0700
Subject: [PATCH 5/5] style: yapf formatting on test_inference_latency_timing

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 code/tests/test_inference_latency_timing.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/code/tests/test_inference_latency_timing.py b/code/tests/test_inference_latency_timing.py
index 3e16c13..cfba0a2 100644
--- a/code/tests/test_inference_latency_timing.py
+++ b/code/tests/test_inference_latency_timing.py
@@ -93,7 +93,9 @@ def _cpu_device(self):
     def test_calls_pipeline_module_when_compile_active(self):
         pipeline = MagicMock(return_value=torch.zeros(1))
         dets = np.zeros((4, 8), dtype=np.uint8)
-        _maybe_warmup_compile(pipeline, dets, self._cpu_device(), trt_context=None, applied_compile=True)
+        _maybe_warmup_compile(
+            pipeline, dets, self._cpu_device(), trt_context=None, applied_compile=True
+        )
         self.assertEqual(pipeline.call_count, 1)
         tensor_arg = pipeline.call_args[0][0]
         self.assertEqual(tensor_arg.shape[0], 1)
@@ -102,13 +104,17 @@ def test_calls_pipeline_module_when_compile_active(self):
     def test_skipped_when_compile_not_applied(self):
         pipeline = MagicMock()
         dets = np.zeros((4, 8), dtype=np.uint8)
-        _maybe_warmup_compile(pipeline, dets, self._cpu_device(), trt_context=None, applied_compile=False)
+        _maybe_warmup_compile(
+            pipeline, dets, self._cpu_device(), trt_context=None, applied_compile=False
+        )
         pipeline.assert_not_called()
 
     def test_skipped_when_trt_context_present(self):
         pipeline = MagicMock()
         dets = np.zeros((4, 8), dtype=np.uint8)
-        _maybe_warmup_compile(pipeline, dets, self._cpu_device(), trt_context=object(), applied_compile=True)
+        _maybe_warmup_compile(
+            pipeline, dets, self._cpu_device(), trt_context=object(), applied_compile=True
+        )
         pipeline.assert_not_called()
 
     def test_cuda_sync_called_on_gpu_device(self):
@@ -118,14 +124,18 @@ def test_cuda_sync_called_on_gpu_device(self):
         gpu_device.type = "cuda"
         with patch("evaluation.logical_error_rate.torch.as_tensor", return_value=torch.zeros(1)) as _mock_tensor, \
              patch("evaluation.logical_error_rate.torch.cuda.synchronize") as mock_sync:
-            _maybe_warmup_compile(pipeline, dets, gpu_device, trt_context=None, applied_compile=True)
+            _maybe_warmup_compile(
+                pipeline, dets, gpu_device, trt_context=None, applied_compile=True
+            )
         mock_sync.assert_called_once()
 
     def test_cuda_sync_not_called_on_cpu_device(self):
         pipeline = MagicMock(return_value=torch.zeros(1))
         dets = np.zeros((4, 8), dtype=np.uint8)
         with patch("evaluation.logical_error_rate.torch.cuda.synchronize") as mock_sync:
-            _maybe_warmup_compile(pipeline, dets, self._cpu_device(), trt_context=None, applied_compile=True)
+            _maybe_warmup_compile(
+                pipeline, dets, self._cpu_device(), trt_context=None, applied_compile=True
+            )
         mock_sync.assert_not_called()