From 4eec7e0c8dff221b6892d4fd0d08db3e0dade8d8 Mon Sep 17 00:00:00 2001 From: kvmto Date: Wed, 15 Apr 2026 11:04:03 +0000 Subject: [PATCH 1/2] docs: add Windows & Blackwell troubleshooting, improve model-not-found error Address workarounds reported in #66 (RTX 5080 + Windows success report): - README: add Blackwell GPU note (cu128 nightly required for SM 12.0) - README: add Windows section (Triton unsupported, TORCH_COMPILE_DISABLE, PYTHONPATH) - README: add pre-trained model not found guidance with explicit path option - run.py: improve find_best_model FileNotFoundError with actionable hint Closes #66 Signed-off-by: kvmto --- README.md | 28 ++++++++++++++++++++++++++++ code/workflows/run.py | 8 +++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 671134b..72a9b55 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,34 @@ Inference note: - Some environments crash during `torch.compile`. - Disable compile: `TORCH_COMPILE=0 bash code/scripts/local_run.sh`. - Or try a safer mode: `TORCH_COMPILE=1 TORCH_COMPILE_MODE=reduce-overhead bash code/scripts/local_run.sh`. +- **Blackwell GPUs (RTX 5080/5090, GB200/GB300)**: + - Stable PyTorch wheels (`cu124`) do not ship SM 12.0 kernels yet. + Install the nightly build with the `cu128` index: + ```bash + pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 + ``` +- **Windows (Git Bash / WSL)**: + - Triton is not supported on native Windows, which causes `torch.compile` to + fail. Disable it before running: + ```bash + export TORCH_COMPILE_DISABLE=1 # PyTorch-level flag + # or, equivalently for the repo scripts: + export PREDECODER_TORCH_COMPILE=0 + ``` + - When running scripts directly (outside the notebook or `local_run.sh`), + set the Python path so that repo modules are importable: + ```bash + export PYTHONPATH="code" + ``` +- **Pre-trained model not found during inference**: + - `find_best_model` searches inside `{output}/models/best_model/` first, + then falls back to `{output}/models/`. If you placed the downloaded + `.pt` file elsewhere, either move it into one of those directories or + point to it directly: + ```bash + PREDECODER_MODEL_CHECKPOINT_FILE=path/to/Ising-Decoder-SurfaceCode-1-Accurate.pt \ + WORKFLOW=inference bash code/scripts/local_run.sh + ``` ## Inference (pre-trained models) diff --git a/code/workflows/run.py b/code/workflows/run.py index 4f68c28..4c8b443 100644 --- a/code/workflows/run.py +++ b/code/workflows/run.py @@ -140,7 +140,13 @@ def find_best_model(path, *, rank: int = 0): print(f" [{marker}] {filename} (epoch {epoch_str})") if best_file is None: - raise FileNotFoundError(f"No valid model checkpoint files found in {path}") + raise FileNotFoundError( + f"No valid model checkpoint files found in {path}\n" + f"Expected .pt files (e.g. Ising-Decoder-SurfaceCode-1-Fast.pt or " + f"PreDecoderModelMemory_*.pt).\n" + f"Hint: download the pretrained weights and place them in this directory, " + f"or set model_checkpoint_file in your config to an explicit path." + ) best_model_path = os.path.join(path, best_file) if rank == 0: From 7696500afff029deda4a79e7d62ab81acc0f4b43 Mon Sep 17 00:00:00 2001 From: Ben Howe <141149032+bmhowe23@users.noreply.github.com> Date: Fri, 17 Apr 2026 12:06:31 -0700 Subject: [PATCH 2/2] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 72a9b55..378a105 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,7 @@ Inference note: - Disable compile: `TORCH_COMPILE=0 bash code/scripts/local_run.sh`. - Or try a safer mode: `TORCH_COMPILE=1 TORCH_COMPILE_MODE=reduce-overhead bash code/scripts/local_run.sh`. - **Blackwell GPUs (RTX 5080/5090, GB200/GB300)**: - - Stable PyTorch wheels (`cu124`) do not ship SM 12.0 kernels yet. + - Stable PyTorch wheels (`cu124`) do not ship SM 12.0 kernels. Install the nightly build with the `cu128` index: ```bash pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128