From f922c81fc39daa4d18b3b3e99646a4221532bf5f Mon Sep 17 00:00:00 2001 From: Wouter Devriendt Date: Wed, 28 Jan 2026 13:33:44 -0800 Subject: [PATCH] fix: show upgrade message when backend resources don't exist When SQS queue or DynamoDB tables don't exist (e.g., after infrastructure changes in test environment), show a friendly upgrade message instead of confusing AWS errors. Changes: - config.py: Add CLI_UPGRADE_MESSAGE, catch NonExistentQueue error - auth.py: Let upgrade messages pass through without re-wrapping - reservations.py: Catch ResourceNotFoundException for availability table - Bump version to 0.3.6 and MIN_CLI_VERSION to 0.3.6 Co-Authored-By: Claude Opus 4.5 --- cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py | 5 ++++- cli-tools/gpu-dev-cli/gpu_dev_cli/config.py | 18 ++++++++++++++++++ .../gpu-dev-cli/gpu_dev_cli/reservations.py | 9 ++++++++- pyproject.toml | 2 +- terraform-gpu-devservers/lambda.tf | 4 ++-- 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py b/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py index fd9133d9..4e9bf2fb 100644 --- a/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +++ b/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py @@ -3,7 +3,7 @@ import subprocess import re from typing import Dict, Any -from .config import Config +from .config import Config, CLI_UPGRADE_MESSAGE from rich.spinner import Spinner @@ -34,6 +34,9 @@ def authenticate_user(config: Config) -> Dict[str, Any]: } except Exception as e: + # Let upgrade messages pass through without re-wrapping + if CLI_UPGRADE_MESSAGE in str(e): + raise raise RuntimeError(f"AWS authentication failed: {e}") diff --git a/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py b/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py index 331c49ba..2cf73c3b 100644 --- a/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +++ b/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py @@ -3,9 +3,20 @@ import os import json import boto3 +from botocore.exceptions import ClientError from pathlib import Path from typing import Dict, Any, Optional +# Upgrade message shown when backend infrastructure has changed +CLI_UPGRADE_MESSAGE = """ +The GPU Dev service has been updated and requires a newer CLI version. + +Please upgrade: + pip install --upgrade git+https://github.com/pytorch/osdc.git@release + +For more info: https://github.com/pytorch/osdc +""".strip() + class Config: """Zero-config AWS-based configuration""" @@ -101,6 +112,13 @@ def get_queue_url(self) -> str: try: response = self.sqs_client.get_queue_url(QueueName=self.queue_name) return response["QueueUrl"] + except ClientError as e: + error_code = e.response.get("Error", {}).get("Code", "") + if error_code == "AWS.SimpleQueueService.NonExistentQueue": + raise RuntimeError(CLI_UPGRADE_MESSAGE) + raise RuntimeError( + f"Cannot access SQS queue {self.queue_name}. Check AWS permissions: {e}" + ) except Exception as e: raise RuntimeError( f"Cannot access SQS queue {self.queue_name}. Check AWS permissions: {e}" diff --git a/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py b/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py index f2d4866b..0bcda513 100644 --- a/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +++ b/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py @@ -16,7 +16,7 @@ from rich.live import Live from rich.spinner import Spinner -from .config import Config +from .config import Config, CLI_UPGRADE_MESSAGE from .name_generator import sanitize_name from . import __version__ @@ -971,6 +971,13 @@ def get_gpu_availability_by_type(self) -> Optional[Dict[str, Dict[str, Any]]]: return availability_info + except ClientError as e: + error_code = e.response.get("Error", {}).get("Code", "") + if error_code == "ResourceNotFoundException": + raise RuntimeError(CLI_UPGRADE_MESSAGE) + console.print( + f"[red]❌ Error getting GPU availability: {str(e)}[/red]") + return None except Exception as e: console.print( f"[red]❌ Error getting GPU availability: {str(e)}[/red]") diff --git a/pyproject.toml b/pyproject.toml index 2a964bd1..80c00df6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "gpu-dev-cli" -version = "0.3.5" +version = "0.3.6" description = "CLI tool for PyTorch GPU developer server reservations" authors = [{name = "PyTorch Team"}] readme = "cli-tools/gpu-dev-cli/README.md" diff --git a/terraform-gpu-devservers/lambda.tf b/terraform-gpu-devservers/lambda.tf index de79723e..562e1c72 100644 --- a/terraform-gpu-devservers/lambda.tf +++ b/terraform-gpu-devservers/lambda.tf @@ -179,8 +179,8 @@ resource "aws_lambda_function" "reservation_processor" { HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : "" SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : "" SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : "" - LAMBDA_VERSION = "0.3.5" - MIN_CLI_VERSION = "0.3.5" + LAMBDA_VERSION = "0.3.6" + MIN_CLI_VERSION = "0.3.6" DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket }, local.alb_env_vars) }