From 734a84dacb91cc5989f2a8ff1cc62d55ec298e6a Mon Sep 17 00:00:00 2001 From: Max Olender Date: Sun, 12 Apr 2026 23:17:32 -0700 Subject: [PATCH 1/4] rvs: implement SOT artifact caching pipeline and HTTP cache server MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds the artifact pre-caching pipeline described in #416. Before each validation cycle, RVS now fetches the SOT JSON, resolves artifact URLs (direct URIs and JSONPath-based `sotpath` expressions), and downloads them concurrently into a local cache directory. An HTTP file server then serves the cache to nodes during validation. The server starts once before the main loop so it stays alive across cycles; new files written by each download pass become visible immediately without a restart. Downloads are bounded by a configurable semaphore, respect a per-file timeout, and verify integrity against the SHA-256 checksum advertised by Artifactory in the `x-checksum-sha256` response header. Files already on disk are skipped on subsequent cycles. Artifact URL resolution lives in a new `scenario/resolver.rs` module and is pure (no I/O), making it straightforward to unit test. JSONPath evaluation handles `sotpath` expressions like `$.BoardSKUs[?@.Name == '...'].Components.Software[?@.Component == '...'].Locations[?@.Name == '...'].Location`. The SOT is fetched from NICC via `list_rack_firmware` and matched by the `Name` field against the scenario's `sot_release`. A file-based override on `RvsCtx` keeps the full pipeline exercisable without a live NICC connection. Multi-SOT support (scenarios targeting different releases in the same cycle) is left as a follow-up TODO. The crate is restructured to have a lib target so the artifact, scenario, and context types are shareable across binaries. A new `test-artifact-cache` binary wires up the complete pipeline against a local SOT file and serves the resulting cache — useful for manual verification and showing colleagues how the pieces fit together. Testing ------- The `test-artifact-cache` binary was run against a real SOT JSON file and a hand-crafted scenario TOML targeting release 1.2.2. The scenario exercises three artifact kinds: an OS image placeholder, a direct-URI artifact, and a `sotpath`-resolved artifact. A fourth large artifact (~1.9 GB NVOS binary) was included to validate concurrent streaming and checksum verification. The SOT and scenario files are not committed (kept alongside the upstream SOT JSON used for development). To reproduce, supply any SOT JSON and a matching scenario TOML: `target/debug/test-artifact-cache \` `--sot \` `--scenario \` `--cache-dir /tmp/rvs-test-cache \` `> /tmp/rvs-test.log 2>&1 &` `tail -f /tmp/rvs-test.log` After downloads completed the cache directory contained: total 3808848 -rw-r--r-- 1 user root 31K nmx-m-nmx-c.proto -rw-r--r-- 1 user root 1.8G nvos.bin -rw-r--r-- 1 user root 1.7M nvos_openapi.json -rw-r--r-- 1 user root 1.7M os All checksums passed. The server correctly served all files from `http://localhost:8080/gb200nvl/1.2.2/`. Signed-off-by: Max Olender --- Cargo.lock | 9 + crates/rvs/Cargo.toml | 15 +- crates/rvs/src/artifact/io.rs | 230 ++++++++++++++++++ crates/rvs/src/artifact/mod.rs | 3 + .../rvs/src/{main.rs => bin/carbide-rvs.rs} | 135 +++++----- crates/rvs/src/bin/test-artifact-cache.rs | 127 ++++++++++ crates/rvs/src/client/io.rs | 33 ++- crates/rvs/src/client/mod.rs | 25 +- crates/rvs/src/config.rs | 9 +- crates/rvs/src/ctx.rs | 16 ++ crates/rvs/src/error.rs | 12 + crates/rvs/src/lib.rs | 26 ++ crates/rvs/src/scenario.rs | 95 -------- crates/rvs/src/scenario/mod.rs | 177 ++++++++++++++ crates/rvs/src/scenario/resolver.rs | 199 +++++++++++++++ 15 files changed, 937 insertions(+), 174 deletions(-) create mode 100644 crates/rvs/src/artifact/io.rs create mode 100644 crates/rvs/src/artifact/mod.rs rename crates/rvs/src/{main.rs => bin/carbide-rvs.rs} (59%) create mode 100644 crates/rvs/src/bin/test-artifact-cache.rs create mode 100644 crates/rvs/src/ctx.rs create mode 100644 crates/rvs/src/lib.rs delete mode 100644 crates/rvs/src/scenario.rs create mode 100644 crates/rvs/src/scenario/mod.rs create mode 100644 crates/rvs/src/scenario/resolver.rs diff --git a/Cargo.lock b/Cargo.lock index 5dc67ddaa4..29ed6c5b3c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2425,17 +2425,26 @@ dependencies = [ name = "carbide-rvs" version = "0.0.1" dependencies = [ + "axum", "carbide-rpc", "carbide-tls", "carbide-uuid", + "clap", "figment", + "futures", + "hex", + "jsonpath-rust", "logfmt", + "reqwest 0.13.3", "serde", + "serde_json", + "sha2 0.11.0", "thiserror 2.0.18", "tokio", "tokio-util", "toml 1.1.2+spec-1.1.0", "tonic", + "tower-http", "tracing", "tracing-subscriber", "uuid", diff --git a/crates/rvs/Cargo.toml b/crates/rvs/Cargo.toml index d5002edfdb..4bf8c0e400 100644 --- a/crates/rvs/Cargo.toml +++ b/crates/rvs/Cargo.toml @@ -22,23 +22,32 @@ edition.workspace = true license.workspace = true authors.workspace = true -[[bin]] -name = "carbide-rvs" -path = "src/main.rs" +[lib] +name = "carbide_rvs" +path = "src/lib.rs" [dependencies] carbide-uuid = { path = "../uuid" } carbide-rpc = { path = "../rpc" } carbide-tls = { path = "../tls" } +axum = { workspace = true } +clap = { workspace = true, features = ["derive"] } figment = { workspace = true, features = ["toml", "env"] } uuid = { workspace = true, features = ["v4"] } logfmt = { path = "../logfmt" } serde = { features = ["derive"], workspace = true } +serde_json = { workspace = true } +futures = { workspace = true } +hex = { workspace = true } +sha2 = { workspace = true } +jsonpath-rust = "1.0.4" +reqwest = { default-features = false, features = ["rustls", "stream"], workspace = true } tokio = { workspace = true } tokio-util = { workspace = true } toml = { workspace = true } thiserror = { workspace = true } tonic = { workspace = true } +tower-http = { workspace = true, features = ["fs"] } tracing = { workspace = true } tracing-subscriber = { workspace = true, features = ["env-filter"] } diff --git a/crates/rvs/src/artifact/io.rs b/crates/rvs/src/artifact/io.rs new file mode 100644 index 0000000000..51d99d33ce --- /dev/null +++ b/crates/rvs/src/artifact/io.rs @@ -0,0 +1,230 @@ +use std::sync::Arc; +use std::time::Duration; + +use axum::Router; +use futures::StreamExt; +use sha2::{Digest, Sha256}; +use tokio::io::AsyncWriteExt; +use tokio::sync::Semaphore; +use tokio::task::JoinSet; +use tower_http::services::ServeDir; + +use crate::client::RackFirmwareData; +use crate::ctx::RvsCtx; +use crate::error::RvsError; +use crate::rack::Racks; +use crate::scenario; + +/// A resolved artifact ready to be downloaded. +#[allow(dead_code)] +#[derive(Debug)] +pub struct ArtifactDownload { + /// Destination path under cache_dir///. + pub output_path: String, + /// Source URL to download from. + pub url: String, +} + +/// Download and cache all artifacts required for validation. +/// +/// Covers the OS image, direct-URI artifacts, and SOT-resolved artifacts +/// defined in the matched scenarios. Does not touch the cache server -- +/// call `start_cache_server` once before the main loop. +pub async fn process_artifacts(racks: &Racks, ctx: &RvsCtx) -> Result<(), RvsError> { + let sot = fetch_sot(racks, ctx).await?; + let downloads = scenario::resolve_artifact_urls(&sot, ctx)?; + download_artifacts(downloads, ctx).await?; + Ok(()) +} + +/// Start the HTTP artifact cache server. +/// +/// Binds once and serves `cache_dir` for the lifetime of the process. +/// New files written by `process_artifacts` become visible immediately +/// without a restart. Call this once before the main validation loop. +pub async fn start_cache_server(ctx: &RvsCtx) -> Result<(), RvsError> { + spawn_cache_server(ctx).await +} + +/// Fetch the SOT JSON for the scenarios loaded in ctx. +/// +/// Uses `ctx.sot_override_path` when set (test binary only), otherwise +/// lists all firmware records from NICC and returns the one whose `Name` +/// field matches the scenario's `sot_release`. +/// +/// TODO[#416]: currently matches on the first scenario's `sot_release` +/// only. When multiple scenarios target different releases, this needs +/// to fetch one SOT per distinct release and route per-scenario. +async fn fetch_sot(_racks: &Racks, ctx: &RvsCtx) -> Result { + if let Some(path) = &ctx.sot_override_path { + tracing::info!(path, "artifact: loading SOT from file override"); + let content = std::fs::read_to_string(path) + .map_err(|e| RvsError::InvalidArg(format!("failed to read SOT override: {e}")))?; + let config = serde_json::from_str(&content) + .map_err(|e| RvsError::InvalidArg(format!("invalid SOT JSON: {e}")))?; + return Ok(RackFirmwareData { + id: "override".to_string(), + config, + }); + } + + let sot_release = ctx + .scenarios + .first() + .map(|s| s.rack.sot_release.as_str()) + .ok_or_else(|| RvsError::InvalidArg("fetch_sot: no scenarios loaded".to_string()))?; + + tracing::info!(sot_release, "artifact: fetching SOT from NICC"); + + let records = ctx.nicc.list_rack_firmware().await?; + records + .into_iter() + .find(|r| r.config.get("Name").and_then(|v| v.as_str()) == Some(sot_release)) + .ok_or_else(|| { + RvsError::InvalidArg(format!( + "fetch_sot: no SOT record found for release '{sot_release}'" + )) + }) +} + +/// Download resolved artifacts into cache_dir///. +/// +/// Skips files already present on disk (cache hit). Respects +/// `max_concurrent_downloads` and `download_timeout_secs` from config. +async fn download_artifacts( + artifacts: Vec, + ctx: &RvsCtx, +) -> Result<(), RvsError> { + let cfg = &ctx.cfg.artifact_cache; + let client = reqwest::Client::new(); + let sem = Arc::new(Semaphore::new(cfg.max_concurrent_downloads as usize)); + let timeout = Duration::from_secs(cfg.download_timeout_secs); + let mut set = JoinSet::new(); + + for artifact in artifacts { + let client = client.clone(); + let sem = sem.clone(); + set.spawn(async move { + let _permit = sem.acquire_owned().await.unwrap(); + tokio::time::timeout(timeout, download_one(&client, &artifact)) + .await + .map_err(|_| RvsError::Timeout(format!("download timed out: {}", artifact.url)))? + }); + } + + while let Some(res) = set.join_next().await { + res.map_err(|e| RvsError::InvalidArg(format!("download task panicked: {e}")))??; + } + Ok(()) +} + +/// Download a single artifact to `artifact.output_path`. +/// +/// Creates parent directories as needed. Skips download if the file already +/// exists (cache hit). Streams the response body directly to disk. +async fn download_one( + client: &reqwest::Client, + artifact: &ArtifactDownload, +) -> Result<(), RvsError> { + let path = std::path::Path::new(&artifact.output_path); + + // Cache hit: trust that a non-tmp file at `path` is complete, because we + // only rename into place after a fully streamed body (and checksum, when + // advertised) succeeds. We do NOT re-verify on hit -- a stable URL is + // assumed to map to stable bytes for the lifetime of the cache. + if path.exists() { + tracing::debug!(path = artifact.output_path, "artifact: cache hit, skipping"); + return Ok(()); + } + + if let Some(parent) = path.parent() { + tokio::fs::create_dir_all(parent).await?; + } + + tracing::info!( + url = artifact.url, + path = artifact.output_path, + "artifact: downloading" + ); + + let response = + client.get(&artifact.url).send().await.map_err(|e| { + RvsError::InvalidArg(format!("download failed for {}: {e}", artifact.url)) + })?; + + if !response.status().is_success() { + return Err(RvsError::InvalidArg(format!( + "download {}: HTTP {}", + artifact.url, + response.status() + ))); + } + + let expected_sha256 = response + .headers() + .get("x-checksum-sha256") + .and_then(|v| v.to_str().ok()) + .map(str::to_lowercase); + + // Stream to a sibling `.partial` file and rename on success, so an + // interrupted download never poisons the cache with a truncated file. + // Append (not `with_extension`) so `foo.bin` and `foo.json` get distinct + // tmp paths instead of colliding on `foo.partial`. + let tmp_path = std::path::PathBuf::from(format!("{}.partial", artifact.output_path)); + let mut file = tokio::fs::File::create(&tmp_path).await?; + let mut hasher = Sha256::new(); + let mut stream = response.bytes_stream(); + while let Some(chunk) = stream.next().await { + let chunk = chunk + .map_err(|e| RvsError::InvalidArg(format!("stream error for {}: {e}", artifact.url)))?; + hasher.update(&chunk); + file.write_all(&chunk).await?; + } + file.flush().await?; + + if let Some(expected) = expected_sha256 { + let actual = hex::encode(hasher.finalize()); + if actual != expected { + let _ = tokio::fs::remove_file(&tmp_path).await; + return Err(RvsError::ChecksumMismatch { + path: artifact.output_path.clone(), + expected, + actual, + }); + } + tracing::info!(path = artifact.output_path, "artifact: checksum OK"); + } + + tokio::fs::rename(&tmp_path, path).await?; + Ok(()) +} + +/// Spawn an HTTP file server that serves the artifact cache directory. +/// +/// Runs in the background via `tokio::spawn`. Nodes pull artifacts from +/// this server (http://:///) +/// during validation setup. +async fn spawn_cache_server(ctx: &RvsCtx) -> Result<(), RvsError> { + let cache_dir = ctx.cfg.artifact_cache.cache_dir.clone(); + let port = ctx.cfg.artifact_cache.serve_port; + let addr = std::net::SocketAddr::from(([0, 0, 0, 0], port)); + + let listener = tokio::net::TcpListener::bind(addr) + .await + .map_err(RvsError::Io)?; + + tracing::info!(port, cache_dir, "artifact: cache server listening"); + + // TODO[#416]: ServeDir returns 404 on directory paths -- add an explicit + // listing endpoint (e.g. GET ///) if nodes or operators + // need to discover available artifacts without knowing filenames in advance. + let app = Router::new().fallback_service(ServeDir::new(&cache_dir)); + + tokio::spawn(async move { + if let Err(e) = axum::serve(listener, app).await { + tracing::error!(error = %e, "artifact: cache server error"); + } + }); + + Ok(()) +} diff --git a/crates/rvs/src/artifact/mod.rs b/crates/rvs/src/artifact/mod.rs new file mode 100644 index 0000000000..e318d9296b --- /dev/null +++ b/crates/rvs/src/artifact/mod.rs @@ -0,0 +1,3 @@ +mod io; + +pub use io::{ArtifactDownload, process_artifacts, start_cache_server}; diff --git a/crates/rvs/src/main.rs b/crates/rvs/src/bin/carbide-rvs.rs similarity index 59% rename from crates/rvs/src/main.rs rename to crates/rvs/src/bin/carbide-rvs.rs index 8e384fc909..0487491f3e 100644 --- a/crates/rvs/src/main.rs +++ b/crates/rvs/src/bin/carbide-rvs.rs @@ -20,12 +20,15 @@ //! External validation orchestrator for NICC. Bridges NICC with test //! frameworks (Benchpress, MPI-based, SLURM-based, etc.) to perform //! partition-aware rack validation. -//! -//! NOTE: This is still a tracer / playground. The abstractions are -//! crystallizing but main.rs is not yet the final shape. use std::path::PathBuf; +use carbide_rvs::config::Config; +use carbide_rvs::ctx::RvsCtx; +use carbide_rvs::error::RvsError; +use carbide_rvs::partitions::Partitions; +use carbide_rvs::{artifact, client, rack, scenario, validation}; +use clap::Parser; use forge_tls::client_config::ClientCert; use rpc::forge_tls_client::{ApiConfig, ForgeClientConfig}; use tokio::signal::unix::{SignalKind, signal}; @@ -35,20 +38,16 @@ use tracing_subscriber::EnvFilter; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::util::SubscriberInitExt; -mod client; -mod config; -mod error; -mod partitions; -mod rack; -mod scenario; -mod validation; - -use client::NiccClient; -use config::Config; -use partitions::Partitions; +#[derive(Parser)] +#[command(about = "Rack Validation Service")] +struct Cli { + /// Path to TOML config file. Defaults and CARBIDE_RVS__* env vars apply if omitted. + #[arg(long, value_name = "PATH")] + config: Option, +} #[tokio::main] -async fn main() -> Result<(), error::RvsError> { +async fn main() -> Result<(), RvsError> { let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) .from_env_lossy(); @@ -60,50 +59,63 @@ async fn main() -> Result<(), error::RvsError> { tracing::info!("carbide-rvs: Rack Validation Service starting"); + let cli = Cli::parse(); + // Load config: defaults -> optional TOML -> CARBIDE_RVS__* env vars - let config_path = parse_config_path()?; - let cfg = Config::load(config_path.as_deref())?; + let cfg = Config::load(cli.config.as_deref())?; tracing::info!(config = ?cfg, "config loaded"); - // Try loading scenario -- soft fail, this is tracer code - let scenario = match scenario::Scenario::load(std::path::Path::new(&cfg.scenario_config_path)) { - Ok(s) => { - tracing::info!(scenario = ?s, "scenario loaded"); - Some(s) - } - Err(e) => { - tracing::warn!(error = %e, "scenario not loaded, continuing without it"); - None - } - }; - let os_uri = scenario.as_ref().map(|s| s.os.uri.as_str()).unwrap_or(""); + // Load all scenarios -- soft fail per file so a single bad config doesn't block others. + let scenarios: Vec = cfg + .scenario_config_paths + .iter() + .filter_map(|path| { + match scenario::Scenario::load(std::path::Path::new(path)) { + Ok(s) => { + tracing::info!(path, model = %s.rack.model, sot_release = %s.rack.sot_release, "scenario loaded"); + Some(s) + } + Err(e) => { + tracing::warn!(path, error = %e, "scenario not loaded, skipping"); + None + } + } + }) + .collect(); // Build NICC client from config let client_cert = ClientCert { - cert_path: cfg.tls.identity_pemfile_path, - key_path: cfg.tls.identity_keyfile_path, + cert_path: cfg.tls.identity_pemfile_path.clone(), + key_path: cfg.tls.identity_keyfile_path.clone(), }; - let client_config = ForgeClientConfig::new(cfg.tls.root_cafile_path, Some(client_cert)); + let client_config = ForgeClientConfig::new(cfg.tls.root_cafile_path.clone(), Some(client_cert)); let api_config = ApiConfig::new(&cfg.nicc.url, &client_config); - let nicc = NiccClient::new(&api_config); + let nicc = client::NiccClient::new(&api_config); + + let ctx = RvsCtx { + nicc, + scenarios, + cfg, + sot_override_path: None, + }; // TODO[#416]: re-introduce a liveness/health probe (bound to // `cfg.metrics_endpoint`) once RVS runs as a long-lived service with // graceful shutdown and real health checks. For now, "alive" just means - // the process is running -- the current stub probe would only echo 200 - // and buys nothing. + // the process is running -- a stub probe would only echo 200 and buys + // nothing. let cancel_token = CancellationToken::new(); let validation_cancel_token = cancel_token.clone(); tokio::spawn(async move { + let Ok(mut sigint) = signal(SignalKind::interrupt()) else { + return; + }; + let Ok(mut sigterm) = signal(SignalKind::terminate()) else { + return; + }; loop { - let Ok(mut sigint) = signal(SignalKind::interrupt()) else { - break; - }; - let Ok(mut sigterm) = signal(SignalKind::terminate()) else { - break; - }; // Wait for SIGINT or SIGTERM let received_signal = tokio::select! { _ = sigint.recv() => "SIGINT", @@ -121,40 +133,23 @@ async fn main() -> Result<(), error::RvsError> { } }); - run_validation( - &nicc, - os_uri, - cfg.poll_interval_secs, - validation_cancel_token, - ) - .await -} - -/// Parse `--config ` from argv. Returns `None` if the flag is absent. -fn parse_config_path() -> Result, error::RvsError> { - let mut args = std::env::args().skip(1); - while let Some(arg) = args.next() { - if arg == "--config" { - let path = args.next().ok_or_else(|| { - error::RvsError::InvalidArg("--config requires a path argument".to_string()) - })?; - return Ok(Some(PathBuf::from(path))); - } - } - Ok(None) + run_validation(&ctx, validation_cancel_token).await } // Rack validation high-level flow -async fn run_validation( - nicc: &NiccClient, - os_uri: &str, - poll_interval_secs: u64, - cancel_token: CancellationToken, -) -> Result<(), error::RvsError> { +async fn run_validation(ctx: &RvsCtx, cancel_token: CancellationToken) -> Result<(), RvsError> { + artifact::start_cache_server(ctx).await?; + let poll_interval_secs = ctx.cfg.poll_interval_secs; let interval = std::time::Duration::from_secs(poll_interval_secs); loop { - let racks = rack::fetch_racks(nicc).await?; - for job in validation::plan(Partitions::try_from(racks)?, nicc, os_uri).await? { + let racks = rack::fetch_racks(&ctx.nicc).await?; + artifact::process_artifacts(&racks, ctx).await?; + let os_uri = ctx + .scenarios + .first() + .map(|s| s.os.uri.as_str()) + .unwrap_or(""); + for job in validation::plan(Partitions::try_from(racks)?, &ctx.nicc, os_uri).await? { let report = validation::validate_partition(job).await?; validation::submit_report(report).await?; } diff --git a/crates/rvs/src/bin/test-artifact-cache.rs b/crates/rvs/src/bin/test-artifact-cache.rs new file mode 100644 index 0000000000..99a6de7a06 --- /dev/null +++ b/crates/rvs/src/bin/test-artifact-cache.rs @@ -0,0 +1,127 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Functional smoke test for the artifact caching pipeline. +//! +//! Exercises `process_artifacts` end-to-end using a local SOT JSON file and +//! one or more scenario TOMLs, without a live NICC gRPC connection. +//! +//! Usage: +//! cargo run --bin test-artifact-cache -- \ +//! --sot \ +//! --scenario \ +//! --cache-dir +//! +//! To capture logs for inspection, redirect output to a file: +//! target/debug/test-artifact-cache \ +//! --sot \ +//! --scenario \ +//! --cache-dir \ +//! > /tmp/rvs-test.log 2>&1 & +//! tail -f /tmp/rvs-test.log + +use std::path::PathBuf; + +use carbide_rvs::artifact; +use carbide_rvs::client::NiccClient; +use carbide_rvs::config::Config; +use carbide_rvs::ctx::RvsCtx; +use carbide_rvs::error::RvsError; +use carbide_rvs::rack::Racks; +use carbide_rvs::scenario::Scenario; +use clap::Parser; +use rpc::forge_tls_client::{ApiConfig, ForgeClientConfig}; +use tracing::level_filters::LevelFilter; +use tracing_subscriber::EnvFilter; +use tracing_subscriber::layer::SubscriberExt; +use tracing_subscriber::util::SubscriberInitExt; + +#[derive(Parser)] +#[command(about = "Smoke-test the artifact cache pipeline without a live NICC connection")] +struct Cli { + /// Path to SOT JSON file (replaces gRPC fetch). + #[arg(long, value_name = "PATH")] + sot: PathBuf, + + /// Path to scenario TOML (repeatable). + #[arg(long, value_name = "PATH")] + scenario: Vec, + + /// Directory to cache downloaded artifacts into. + #[arg(long, value_name = "PATH")] + cache_dir: PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), RvsError> { + let env_filter = EnvFilter::builder() + .with_default_directive(LevelFilter::INFO.into()) + .from_env_lossy(); + + tracing_subscriber::registry() + .with(logfmt::layer()) + .with(env_filter) + .init(); + + let cli = Cli::parse(); + + // Load scenarios from provided paths; hard-fail on any parse error. + let scenarios: Vec = cli + .scenario + .iter() + .map(|p| Scenario::load(p).map_err(RvsError::InvalidArg)) + .collect::>()?; + + if scenarios.is_empty() { + return Err(RvsError::InvalidArg( + "no scenarios provided; pass at least one --scenario ".to_string(), + )); + } + + // Build config with target cache dir; everything else default. + let mut cfg = Config::default(); + cfg.artifact_cache.cache_dir = cli.cache_dir.to_string_lossy().into_owned(); + + // NiccClient is required by RvsCtx but won't be called: fetch_sot + // short-circuits to sot_override_path before touching gRPC. + let client_config = ForgeClientConfig::new("/dev/null".to_string(), None); + let api_config = ApiConfig::new(&cfg.nicc.url, &client_config); + let nicc = NiccClient::new(&api_config); + + let sot_path = cli.sot.to_string_lossy().into_owned(); + let ctx = RvsCtx { + nicc, + scenarios, + cfg, + sot_override_path: Some(sot_path), + }; + + // Empty racks: fetch_sot ignores the racks argument while sot_override_path is set. + let racks = Racks { inner: vec![] }; + + tracing::info!("test-artifact-cache: starting artifact cache run"); + artifact::start_cache_server(&ctx).await?; + artifact::process_artifacts(&racks, &ctx).await?; + tracing::info!( + port = ctx.cfg.artifact_cache.serve_port, + cache_dir = %ctx.cfg.artifact_cache.cache_dir, + "test-artifact-cache: downloads complete, cache server running — press Ctrl+C to stop" + ); + tokio::signal::ctrl_c().await.map_err(RvsError::Io)?; + + Ok(()) +} diff --git a/crates/rvs/src/client/io.rs b/crates/rvs/src/client/io.rs index 3403196dc0..b8998a6e44 100644 --- a/crates/rvs/src/client/io.rs +++ b/crates/rvs/src/client/io.rs @@ -3,7 +3,8 @@ use std::collections::HashMap; use carbide_uuid::machine::MachineId; use rpc::forge::{ GetRackRequest, Instance, InstanceAllocationRequest, InstanceConfig, Label, - MachineMetadataUpdateRequest, MachinesByIdsRequest, Metadata, + MachineMetadataUpdateRequest, MachinesByIdsRequest, Metadata, RackFirmwareGetRequest, + RackFirmwareSearchFilter, }; use rpc::forge_api_client::ForgeApiClient; use rpc::forge_tls_client::ApiConfig; @@ -12,7 +13,7 @@ use rpc::protos::forge::{ instance_operating_system_config, }; -use super::{RackData, TrayData}; +use super::{RackData, RackFirmwareData, TrayData}; use crate::error::RvsError; /// NICC gRPC client wrapper -- translates gRPC responses into IR types. @@ -34,6 +35,34 @@ impl NiccClient { response.rack.into_iter().map(RackData::try_from).collect() } + /// Fetch a rack firmware record (SOT JSON) by ID. + #[allow(dead_code)] + pub async fn get_rack_firmware(&self, firmware_id: &str) -> Result { + let response = self + .inner + .get_rack_firmware(RackFirmwareGetRequest { + id: firmware_id.to_string(), + }) + .await?; + RackFirmwareData::try_from(response) + } + + /// List all rack firmware records (SOT JSON blobs) from NICC. + pub async fn list_rack_firmware(&self) -> Result, RvsError> { + let response = self + .inner + .list_rack_firmware(RackFirmwareSearchFilter { + only_available: false, + rack_hardware_type: None, + }) + .await?; + response + .configs + .into_iter() + .map(RackFirmwareData::try_from) + .collect() + } + /// Update `rv.*` labels on a machine, preserving all non-`rv.*` labels. pub async fn update_rv_labels( &self, diff --git a/crates/rvs/src/client/mod.rs b/crates/rvs/src/client/mod.rs index 839cc91d37..a99b56d038 100644 --- a/crates/rvs/src/client/mod.rs +++ b/crates/rvs/src/client/mod.rs @@ -5,7 +5,7 @@ use carbide_uuid::machine::MachineId; use carbide_uuid::nvlink::NvLinkDomainId; use carbide_uuid::rack::RackId; pub use io::NiccClient; -use rpc::forge::{Machine, Rack}; +use rpc::forge::{Machine, Rack, RackFirmware}; use crate::error::RvsError; @@ -113,3 +113,26 @@ impl TryFrom for RackData { }) } } + +/// SOT JSON blob returned from NICC for a rack firmware/release record. +#[allow(dead_code)] +#[derive(Debug)] +pub struct RackFirmwareData { + /// Firmware record ID. + pub id: String, + /// Parsed SOT JSON -- used for JSONPath artifact resolution. + pub config: serde_json::Value, +} + +impl TryFrom for RackFirmwareData { + type Error = RvsError; + + fn try_from(value: RackFirmware) -> Result { + let config = serde_json::from_str(&value.config_json) + .map_err(|e| RvsError::InvalidArg(format!("invalid SOT JSON: {e}")))?; + Ok(Self { + id: value.id, + config, + }) + } +} diff --git a/crates/rvs/src/config.rs b/crates/rvs/src/config.rs index 8e439ac292..0597f890f3 100644 --- a/crates/rvs/src/config.rs +++ b/crates/rvs/src/config.rs @@ -15,8 +15,8 @@ pub struct Config { pub listen: SocketAddr, /// Prometheus metrics / liveness probe endpoint. pub metrics_endpoint: SocketAddr, - /// Path to the scenario definition TOML. - pub scenario_config_path: String, + /// Paths to scenario definition TOMLs (one per rack model/release). + pub scenario_config_paths: Vec, /// How long to wait between validation poll cycles (seconds). pub poll_interval_secs: u64, /// NICC connection settings. @@ -62,6 +62,8 @@ pub struct ArtifactCacheConfig { pub download_timeout_secs: u64, /// Max parallel artifact downloads. pub max_concurrent_downloads: u32, + /// Port for the HTTP artifact cache server (nodes pull from this). + pub serve_port: u16, } impl Default for Config { @@ -69,7 +71,7 @@ impl Default for Config { Self { listen: "[::]:1089".parse().unwrap(), metrics_endpoint: "[::]:9019".parse().unwrap(), - scenario_config_path: "/etc/forge/rvs/scenario.toml".to_string(), + scenario_config_paths: vec!["/etc/forge/rvs/scenario.toml".to_string()], poll_interval_secs: 30, nicc: NiccConfig::default(), tls: TlsConfig::default(), @@ -103,6 +105,7 @@ impl Default for ArtifactCacheConfig { cache_dir: "/rvs-cache".to_string(), download_timeout_secs: 600, max_concurrent_downloads: 4, + serve_port: 8080, } } } diff --git a/crates/rvs/src/ctx.rs b/crates/rvs/src/ctx.rs new file mode 100644 index 0000000000..ba38fe472a --- /dev/null +++ b/crates/rvs/src/ctx.rs @@ -0,0 +1,16 @@ +use crate::client::NiccClient; +use crate::config::Config; +use crate::scenario::Scenario; + +/// Top-level RVS runtime context -- passed to all major routines. +/// +/// Bundles the NICC client, loaded scenarios, and service config so individual +/// routines don't need to accept each piece separately. +pub struct RvsCtx { + pub nicc: NiccClient, + pub scenarios: Vec, + pub cfg: Config, + /// Dev/test only: load SOT JSON from this file path instead of calling + /// gRPC. Set in `test-artifact-cache`; always `None` in production. + pub sot_override_path: Option, +} diff --git a/crates/rvs/src/error.rs b/crates/rvs/src/error.rs index dd07697cab..48cd651e8a 100644 --- a/crates/rvs/src/error.rs +++ b/crates/rvs/src/error.rs @@ -36,4 +36,16 @@ pub enum RvsError { /// I/O error (e.g. binding a TCP listener). #[error("I/O error: {0}")] Io(#[from] std::io::Error), + + /// An operation exceeded its deadline. + #[error("Timeout: {0}")] + Timeout(String), + + /// Downloaded file digest does not match the server-advertised checksum. + #[error("Checksum mismatch for {path}: expected {expected}, got {actual}")] + ChecksumMismatch { + path: String, + expected: String, + actual: String, + }, } diff --git a/crates/rvs/src/lib.rs b/crates/rvs/src/lib.rs new file mode 100644 index 0000000000..68fda6851f --- /dev/null +++ b/crates/rvs/src/lib.rs @@ -0,0 +1,26 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +pub mod artifact; +pub mod client; +pub mod config; +pub mod ctx; +pub mod error; +pub mod partitions; +pub mod rack; +pub mod scenario; +pub mod validation; diff --git a/crates/rvs/src/scenario.rs b/crates/rvs/src/scenario.rs deleted file mode 100644 index 25af51500e..0000000000 --- a/crates/rvs/src/scenario.rs +++ /dev/null @@ -1,95 +0,0 @@ -#![allow(dead_code)] - -use std::path::Path; - -use serde::Deserialize; - -/// Rack model + SOT release this scenario targets. -#[derive(Debug, Deserialize)] -pub struct RackTarget { - pub model: String, - pub sot_release: String, -} - -/// Ephemeral OS image to boot on validation nodes. -#[derive(Debug, Deserialize)] -pub struct OsImage { - pub uri: String, -} - -/// Pre-cached artifact -- resolved via direct URI or SOT JSONPath. -#[derive(Debug, Deserialize)] -pub struct Artifact { - pub name: String, - pub output: String, - /// Direct download URL (mutually exclusive with `sotpath`). - // TODO[#416]: enforce exactly one of `uri`/`sotpath` is set - add a - // post-deserialization validation step in Scenario::load or a custom - // Deserialize impl. Currently both can be set (or neither) without error. - pub uri: Option, - /// JSONPath into SOT JSON to resolve download URL. - pub sotpath: Option, -} - -/// Setup step -- runs before tests, aborts validation on failure. -#[derive(Debug, Deserialize)] -pub struct SetupStep { - pub execute: String, -} - -/// Test step -- result recorded independently under `name`. -#[derive(Debug, Deserialize)] -pub struct TestStep { - pub name: String, - pub execute: String, -} - -/// Teardown step -- always runs, regardless of test outcome. -#[derive(Debug, Deserialize)] -pub struct TeardownStep { - pub execute: String, -} - -/// Complete rack validation scenario definition. -#[derive(Debug, Deserialize)] -pub struct Scenario { - pub rack: RackTarget, - pub os: OsImage, - #[serde(default)] - pub artifacts: Vec, - #[serde(default)] - pub setup: Vec, - #[serde(default)] - pub test: Vec, - #[serde(default)] - pub teardown: Vec, -} - -impl Scenario { - /// Parse a scenario from a TOML file on disk. - pub fn load(path: &Path) -> Result { - let content = - std::fs::read_to_string(path).map_err(|e| format!("read {}: {e}", path.display()))?; - toml::from_str(&content).map_err(|e| format!("parse {}: {e}", path.display())) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_load_example_scenario() { - let path = - std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("doc/example_scenario.toml"); - let scenario = Scenario::load(&path).unwrap(); - assert_eq!(scenario.rack.model, "gb200nvl"); - assert_eq!(scenario.rack.sot_release, "1.2.5"); - assert!(!scenario.os.uri.is_empty()); - assert_eq!(scenario.artifacts.len(), 6); - assert_eq!(scenario.setup.len(), 1); - assert_eq!(scenario.test.len(), 2); - assert_eq!(scenario.teardown.len(), 1); - assert_eq!(scenario.test[0].name, "nv_basic"); - } -} diff --git a/crates/rvs/src/scenario/mod.rs b/crates/rvs/src/scenario/mod.rs new file mode 100644 index 0000000000..6259e0a79d --- /dev/null +++ b/crates/rvs/src/scenario/mod.rs @@ -0,0 +1,177 @@ +#![allow(dead_code)] + +mod resolver; + +use std::path::Path; + +pub use resolver::resolve_artifact_urls; +use serde::Deserialize; + +/// Rack model + SOT release this scenario targets. +#[derive(Debug, Deserialize)] +pub struct RackTarget { + pub model: String, + pub sot_release: String, +} + +/// Ephemeral OS image to boot on validation nodes. +#[derive(Debug, Deserialize)] +pub struct OsImage { + pub uri: String, +} + +/// Pre-cached artifact -- resolved via direct URI or SOT JSONPath. +#[derive(Debug, Deserialize)] +pub struct Artifact { + pub name: String, + pub output: String, + /// Direct download URL (mutually exclusive with `sotpath`). + /// + /// Exactly one of `uri`/`sotpath` must be set; enforced in + /// `Scenario::load` after deserialization. + pub uri: Option, + /// JSONPath into SOT JSON to resolve download URL. + pub sotpath: Option, +} + +/// Setup step -- runs before tests, aborts validation on failure. +#[derive(Debug, Deserialize)] +pub struct SetupStep { + pub execute: String, +} + +/// Test step -- result recorded independently under `name`. +#[derive(Debug, Deserialize)] +pub struct TestStep { + pub name: String, + pub execute: String, +} + +/// Teardown step -- always runs, regardless of test outcome. +#[derive(Debug, Deserialize)] +pub struct TeardownStep { + pub execute: String, +} + +/// Complete rack validation scenario definition. +#[derive(Debug, Deserialize)] +pub struct Scenario { + pub rack: RackTarget, + pub os: OsImage, + #[serde(default)] + pub artifacts: Vec, + #[serde(default)] + pub setup: Vec, + #[serde(default)] + pub test: Vec, + #[serde(default)] + pub teardown: Vec, +} + +impl Scenario { + /// Parse a scenario from a TOML file on disk. + pub fn load(path: &Path) -> Result { + let content = + std::fs::read_to_string(path).map_err(|e| format!("read {}: {e}", path.display()))?; + let scenario: Scenario = + toml::from_str(&content).map_err(|e| format!("parse {}: {e}", path.display()))?; + scenario + .validate() + .map_err(|e| format!("validate {}: {e}", path.display()))?; + Ok(scenario) + } + + fn validate(&self) -> Result<(), String> { + for artifact in &self.artifacts { + match (&artifact.uri, &artifact.sotpath) { + (Some(_), Some(_)) => { + return Err(format!( + "artifact '{}': both 'uri' and 'sotpath' set; exactly one required", + artifact.name + )); + } + (None, None) => { + return Err(format!( + "artifact '{}': neither 'uri' nor 'sotpath' set; exactly one required", + artifact.name + )); + } + _ => {} + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_load_example_scenario() { + let path = + std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("doc/example_scenario.toml"); + let scenario = Scenario::load(&path).unwrap(); + assert_eq!(scenario.rack.model, "gb200nvl"); + assert_eq!(scenario.rack.sot_release, "1.2.5"); + assert!(!scenario.os.uri.is_empty()); + assert_eq!(scenario.artifacts.len(), 6); + assert_eq!(scenario.setup.len(), 1); + assert_eq!(scenario.test.len(), 2); + assert_eq!(scenario.teardown.len(), 1); + assert_eq!(scenario.test[0].name, "nv_basic"); + } + + fn scenario_with_artifacts(artifacts: Vec) -> Scenario { + Scenario { + rack: RackTarget { + model: "gb200nvl".to_string(), + sot_release: "1.2.5".to_string(), + }, + os: OsImage { + uri: "https://example.com/os.img".to_string(), + }, + artifacts, + setup: vec![], + test: vec![], + teardown: vec![], + } + } + + fn artifact(name: &str, uri: Option<&str>, sotpath: Option<&str>) -> Artifact { + Artifact { + name: name.to_string(), + output: format!("{name}.bin"), + uri: uri.map(str::to_string), + sotpath: sotpath.map(str::to_string), + } + } + + #[test] + fn validate_accepts_uri_only() { + let s = scenario_with_artifacts(vec![artifact("a", Some("https://x/y"), None)]); + assert!(s.validate().is_ok()); + } + + #[test] + fn validate_accepts_sotpath_only() { + let s = scenario_with_artifacts(vec![artifact("a", None, Some("$.foo"))]); + assert!(s.validate().is_ok()); + } + + #[test] + fn validate_rejects_both_uri_and_sotpath() { + let s = scenario_with_artifacts(vec![artifact("a", Some("https://x/y"), Some("$.foo"))]); + let err = s.validate().unwrap_err(); + assert!(err.contains("both"), "got: {err}"); + assert!(err.contains("'a'"), "got: {err}"); + } + + #[test] + fn validate_rejects_neither_uri_nor_sotpath() { + let s = scenario_with_artifacts(vec![artifact("a", None, None)]); + let err = s.validate().unwrap_err(); + assert!(err.contains("neither"), "got: {err}"); + assert!(err.contains("'a'"), "got: {err}"); + } +} diff --git a/crates/rvs/src/scenario/resolver.rs b/crates/rvs/src/scenario/resolver.rs new file mode 100644 index 0000000000..0519609b11 --- /dev/null +++ b/crates/rvs/src/scenario/resolver.rs @@ -0,0 +1,199 @@ +use jsonpath_rust::JsonPath; + +use super::Scenario; +use crate::artifact::ArtifactDownload; +use crate::client::RackFirmwareData; +use crate::ctx::RvsCtx; +use crate::error::RvsError; + +/// Resolve all artifact download URLs for all scenarios. +/// +/// For each scenario collects: +/// - OS image (`scenario.os.uri`) +/// - Direct-URI artifacts (`artifact.uri`) +/// - SOT-resolved artifacts (`artifact.sotpath`) evaluated via JSONPath +/// +/// The caller is responsible for providing the SOT JSON when sotpath +/// artifacts are present. +pub fn resolve_artifact_urls( + sot: &RackFirmwareData, + ctx: &RvsCtx, +) -> Result, RvsError> { + let mut downloads = vec![]; + + for scenario in &ctx.scenarios { + downloads.extend(resolve_for_scenario( + sot, + scenario, + &ctx.cfg.artifact_cache.cache_dir, + )?); + } + + Ok(downloads) +} + +fn resolve_for_scenario( + sot: &RackFirmwareData, + scenario: &Scenario, + cache_dir: &str, +) -> Result, RvsError> { + let ns = format!("{}/{}", scenario.rack.model, scenario.rack.sot_release); + let mut downloads = vec![]; + + // OS image + downloads.push(ArtifactDownload { + output_path: format!("{cache_dir}/{ns}/os"), + url: scenario.os.uri.clone(), + }); + + // Direct-URI artifacts + for artifact in scenario.artifacts.iter().filter(|a| a.uri.is_some()) { + downloads.push(ArtifactDownload { + output_path: format!("{cache_dir}/{ns}/{}", artifact.output), + url: artifact.uri.clone().unwrap(), + }); + } + + // SOT-resolved artifacts + for artifact in scenario.artifacts.iter().filter(|a| a.sotpath.is_some()) { + let url = eval_sotpath(sot, artifact.sotpath.as_deref().unwrap())?; + downloads.push(ArtifactDownload { + output_path: format!("{cache_dir}/{ns}/{}", artifact.output), + url, + }); + } + + Ok(downloads) +} + +/// Evaluate a JSONPath expression against the SOT config and return the +/// first matching string value (the download URL). +fn eval_sotpath(sot: &RackFirmwareData, sotpath: &str) -> Result { + let results = sot + .config + .query_with_path(sotpath) + .map_err(|e| RvsError::InvalidArg(format!("invalid sotpath '{sotpath}': {e}")))?; + + let value = results + .into_iter() + .next() + .ok_or_else(|| RvsError::InvalidArg(format!("sotpath '{sotpath}' matched nothing")))?; + + value.val().as_str().map(|s| s.to_string()).ok_or_else(|| { + RvsError::InvalidArg(format!("sotpath '{sotpath}' did not resolve to a string")) + }) +} + +#[cfg(test)] +mod tests { + use serde_json::json; + + use super::*; + + fn make_sot(config: serde_json::Value) -> RackFirmwareData { + RackFirmwareData { + id: "test".to_string(), + config, + } + } + + fn make_scenario(model: &str, sot_release: &str) -> Scenario { + Scenario { + rack: super::super::RackTarget { + model: model.to_string(), + sot_release: sot_release.to_string(), + }, + os: super::super::OsImage { + uri: "https://example.com/os.img".to_string(), + }, + artifacts: vec![], + setup: vec![], + test: vec![], + teardown: vec![], + } + } + + // --- eval_sotpath --- + + #[test] + fn eval_sotpath_returns_first_string_match() { + let sot = make_sot(json!({ "firmware": { "url": "https://cdn.example.com/fw.bin" } })); + let url = eval_sotpath(&sot, "$.firmware.url").unwrap(); + assert_eq!(url, "https://cdn.example.com/fw.bin"); + } + + #[test] + fn eval_sotpath_no_match_is_error() { + let sot = make_sot(json!({ "firmware": {} })); + let err = eval_sotpath(&sot, "$.firmware.url").unwrap_err(); + assert!(err.to_string().contains("matched nothing"), "got: {err}"); + } + + #[test] + fn eval_sotpath_non_string_match_is_error() { + let sot = make_sot(json!({ "firmware": { "version": 42 } })); + let err = eval_sotpath(&sot, "$.firmware.version").unwrap_err(); + assert!( + err.to_string().contains("did not resolve to a string"), + "got: {err}" + ); + } + + // --- resolve_for_scenario --- + + #[test] + fn resolve_for_scenario_os_image_only() { + let sot = make_sot(json!({})); + let scenario = make_scenario("gb200nvl", "1.2.5"); + let downloads = resolve_for_scenario(&sot, &scenario, "/cache").unwrap(); + assert_eq!(downloads.len(), 1); + assert_eq!(downloads[0].output_path, "/cache/gb200nvl/1.2.5/os"); + assert_eq!(downloads[0].url, "https://example.com/os.img"); + } + + #[test] + fn resolve_for_scenario_direct_uri_artifact() { + let sot = make_sot(json!({})); + let mut scenario = make_scenario("gb200nvl", "1.2.5"); + scenario.artifacts.push(super::super::Artifact { + name: "diag".to_string(), + output: "diag.bin".to_string(), + uri: Some("https://example.com/diag.bin".to_string()), + sotpath: None, + }); + let downloads = resolve_for_scenario(&sot, &scenario, "/cache").unwrap(); + assert_eq!(downloads.len(), 2); + assert_eq!(downloads[1].output_path, "/cache/gb200nvl/1.2.5/diag.bin"); + assert_eq!(downloads[1].url, "https://example.com/diag.bin"); + } + + #[test] + fn resolve_for_scenario_sotpath_artifact() { + let sot = make_sot(json!({ "packages": { "diag": "https://cdn.example.com/diag.bin" } })); + let mut scenario = make_scenario("gb200nvl", "1.2.5"); + scenario.artifacts.push(super::super::Artifact { + name: "diag".to_string(), + output: "diag.bin".to_string(), + uri: None, + sotpath: Some("$.packages.diag".to_string()), + }); + let downloads = resolve_for_scenario(&sot, &scenario, "/cache").unwrap(); + assert_eq!(downloads.len(), 2); + assert_eq!(downloads[1].output_path, "/cache/gb200nvl/1.2.5/diag.bin"); + assert_eq!(downloads[1].url, "https://cdn.example.com/diag.bin"); + } + + #[test] + fn resolve_for_scenario_sotpath_missing_is_error() { + let sot = make_sot(json!({})); + let mut scenario = make_scenario("gb200nvl", "1.2.5"); + scenario.artifacts.push(super::super::Artifact { + name: "diag".to_string(), + output: "diag.bin".to_string(), + uri: None, + sotpath: Some("$.packages.diag".to_string()), + }); + let err = resolve_for_scenario(&sot, &scenario, "/cache").unwrap_err(); + assert!(err.to_string().contains("matched nothing"), "got: {err}"); + } +} From 391aad9c31c55c1a27f788d902ac5b1c60c48451 Mon Sep 17 00:00:00 2001 From: Max Olender Date: Wed, 20 May 2026 10:05:36 -0700 Subject: [PATCH 2/4] fixup! dead_code annotations removed Signed-off-by: Max Olender --- crates/rvs/src/artifact/io.rs | 1 - crates/rvs/src/client/io.rs | 3 --- crates/rvs/src/client/mod.rs | 1 - crates/rvs/src/error.rs | 1 - crates/rvs/src/scenario/mod.rs | 2 -- crates/rvs/src/validation/mod.rs | 9 +++++++-- 6 files changed, 7 insertions(+), 10 deletions(-) diff --git a/crates/rvs/src/artifact/io.rs b/crates/rvs/src/artifact/io.rs index 51d99d33ce..29e2ebd694 100644 --- a/crates/rvs/src/artifact/io.rs +++ b/crates/rvs/src/artifact/io.rs @@ -16,7 +16,6 @@ use crate::rack::Racks; use crate::scenario; /// A resolved artifact ready to be downloaded. -#[allow(dead_code)] #[derive(Debug)] pub struct ArtifactDownload { /// Destination path under cache_dir///. diff --git a/crates/rvs/src/client/io.rs b/crates/rvs/src/client/io.rs index b8998a6e44..bb846ae357 100644 --- a/crates/rvs/src/client/io.rs +++ b/crates/rvs/src/client/io.rs @@ -36,7 +36,6 @@ impl NiccClient { } /// Fetch a rack firmware record (SOT JSON) by ID. - #[allow(dead_code)] pub async fn get_rack_firmware(&self, firmware_id: &str) -> Result { let response = self .inner @@ -103,7 +102,6 @@ impl NiccClient { } /// Allocate a validation instance on a single machine. - #[allow(dead_code)] /// /// The OS is identified by `os_uri` from the scenario file. Until RVS can /// resolve the URI to a NICC OS image UUID, `os_image_id` is stubbed with @@ -150,7 +148,6 @@ impl NiccClient { } /// Fetch current state of instances by their IDs. - #[allow(dead_code)] pub async fn get_instances(&self, instance_ids: &[String]) -> Result, RvsError> { let ids = instance_ids .iter() diff --git a/crates/rvs/src/client/mod.rs b/crates/rvs/src/client/mod.rs index a99b56d038..a9a13bc883 100644 --- a/crates/rvs/src/client/mod.rs +++ b/crates/rvs/src/client/mod.rs @@ -115,7 +115,6 @@ impl TryFrom for RackData { } /// SOT JSON blob returned from NICC for a rack firmware/release record. -#[allow(dead_code)] #[derive(Debug)] pub struct RackFirmwareData { /// Firmware record ID. diff --git a/crates/rvs/src/error.rs b/crates/rvs/src/error.rs index 48cd651e8a..087db11f8d 100644 --- a/crates/rvs/src/error.rs +++ b/crates/rvs/src/error.rs @@ -13,7 +13,6 @@ pub enum RvsError { InvalidMachineId(#[from] MachineIdParseError), /// An ID string couldn't be parsed as a UUID-based type. - #[allow(dead_code)] #[error("Failed to parse ID: {0}")] InvalidId(String), diff --git a/crates/rvs/src/scenario/mod.rs b/crates/rvs/src/scenario/mod.rs index 6259e0a79d..db6ab4b9cb 100644 --- a/crates/rvs/src/scenario/mod.rs +++ b/crates/rvs/src/scenario/mod.rs @@ -1,5 +1,3 @@ -#![allow(dead_code)] - mod resolver; use std::path::Path; diff --git a/crates/rvs/src/validation/mod.rs b/crates/rvs/src/validation/mod.rs index b28c9748ff..64f1effc0e 100644 --- a/crates/rvs/src/validation/mod.rs +++ b/crates/rvs/src/validation/mod.rs @@ -15,7 +15,12 @@ pub struct Report { trays_cnt: u32, } -#[allow(dead_code)] +/// Aggregator for per-partition reports within one validation cycle. +/// +/// TODO[#416]: to be wired when `plan()` starts returning multiple jobs +/// and the top-level loop needs a single object to hand to +/// `submit_report`. The `_inner` prefix marks the field intentionally +/// unused until then, without resorting to `#[allow(dead_code)]`. pub struct Reports { - inner: Vec, + _inner: Vec, } From 27ab4da5b1f42a25cd028c12f83f9c2c209a4977 Mon Sep 17 00:00:00 2001 From: Max Olender Date: Wed, 20 May 2026 21:55:39 -0700 Subject: [PATCH 3/4] fixup! attempt to shut up taplo Signed-off-by: Max Olender --- Cargo.toml | 4 ++-- crates/rvs/Cargo.toml | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cd38a73119..eaabfd4905 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -261,8 +261,8 @@ tower-test = "0.4" [profile.release] debug = "line-tables-only" -debug-assertions = true # Add some extra assurance during development -overflow-checks = true # Add some extra assurance during development +debug-assertions = true # Add some extra assurance during development +overflow-checks = true # Add some extra assurance during development [workspace.lints.clippy] cloned_instead_of_copied = "warn" diff --git a/crates/rvs/Cargo.toml b/crates/rvs/Cargo.toml index 4bf8c0e400..c610ff3652 100644 --- a/crates/rvs/Cargo.toml +++ b/crates/rvs/Cargo.toml @@ -41,7 +41,10 @@ futures = { workspace = true } hex = { workspace = true } sha2 = { workspace = true } jsonpath-rust = "1.0.4" -reqwest = { default-features = false, features = ["rustls", "stream"], workspace = true } +reqwest = { default-features = false, features = [ + "rustls", + "stream", +], workspace = true } tokio = { workspace = true } tokio-util = { workspace = true } toml = { workspace = true } From 006775a6bd511c163f45948af30b39a5294b3926 Mon Sep 17 00:00:00 2001 From: Max Olender Date: Thu, 21 May 2026 00:34:57 -0700 Subject: [PATCH 4/4] fixup! xtask deps fix Signed-off-by: Max Olender --- Cargo.toml | 1 + crates/rvs/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index eaabfd4905..83111f4afe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -258,6 +258,7 @@ mockito = "1.7.0" kube = { version = "3.0.0", default-features = false } k8s-openapi = "0.27.0" tower-test = "0.4" +jsonpath-rust = "1.0.4" [profile.release] debug = "line-tables-only" diff --git a/crates/rvs/Cargo.toml b/crates/rvs/Cargo.toml index c610ff3652..bb05c563e8 100644 --- a/crates/rvs/Cargo.toml +++ b/crates/rvs/Cargo.toml @@ -40,7 +40,7 @@ serde_json = { workspace = true } futures = { workspace = true } hex = { workspace = true } sha2 = { workspace = true } -jsonpath-rust = "1.0.4" +jsonpath-rust = { workspace = true } reqwest = { default-features = false, features = [ "rustls", "stream",