From e58d098b9eadb7862917d243882fccdc67e9e81c Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 22 Apr 2026 22:04:22 -0400 Subject: [PATCH 1/8] Introduce `sled-agent-scrimlet-reconcilers` crate --- Cargo.lock | 81 +- Cargo.toml | 5 +- dev-tools/ls-apis/tests/api_dependencies.out | 6 +- sled-agent/Cargo.toml | 1 + sled-agent/scrimlet-reconcilers/Cargo.toml | 30 + .../src/dpd_reconciler.rs | 83 ++ sled-agent/scrimlet-reconcilers/src/handle.rs | 318 ++++++++ .../scrimlet-reconcilers/src/handle/tests.rs | 432 +++++++++++ sled-agent/scrimlet-reconcilers/src/lib.rs | 65 ++ .../src/mgd_reconciler.rs | 77 ++ .../src/reconciler_task.rs | 311 ++++++++ .../src/reconciler_task/tests.rs | 723 ++++++++++++++++++ sled-agent/scrimlet-reconcilers/src/status.rs | 141 ++++ .../src/switch_zone_slot.rs | 143 ++++ .../src/switch_zone_underlay_ip.rs | 82 ++ .../src/uplinkd_reconciler.rs | 48 ++ sled-agent/src/bootstrap/early_networking.rs | 2 +- sled-agent/src/services.rs | 2 +- sled-agent/src/sled_agent.rs | 68 +- 19 files changed, 2511 insertions(+), 107 deletions(-) create mode 100644 sled-agent/scrimlet-reconcilers/Cargo.toml create mode 100644 sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs create mode 100644 sled-agent/scrimlet-reconcilers/src/handle.rs create mode 100644 sled-agent/scrimlet-reconcilers/src/handle/tests.rs create mode 100644 sled-agent/scrimlet-reconcilers/src/lib.rs create mode 100644 sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs create mode 100644 sled-agent/scrimlet-reconcilers/src/reconciler_task.rs create mode 100644 sled-agent/scrimlet-reconcilers/src/reconciler_task/tests.rs create mode 100644 sled-agent/scrimlet-reconcilers/src/status.rs create mode 100644 sled-agent/scrimlet-reconcilers/src/switch_zone_slot.rs create mode 100644 sled-agent/scrimlet-reconcilers/src/switch_zone_underlay_ip.rs create mode 100644 sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs diff --git a/Cargo.lock b/Cargo.lock index dd7a93fcd2b..dd4c5cc2943 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -131,7 +131,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -142,7 +142,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -1703,7 +1703,7 @@ version = "3.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] @@ -2229,7 +2229,7 @@ source = "git+https://github.com/oxidecomputer/crucible?rev=7103cd3a3d7b0112d294 dependencies = [ "crucible-workspace-hack", "libc", - "num-derive 0.4.2", + "num-derive", "num-traits", "thiserror 2.0.18", ] @@ -3627,7 +3627,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -5086,7 +5086,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.5.10", + "socket2 0.6.3", "system-configuration", "tokio", "tower-layer", @@ -5752,7 +5752,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi 0.5.2", "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -5829,7 +5829,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -6168,12 +6168,12 @@ dependencies = [ [[package]] name = "libscf-sys" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12f02d0eda38e8cc453c5ec5d49945545d8d9eb0e59cb2ce4152ba6518f373e7" +checksum = "d0d7bd6cfd9b5d32738cebd83a1b68060d96b1ca10d88bf9a5cb10dfac0f1cdf" dependencies = [ "libc", - "num-derive 0.3.3", + "num-derive", "num-traits", ] @@ -7953,7 +7953,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -8012,17 +8012,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" -[[package]] -name = "num-derive" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "876a53fff98e03a936a674b29568b0e605f06b29372c2489ff4de23f1949743d" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "num-derive" version = "0.4.2" @@ -9173,6 +9162,7 @@ dependencies = [ "sled-agent-health-monitor", "sled-agent-measurements", "sled-agent-resolvable-files", + "sled-agent-scrimlet-reconcilers", "sled-agent-types", "sled-agent-types-versions", "sled-diagnostics", @@ -11352,9 +11342,9 @@ dependencies = [ [[package]] name = "proptest" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37566cb3fdacef14c0737f9546df7cfeadbfbc9fef10991038bf5015d0c80532" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" dependencies = [ "bit-set 0.8.0", "bit-vec 0.8.0", @@ -11456,7 +11446,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls 0.23.37", - "socket2 0.5.10", + "socket2 0.6.3", "thiserror 2.0.18", "tokio", "tracing", @@ -11494,7 +11484,7 @@ dependencies = [ "cfg_aliases 0.2.1", "libc", "once_cell", - "socket2 0.5.10", + "socket2 0.6.3", "tracing", "windows-sys 0.60.2", ] @@ -12397,7 +12387,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.11.0", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -12500,7 +12490,7 @@ dependencies = [ "security-framework", "security-framework-sys", "webpki-root-certs", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -13548,6 +13538,29 @@ dependencies = [ "tufaceous-artifact", ] +[[package]] +name = "sled-agent-scrimlet-reconcilers" +version = "0.1.0" +dependencies = [ + "assert_matches", + "chrono", + "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=44a949c9bedf4fcd4d280337fa1965b4293c88d1)", + "gateway-client", + "gateway-types", + "httpmock", + "mg-admin-client", + "omicron-common", + "omicron-test-utils", + "omicron-workspace-hack", + "reqwest 0.13.2", + "serde_json", + "sled-agent-types", + "slog", + "slog-error-chain", + "thiserror 2.0.18", + "tokio", +] + [[package]] name = "sled-agent-types" version = "0.1.0" @@ -13951,7 +13964,7 @@ version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1c97747dbf44bb1ca44a561ece23508e99cb592e862f22222dcf42f51d1e451" dependencies = [ - "heck 0.5.0", + "heck 0.4.1", "proc-macro2", "quote", "syn 2.0.117", @@ -13974,7 +13987,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -14588,7 +14601,7 @@ dependencies = [ "getrandom 0.4.1", "once_cell", "rustix 1.1.3", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -14608,7 +14621,7 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8c27177b12a6399ffc08b98f76f7c9a1f4fe9fc967c784c5a071fa8d93cf7e1" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -16957,7 +16970,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index c92c34a64ea..8d834d7cc5a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -146,6 +146,7 @@ members = [ "sled-agent/bootstrap-agent-lockstep-api", "sled-agent/bootstrap-agent-lockstep-types", "sled-agent/config-reconciler", + "sled-agent/scrimlet-reconcilers", "sled-agent/health-monitor", "sled-agent/measurements", "sled-agent/repo-depot-api", @@ -330,6 +331,7 @@ default-members = [ "sled-agent/bootstrap-agent-lockstep-api", "sled-agent/bootstrap-agent-lockstep-types", "sled-agent/config-reconciler", + "sled-agent/scrimlet-reconcilers", "sled-agent/health-monitor", "sled-agent/measurements", "sled-agent/repo-depot-api", @@ -722,7 +724,7 @@ propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "bc489ddf0f38f75e0c194b86cf6f0de377f68845" } propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "bc489ddf0f38f75e0c194b86cf6f0de377f68845" } # NOTE: see above! -proptest = "1.7.0" +proptest = "1.11.0" qorb = "0.4.1" quote = "1.0" # Some dependencies still require rand 0.8.x. @@ -786,6 +788,7 @@ sled-agent-client = { path = "clients/sled-agent-client" } sled-agent-config-reconciler = { path = "sled-agent/config-reconciler" } sled-agent-health-monitor = { path = "sled-agent/health-monitor" } sled-agent-measurements = { path = "sled-agent/measurements" } +sled-agent-scrimlet-reconcilers = { path = "sled-agent/scrimlet-reconcilers" } sled-agent-types = { path = "sled-agent/types" } sled-agent-types-versions = { path = "sled-agent/types/versions" } sled-agent-resolvable-files = { path = "sled-agent/resolvable-files" } diff --git a/dev-tools/ls-apis/tests/api_dependencies.out b/dev-tools/ls-apis/tests/api_dependencies.out index e2274ddd2d5..7dc27978b54 100644 --- a/dev-tools/ls-apis/tests/api_dependencies.out +++ b/dev-tools/ls-apis/tests/api_dependencies.out @@ -41,7 +41,7 @@ Dendrite DPD (client: dpd-client) consumed by: lldpd (lldp/lldpd) via 2 paths consumed by: mgd (maghemite/mgd) via 1 path consumed by: omicron-nexus (omicron/nexus) via 2 paths - consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path + consumed by: omicron-sled-agent (omicron/sled-agent) via 2 paths consumed by: tfportd (dendrite/tfportd) via 2 paths consumed by: wicketd (omicron/wicketd) via 2 paths @@ -52,7 +52,7 @@ Management Gateway Service (client: gateway-client) consumed by: lldpd (lldp/lldpd) via 1 path consumed by: mgd (maghemite/mgd) via 1 path consumed by: omicron-nexus (omicron/nexus) via 4 paths - consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path + consumed by: omicron-sled-agent (omicron/sled-agent) via 2 paths consumed by: wicketd (omicron/wicketd) via 3 paths Wicketd Installinator (client: installinator-client) @@ -64,7 +64,7 @@ LLDP daemon (client: lldpd-client) Maghemite MG Admin (client: mg-admin-client) consumed by: omicron-nexus (omicron/nexus) via 2 paths - consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path + consumed by: omicron-sled-agent (omicron/sled-agent) via 2 paths Nexus Internal API (client: nexus-client) consumed by: crucible-pantry (crucible/pantry) via 1 path diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 5f36581aa8a..fdd299705c1 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -100,6 +100,7 @@ sled-agent-measurements.workspace = true sled-agent-types.workspace = true sled-agent-types-versions.workspace = true sled-agent-resolvable-files.workspace = true +sled-agent-scrimlet-reconcilers.workspace = true sled-diagnostics.workspace = true sled-hardware.workspace = true sled-hardware-types.workspace = true diff --git a/sled-agent/scrimlet-reconcilers/Cargo.toml b/sled-agent/scrimlet-reconcilers/Cargo.toml new file mode 100644 index 00000000000..7f77880bd3d --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "sled-agent-scrimlet-reconcilers" +version = "0.1.0" +edition.workspace = true +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +chrono.workspace = true +dpd-client.workspace = true +gateway-client.workspace = true +gateway-types.workspace = true +mg-admin-client.workspace = true +omicron-common.workspace = true +reqwest.workspace = true +sled-agent-types.workspace = true +slog.workspace = true +slog-error-chain.workspace = true +thiserror.workspace = true +tokio.workspace = true + +omicron-workspace-hack.workspace = true + +[dev-dependencies] +assert_matches.workspace = true +httpmock.workspace = true +omicron-test-utils.workspace = true +serde_json.workspace = true diff --git a/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs new file mode 100644 index 00000000000..62ca650d39e --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs @@ -0,0 +1,83 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Reconciler responsible for configuration of `dpd` within a scrimlet's switch +//! zone. + +use crate::ThisSledSwitchZoneUnderlayIpAddr; +use crate::reconciler_task::Reconciler; +use crate::switch_zone_slot::ThisSledSwitchSlot; +use dpd_client::Client; +use omicron_common::OMICRON_DPD_TAG; +use omicron_common::address::DENDRITE_PORT; +use sled_agent_types::system_networking::SystemNetworkingConfig; +use slog::Logger; +use std::time::Duration; + +#[derive(Debug, Clone)] +pub struct DpdReconcilerStatus { + pub todo_status: (), +} + +#[derive(Debug)] +pub(crate) struct DpdReconciler { + _client: Client, + _switch_slot: ThisSledSwitchSlot, +} + +impl Reconciler for DpdReconciler { + type Status = DpdReconcilerStatus; + + const LOGGER_COMPONENT_NAME: &'static str = "DpdReconciler"; + const RE_RECONCILE_INTERVAL: Duration = Duration::from_secs(30); + + fn new( + switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + switch_slot: ThisSledSwitchSlot, + parent_log: &Logger, + ) -> Self { + // Build a custom reqwest client, primarily to set a lower + // `pool_idle_timeout`. Our `RE_RECONCILE_INTERVAL` interval of 30 + // seconds happens to coincide exactly with dropshot's default + // connection timeout of 30 seconds. In early testing, this caused us to + // hit surprisingly + // frequently: dpd would close a connection right as we were trying to + // use it, resulting in spurious "connection closed before message + // completed" or "connection reset by peer" errors. + // + // We choose a much lower `pool_idle_timeout`: 10 seconds is long enough + // to reuse a connection for all the requests made during one + // reconciliation pass, but is short enough we should discard it before + // the server wants to time us out. + let reqwest_client = reqwest::ClientBuilder::new() + .connect_timeout(Duration::from_secs(15)) + .read_timeout(Duration::from_secs(15)) + .pool_idle_timeout(Duration::from_secs(10)) + .build() + .expect("reqwest parameters are valid"); + + let baseurl = + format!("http://[{switch_zone_underlay_ip}]:{DENDRITE_PORT}"); + + let client = Client::new_with_client( + &baseurl, + reqwest_client, + dpd_client::ClientState { + tag: OMICRON_DPD_TAG.to_owned(), + log: parent_log + .new(slog::o!("component" => "DpdReconcilerClient")), + }, + ); + + Self { _client: client, _switch_slot: switch_slot } + } + + async fn do_reconciliation( + &mut self, + _system_networking_config: &SystemNetworkingConfig, + _log: &Logger, + ) -> Self::Status { + DpdReconcilerStatus { todo_status: () } + } +} diff --git a/sled-agent/scrimlet-reconcilers/src/handle.rs b/sled-agent/scrimlet-reconcilers/src/handle.rs new file mode 100644 index 00000000000..db21451de07 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/handle.rs @@ -0,0 +1,318 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`ScrimletReconcilers`] is the entry point into this crate for `sled-agent`; +//! it provides a suitable handle for `sled-agent`'s "long running tasks", and +//! contains a handle to each of the inner service-specific reconcilers. + +use crate::DetermineSwitchSlotStatus; +use crate::ThisSledSwitchZoneUnderlayIpAddr; +use crate::dpd_reconciler::DpdReconciler; +use crate::mgd_reconciler::MgdReconciler; +use crate::reconciler_task::ReconcilerTaskHandle; +use crate::status::ScrimletReconcilersStatus; +use crate::status::ScrimletStatus; +use crate::switch_zone_slot::ThisSledSwitchSlot; +use crate::uplinkd_reconciler::UplinkdReconciler; +use omicron_common::address::MGS_PORT; +use sled_agent_types::system_networking::SystemNetworkingConfig; +use slog::Logger; +use slog::info; +use std::sync::Arc; +use std::sync::OnceLock; +use tokio::sync::watch; + +/// Information required to enable all the scrimlet reconciler tasks provided +/// by this crate. +/// +#[derive(Debug, Clone)] +pub struct SledAgentNetworkingInfo { + pub system_networking_config_rx: watch::Receiver, + pub switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, +} + +/// Handle to tasks that reconcile network configuration with services within a +/// scrimlet's local switch zone. +/// +/// [`ScrimletReconcilers`] has a two- or three-phase initialization process +/// (depending on whether the sled is a non-scrimlet or a scrimlet) to support +/// being included in `sled-agent`'s set of "long running tasks". +/// [`ScrimletReconcilers::new()`] can be constructed at any time (in +/// particular: very soon after `sled-agent` starts). +/// +/// After a [`ScrimletReconcilers`] has been created, `sled-agent` is +/// responsible for calling +/// [`ScrimletReconcilers::set_sled_agent_networking_info_once()`] exactly one +/// time: this provides the reconcilers with a watch channel to receive the +/// current [`SystemNetworkingConfig`] and the IP address of this sled's switch +/// zone (should it have one). +/// +/// After `sled-agent` has provided those prerequisites, on all sleds, +/// [`ScrimletReconcilers`] spawns a tokio task that waits until both of these +/// have occurred: +/// +/// 1. [`ScrimletReconcilers::set_scrimlet_status()`] is called with +/// [`ScrimletStatus::Scrimlet`]. +/// 2. We successfully contact MGS within our switch zone to determine which +/// switch slot we are. +/// +/// On non-scrimlet sleds, step 1 never happens, so the reconciler tasks are +/// never spawned. +/// +/// Once both of these are satisfied on scrimlets, all reconciliation tasks are +/// spawned and begin running. They will reactivate periodically and in response +/// to any changes to the [`SystemNetworkingConfig`] from sled-agent, and will +/// go inert if [`ScrimletReconcilers::set_scrimlet_status()`] is called with +/// [`ScrimletStatus::NotScrimlet`] (remaining inert until we are told we are a +/// scrimlet again). +pub struct ScrimletReconcilers { + // Sending half of the channel used to communicate to all the reconcilers + // whether we're still a scrimlet. + scrimlet_status_tx: watch::Sender, + + // These once locks hold the second and third phases of initialization + // described in the doc comment above. + determining_switch_slot: + OnceLock>, + running_reconcilers: Arc>, + + parent_log: Logger, + + // Hook for unit tests to modify how we construct an MGS client when + // determining our switch slot. + #[cfg(test)] + override_make_mgs_client: Option gateway_client::Client>>, +} + +impl ScrimletReconcilers { + pub fn new(parent_log: &Logger) -> Self { + // We discard the receiver here, and create new subscribers if and when + // we spawn tasks that need to consume it. + let (scrimlet_status_tx, _scrimlet_status_rx) = + watch::channel(ScrimletStatus::NotScrimlet); + + Self { + scrimlet_status_tx, + determining_switch_slot: OnceLock::new(), + running_reconcilers: Arc::new(OnceLock::new()), + parent_log: parent_log.clone(), + #[cfg(test)] + override_make_mgs_client: None, + } + } + + pub fn status(&self) -> ScrimletReconcilersStatus { + // Do we have running reconcilers? If so, report their status. + if let Some(running) = self.running_reconcilers.get() { + let RunningReconcilers { + dpd_reconciler, + mgd_reconciler, + uplinkd_reconciler, + } = running; + ScrimletReconcilersStatus::Running { + dpd_reconciler: dpd_reconciler.status(), + mgd_reconciler: mgd_reconciler.status(), + uplinkd_reconciler: uplinkd_reconciler.status(), + } + } + // Otherwise, have we started determining our switch slot? + else if let Some(status_rx) = self.determining_switch_slot.get() { + ScrimletReconcilersStatus::DeterminingSwitchSlot( + status_rx.borrow().clone(), + ) + } + // Otherwise, we're still waiting for the networking info. + else { + ScrimletReconcilersStatus::WaitingForSledAgentNetworkingInfo + } + } + + /// Set whether this sled is a scrimlet or not. + /// + /// This doesn't change _much_ at runtime, but it can: we may start out "not + /// a scrimlet" and then discover an attached switch later after boot, at + /// which point we become a scrimlet, or we may become "not a scrimlet" if + /// the switch is detached at runtime. + pub fn set_scrimlet_status(&self, status: ScrimletStatus) { + self.scrimlet_status_tx.send_if_modified(|prev| { + if *prev == status { + false + } else { + *prev = status; + true + } + }); + } + + /// Provide the networking information necessary to start the scrimlet + /// reconciler tasks. + /// + /// # Panics + /// + /// This method panics if called more than once; this is considered a + /// programmer error. + /// + /// One of the bits inside `info` is a watch channel: receiving a second + /// channel is a sign of control flow gone very wrong, as all the tasks will + /// already be operating based on the first channel received. + pub fn set_sled_agent_networking_info_once( + &self, + info: SledAgentNetworkingInfo, + ) { + let (determining_switch_slot_tx, determining_switch_slot_rx) = + watch::channel(DetermineSwitchSlotStatus::NotScrimlet); + + // Ensure we're only called once. + if self.determining_switch_slot.set(determining_switch_slot_rx).is_err() + { + panic!( + "set_sled_agent_networking_info_once() called more than \ + once - scrimlet reconcilers are already set up and \ + running based on the initial information provided!" + ); + } + + let mgs_client = self.make_mgs_client(info.switch_zone_underlay_ip); + + // We now know this is the one and only time we've been called; spawn a + // task that waits until we're a scrimlet, then waits until we can + // determine our switch slot, then spawns all the running reconcilers + // (populating `self.running_reconcilers`). + // + // We don't hang on to the join handle from this task; it exits either + // when it populates `self.running_reconcilers`, or when we're dropped + // (because it will exit when `self.scrimlet_status_tx` is closed). + tokio::spawn(determine_switch_slot( + Arc::clone(&self.running_reconcilers), + determining_switch_slot_tx, + self.scrimlet_status_tx.subscribe(), + mgs_client, + info, + self.parent_log.clone(), + )); + } + + #[cfg(test)] + fn make_mgs_client( + &self, + switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + ) -> gateway_client::Client { + if let Some(make_client) = &self.override_make_mgs_client { + make_client() + } else { + self.make_real_mgs_client(switch_zone_underlay_ip) + } + } + + #[cfg(not(test))] + fn make_mgs_client( + &self, + switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + ) -> gateway_client::Client { + self.make_real_mgs_client(switch_zone_underlay_ip) + } + + fn make_real_mgs_client( + &self, + switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + ) -> gateway_client::Client { + let baseurl = format!("http://[{switch_zone_underlay_ip}]:{MGS_PORT}"); + gateway_client::Client::new( + &baseurl, + self.parent_log + .new(slog::o!("component" => "ThisSledSwitchSlotMgsClient")), + ) + } +} + +async fn determine_switch_slot( + running_reconcilers: Arc>, + determining_switch_slot_tx: watch::Sender, + mut scrimlet_status_rx: watch::Receiver, + mgs_client: gateway_client::Client, + networking_info: SledAgentNetworkingInfo, + parent_log: Logger, +) { + let log = parent_log + .new(slog::o!("component" => "ThisSledSwitchSlotDetermination")); + + // Block until either we successfully contact MGS at our switch zone + // underlay IP or the sending half of `scrimlet_status_rx` is closed (which + // means our parent `ScrimletReconcilers` was dropped - this only happens in + // tests). + let this_sled_switch_slot = + match ThisSledSwitchSlot::determine_retrying_forever( + determining_switch_slot_tx, + &mut scrimlet_status_rx, + &mgs_client, + &log, + ) + .await + { + Ok(slot) => slot, + Err(_recv_error) => { + return; + } + }; + info!( + log, "determined this sled's switch slot"; + "slot" => ?this_sled_switch_slot, + ); + + // We know `running_reconcilers` must be unset, because we're the only one + // that sets it, and we're only spawned from + // `set_sled_agent_networking_info_once()` (which itself guarantees it's + // only called once). + running_reconcilers + .set(RunningReconcilers::spawn_all( + scrimlet_status_rx, + networking_info, + this_sled_switch_slot, + &parent_log, + )) + .expect("running reconcilers is only set once"); +} + +#[derive(Debug)] +struct RunningReconcilers { + dpd_reconciler: ReconcilerTaskHandle, + mgd_reconciler: ReconcilerTaskHandle, + uplinkd_reconciler: ReconcilerTaskHandle, +} + +impl RunningReconcilers { + fn spawn_all( + scrimlet_status_rx: watch::Receiver, + networking_info: SledAgentNetworkingInfo, + this_sled_switch_slot: ThisSledSwitchSlot, + parent_log: &Logger, + ) -> Self { + let dpd_reconciler = ReconcilerTaskHandle::::spawn( + scrimlet_status_rx.clone(), + networking_info.system_networking_config_rx.clone(), + networking_info.switch_zone_underlay_ip, + this_sled_switch_slot, + parent_log, + ); + let mgd_reconciler = ReconcilerTaskHandle::::spawn( + scrimlet_status_rx.clone(), + networking_info.system_networking_config_rx.clone(), + networking_info.switch_zone_underlay_ip, + this_sled_switch_slot, + parent_log, + ); + let uplinkd_reconciler = + ReconcilerTaskHandle::::spawn( + scrimlet_status_rx, + networking_info.system_networking_config_rx, + networking_info.switch_zone_underlay_ip, + this_sled_switch_slot, + parent_log, + ); + Self { dpd_reconciler, mgd_reconciler, uplinkd_reconciler } + } +} + +#[cfg(test)] +mod tests; diff --git a/sled-agent/scrimlet-reconcilers/src/handle/tests.rs b/sled-agent/scrimlet-reconcilers/src/handle/tests.rs new file mode 100644 index 00000000000..9b41295cd9e --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/handle/tests.rs @@ -0,0 +1,432 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::time::Duration; + +use super::*; +use assert_matches::assert_matches; +use httpmock::Mock; +use httpmock::MockServer; +use omicron_test_utils::dev; +use sled_agent_types::early_networking::RackNetworkConfig; + +struct Harness { + handle: ScrimletReconcilers, + networking_config_tx: watch::Sender, + mock_mgs: MockServer, +} + +impl Harness { + fn new(log: &Logger) -> Self { + let (networking_config_tx, _) = + watch::channel(SystemNetworkingConfig { + rack_network_config: RackNetworkConfig { + rack_subnet: "fd00:1122:3344:0100::/56".parse().unwrap(), + infra_ip_first: "192.0.2.10".parse().unwrap(), + infra_ip_last: "192.0.2.100".parse().unwrap(), + ports: Vec::new(), + bgp: Vec::new(), + bfd: Vec::new(), + }, + service_zone_nat_entries: None, + }); + + let mock_mgs = MockServer::start(); + let mut handle = ScrimletReconcilers::new(log); + + // Override how `handle` will attempt to contact MGS to point it at our + // mock server instead. + handle.override_make_mgs_client = { + let url = format!("http://{}", mock_mgs.address()); + let client = gateway_client::Client::new_with_client( + &url, + // Tricky bit: we use tokio's paused time in tests below both + // for consistency and test speed, but by default progenitor + // clients have a 15-second connection timeout. That passes + // _instantly_ if time is paused, which doesn't let us establish + // real TCP connections to the httpmock server, causing tests to + // spam connections as fast as possible. Pass a default reqwest + // client, which has no such timeout, allowing connections to + // wait (and succeed or fail as intended). + reqwest::Client::new(), + log.new(slog::o!("component" => "test-mgs-client")), + ); + Some(Box::new(move || client.clone())) + }; + + Self { handle, networking_config_tx, mock_mgs } + } + + fn sled_agent_networking_info(&self) -> SledAgentNetworkingInfo { + SledAgentNetworkingInfo { + system_networking_config_rx: self.networking_config_tx.subscribe(), + switch_zone_underlay_ip: + ThisSledSwitchZoneUnderlayIpAddr::TEST_FAKE, + } + } + + async fn wait_for_task_status(&self, matches: F) + where + F: Fn(&ScrimletReconcilersStatus) -> bool, + { + let mut status = self.handle.status(); + let start = tokio::time::Instant::now(); + while start.elapsed() < Duration::from_secs(10) { + if matches(&status) { + return; + } + tokio::time::sleep(Duration::from_millis(10)).await; + status = self.handle.status(); + } + panic!("timeout waiting for task status (got {status:?})"); + } + + fn mock_mgs_set_switch_slot(&self, slot: u16) -> Mock<'_> { + let body = + serde_json::json!({ "type": "switch", "slot": slot }).to_string(); + self.mock_mgs.mock(|when, then| { + when.method(httpmock::Method::GET).path("/local/switch-id"); + then.status(200) + .header("content-type", "application/json") + .body(body); + }) + } + + fn mock_mgs_set_503(&self) -> Mock<'_> { + self.mock_mgs.mock(|when, then| { + when.method(httpmock::Method::GET).path("/local/switch-id"); + then.status(503).header("content-type", "application/json").body( + serde_json::json!({ + "request_id": "test", + "message": "service unavailable", + }) + .to_string(), + ); + }) + } +} + +#[tokio::test] +#[should_panic( + expected = "set_sled_agent_networking_info_once() called more than once" +)] +async fn calling_set_sled_agent_networking_info_once_multiple_times_panics() { + let logctx = dev::test_setup_log( + "calling_set_sled_agent_networking_info_once_multiple_times_panics", + ); + let harness = Harness::new(&logctx.log); + + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + + logctx.cleanup_successful(); +} + +// Happy-path test for non-scrimlets. +#[tokio::test] +async fn non_scrimlet_two_phase_initialization() { + let logctx = dev::test_setup_log("non_scrimlet_two_phase_initialization"); + let harness = Harness::new(&logctx.log); + + // initial status + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::WaitingForSledAgentNetworkingInfo + ); + + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + + // terminal status for non-scrimlets + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::NotScrimlet + ) + ); + + logctx.cleanup_successful(); +} + +// Happy-path test for scrimlets where we find out we're a scrimlet after +// getting our networking info. +#[tokio::test(start_paused = true)] +async fn scrimlet_three_phase_initialization_info_then_scrimlet() { + let logctx = dev::test_setup_log( + "scrimlet_three_phase_initialization_info_then_scrimlet", + ); + let harness = Harness::new(&logctx.log); + + // Configure our mock MGS to claim to be switch slot 0. + harness.mock_mgs_set_switch_slot(0); + + // initial status + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::WaitingForSledAgentNetworkingInfo + ); + + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + + // status before we become a scrimlet + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::NotScrimlet + ) + ); + + harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); + + // becoming a scrimlet should cause us to transition to contacting MGS, but + // this happens in another tokio task so we have to wait for it. + harness + .wait_for_task_status(|status| { + matches!( + status, + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::ContactingMgs { + prev_attempt_err: None + } + ) + ) + }) + .await; + + // and we should transition through that to `Running` after contacting the + // mock MGS. + harness + .wait_for_task_status(|status| { + matches!(status, ScrimletReconcilersStatus::Running { .. }) + }) + .await; + + logctx.cleanup_successful(); +} + +// Happy-path test for scrimlets where we find out we're a scrimlet before +// getting our networking info. +#[tokio::test(start_paused = true)] +async fn scrimlet_three_phase_initialization_scrimlet_then_info() { + let logctx = dev::test_setup_log( + "scrimlet_three_phase_initialization_scrimlet_then_info", + ); + let harness = Harness::new(&logctx.log); + + // Configure our mock MGS to claim to be switch slot 0. + harness.mock_mgs_set_switch_slot(0); + + // Become a scrimlet right away. + harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); + + // initial status + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::WaitingForSledAgentNetworkingInfo + ); + + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + + // We're already a scrimlet, so we should go through `ContactingMgs` and + // then to `Running`. + harness + .wait_for_task_status(|status| { + matches!( + status, + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::ContactingMgs { + prev_attempt_err: None + } + ) + ) + }) + .await; + harness + .wait_for_task_status(|status| { + matches!(status, ScrimletReconcilersStatus::Running { .. }) + }) + .await; + + logctx.cleanup_successful(); +} + +// Scrimlet test case where MGS fails the first time we ask then succeeds later. +#[tokio::test(start_paused = true)] +async fn scrimlet_mgs_fails_first_attempt() { + let logctx = dev::test_setup_log("scrimlet_mgs_fails_first_attempt"); + let harness = Harness::new(&logctx.log); + + // Configure our mock MGS to return an HTTP 503. + let mut error_mock = harness.mock_mgs_set_503(); + + // initial status + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::WaitingForSledAgentNetworkingInfo + ); + + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + + // status before we become a scrimlet + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::NotScrimlet + ) + ); + + harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); + + // We should first see the "waiting to retry" status... + harness + .wait_for_task_status(|status| { + matches!( + status, + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::WaitingToRetry { + prev_attempt_err, + } + ) if prev_attempt_err.contains("status: 503") + ) + }) + .await; + + // ... then the "contacting MGS" status with the previous error set. + harness + .wait_for_task_status(|status| { + matches!( + status, + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::ContactingMgs { + prev_attempt_err: Some(err), + } + ) if err.contains("status: 503") + ) + }) + .await; + + // Changing MGS to return success should transition us to `Running`. + error_mock.delete(); + harness.mock_mgs_set_switch_slot(0); + + harness + .wait_for_task_status(|status| { + matches!(status, ScrimletReconcilersStatus::Running { .. }) + }) + .await; + + logctx.cleanup_successful(); +} + +// Scrimlet test case where MGS fails, then we become "not a scrimlet", then we +// become a scrimlet again and MGS succeeds. +#[tokio::test(start_paused = true)] +async fn scrimlet_mgs_fails_then_we_become_not_a_scrimlet() { + let logctx = + dev::test_setup_log("scrimlet_mgs_fails_then_we_become_not_a_scrimlet"); + let harness = Harness::new(&logctx.log); + + // Configure our mock MGS to return an HTTP 503. + let mut error_mock = harness.mock_mgs_set_503(); + + // initial status + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::WaitingForSledAgentNetworkingInfo + ); + + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + + // status before we become a scrimlet + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::NotScrimlet + ) + ); + + harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); + + // We should first see the "waiting to retry" status... + harness + .wait_for_task_status(|status| { + matches!( + status, + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::WaitingToRetry { + prev_attempt_err, + } + ) if prev_attempt_err.contains("status: 503") + ) + }) + .await; + + // ... then the "contacting MGS" status with the previous error set. + harness + .wait_for_task_status(|status| { + matches!( + status, + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::ContactingMgs { + prev_attempt_err: Some(err), + } + ) if err.contains("status: 503") + ) + }) + .await; + + // Now sled-agent tells us we're not a scrimlet; we should transition back + // to `NotScrimlet`. + harness.handle.set_scrimlet_status(ScrimletStatus::NotScrimlet); + harness + .wait_for_task_status(|status| { + matches!( + status, + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::NotScrimlet + ) + ) + }) + .await; + + // Reconfigure MGS to succeed. + error_mock.delete(); + harness.mock_mgs_set_switch_slot(0); + + // Become a scrimlet again - we should transition through `ContactingMgs` + // (with no previous error) then to `Running`. + harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); + harness + .wait_for_task_status(|status| { + matches!( + status, + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::ContactingMgs { + prev_attempt_err: None, + } + ) + ) + }) + .await; + harness + .wait_for_task_status(|status| { + matches!(status, ScrimletReconcilersStatus::Running { .. }) + }) + .await; + + logctx.cleanup_successful(); +} diff --git a/sled-agent/scrimlet-reconcilers/src/lib.rs b/sled-agent/scrimlet-reconcilers/src/lib.rs new file mode 100644 index 00000000000..110fa5a3835 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/lib.rs @@ -0,0 +1,65 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! This crate implements long-running reconciler tasks responsible for +//! configuration of services within a scrimlet's switch zone. +//! +//! These tasks _only_ talk to services on the same sled as the sled-agent +//! executing these tasks; we attempt to ensure this at runtime via types like +//! [`ThisSledSwitchZoneUnderlayIpAddr`]. A scrimlet running these tasks should +//! never attempt to talk to another scrimlet's switch zone, and a non-scrimlet +//! running these tasks should never attempt to talk to anything. (Non-scrimlet +//! sleds still create a [`ScrimletReconcilers`] handle, as sled-agent can't +//! easily tell the difference between "not a scrimlet because we're in a +//! different cubby" and "not a scrimlet because we should have a switch but it +//! isn't connected / isn't powered on / etc.", but the reconciliation tasks are +//! only spawned under conditions that only scrimlets can satisfy; see +//! [`ScrimletReconcilers`] for more detail.) +//! +//! These tasks are responsible for applying system-level networking, including: +//! +//! * Configuration of uplink ports within `dpd` +//! * Configuration of NAT entries for system-level services (boundary NTP, +//! Nexus, external DNS - notably _not_ instances) within `dpd` +//! * Configuration of BGP within `mgd` +//! * Configuration of BFD within `mgd` +//! * Configuration of static routes within `mgd` +//! * Configuration of SMF properties for `uplinkd` and `lldpd` +//! +//! The specific configuration that should be applied comes from Nexus (or RSS, +//! at rack setup time) and is sent to `sled-agent` via the bootstore. +//! +//! In the past, responsibility for this configuration was split: sled-agent was +//! responsible for applying an initial config on sled boot (required for cold +//! boot of the rack), and Nexus was responsible for continuously keeping the +//! config in sync afterwards. This had a variety of problems; see +//! . The split is now +//! that Nexus is responsible for maintaining what the configuration should be, +//! and each scrimlet is responsible for applying that configuration to its own +//! switch zone's services; the latter is implemented via this crate. + +mod dpd_reconciler; +mod handle; +mod mgd_reconciler; +mod reconciler_task; +mod status; +mod switch_zone_slot; +mod switch_zone_underlay_ip; +mod uplinkd_reconciler; + +pub use dpd_reconciler::DpdReconcilerStatus; +pub use handle::ScrimletReconcilers; +pub use handle::SledAgentNetworkingInfo; +pub use mgd_reconciler::MgdReconcilerStatus; +pub use status::DetermineSwitchSlotStatus; +pub use status::ReconcilerActivationReason; +pub use status::ReconcilerCurrentStatus; +pub use status::ReconcilerInertReason; +pub use status::ReconcilerRunningStatus; +pub use status::ReconcilerStatus; +pub use status::ReconciliationCompletedStatus; +pub use status::ScrimletReconcilersStatus; +pub use status::ScrimletStatus; +pub use switch_zone_underlay_ip::ThisSledSwitchZoneUnderlayIpAddr; +pub use uplinkd_reconciler::UplinkdReconcilerStatus; diff --git a/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs new file mode 100644 index 00000000000..b14d4e7db4f --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs @@ -0,0 +1,77 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Reconciler responsible for configuration of `mgd` within a scrimlet's switch +//! zone. + +use crate::ThisSledSwitchZoneUnderlayIpAddr; +use crate::reconciler_task::Reconciler; +use crate::switch_zone_slot::ThisSledSwitchSlot; +use mg_admin_client::Client; +use omicron_common::address::MGD_PORT; +use sled_agent_types::system_networking::SystemNetworkingConfig; +use slog::Logger; +use std::time::Duration; + +#[derive(Debug, Clone)] +pub struct MgdReconcilerStatus { + pub todo_status: (), +} + +#[derive(Debug)] +pub(crate) struct MgdReconciler { + _client: Client, + _switch_slot: ThisSledSwitchSlot, +} + +impl Reconciler for MgdReconciler { + type Status = MgdReconcilerStatus; + + const LOGGER_COMPONENT_NAME: &'static str = "MgdReconciler"; + const RE_RECONCILE_INTERVAL: Duration = Duration::from_secs(30); + + fn new( + switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + switch_slot: ThisSledSwitchSlot, + parent_log: &Logger, + ) -> Self { + // Build a custom reqwest client, primarily to set a lower + // `pool_idle_timeout`. Our `RE_RECONCILE_INTERVAL` interval of 30 + // seconds happens to coincide exactly with dropshot's default + // connection timeout of 30 seconds. In early testing, this caused us to + // hit surprisingly + // frequently: mgd would close a connection right as we were trying to + // use it, resulting in spurious "connection closed before message + // completed" or "connection reset by peer" errors. + // + // We choose a much lower `pool_idle_timeout`: 10 seconds is long enough + // to reuse a connection for all the requests made during one + // reconciliation pass, but is short enough we should discard it before + // the server wants to time us out. + let reqwest_client = reqwest::ClientBuilder::new() + .connect_timeout(Duration::from_secs(15)) + .read_timeout(Duration::from_secs(15)) + .pool_idle_timeout(Duration::from_secs(10)) + .build() + .expect("reqwest parameters are valid"); + + let baseurl = format!("http://[{switch_zone_underlay_ip}]:{MGD_PORT}"); + + let client = Client::new_with_client( + &baseurl, + reqwest_client, + parent_log.new(slog::o!("component" => "MgdReconcilerClient")), + ); + + Self { _client: client, _switch_slot: switch_slot } + } + + async fn do_reconciliation( + &mut self, + _system_networking_config: &SystemNetworkingConfig, + _log: &Logger, + ) -> Self::Status { + MgdReconcilerStatus { todo_status: () } + } +} diff --git a/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs b/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs new file mode 100644 index 00000000000..7681fc79441 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs @@ -0,0 +1,311 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The general framework for one of the service-specific reconciler tasks +//! implemented in this crate. +//! +//! Each reconciler follows the same structure: +//! +//! 1. If we ever stop being a scrimlet (i.e., the attached switch goes away), +//! go inert until we become a scrimlet again (i.e., the switch reappears). +//! This can happen during sidecar updates if it powers off briefly to reset +//! internal FPGAs, or in a variety of other less common and more rainy-day +//! situations. +//! 2. Periodically or when the `SystemNetworkingConfig` changes, perform +//! service-specific reconciliation. This is provided by implementors of the +//! [`Reconciler`] trait elsewhere in this crate. +//! 3. Report status of this task in an output watch channel, suitable for +//! reporting in the sled-agent inventory. +//! +//! [`ReconcilerTask::run()`] handles 1 and 3, and service-specific +//! implementations of [`Reconciler`] provide 2. + +use crate::ThisSledSwitchZoneUnderlayIpAddr; +use crate::status::ReconcilerActivationReason; +use crate::status::ReconcilerCurrentStatus; +use crate::status::ReconcilerInertReason; +use crate::status::ReconcilerRunningStatus; +use crate::status::ReconcilerStatus; +use crate::status::ReconciliationCompletedStatus; +use crate::status::ScrimletStatus; +use crate::switch_zone_slot::ThisSledSwitchSlot; +use chrono::Utc; +use sled_agent_types::system_networking::SystemNetworkingConfig; +use slog::Logger; +use slog::error; +use slog::info; +use std::convert::Infallible; +use std::time::Duration; +use tokio::sync::watch; +use tokio::sync::watch::error::RecvError; +use tokio::task::JoinHandle; + +/// Trait that should be implemented by the service-specific reconciler tasks +/// elsewhere in this crate. +pub(crate) trait Reconciler: Send + 'static { + type Status: Clone + Send + Sync + 'static; + + const LOGGER_COMPONENT_NAME: &'static str; + const RE_RECONCILE_INTERVAL: Duration; + + /// Construct a new instance of this `Reconciler`. + /// + /// Typically builds a client for the relevant service based on + /// `switch_zone_underlay_ip` and record `switch_slot` for use inside future + /// calls to `do_reconciliation()`. + fn new( + switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + switch_slot: ThisSledSwitchSlot, + parent_log: &Logger, + ) -> Self; + + /// Perform any required reconciliation based on the current contents of + /// `system_networking_config`. + /// + /// This method is infallible; any errors must be described by + /// `Self::Status`. + fn do_reconciliation( + &mut self, + system_networking_config: &SystemNetworkingConfig, + log: &Logger, + ) -> impl Future + Send; +} + +#[derive(Debug)] +pub(crate) struct ReconcilerTaskHandle { + status_rx: watch::Receiver>, + + // We never wait on this task: the only way it exits is by panicking (which + // results in a process abort, since we build sled-agent with panic=abort) + // or if the watch channels it's reading are dropped, which itself can only + // happen by a panic elsewhere. + _task: JoinHandle<()>, +} + +impl ReconcilerTaskHandle { + pub(crate) fn spawn( + scrimlet_status_rx: watch::Receiver, + system_networking_config_rx: watch::Receiver, + switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + this_sled_switch_slot: ThisSledSwitchSlot, + parent_log: &Logger, + ) -> Self { + Self::spawn_impl( + scrimlet_status_rx, + system_networking_config_rx, + switch_zone_underlay_ip, + this_sled_switch_slot, + parent_log, + T::new, + ) + } + + // Separate, private function that allows unit tests to customize how `T` is + // constructed. Production passes `T::new` as `inner_constructor`; i.e., + // just call the constructor we know exists from the `Reconciler` trait. + fn spawn_impl( + scrimlet_status_rx: watch::Receiver, + system_networking_config_rx: watch::Receiver, + switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + this_sled_switch_slot: ThisSledSwitchSlot, + parent_log: &Logger, + inner_constructor: F, + ) -> Self + where + F: FnOnce( + ThisSledSwitchZoneUnderlayIpAddr, + ThisSledSwitchSlot, + &Logger, + ) -> T + + Send + + Sync + + 'static, + { + let (status_tx, status_rx) = watch::channel(ReconcilerStatus { + current_status: ReconcilerCurrentStatus::Idle, + last_completion: None, + }); + + let log = + parent_log.new(slog::o!("component" => T::LOGGER_COMPONENT_NAME)); + + let mut inner_task = ReconcilerTask { + scrimlet_status_rx, + system_networking_config_rx, + status_tx, + inner: inner_constructor( + switch_zone_underlay_ip, + this_sled_switch_slot, + parent_log, + ), + log, + }; + + let task = tokio::spawn(async move { + match inner_task.run().await { + // `inner_task.run()` runs forever... + Ok(never_returns) => match never_returns {}, + + // ... unless one of its input watch channels has closed. + Err(_recv_error) => { + inner_task.status_tx.send_modify(|status| { + status.current_status = ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::TaskExitedUnexpectedly, + ); + }); + error!( + inner_task.log, + "exited due to watch channel closure \ + (unexpected except during shutdown in tests)" + ); + } + } + }); + + Self { status_rx, _task: task } + } + + pub(crate) fn status(&self) -> ReconcilerStatus { + self.status_rx.borrow().clone() + } +} + +struct ReconcilerTask { + scrimlet_status_rx: watch::Receiver, + system_networking_config_rx: watch::Receiver, + status_tx: watch::Sender>, + inner: T, + log: Logger, +} + +impl ReconcilerTask { + async fn run(&mut self) -> Result { + let mut activation_reason = ReconcilerActivationReason::Startup; + let mut activation_count: u64 = 0; + + loop { + // We know we _were_ a scrimlet at some point, because we determined + // our switch slot by contacting MGS within our own switch zone. But + // it's possible we could become "not a scrimlet" in the future + // (e.g., if the switch disappears out from under us). In such a + // case, block until it comes back. + self.wait_if_this_sled_is_no_longer_a_scrimlet().await?; + + // We _are_ a scrimlet; perform reconciliation. + info!( + self.log, "starting reconciliation attempt"; + "activation_reason" => ?activation_reason, + "activation_count" => activation_count, + ); + + // Snapshot the current networking config so we hold the watch + // channel as little as possible. + let system_networking_config = + self.system_networking_config_rx.borrow_and_update().clone(); + let running_status = + ReconcilerRunningStatus::new(activation_reason); + self.status_tx.send_modify(|status| { + status.current_status = + ReconcilerCurrentStatus::Running(running_status); + }); + + // Actually perform reconciliation. + let status_result = self + .inner + .do_reconciliation(&system_networking_config, &self.log) + .await; + + // Update our output watch channel with the result. + info!( + self.log, "reconciliation attempt complete"; + "activation_reason" => ?activation_reason, + "activation_count" => activation_count, + ); + self.status_tx.send_modify(|status| { + status.current_status = ReconcilerCurrentStatus::Idle; + status.last_completion = + Some(Box::new(ReconciliationCompletedStatus { + activation_reason, + completed_at_time: Utc::now(), + ran_for: running_status.elapsed_since_start(), + activation_count, + status: status_result, + })); + }); + activation_count = activation_count.wrapping_add(1); + + // Wait until we should perform reconciliation again: our + // re-reconciliation periodic timer fires or one of our input watch + // channels changes. + // + // All arms are cancel-safe and we do not `.await` within the body + // of any arm, avoiding any opportunity for futurelock. + activation_reason = tokio::select! { + () = tokio::time::sleep(T::RE_RECONCILE_INTERVAL) => { + ReconcilerActivationReason::PeriodicTimer + } + + result = self.system_networking_config_rx.changed() => { + () = result?; + ReconcilerActivationReason::SystemNetworkingConfigChanged + } + + result = self.scrimlet_status_rx.changed() => { + () = result?; + ReconcilerActivationReason::ScrimletStatusChanged + } + }; + } + } + + async fn wait_if_this_sled_is_no_longer_a_scrimlet( + &mut self, + ) -> Result<(), RecvError> { + let mut logged_not_scrimlet = false; + + loop { + let status = *self.scrimlet_status_rx.borrow_and_update(); + match status { + ScrimletStatus::Scrimlet => { + return Ok(()); + } + ScrimletStatus::NotScrimlet => { + if !logged_not_scrimlet { + info!( + self.log, + "not a scrimlet - reconciler going inert" + ); + logged_not_scrimlet = true; + } + self.status_tx.send_modify(|status| { + status.current_status = ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::NoLongerAScrimlet, + ); + }); + + // Select over both input channels so we can detect channel + // closure and exit cleanly if either channel goes away. If + // the rack network config changes here, we'll spuriously + // loop around and reread the scrimlet status, but that's no + // big deal. + // + // Both arms are cancel-safe and we do not `.await` within + // the body of any arm, avoiding any opportunity for + // futurelock. + tokio::select! { + result = self.scrimlet_status_rx.changed() => { + () = result?; + } + result = self.system_networking_config_rx.changed() => { + () = result?; + } + } + } + } + } + } +} + +#[cfg(test)] +mod tests; diff --git a/sled-agent/scrimlet-reconcilers/src/reconciler_task/tests.rs b/sled-agent/scrimlet-reconcilers/src/reconciler_task/tests.rs new file mode 100644 index 00000000000..df6b8d2af6e --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/reconciler_task/tests.rs @@ -0,0 +1,723 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::*; +use assert_matches::assert_matches; +use sled_agent_types::early_networking::RackNetworkConfig; +use std::mem; +use std::sync::Arc; +use std::sync::Mutex; +use tokio::sync::mpsc; +use tokio::time::Instant; + +struct MockReconciler { + do_reconciliation_calls: Arc>>, + do_reconciliation_results: mpsc::UnboundedReceiver, +} + +impl Reconciler for MockReconciler { + type Status = String; + + const LOGGER_COMPONENT_NAME: &'static str = "MockReconciler"; + const RE_RECONCILE_INTERVAL: Duration = Duration::from_secs(30); + + fn new( + _switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + _switch_slot: ThisSledSwitchSlot, + _parent_log: &Logger, + ) -> Self { + unimplemented!("not called by tests") + } + + async fn do_reconciliation( + &mut self, + system_networking_config: &SystemNetworkingConfig, + _log: &Logger, + ) -> Self::Status { + self.do_reconciliation_calls + .lock() + .unwrap() + .push(system_networking_config.clone()); + self.do_reconciliation_results + .recv() + .await + .expect("test never closes sending side of channel") + } +} + +fn test_system_networking_config_1() -> SystemNetworkingConfig { + SystemNetworkingConfig { + rack_network_config: RackNetworkConfig { + rack_subnet: "fd00:1122:3344:0100::/56".parse().unwrap(), + infra_ip_first: "192.0.2.10".parse().unwrap(), + infra_ip_last: "192.0.2.100".parse().unwrap(), + ports: Vec::new(), + bgp: Vec::new(), + bfd: Vec::new(), + }, + service_zone_nat_entries: None, + } +} + +fn test_system_networking_config_2() -> SystemNetworkingConfig { + SystemNetworkingConfig { + rack_network_config: RackNetworkConfig { + rack_subnet: "fd00:aabb:ccdd:0200::/56".parse().unwrap(), + infra_ip_first: "192.0.2.20".parse().unwrap(), + infra_ip_last: "192.0.2.200".parse().unwrap(), + ports: Vec::new(), + bgp: Vec::new(), + bfd: Vec::new(), + }, + service_zone_nat_entries: None, + } +} + +struct Harness { + task: ReconcilerTaskHandle, + scrimlet_status_tx: watch::Sender, + system_networking_config_tx: watch::Sender, + do_reconciliation_results_tx: mpsc::UnboundedSender, + do_reconciliation_calls: Arc>>, +} + +impl Harness { + const WAIT_FOR_STATUS_CHECK_INTERVAL: Duration = Duration::from_millis(100); + + fn new(log: &Logger) -> Self { + let (scrimlet_status_tx, scrimlet_status_rx) = + watch::channel(ScrimletStatus::Scrimlet); + let (system_networking_config_tx, system_networking_config_rx) = + watch::channel(test_system_networking_config_1()); + + let (do_reconciliation_results_tx, do_reconciliation_results) = + mpsc::unbounded_channel(); + let do_reconciliation_calls = Arc::new(Mutex::new(Vec::new())); + + let task = { + let do_reconciliation_calls = Arc::clone(&do_reconciliation_calls); + ReconcilerTaskHandle::spawn_impl( + scrimlet_status_rx, + system_networking_config_rx, + ThisSledSwitchZoneUnderlayIpAddr::TEST_FAKE, + ThisSledSwitchSlot::TEST_FAKE, + log, + |_ip, _slot, _log| MockReconciler { + do_reconciliation_calls, + do_reconciliation_results, + }, + ) + }; + + Self { + task, + scrimlet_status_tx, + system_networking_config_tx, + do_reconciliation_results_tx, + do_reconciliation_calls, + } + } + + fn set_scrimlet_status(&self, status: ScrimletStatus) { + self.scrimlet_status_tx.send_modify(|s| { + *s = status; + }); + } + + async fn wait_for_do_reconciliation_call_count(&self, count: usize) { + let mut last_seen = self.do_reconciliation_calls.lock().unwrap().len(); + let start = Instant::now(); + while start.elapsed() < Duration::from_secs(5) { + if last_seen == count { + return; + } + tokio::time::sleep(Self::WAIT_FOR_STATUS_CHECK_INTERVAL).await; + last_seen = self.do_reconciliation_calls.lock().unwrap().len(); + } + panic!( + "timeout waiting for do_reconciliation call count {count} \ + (got {last_seen})" + ); + } + + async fn wait_for_task_status( + &self, + description: &str, + matches: F, + ) -> ReconcilerStatus + where + F: Fn(&ReconcilerCurrentStatus) -> bool, + { + let mut status = self.task.status(); + let start = Instant::now(); + while start.elapsed() < Duration::from_secs(5) { + if matches(&status.current_status) { + return status; + } + tokio::time::sleep(Self::WAIT_FOR_STATUS_CHECK_INTERVAL).await; + status = self.task.status(); + } + panic!( + "timeout waiting for task status {description} (got {status:?})" + ); + } + + async fn wait_for_task_status_no_longer_a_scrimlet( + &self, + ) -> ReconcilerStatus { + self.wait_for_task_status("Inert(NoLongerAScrimlet)", |status| { + matches!( + status, + ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::NoLongerAScrimlet + ) + ) + }) + .await + } + + async fn wait_for_task_status_idle(&self) -> ReconcilerStatus { + self.wait_for_task_status("Idle", |status| { + matches!(status, ReconcilerCurrentStatus::Idle) + }) + .await + } + + async fn shutdown_cleanly(self) { + let Self { + task, scrimlet_status_tx, do_reconciliation_results_tx, .. + } = self; + + // Dropping this watch channel should cause the task to exit. + mem::drop(scrimlet_status_tx); + + task._task.await.expect("task didn't panic"); + let final_status = task.status_rx.borrow(); + assert_matches!( + final_status.current_status, + ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::TaskExitedUnexpectedly + ) + ); + + // Explicitly drop this _after_ the task exits so we're guaranteed + // not to hit the `.expect()` in + // `MockReconciler::do_reconciliation()` above. + mem::drop(do_reconciliation_results_tx); + } +} + +// Test the first activation. +#[tokio::test(start_paused = true)] +async fn first_activation() { + let logctx = omicron_test_utils::dev::test_setup_log("first_activation"); + let harness = Harness::new(&logctx.log); + + // Confirm we start in Idle. + assert_matches!( + harness.task.status().current_status, + ReconcilerCurrentStatus::Idle + ); + + // Task should call do_reconciliation() for the first time. + harness.wait_for_do_reconciliation_call_count(1).await; + harness.do_reconciliation_results_tx.send("first".to_string()).unwrap(); + let status = harness.wait_for_task_status_idle().await; + + let completion = + status.last_completion.expect("last_completion should be preserved"); + assert_eq!(completion.activation_count, 0); + assert_eq!(completion.status, "first"); + assert_matches!( + completion.activation_reason, + ReconcilerActivationReason::Startup + ); + + harness.shutdown_cleanly().await; + logctx.cleanup_successful(); +} + +// Test: after completing a reconciliation and reaching the select!, +// setting NotScrimlet causes the task to loop back to +// wait_if_this_sled_is_no_longer_a_scrimlet and go inert — without +// performing another reconciliation. +#[tokio::test(start_paused = true)] +async fn scrimlet_becomes_not_scrimlet_during_select() { + let logctx = omicron_test_utils::dev::test_setup_log( + "scrimlet_becomes_not_scrimlet_during_select", + ); + let harness = Harness::new(&logctx.log); + + // Complete the first reconciliation so the task reaches the select!. + harness.wait_for_do_reconciliation_call_count(1).await; + harness.do_reconciliation_results_tx.send("first".to_string()).unwrap(); + harness.wait_for_task_status_idle().await; + + // Become NotScrimlet → the select! fires with ScrimletStatusChanged, + // the loop goes back to wait_if_this_sled_is_no_longer_a_scrimlet, and the + // task becomes Inert(NoLongerAScrimlet). + harness.set_scrimlet_status(ScrimletStatus::NotScrimlet); + let status = harness.wait_for_task_status_no_longer_a_scrimlet().await; + + // No additional do_reconciliation call should have happened: the + // ScrimletStatusChanged activation saw NotScrimlet and went inert + // instead of reconciling. + assert_eq!(harness.do_reconciliation_calls.lock().unwrap().len(), 1); + + // last_completion from the prior run should still be present. + let completion = + status.last_completion.expect("last_completion should be preserved"); + assert_eq!(completion.activation_count, 0); + assert_eq!(completion.status, "first"); + + harness.shutdown_cleanly().await; + logctx.cleanup_successful(); +} + +// Test: after the first reconciliation completes, changing the +// SystemNetworkingConfig triggers a second reconciliation with +// activation_reason = SystemNetworkingConfigChanged and the new config. +#[tokio::test(start_paused = true)] +async fn system_networking_config_change_triggers_re_reconciliation() { + let logctx = omicron_test_utils::dev::test_setup_log( + "system_networking_config_change_triggers_re_reconciliation", + ); + let harness = Harness::new(&logctx.log); + + // Wait for the first do_reconciliation call (Startup). + harness.wait_for_do_reconciliation_call_count(1).await; + + // Complete the first reconciliation. + harness.do_reconciliation_results_tx.send("first".to_string()).unwrap(); + harness.wait_for_task_status_idle().await; + + // Send a new SystemNetworkingConfig. + let first_config = test_system_networking_config_1(); + let second_config = test_system_networking_config_2(); + assert_ne!(first_config, second_config); + harness.system_networking_config_tx.send(second_config.clone()).unwrap(); + + // Wait for the second do_reconciliation call. + harness.wait_for_do_reconciliation_call_count(2).await; + + // The second call should have received the new config. + let received_configs = + harness.do_reconciliation_calls.lock().unwrap().clone(); + assert_eq!(received_configs.len(), 2); + assert_eq!(received_configs[0], first_config); + assert_eq!(received_configs[1], second_config); + + // Status should be Running with SystemNetworkingConfigChanged. + let status = harness.task.status(); + match &status.current_status { + ReconcilerCurrentStatus::Running(running) => { + assert_matches!( + running.activation_reason(), + ReconcilerActivationReason::SystemNetworkingConfigChanged + ); + } + other => panic!("expected Running status, got {other:?}"), + } + + // Complete the second reconciliation. + harness.do_reconciliation_results_tx.send("second".to_string()).unwrap(); + let status = harness.wait_for_task_status_idle().await; + + let completion = + status.last_completion.expect("should have last_completion"); + assert_matches!( + completion.activation_reason, + ReconcilerActivationReason::SystemNetworkingConfigChanged + ); + assert_eq!(completion.activation_count, 1); + assert_eq!(completion.status, "second"); + + harness.shutdown_cleanly().await; + logctx.cleanup_successful(); +} + +// Test: after the first reconciliation completes, the periodic timer +// fires after RE_RECONCILE_INTERVAL and triggers a second +// reconciliation with activation_reason = PeriodicTimer. +#[tokio::test(start_paused = true)] +async fn periodic_timer_triggers_re_reconciliation() { + let logctx = omicron_test_utils::dev::test_setup_log( + "periodic_timer_triggers_re_reconciliation", + ); + let harness = Harness::new(&logctx.log); + + // Complete the first reconciliation (Startup). + harness.wait_for_do_reconciliation_call_count(1).await; + harness.do_reconciliation_results_tx.send("first".to_string()).unwrap(); + harness.wait_for_task_status_idle().await; + + // Advance time just short of the interval — no second call yet. + tokio::time::advance( + MockReconciler::RE_RECONCILE_INTERVAL - Duration::from_millis(1), + ) + .await; + assert_eq!(harness.do_reconciliation_calls.lock().unwrap().len(), 1); + + // Advance past the interval — periodic timer fires. + tokio::time::advance(Duration::from_millis(1)).await; + harness.wait_for_do_reconciliation_call_count(2).await; + + // Status should be Running with PeriodicTimer. + let status = harness.task.status(); + match &status.current_status { + ReconcilerCurrentStatus::Running(running) => { + assert_matches!( + running.activation_reason(), + ReconcilerActivationReason::PeriodicTimer + ); + } + other => panic!("expected Running status, got {other:?}"), + } + + // Complete and verify. + harness.do_reconciliation_results_tx.send("periodic".to_string()).unwrap(); + let status = harness.wait_for_task_status_idle().await; + let completion = + status.last_completion.expect("should have last_completion"); + assert_matches!( + completion.activation_reason, + ReconcilerActivationReason::PeriodicTimer + ); + assert_eq!(completion.activation_count, 1); + assert_eq!(completion.status, "periodic"); + + harness.shutdown_cleanly().await; + logctx.cleanup_successful(); +} + +// Test: if the SystemNetworkingConfig changes while do_reconciliation is +// in-flight, the task should notice when it reaches the select! and +// immediately perform another reconciliation with +// activation_reason = SystemNetworkingConfigChanged using the latest config. +#[tokio::test(start_paused = true)] +async fn config_change_during_inflight_reconciliation() { + let logctx = omicron_test_utils::dev::test_setup_log( + "config_change_during_inflight_reconciliation", + ); + let harness = Harness::new(&logctx.log); + + // Wait for the first do_reconciliation call (Startup) to be entered. + harness.wait_for_do_reconciliation_call_count(1).await; + + // While the first reconciliation is still in-flight, change the + // config. The task won't see this until it finishes and hits the + // select!. + harness + .system_networking_config_tx + .send(test_system_networking_config_2()) + .unwrap(); + + // Complete the first reconciliation. + harness.do_reconciliation_results_tx.send("first".to_string()).unwrap(); + + // The task should immediately start a second reconciliation because + // system_networking_config_rx.changed() fires in the select!. Because we + // have time paused, the elapsed time should be exactly one check interval + // of our test harness. + let before = tokio::time::Instant::now(); + harness.wait_for_do_reconciliation_call_count(2).await; + assert_eq!(before.elapsed(), Harness::WAIT_FOR_STATUS_CHECK_INTERVAL); + + // The second call should have received the new config (via + // borrow_and_update()). + let received_configs = + harness.do_reconciliation_calls.lock().unwrap().clone(); + assert_eq!(received_configs[0], test_system_networking_config_1()); + assert_eq!(received_configs[1], test_system_networking_config_2()); + + // Status should be Running with SystemNetworkingConfigChanged. + let status = harness.task.status(); + match &status.current_status { + ReconcilerCurrentStatus::Running(running) => { + assert_matches!( + running.activation_reason(), + ReconcilerActivationReason::SystemNetworkingConfigChanged + ); + } + other => panic!("expected Running status, got {other:?}"), + } + + // Complete the second reconciliation and verify. + harness.do_reconciliation_results_tx.send("second".to_string()).unwrap(); + let status = harness.wait_for_task_status_idle().await; + let completion = + status.last_completion.expect("should have last_completion"); + assert_matches!( + completion.activation_reason, + ReconcilerActivationReason::SystemNetworkingConfigChanged + ); + assert_eq!(completion.activation_count, 1); + assert_eq!(completion.status, "second"); + + harness.shutdown_cleanly().await; + logctx.cleanup_successful(); +} + +// Test: full scrimlet status round-trip. Start as scrimlet, complete +// reconciliation #0 (Startup). Set NotScrimlet → task goes inert. Set +// Scrimlet again → reconciliation #1 fires with activation_reason = +// ScrimletStatusChanged and activation_count = 1. +#[tokio::test(start_paused = true)] +async fn scrimlet_status_round_trip() { + let logctx = + omicron_test_utils::dev::test_setup_log("scrimlet_status_round_trip"); + let harness = Harness::new(&logctx.log); + + // First reconciliation (Startup). + harness.wait_for_do_reconciliation_call_count(1).await; + harness.do_reconciliation_results_tx.send("first".to_string()).unwrap(); + let status = harness.wait_for_task_status_idle().await; + let completion = + status.last_completion.expect("should have last_completion"); + assert_matches!( + completion.activation_reason, + ReconcilerActivationReason::Startup + ); + assert_eq!(completion.activation_count, 0); + + // Become NotScrimlet → task should go inert. + harness.set_scrimlet_status(ScrimletStatus::NotScrimlet); + harness.wait_for_task_status_no_longer_a_scrimlet().await; + + // No additional do_reconciliation call should have happened. + assert_eq!(harness.do_reconciliation_calls.lock().unwrap().len(), 1); + + // Become Scrimlet again → reconciliation #1 fires. + harness.set_scrimlet_status(ScrimletStatus::Scrimlet); + harness.wait_for_do_reconciliation_call_count(2).await; + + // Status should be Running with ScrimletStatusChanged. + let status = harness.task.status(); + match &status.current_status { + ReconcilerCurrentStatus::Running(running) => { + assert_matches!( + running.activation_reason(), + ReconcilerActivationReason::ScrimletStatusChanged + ); + } + other => panic!("expected Running status, got {other:?}"), + } + + // Complete the second reconciliation and check last_completion. + harness.do_reconciliation_results_tx.send("second".to_string()).unwrap(); + let status = harness.wait_for_task_status_idle().await; + let completion = + status.last_completion.expect("should have last_completion"); + assert_matches!( + completion.activation_reason, + ReconcilerActivationReason::ScrimletStatusChanged + ); + assert_eq!(completion.activation_count, 1); + assert_eq!(completion.status, "second"); + + harness.shutdown_cleanly().await; + logctx.cleanup_successful(); +} + +// Test: dropping the system_networking_config sender while the task is in +// the select! (after completing a reconciliation) causes the task to +// exit with TaskExitedUnexpectedly. +#[tokio::test(start_paused = true)] +async fn channel_closure_system_networking_config_during_select() { + let logctx = omicron_test_utils::dev::test_setup_log( + "channel_closure_system_networking_config_during_select", + ); + let harness = Harness::new(&logctx.log); + + // Complete one reconciliation so the task reaches the select!. + harness.wait_for_do_reconciliation_call_count(1).await; + harness.do_reconciliation_results_tx.send("done".to_string()).unwrap(); + harness.wait_for_task_status_idle().await; + + // Drop the system_networking_config sender. This closes the watch channel, + // which causes the `system_networking_config_rx.changed()` arm in the + // select! to return Err(RecvError), causing the task to exit. + let Harness { + task, + scrimlet_status_tx, + system_networking_config_tx, + do_reconciliation_results_tx, + do_reconciliation_calls, + } = harness; + + mem::drop(system_networking_config_tx); + + // Wait for the task to exit and verify the final status. + task._task.await.expect("task didn't panic"); + let final_status = task.status_rx.borrow(); + assert_matches!( + final_status.current_status, + ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::TaskExitedUnexpectedly + ) + ); + + // do_reconciliation should have been called exactly once (the initial + // Startup reconciliation); no second call after the channel closed. + assert_eq!(do_reconciliation_calls.lock().unwrap().len(), 1); + + mem::drop(scrimlet_status_tx); + mem::drop(do_reconciliation_results_tx); + + logctx.cleanup_successful(); +} + +// Test: dropping the scrimlet_status sender while the task is in the +// select! (after completing a reconciliation) causes the task to exit +// with TaskExitedUnexpectedly. +#[tokio::test(start_paused = true)] +async fn channel_closure_scrimlet_status_during_select() { + let logctx = omicron_test_utils::dev::test_setup_log( + "channel_closure_scrimlet_status_during_select", + ); + let harness = Harness::new(&logctx.log); + + // Complete one reconciliation so the task reaches the select!. + harness.wait_for_do_reconciliation_call_count(1).await; + harness.do_reconciliation_results_tx.send("done".to_string()).unwrap(); + harness.wait_for_task_status_idle().await; + + // Destructure the harness so we can drop the scrimlet_status sender + // while still holding the other pieces we need. + let Harness { + task, + scrimlet_status_tx, + do_reconciliation_results_tx, + do_reconciliation_calls, + .. + } = harness; + + // Drop the scrimlet_status sender. This closes the watch channel, + // which causes the `scrimlet_status_rx.changed()` arm in the + // select! to return Err(RecvError), causing the task to exit. + mem::drop(scrimlet_status_tx); + + // Wait for the task to exit and verify the final status. + task._task.await.expect("task didn't panic"); + let final_status = task.status_rx.borrow(); + assert_matches!( + final_status.current_status, + ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::TaskExitedUnexpectedly + ) + ); + + // do_reconciliation should have been called exactly once (the initial + // Startup reconciliation); no second call after the channel closed. + assert_eq!(do_reconciliation_calls.lock().unwrap().len(), 1); + + // Explicitly drop after the task exits so we can't hit the .expect() + // in MockReconciler::do_reconciliation(). + mem::drop(do_reconciliation_results_tx); + + logctx.cleanup_successful(); +} + +// Test: dropping the scrimlet_status sender while the task is blocked in +// wait_if_this_sled_is_no_longer_a_scrimlet() waiting for the switch slot +// causes the task to exit with TaskExitedUnexpectedly. +#[tokio::test(start_paused = true)] +async fn channel_closure_scrimlet_status_during_not_scrimlet() { + let logctx = omicron_test_utils::dev::test_setup_log( + "channel_closure_scrimlet_status_during_not_scrimlet", + ); + let harness = Harness::new(&logctx.log); + + // Set ourselves as "not a scrimlet". + harness.set_scrimlet_status(ScrimletStatus::NotScrimlet); + + // Wait for the task to reach Inert(NoLongerAScrimlet). + harness.wait_for_task_status_no_longer_a_scrimlet().await; + + // Destructure the harness so we can drop the scrimlet_status sender + // while still holding the other pieces we need. + let Harness { + task, + scrimlet_status_tx, + system_networking_config_tx, + do_reconciliation_results_tx, + do_reconciliation_calls, + } = harness; + + // Drop the scrimlet_status sender. This closes the watch channel, + // which causes scrimlet_status_rx.changed().await to return + // Err(RecvError) inside wait_if_this_sled_is_no_longer_a_scrimlet(), + // causing the task to exit. + mem::drop(scrimlet_status_tx); + + // Wait for the task to exit and verify the final status. + task._task.await.expect("task didn't panic"); + let final_status = task.status_rx.borrow(); + assert_matches!( + final_status.current_status, + ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::TaskExitedUnexpectedly + ) + ); + + // do_reconciliation() should never have been called. + assert_eq!(do_reconciliation_calls.lock().unwrap().len(), 0); + + // Explicitly drop after the task exits. + mem::drop(do_reconciliation_results_tx); + mem::drop(system_networking_config_tx); + + logctx.cleanup_successful(); +} + +// Test: dropping the system_networking_config sender while the task is blocked +// in wait_if_this_sled_is_no_longer_a_scrimlet() causes the task to exit with +// TaskExitedUnexpectedly. +#[tokio::test(start_paused = true)] +async fn channel_closure_system_networking_config_during_not_scrimlet() { + let logctx = omicron_test_utils::dev::test_setup_log( + "channel_closure_system_networking_config_during_not_scrimlet", + ); + let harness = Harness::new(&logctx.log); + + // Set ourselves as "not a scrimlet". + harness.set_scrimlet_status(ScrimletStatus::NotScrimlet); + + // Wait for the task to reach Inert(NoLongerAScrimlet). + harness.wait_for_task_status_no_longer_a_scrimlet().await; + + // Destructure the harness so we can drop the system_networking_config + // sender while still holding the other pieces we need. + let Harness { + task, + scrimlet_status_tx, + system_networking_config_tx, + do_reconciliation_results_tx, + do_reconciliation_calls, + } = harness; + + // Drop the system_networking_config sender. The task is currently blocked + // waiting to become a scrimlet again, but also monitors this channel for + // closure; it should notice and exit. + mem::drop(system_networking_config_tx); + + // Wait for the task to exit and verify the final status. + task._task.await.expect("task didn't panic"); + let final_status = task.status_rx.borrow(); + assert_matches!( + final_status.current_status, + ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::TaskExitedUnexpectedly + ) + ); + + // do_reconciliation() should never have been called. + assert_eq!(do_reconciliation_calls.lock().unwrap().len(), 0); + + // Explicitly drop after the task exits. + mem::drop(do_reconciliation_results_tx); + mem::drop(scrimlet_status_tx); + + logctx.cleanup_successful(); +} diff --git a/sled-agent/scrimlet-reconcilers/src/status.rs b/sled-agent/scrimlet-reconcilers/src/status.rs new file mode 100644 index 00000000000..d433745b4d6 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/status.rs @@ -0,0 +1,141 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types for the status and results of the reconcilers in this crate. + +use crate::DpdReconcilerStatus; +use crate::MgdReconcilerStatus; +use crate::UplinkdReconcilerStatus; +use chrono::DateTime; +use chrono::Utc; +use std::time::Duration; +use std::time::Instant; + +/// Whether or not this sled is a scrimlet. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ScrimletStatus { + Scrimlet, + NotScrimlet, +} + +/// Status of attempting to determine this sled's switch slot via MGS within +/// this sled's switch zone. +#[derive(Debug, Clone)] +pub enum DetermineSwitchSlotStatus { + /// We're not attempting to contact MGS because we're not a scrimlet. + NotScrimlet, + + /// We're currently attempting to contact MGS. + /// + /// If this is not the first attempt, `prev_attempt_err` contains the error + /// we encountered the last time. (If the last time succeeded, we'd be + /// done!) + ContactingMgs { prev_attempt_err: Option }, + + /// We're currently idle waiting for a timeout to retry due to a previous + /// failure. + WaitingToRetry { prev_attempt_err: String }, +} + +/// Why a reconciler task has gone inert. +#[derive(Debug, Clone, Copy)] +pub enum ReconcilerInertReason { + /// The reconciler task started when this sled was a scrimlet, but it has + /// since become "not a scrimlet" (e.g., because the attached switch has + /// gone away). + NoLongerAScrimlet, + + /// The reconciler task exited. This is not expected except in tests; the + /// task runs forever as long as sled-agent holds on to the channels used to + /// communicate with it. + TaskExitedUnexpectedly, +} + +/// Why a reconciler task was activated. +#[derive(Debug, Clone, Copy)] +pub enum ReconcilerActivationReason { + /// Each reconciler runs once on startup. + Startup, + /// The task was activated due to its periodic timer firing. + PeriodicTimer, + /// The task was activated in response to a change in the networking config. + SystemNetworkingConfigChanged, + /// The task was activated in response to the sled becoming a scrimlet again + /// (after previously transitioning to "not a scrimlet"). + ScrimletStatusChanged, +} + +#[derive(Debug, Clone)] +pub struct ReconciliationCompletedStatus { + pub activation_reason: ReconcilerActivationReason, + pub completed_at_time: DateTime, + pub ran_for: Duration, + pub activation_count: u64, + pub status: T, +} + +#[derive(Debug, Clone, Copy)] +pub struct ReconcilerRunningStatus { + activation_reason: ReconcilerActivationReason, + started_at_time: DateTime, + started_at_instant: Instant, +} + +impl ReconcilerRunningStatus { + pub(crate) fn new(activation_reason: ReconcilerActivationReason) -> Self { + Self { + activation_reason, + started_at_time: Utc::now(), + started_at_instant: Instant::now(), + } + } + + pub fn activation_reason(&self) -> ReconcilerActivationReason { + self.activation_reason + } + + pub fn started_at(&self) -> DateTime { + self.started_at_time + } + + pub fn elapsed_since_start(&self) -> Duration { + self.started_at_instant.elapsed() + } +} + +#[derive(Debug, Clone)] +pub enum ReconcilerCurrentStatus { + /// The reconciler is inert: it will not or cannot run for some reason. + Inert(ReconcilerInertReason), + /// The reconciler is currently running. + Running(ReconcilerRunningStatus), + /// The reconciler is not currently running. + Idle, +} + +#[derive(Debug, Clone)] +pub struct ReconcilerStatus { + /// Status of the task at this moment. + pub current_status: ReconcilerCurrentStatus, + /// Final status of the most recent activation of this task. + // Box the inner status to avoid clippy complaining about + // `ScrimletReconcilersStatus::Running { ... }` being overly large. + pub last_completion: Option>>, +} + +#[derive(Debug, Clone)] +pub enum ScrimletReconcilersStatus { + /// `sled-agent` has not yet provided underlay networking information. + WaitingForSledAgentNetworkingInfo, + + /// We're attempting to determine our switch slot. + DeterminingSwitchSlot(DetermineSwitchSlotStatus), + + /// We are a scrimlet and the individual reconcilers are running. + Running { + dpd_reconciler: ReconcilerStatus, + mgd_reconciler: ReconcilerStatus, + uplinkd_reconciler: ReconcilerStatus, + }, +} diff --git a/sled-agent/scrimlet-reconcilers/src/switch_zone_slot.rs b/sled-agent/scrimlet-reconcilers/src/switch_zone_slot.rs new file mode 100644 index 00000000000..4c9e8f6aa2f --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/switch_zone_slot.rs @@ -0,0 +1,143 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::DetermineSwitchSlotStatus; +use crate::ScrimletStatus; +use gateway_client::Client; +use gateway_client::ClientInfo; +use gateway_types::component::SpType; +use sled_agent_types::early_networking::SwitchSlot; +use slog::Logger; +use slog::error; +use slog::warn; +use slog_error_chain::InlineErrorChain; +use std::time::Duration; +use tokio::sync::watch; +use tokio::sync::watch::error::RecvError; + +/// Newtype wrapper around [`SwitchSlot`]. This type is always the physical slot +/// of our own, local switch. +/// +/// This information can only be determined by asking MGS inside our own switch +/// zone. An instance of this type can only be created if we are indeed a +/// scrimlet. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub(crate) struct ThisSledSwitchSlot(SwitchSlot); + +impl PartialEq for ThisSledSwitchSlot { + fn eq(&self, other: &SwitchSlot) -> bool { + self.0 == *other + } +} + +impl PartialEq for SwitchSlot { + fn eq(&self, other: &ThisSledSwitchSlot) -> bool { + *self == other.0 + } +} + +impl ThisSledSwitchSlot { + const MGS_RETRY_TIMEOUT: Duration = Duration::from_secs(5); + + #[cfg(test)] + pub(crate) const TEST_FAKE: Self = Self(SwitchSlot::Switch0); + + pub(crate) async fn determine_retrying_forever( + determine_status_tx: watch::Sender, + scrimlet_status_rx: &mut watch::Receiver, + client: &Client, + log: &Logger, + ) -> Result { + loop { + // Wait until we become a scrimlet; there's no point in trying to + // contact our switch zone if it doesn't exist. + loop { + let scrimlet_status = *scrimlet_status_rx.borrow_and_update(); + match scrimlet_status { + ScrimletStatus::Scrimlet => break, + ScrimletStatus::NotScrimlet => { + determine_status_tx.send_modify(|status| { + *status = DetermineSwitchSlotStatus::NotScrimlet; + }); + scrimlet_status_rx.changed().await?; + continue; + } + } + } + + // Update to our status to `ContactingMgs`, and carry forward any + // error from a previous attempt. + determine_status_tx.send_if_modified(|status| match status { + DetermineSwitchSlotStatus::ContactingMgs { .. } => false, + DetermineSwitchSlotStatus::NotScrimlet => { + *status = DetermineSwitchSlotStatus::ContactingMgs { + prev_attempt_err: None, + }; + true + } + DetermineSwitchSlotStatus::WaitingToRetry { + prev_attempt_err, + } => { + *status = DetermineSwitchSlotStatus::ContactingMgs { + prev_attempt_err: Some(prev_attempt_err.clone()), + }; + true + } + }); + + // We are a scrimlet - see if we know our own slot yet. + let err = match client + .sp_local_switch_id() + .await + .map(|resp| resp.into_inner()) + { + Ok(identity) => match (identity.type_, identity.slot) { + (SpType::Switch, 0) => { + return Ok(ThisSledSwitchSlot(SwitchSlot::Switch0)); + } + (SpType::Switch, 1) => { + return Ok(ThisSledSwitchSlot(SwitchSlot::Switch1)); + } + (sp_type, sp_slot) => { + // We should never get any other response; if we do, + // something has gone very wrong with MGS. It's not + // likely retrying will fix this, but there isn't + // anything else we can do. + error!( + log, + "failed to determine this sled's switch slot: got \ + unexpected identity; will retry"; + "sp_type" => ?sp_type, + "sp_slot" => sp_slot, + ); + format!( + "received invalid SP type/slot combo from MGS {}: \ + {sp_type:?}/{sp_slot}", + client.baseurl() + ) + } + }, + Err(err) => { + let err = InlineErrorChain::new(&err); + warn!( + log, + "failed to determine this sled's switch slot; \ + will retry"; + &err, + ); + err.to_string() + } + }; + + determine_status_tx.send_modify(|status| { + *status = DetermineSwitchSlotStatus::WaitingToRetry { + prev_attempt_err: err, + }; + }); + + // Sleep briefly before retrying. + tokio::time::sleep(Self::MGS_RETRY_TIMEOUT).await; + } + } +} diff --git a/sled-agent/scrimlet-reconcilers/src/switch_zone_underlay_ip.rs b/sled-agent/scrimlet-reconcilers/src/switch_zone_underlay_ip.rs new file mode 100644 index 00000000000..99407bcbd95 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/switch_zone_underlay_ip.rs @@ -0,0 +1,82 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use omicron_common::address::get_switch_zone_address; +use sled_agent_types::sled::StartSledAgentRequest; +use std::fmt; +use std::net::IpAddr; +use std::net::Ipv6Addr; + +/// Newtype wrapper around [`Ipv6Addr`]. This type is always the IP address +/// of our own, local switch zone. +/// +/// That switch zone will only exist if we are a scrimlet, but we always +/// know what the IP would be. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct ThisSledSwitchZoneUnderlayIpAddr(Ipv6Addr); + +impl ThisSledSwitchZoneUnderlayIpAddr { + #[cfg(test)] + pub(crate) const TEST_FAKE: Self = Self(Ipv6Addr::LOCALHOST); + + /// Construct a [`ThisSledSwitchZoneUnderlayIpAddr`] from the request to + /// start this sled agent. + /// + /// This function MUST only be called by a `sled-agent` with its own + /// `StartSledAgentRequest`. We can't statically enforce this requirement, + /// but failure to adhere to this convention defeats the entire point of + /// this type: it is supposed to provide a way for `sled-agent` (and related + /// crates, like this one) to statically "tag" an IP address as being the IP + /// of its own, local switch zone. + pub fn from_sled_agent_request(request: &StartSledAgentRequest) -> Self { + ThisSledSwitchZoneUnderlayIpAddr(get_switch_zone_address( + request.body.subnet, + )) + } +} + +// NOTE: We impl `From` only in this direction: constructing a +// `ThisSledSwitchZoneUnderlayIpAddr` must happen only via +// `from_sled_agent_request()`. +impl From for Ipv6Addr { + fn from(value: ThisSledSwitchZoneUnderlayIpAddr) -> Self { + value.0 + } +} + +impl From for IpAddr { + fn from(value: ThisSledSwitchZoneUnderlayIpAddr) -> Self { + value.0.into() + } +} + +impl PartialEq for ThisSledSwitchZoneUnderlayIpAddr { + fn eq(&self, other: &IpAddr) -> bool { + self.0.eq(other) + } +} + +impl PartialEq for IpAddr { + fn eq(&self, other: &ThisSledSwitchZoneUnderlayIpAddr) -> bool { + self.eq(&other.0) + } +} + +impl PartialEq for ThisSledSwitchZoneUnderlayIpAddr { + fn eq(&self, other: &Ipv6Addr) -> bool { + self.0.eq(other) + } +} + +impl PartialEq for Ipv6Addr { + fn eq(&self, other: &ThisSledSwitchZoneUnderlayIpAddr) -> bool { + self.eq(&other.0) + } +} + +impl fmt::Display for ThisSledSwitchZoneUnderlayIpAddr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} diff --git a/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs new file mode 100644 index 00000000000..613ef7e6cc5 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs @@ -0,0 +1,48 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Reconciler for configuration of `uplinkd` within a scrimlet's switch zone. +//! +//! Unlike most reconcilers in this crate, `uplinkd`'s configuration is managed +//! via SMF, not a dropshot server. + +use crate::ThisSledSwitchZoneUnderlayIpAddr; +use crate::reconciler_task::Reconciler; +use crate::switch_zone_slot::ThisSledSwitchSlot; +use sled_agent_types::system_networking::SystemNetworkingConfig; +use slog::Logger; +use std::time::Duration; + +#[derive(Debug, Clone)] +pub struct UplinkdReconcilerStatus { + pub todo_status: (), +} + +#[derive(Debug)] +pub(crate) struct UplinkdReconciler { + _switch_slot: ThisSledSwitchSlot, +} + +impl Reconciler for UplinkdReconciler { + type Status = UplinkdReconcilerStatus; + + const LOGGER_COMPONENT_NAME: &'static str = "UplinkdReconciler"; + const RE_RECONCILE_INTERVAL: std::time::Duration = Duration::from_secs(30); + + fn new( + _switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + switch_slot: ThisSledSwitchSlot, + _parent_log: &Logger, + ) -> Self { + Self { _switch_slot: switch_slot } + } + + async fn do_reconciliation( + &mut self, + _system_networking_config: &SystemNetworkingConfig, + _log: &Logger, + ) -> Self::Status { + UplinkdReconcilerStatus { todo_status: () } + } +} diff --git a/sled-agent/src/bootstrap/early_networking.rs b/sled-agent/src/bootstrap/early_networking.rs index 217a7c2e4b5..c6435efeefb 100644 --- a/sled-agent/src/bootstrap/early_networking.rs +++ b/sled-agent/src/bootstrap/early_networking.rs @@ -4,7 +4,6 @@ //! Network setup required to bring up the control plane -use crate::sled_agent::ThisSledSwitchZoneUnderlayIpAddr; use anyhow::{Context, anyhow}; use dpd_client::Client as DpdClient; use dpd_client::types::{ @@ -39,6 +38,7 @@ use omicron_common::backoff::{ use omicron_ddm_admin_client::DdmError; use oxnet::IpNet; use rdb_types::{Prefix, Prefix4, Prefix6}; +use sled_agent_scrimlet_reconcilers::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::early_networking::{ BfdMode, BgpConfig, BgpPeerConfig, ImportExportPolicy, PortConfig, PortFec, PortSpeed, RouterPeerType, SwitchSlot, UplinkAddress, diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 82d2fc23a4e..543b5bf33fe 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -30,7 +30,6 @@ use crate::config::SidecarRevision; use crate::ddm_reconciler::DdmReconciler; use crate::metrics::MetricsRequestQueue; use crate::profile::*; -use crate::sled_agent::ThisSledSwitchZoneUnderlayIpAddr; use anyhow::anyhow; use camino::{Utf8Path, Utf8PathBuf}; use clickhouse_admin_types::CLICKHOUSE_KEEPER_CONFIG_DIR; @@ -91,6 +90,7 @@ use omicron_uuid_kinds::OmicronZoneUuid; use sled_agent_resolvable_files::{ ZoneImageSourceResolver, ramdisk_file_source, }; +use sled_agent_scrimlet_reconcilers::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::instance::ExternalIpConfig; use sled_agent_types::instance::ExternalIps; use sled_agent_types::inventory::{ diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index e0be3e87daa..a98b6abe512 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -67,6 +67,7 @@ use sled_agent_config_reconciler::{ }; use sled_agent_health_monitor::handle::HealthMonitorHandle; use sled_agent_measurements::MeasurementsHandle; +use sled_agent_scrimlet_reconcilers::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::attached_subnet::AttachedSubnet; use sled_agent_types::attached_subnet::AttachedSubnets; use sled_agent_types::dataset::LocalStorageDatasetDeleteRequest; @@ -1856,70 +1857,3 @@ impl SledAgentFacilities for ReconcilerFacilities { .remove_internal_dns_subnet(prefix); } } - -pub(crate) use self::local_switch_zone_ip::ThisSledSwitchZoneUnderlayIpAddr; - -/// Private module to enforce construction of -/// [`ThisSledSwitchZoneUnderlayIpAddr`] only happens via the constructors we -/// define. -mod local_switch_zone_ip { - use omicron_common::address::get_switch_zone_address; - use sled_agent_types::sled::StartSledAgentRequest; - use std::fmt; - use std::net::IpAddr; - use std::net::Ipv6Addr; - - /// Newtype wrapper around [`Ipv6Addr`]. This type is always the IP address - /// of our own, local switch zone. - /// - /// That switch zone will only exist if we are a scrimlet, but we always - /// know what the IP would be. - #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] - pub(crate) struct ThisSledSwitchZoneUnderlayIpAddr(Ipv6Addr); - - impl ThisSledSwitchZoneUnderlayIpAddr { - /// Construct a [`ThisSledSwitchZoneUnderlayIpAddr`] from the request to - /// start this sled agent. - /// - /// This takes a full request object instead of something smaller (like - /// just a sled subnet) to put up a roadblock to accidentally - /// constructing a [`ThisSledSwitchZoneUnderlayIpAddr`] that points to - /// any address other than our own. `sled-agent` has ready access to the - /// subnets and addresses of other sleds, but doesn't have ready access - /// to other sleds' [`StartSledAgentRequest`]s. - pub(crate) fn from_sled_agent_request( - request: &StartSledAgentRequest, - ) -> Self { - ThisSledSwitchZoneUnderlayIpAddr(get_switch_zone_address( - request.body.subnet, - )) - } - } - - // NOTE: We impl `From` only in this direction: constructing a - // `ThisSledSwitchZoneUnderlayIpAddr` must happen only via - // `from_sled_agent_request()`. - impl From for Ipv6Addr { - fn from(value: ThisSledSwitchZoneUnderlayIpAddr) -> Self { - value.0 - } - } - - impl PartialEq for ThisSledSwitchZoneUnderlayIpAddr { - fn eq(&self, other: &IpAddr) -> bool { - self.0.eq(other) - } - } - - impl PartialEq for IpAddr { - fn eq(&self, other: &ThisSledSwitchZoneUnderlayIpAddr) -> bool { - self.eq(&other.0) - } - } - - impl fmt::Display for ThisSledSwitchZoneUnderlayIpAddr { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.0.fmt(f) - } - } -} From d726cd21198b097f98af1bcd8f3ae665dd979a9f Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 23 Apr 2026 14:24:35 -0400 Subject: [PATCH 2/8] hakari --- workspace-hack/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 87fdb2d53db..1d308dd6d31 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -103,7 +103,7 @@ postgres-types = { version = "0.2.12", default-features = false, features = ["wi ppv-lite86 = { version = "0.2.21", default-features = false, features = ["simd", "std"] } predicates = { version = "3.1.4" } proc-macro2 = { version = "1.0.106" } -proptest = { version = "1.10.0" } +proptest = { version = "1.11.0" } rand-274715c4dabd11b0 = { package = "rand", version = "0.9.2" } rand-c38e5c1d305a1b54 = { package = "rand", version = "0.8.5" } rand_chacha-274715c4dabd11b0 = { package = "rand_chacha", version = "0.9.0", default-features = false, features = ["std"] } @@ -250,7 +250,7 @@ postgres-types = { version = "0.2.12", default-features = false, features = ["wi ppv-lite86 = { version = "0.2.21", default-features = false, features = ["simd", "std"] } predicates = { version = "3.1.4" } proc-macro2 = { version = "1.0.106" } -proptest = { version = "1.10.0" } +proptest = { version = "1.11.0" } rand-274715c4dabd11b0 = { package = "rand", version = "0.9.2" } rand-c38e5c1d305a1b54 = { package = "rand", version = "0.8.5" } rand_chacha-274715c4dabd11b0 = { package = "rand_chacha", version = "0.9.0", default-features = false, features = ["std"] } From 6f0c567909346c44d1ce7597b20f9fb7f1ca0cbe Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 23 Apr 2026 15:36:26 -0400 Subject: [PATCH 3/8] fix flaky tests --- .../scrimlet-reconcilers/src/handle/tests.rs | 113 ++++++------------ 1 file changed, 37 insertions(+), 76 deletions(-) diff --git a/sled-agent/scrimlet-reconcilers/src/handle/tests.rs b/sled-agent/scrimlet-reconcilers/src/handle/tests.rs index 9b41295cd9e..73b5f9c1bee 100644 --- a/sled-agent/scrimlet-reconcilers/src/handle/tests.rs +++ b/sled-agent/scrimlet-reconcilers/src/handle/tests.rs @@ -105,6 +105,28 @@ impl Harness { ); }) } + + async fn wait_for_mock_to_be_called(&self, mock: &Mock<'_>) { + // Tricky bit: we use tokio's paused time in tests below both for + // consistency and test speed, but we need to wait for _real_ time for + // httpmock to receive requests. Use `std::time::Instant` here so we + // wait for 10 real seconds for a request to be received. + let start = std::time::Instant::now(); + while start.elapsed() < Duration::from_secs(10) { + if mock.calls_async().await == 0 { + // advance paused time... + tokio::time::sleep(Duration::from_secs(1)).await; + // and also _really_ sleep briefly to avoid spinning 100% CPU + std::thread::sleep(Duration::from_millis(10)); + continue; + } + + // We got at least 1 call; assert and return. + return mock.assert_async().await; + } + + panic!("timed out wait for mock to be called"); + } } #[tokio::test] @@ -164,7 +186,7 @@ async fn scrimlet_three_phase_initialization_info_then_scrimlet() { let harness = Harness::new(&logctx.log); // Configure our mock MGS to claim to be switch slot 0. - harness.mock_mgs_set_switch_slot(0); + let success_mock = harness.mock_mgs_set_switch_slot(0); // initial status assert_matches!( @@ -186,23 +208,8 @@ async fn scrimlet_three_phase_initialization_info_then_scrimlet() { harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); - // becoming a scrimlet should cause us to transition to contacting MGS, but - // this happens in another tokio task so we have to wait for it. - harness - .wait_for_task_status(|status| { - matches!( - status, - ScrimletReconcilersStatus::DeterminingSwitchSlot( - DetermineSwitchSlotStatus::ContactingMgs { - prev_attempt_err: None - } - ) - ) - }) - .await; - - // and we should transition through that to `Running` after contacting the - // mock MGS. + // We should contact MGS, then transition to the running state. + harness.wait_for_mock_to_be_called(&success_mock).await; harness .wait_for_task_status(|status| { matches!(status, ScrimletReconcilersStatus::Running { .. }) @@ -222,7 +229,7 @@ async fn scrimlet_three_phase_initialization_scrimlet_then_info() { let harness = Harness::new(&logctx.log); // Configure our mock MGS to claim to be switch slot 0. - harness.mock_mgs_set_switch_slot(0); + let success_mock = harness.mock_mgs_set_switch_slot(0); // Become a scrimlet right away. harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); @@ -237,20 +244,8 @@ async fn scrimlet_three_phase_initialization_scrimlet_then_info() { harness.sled_agent_networking_info(), ); - // We're already a scrimlet, so we should go through `ContactingMgs` and - // then to `Running`. - harness - .wait_for_task_status(|status| { - matches!( - status, - ScrimletReconcilersStatus::DeterminingSwitchSlot( - DetermineSwitchSlotStatus::ContactingMgs { - prev_attempt_err: None - } - ) - ) - }) - .await; + // We're already a scrimlet, so we contact MGS then transition to Running. + harness.wait_for_mock_to_be_called(&success_mock).await; harness .wait_for_task_status(|status| { matches!(status, ScrimletReconcilersStatus::Running { .. }) @@ -289,6 +284,9 @@ async fn scrimlet_mgs_fails_first_attempt() { harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); + // Wait for our mock to receive a request and return a 503; we should then + // see the `WaitingToRetry` state with an error. + harness.wait_for_mock_to_be_called(&error_mock).await; // We should first see the "waiting to retry" status... harness .wait_for_task_status(|status| { @@ -303,24 +301,11 @@ async fn scrimlet_mgs_fails_first_attempt() { }) .await; - // ... then the "contacting MGS" status with the previous error set. - harness - .wait_for_task_status(|status| { - matches!( - status, - ScrimletReconcilersStatus::DeterminingSwitchSlot( - DetermineSwitchSlotStatus::ContactingMgs { - prev_attempt_err: Some(err), - } - ) if err.contains("status: 503") - ) - }) - .await; - // Changing MGS to return success should transition us to `Running`. error_mock.delete(); - harness.mock_mgs_set_switch_slot(0); + let success_mock = harness.mock_mgs_set_switch_slot(0); + harness.wait_for_mock_to_be_called(&success_mock).await; harness .wait_for_task_status(|status| { matches!(status, ScrimletReconcilersStatus::Running { .. }) @@ -361,7 +346,8 @@ async fn scrimlet_mgs_fails_then_we_become_not_a_scrimlet() { harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); - // We should first see the "waiting to retry" status... + // Wait until we've failed to determine our switch slot. + harness.wait_for_mock_to_be_called(&error_mock).await; harness .wait_for_task_status(|status| { matches!( @@ -375,20 +361,6 @@ async fn scrimlet_mgs_fails_then_we_become_not_a_scrimlet() { }) .await; - // ... then the "contacting MGS" status with the previous error set. - harness - .wait_for_task_status(|status| { - matches!( - status, - ScrimletReconcilersStatus::DeterminingSwitchSlot( - DetermineSwitchSlotStatus::ContactingMgs { - prev_attempt_err: Some(err), - } - ) if err.contains("status: 503") - ) - }) - .await; - // Now sled-agent tells us we're not a scrimlet; we should transition back // to `NotScrimlet`. harness.handle.set_scrimlet_status(ScrimletStatus::NotScrimlet); @@ -405,23 +377,12 @@ async fn scrimlet_mgs_fails_then_we_become_not_a_scrimlet() { // Reconfigure MGS to succeed. error_mock.delete(); - harness.mock_mgs_set_switch_slot(0); + let success_mock = harness.mock_mgs_set_switch_slot(0); // Become a scrimlet again - we should transition through `ContactingMgs` // (with no previous error) then to `Running`. harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); - harness - .wait_for_task_status(|status| { - matches!( - status, - ScrimletReconcilersStatus::DeterminingSwitchSlot( - DetermineSwitchSlotStatus::ContactingMgs { - prev_attempt_err: None, - } - ) - ) - }) - .await; + harness.wait_for_mock_to_be_called(&success_mock).await; harness .wait_for_task_status(|status| { matches!(status, ScrimletReconcilersStatus::Running { .. }) From a518eb8451e182869d2595c2d2ed89c06a398f39 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 23 Apr 2026 18:06:15 -0400 Subject: [PATCH 4/8] don't leave behind log files in should_panic test --- Cargo.lock | 1 + sled-agent/scrimlet-reconcilers/Cargo.toml | 1 + sled-agent/scrimlet-reconcilers/src/handle/tests.rs | 12 +++++++++--- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dd4c5cc2943..4b8b7c7f25c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13545,6 +13545,7 @@ dependencies = [ "assert_matches", "chrono", "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=44a949c9bedf4fcd4d280337fa1965b4293c88d1)", + "dropshot 0.16.7", "gateway-client", "gateway-types", "httpmock", diff --git a/sled-agent/scrimlet-reconcilers/Cargo.toml b/sled-agent/scrimlet-reconcilers/Cargo.toml index 7f77880bd3d..781c806fccd 100644 --- a/sled-agent/scrimlet-reconcilers/Cargo.toml +++ b/sled-agent/scrimlet-reconcilers/Cargo.toml @@ -25,6 +25,7 @@ omicron-workspace-hack.workspace = true [dev-dependencies] assert_matches.workspace = true +dropshot.workspace = true httpmock.workspace = true omicron-test-utils.workspace = true serde_json.workspace = true diff --git a/sled-agent/scrimlet-reconcilers/src/handle/tests.rs b/sled-agent/scrimlet-reconcilers/src/handle/tests.rs index 73b5f9c1bee..4198d248edd 100644 --- a/sled-agent/scrimlet-reconcilers/src/handle/tests.rs +++ b/sled-agent/scrimlet-reconcilers/src/handle/tests.rs @@ -6,6 +6,9 @@ use std::time::Duration; use super::*; use assert_matches::assert_matches; +use dropshot::ConfigLogging; +use dropshot::ConfigLoggingLevel; +use dropshot::test_util::LogContext; use httpmock::Mock; use httpmock::MockServer; use omicron_test_utils::dev; @@ -134,8 +137,13 @@ impl Harness { expected = "set_sled_agent_networking_info_once() called more than once" )] async fn calling_set_sled_agent_networking_info_once_multiple_times_panics() { - let logctx = dev::test_setup_log( + // Set up a stderr logger - we're going to panic and won't have the + // opportunity to clean up a file-based one. + let log_config = + ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Trace }; + let logctx = LogContext::new( "calling_set_sled_agent_networking_info_once_multiple_times_panics", + &log_config, ); let harness = Harness::new(&logctx.log); @@ -145,8 +153,6 @@ async fn calling_set_sled_agent_networking_info_once_multiple_times_panics() { harness.handle.set_sled_agent_networking_info_once( harness.sled_agent_networking_info(), ); - - logctx.cleanup_successful(); } // Happy-path test for non-scrimlets. From 2b3444de012914dd179bf149fd213b796482d258 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 30 Apr 2026 16:38:57 -0400 Subject: [PATCH 5/8] use ThisSledSwitchZoneUnderlayIpAddr from sled-agent-types --- sled-agent/scrimlet-reconcilers/Cargo.toml | 1 + .../src/dpd_reconciler.rs | 2 +- sled-agent/scrimlet-reconcilers/src/handle.rs | 2 +- sled-agent/scrimlet-reconcilers/src/lib.rs | 5 +- .../src/mgd_reconciler.rs | 2 +- .../src/reconciler_task.rs | 2 +- .../src/switch_zone_underlay_ip.rs | 82 ------------------- .../src/uplinkd_reconciler.rs | 2 +- sled-agent/types/src/sled.rs | 3 + 9 files changed, 12 insertions(+), 89 deletions(-) delete mode 100644 sled-agent/scrimlet-reconcilers/src/switch_zone_underlay_ip.rs diff --git a/sled-agent/scrimlet-reconcilers/Cargo.toml b/sled-agent/scrimlet-reconcilers/Cargo.toml index 781c806fccd..b8653673dc2 100644 --- a/sled-agent/scrimlet-reconcilers/Cargo.toml +++ b/sled-agent/scrimlet-reconcilers/Cargo.toml @@ -29,3 +29,4 @@ dropshot.workspace = true httpmock.workspace = true omicron-test-utils.workspace = true serde_json.workspace = true +sled-agent-types = { workspace = true, features = ["testing"] } diff --git a/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs index 62ca650d39e..85d6c144ecf 100644 --- a/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs +++ b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs @@ -5,12 +5,12 @@ //! Reconciler responsible for configuration of `dpd` within a scrimlet's switch //! zone. -use crate::ThisSledSwitchZoneUnderlayIpAddr; use crate::reconciler_task::Reconciler; use crate::switch_zone_slot::ThisSledSwitchSlot; use dpd_client::Client; use omicron_common::OMICRON_DPD_TAG; use omicron_common::address::DENDRITE_PORT; +use sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::system_networking::SystemNetworkingConfig; use slog::Logger; use std::time::Duration; diff --git a/sled-agent/scrimlet-reconcilers/src/handle.rs b/sled-agent/scrimlet-reconcilers/src/handle.rs index db21451de07..5f08d969ce1 100644 --- a/sled-agent/scrimlet-reconcilers/src/handle.rs +++ b/sled-agent/scrimlet-reconcilers/src/handle.rs @@ -7,7 +7,6 @@ //! contains a handle to each of the inner service-specific reconcilers. use crate::DetermineSwitchSlotStatus; -use crate::ThisSledSwitchZoneUnderlayIpAddr; use crate::dpd_reconciler::DpdReconciler; use crate::mgd_reconciler::MgdReconciler; use crate::reconciler_task::ReconcilerTaskHandle; @@ -16,6 +15,7 @@ use crate::status::ScrimletStatus; use crate::switch_zone_slot::ThisSledSwitchSlot; use crate::uplinkd_reconciler::UplinkdReconciler; use omicron_common::address::MGS_PORT; +use sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::system_networking::SystemNetworkingConfig; use slog::Logger; use slog::info; diff --git a/sled-agent/scrimlet-reconcilers/src/lib.rs b/sled-agent/scrimlet-reconcilers/src/lib.rs index 110fa5a3835..993f6388088 100644 --- a/sled-agent/scrimlet-reconcilers/src/lib.rs +++ b/sled-agent/scrimlet-reconcilers/src/lib.rs @@ -38,6 +38,9 @@ //! that Nexus is responsible for maintaining what the configuration should be, //! and each scrimlet is responsible for applying that configuration to its own //! switch zone's services; the latter is implemented via this crate. +//! +//! [`ThisSledSwitchZoneUnderlayIpAddr`]: +//! sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr mod dpd_reconciler; mod handle; @@ -45,7 +48,6 @@ mod mgd_reconciler; mod reconciler_task; mod status; mod switch_zone_slot; -mod switch_zone_underlay_ip; mod uplinkd_reconciler; pub use dpd_reconciler::DpdReconcilerStatus; @@ -61,5 +63,4 @@ pub use status::ReconcilerStatus; pub use status::ReconciliationCompletedStatus; pub use status::ScrimletReconcilersStatus; pub use status::ScrimletStatus; -pub use switch_zone_underlay_ip::ThisSledSwitchZoneUnderlayIpAddr; pub use uplinkd_reconciler::UplinkdReconcilerStatus; diff --git a/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs index b14d4e7db4f..3a400ae1970 100644 --- a/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs +++ b/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs @@ -5,11 +5,11 @@ //! Reconciler responsible for configuration of `mgd` within a scrimlet's switch //! zone. -use crate::ThisSledSwitchZoneUnderlayIpAddr; use crate::reconciler_task::Reconciler; use crate::switch_zone_slot::ThisSledSwitchSlot; use mg_admin_client::Client; use omicron_common::address::MGD_PORT; +use sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::system_networking::SystemNetworkingConfig; use slog::Logger; use std::time::Duration; diff --git a/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs b/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs index 7681fc79441..a15bd7cad17 100644 --- a/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs +++ b/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs @@ -21,7 +21,6 @@ //! [`ReconcilerTask::run()`] handles 1 and 3, and service-specific //! implementations of [`Reconciler`] provide 2. -use crate::ThisSledSwitchZoneUnderlayIpAddr; use crate::status::ReconcilerActivationReason; use crate::status::ReconcilerCurrentStatus; use crate::status::ReconcilerInertReason; @@ -31,6 +30,7 @@ use crate::status::ReconciliationCompletedStatus; use crate::status::ScrimletStatus; use crate::switch_zone_slot::ThisSledSwitchSlot; use chrono::Utc; +use sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::system_networking::SystemNetworkingConfig; use slog::Logger; use slog::error; diff --git a/sled-agent/scrimlet-reconcilers/src/switch_zone_underlay_ip.rs b/sled-agent/scrimlet-reconcilers/src/switch_zone_underlay_ip.rs deleted file mode 100644 index 99407bcbd95..00000000000 --- a/sled-agent/scrimlet-reconcilers/src/switch_zone_underlay_ip.rs +++ /dev/null @@ -1,82 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use omicron_common::address::get_switch_zone_address; -use sled_agent_types::sled::StartSledAgentRequest; -use std::fmt; -use std::net::IpAddr; -use std::net::Ipv6Addr; - -/// Newtype wrapper around [`Ipv6Addr`]. This type is always the IP address -/// of our own, local switch zone. -/// -/// That switch zone will only exist if we are a scrimlet, but we always -/// know what the IP would be. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub struct ThisSledSwitchZoneUnderlayIpAddr(Ipv6Addr); - -impl ThisSledSwitchZoneUnderlayIpAddr { - #[cfg(test)] - pub(crate) const TEST_FAKE: Self = Self(Ipv6Addr::LOCALHOST); - - /// Construct a [`ThisSledSwitchZoneUnderlayIpAddr`] from the request to - /// start this sled agent. - /// - /// This function MUST only be called by a `sled-agent` with its own - /// `StartSledAgentRequest`. We can't statically enforce this requirement, - /// but failure to adhere to this convention defeats the entire point of - /// this type: it is supposed to provide a way for `sled-agent` (and related - /// crates, like this one) to statically "tag" an IP address as being the IP - /// of its own, local switch zone. - pub fn from_sled_agent_request(request: &StartSledAgentRequest) -> Self { - ThisSledSwitchZoneUnderlayIpAddr(get_switch_zone_address( - request.body.subnet, - )) - } -} - -// NOTE: We impl `From` only in this direction: constructing a -// `ThisSledSwitchZoneUnderlayIpAddr` must happen only via -// `from_sled_agent_request()`. -impl From for Ipv6Addr { - fn from(value: ThisSledSwitchZoneUnderlayIpAddr) -> Self { - value.0 - } -} - -impl From for IpAddr { - fn from(value: ThisSledSwitchZoneUnderlayIpAddr) -> Self { - value.0.into() - } -} - -impl PartialEq for ThisSledSwitchZoneUnderlayIpAddr { - fn eq(&self, other: &IpAddr) -> bool { - self.0.eq(other) - } -} - -impl PartialEq for IpAddr { - fn eq(&self, other: &ThisSledSwitchZoneUnderlayIpAddr) -> bool { - self.eq(&other.0) - } -} - -impl PartialEq for ThisSledSwitchZoneUnderlayIpAddr { - fn eq(&self, other: &Ipv6Addr) -> bool { - self.0.eq(other) - } -} - -impl PartialEq for Ipv6Addr { - fn eq(&self, other: &ThisSledSwitchZoneUnderlayIpAddr) -> bool { - self.eq(&other.0) - } -} - -impl fmt::Display for ThisSledSwitchZoneUnderlayIpAddr { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.0.fmt(f) - } -} diff --git a/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs index 613ef7e6cc5..bde11b5c301 100644 --- a/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs +++ b/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs @@ -7,9 +7,9 @@ //! Unlike most reconcilers in this crate, `uplinkd`'s configuration is managed //! via SMF, not a dropshot server. -use crate::ThisSledSwitchZoneUnderlayIpAddr; use crate::reconciler_task::Reconciler; use crate::switch_zone_slot::ThisSledSwitchSlot; +use sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::system_networking::SystemNetworkingConfig; use slog::Logger; use std::time::Duration; diff --git a/sled-agent/types/src/sled.rs b/sled-agent/types/src/sled.rs index 1435d4ad0b7..20c570251d2 100644 --- a/sled-agent/types/src/sled.rs +++ b/sled-agent/types/src/sled.rs @@ -29,6 +29,9 @@ mod local_switch_zone_ip { pub struct ThisSledSwitchZoneUnderlayIpAddr(Ipv6Addr); impl ThisSledSwitchZoneUnderlayIpAddr { + #[cfg(any(test, feature = "testing"))] + pub const TEST_FAKE: Self = Self(Ipv6Addr::LOCALHOST); + /// Construct a [`ThisSledSwitchZoneUnderlayIpAddr`] from the request to /// start this sled agent. /// From bcaa7150878cff52afb21fa45170028324ff2a61 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 30 Apr 2026 16:39:36 -0400 Subject: [PATCH 6/8] hakari --- Cargo.lock | 65 ++++++++++++--------------------------- workspace-hack/Cargo.toml | 30 ++++++++---------- 2 files changed, 31 insertions(+), 64 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 009288c7248..438157a6ff2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4049,7 +4049,7 @@ dependencies = [ "clap", "futures", "gateway-client", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-messages", "omicron-common", "omicron-uuid-kinds", "omicron-workspace-hack", @@ -4076,7 +4076,7 @@ dependencies = [ "chrono", "daft", "ereport-types", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-messages", "gateway-types", "omicron-uuid-kinds", "omicron-workspace-hack", @@ -4101,15 +4101,6 @@ dependencies = [ "zerocopy 0.8.40", ] -[[package]] -name = "gateway-ereport-messages" -version = "0.1.0" -source = "git+https://github.com/oxidecomputer/management-gateway-service?rev=ea2f39ccdea124b5affcad0ca17bc5dacf65823a#ea2f39ccdea124b5affcad0ca17bc5dacf65823a" -dependencies = [ - "serde", - "zerocopy 0.8.40", -] - [[package]] name = "gateway-messages" version = "0.1.0" @@ -4128,24 +4119,6 @@ dependencies = [ "zerocopy 0.8.40", ] -[[package]] -name = "gateway-messages" -version = "0.1.0" -source = "git+https://github.com/oxidecomputer/management-gateway-service?rev=ea2f39ccdea124b5affcad0ca17bc5dacf65823a#ea2f39ccdea124b5affcad0ca17bc5dacf65823a" -dependencies = [ - "bitflags 2.11.0", - "hubpack", - "serde", - "serde-big-array", - "serde_repr", - "smoltcp 0.9.1", - "static_assertions", - "strum 0.27.2", - "strum_macros 0.27.2", - "uuid", - "zerocopy 0.8.40", -] - [[package]] name = "gateway-sp-comms" version = "0.1.2" @@ -4156,8 +4129,8 @@ dependencies = [ "base64 0.22.1", "futures", "fxhash", - "gateway-ereport-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-ereport-messages", + "gateway-messages", "hex", "hubpack", "hubtools 0.4.7 (git+https://github.com/oxidecomputer/hubtools.git?rev=2b1ef9b38d75563ea800baa3b17327eec17b1b7a)", @@ -4192,7 +4165,7 @@ dependencies = [ "camino", "dropshot 0.17.0", "gateway-client", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-messages", "gateway-types", "omicron-gateway", "omicron-test-utils", @@ -4219,7 +4192,7 @@ version = "0.1.0" dependencies = [ "daft", "dropshot 0.17.0", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-messages", "hex", "omicron-uuid-kinds", "omicron-workspace-hack", @@ -6559,7 +6532,7 @@ dependencies = [ "clap", "futures", "gateway-client", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-messages", "gateway-test-utils", "libc", "omicron-gateway", @@ -7197,7 +7170,7 @@ dependencies = [ "expectorate", "futures", "gateway-client", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-messages", "gateway-test-utils", "gateway-types", "httpmock", @@ -7325,7 +7298,7 @@ dependencies = [ "dropshot 0.17.0", "futures", "gateway-client", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-messages", "gateway-test-utils", "gateway-types", "http", @@ -7691,7 +7664,7 @@ dependencies = [ "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=44a949c9bedf4fcd4d280337fa1965b4293c88d1)", "dropshot 0.17.0", "futures", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-messages", "gateway-test-utils", "headers", "hickory-resolver 0.25.2", @@ -8501,7 +8474,7 @@ dependencies = [ "futures", "gateway-api", "gateway-client", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-messages", "gateway-sp-comms", "gateway-test-utils", "gateway-types", @@ -8652,7 +8625,7 @@ dependencies = [ "fatfs", "futures", "gateway-client", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-messages", "gateway-test-utils", "gateway-types", "headers", @@ -8866,7 +8839,7 @@ dependencies = [ "expectorate", "futures", "gateway-client", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-messages", "gateway-test-utils", "gateway-types", "http", @@ -9355,8 +9328,8 @@ dependencies = [ "futures-sink", "futures-task", "futures-util", - "gateway-ereport-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=ea2f39ccdea124b5affcad0ca17bc5dacf65823a)", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=ea2f39ccdea124b5affcad0ca17bc5dacf65823a)", + "gateway-ereport-messages", + "gateway-messages", "generic-array", "getrandom 0.2.17", "getrandom 0.4.1", @@ -9384,7 +9357,6 @@ dependencies = [ "log", "managed", "memchr", - "miniz_oxide", "mio", "newtype-uuid", "nix 0.31.2", @@ -9407,6 +9379,7 @@ dependencies = [ "predicates", "proc-macro2", "proptest", + "quote", "rand 0.8.5", "rand 0.9.2", "rand_chacha 0.3.1", @@ -14175,8 +14148,8 @@ dependencies = [ "clap", "dropshot 0.17.0", "futures", - "gateway-ereport-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-ereport-messages", + "gateway-messages", "gateway-types", "hex", "hubtools 0.4.7 (git+https://github.com/oxidecomputer/hubtools.git?rev=2b1ef9b38d75563ea800baa3b17327eec17b1b7a)", @@ -17007,7 +16980,7 @@ dependencies = [ "fs-err 3.3.0", "futures", "gateway-client", - "gateway-messages 0.1.0 (git+https://github.com/oxidecomputer/management-gateway-service?rev=0d7a8992f914ad6a5947409048779969bbe80e3d)", + "gateway-messages", "gateway-test-utils", "gateway-types", "hex", diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 1d308dd6d31..01c0ddb7f3a 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -60,8 +60,8 @@ futures-core = { version = "0.3.32" } futures-sink = { version = "0.3.32" } futures-task = { version = "0.3.32", default-features = false, features = ["std"] } futures-util = { version = "0.3.32", features = ["channel", "io", "sink"] } -gateway-ereport-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "ea2f39ccdea124b5affcad0ca17bc5dacf65823a", default-features = false, features = ["debug-impls", "serde"] } -gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "ea2f39ccdea124b5affcad0ca17bc5dacf65823a", features = ["std"] } +gateway-ereport-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "0d7a8992f914ad6a5947409048779969bbe80e3d", default-features = false, features = ["debug-impls", "serde"] } +gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "0d7a8992f914ad6a5947409048779969bbe80e3d", features = ["std"] } generic-array = { version = "0.14.7", default-features = false, features = ["more_lengths", "zeroize"] } getrandom-6f8ce4dd05d13bba = { package = "getrandom", version = "0.2.17", default-features = false, features = ["js", "rdrand", "std"] } group = { version = "0.13.0", default-features = false, features = ["alloc"] } @@ -71,9 +71,9 @@ hex = { version = "0.4.3", features = ["serde"] } hickory-proto = { version = "0.25.2", features = ["serde", "text-parsing"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "1.8.1", features = ["full"] } -iddqd = { version = "0.3.17", features = ["daft", "proptest", "schemars08"] } +iddqd = { version = "0.3.18", features = ["daft", "proptest", "schemars08"] } idna = { version = "1.1.0" } -indexmap = { version = "2.13.0", features = ["serde"] } +indexmap = { version = "2.14.0", features = ["serde"] } inout = { version = "0.1.4", default-features = false, features = ["std"] } ipnet = { version = "2.11.0", features = ["serde"] } ipnetwork = { version = "0.21.1", features = ["schemars", "serde"] } @@ -104,6 +104,7 @@ ppv-lite86 = { version = "0.2.21", default-features = false, features = ["simd", predicates = { version = "3.1.4" } proc-macro2 = { version = "1.0.106" } proptest = { version = "1.11.0" } +quote = { version = "1.0.45" } rand-274715c4dabd11b0 = { package = "rand", version = "0.9.2" } rand-c38e5c1d305a1b54 = { package = "rand", version = "0.8.5" } rand_chacha-274715c4dabd11b0 = { package = "rand_chacha", version = "0.9.0", default-features = false, features = ["std"] } @@ -119,7 +120,7 @@ rustls = { version = "0.23.37", features = ["ring"] } rustls-webpki = { version = "0.103.9", default-features = false, features = ["aws-lc-rs", "ring", "std"] } schemars = { version = "0.8.22", features = ["bytes", "chrono", "semver", "url", "uuid1"] } scopeguard = { version = "1.2.0" } -semver = { version = "1.0.27", features = ["serde"] } +semver = { version = "1.0.28", features = ["serde"] } serde = { version = "1.0.228", features = ["alloc", "derive", "rc"] } serde_core = { version = "1.0.228", features = ["alloc", "rc"] } serde_json = { version = "1.0.149", features = ["alloc", "raw_value", "unbounded_depth"] } @@ -206,8 +207,8 @@ futures-core = { version = "0.3.32" } futures-sink = { version = "0.3.32" } futures-task = { version = "0.3.32", default-features = false, features = ["std"] } futures-util = { version = "0.3.32", features = ["channel", "io", "sink"] } -gateway-ereport-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "ea2f39ccdea124b5affcad0ca17bc5dacf65823a", default-features = false, features = ["debug-impls", "serde"] } -gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "ea2f39ccdea124b5affcad0ca17bc5dacf65823a", features = ["std"] } +gateway-ereport-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "0d7a8992f914ad6a5947409048779969bbe80e3d", default-features = false, features = ["debug-impls", "serde"] } +gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "0d7a8992f914ad6a5947409048779969bbe80e3d", features = ["std"] } generic-array = { version = "0.14.7", default-features = false, features = ["more_lengths", "zeroize"] } getrandom-6f8ce4dd05d13bba = { package = "getrandom", version = "0.2.17", default-features = false, features = ["js", "rdrand", "std"] } group = { version = "0.13.0", default-features = false, features = ["alloc"] } @@ -218,9 +219,9 @@ hex = { version = "0.4.3", features = ["serde"] } hickory-proto = { version = "0.25.2", features = ["serde", "text-parsing"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "1.8.1", features = ["full"] } -iddqd = { version = "0.3.17", features = ["daft", "proptest", "schemars08"] } +iddqd = { version = "0.3.18", features = ["daft", "proptest", "schemars08"] } idna = { version = "1.1.0" } -indexmap = { version = "2.13.0", features = ["serde"] } +indexmap = { version = "2.14.0", features = ["serde"] } inout = { version = "0.1.4", default-features = false, features = ["std"] } ipnet = { version = "2.11.0", features = ["serde"] } ipnetwork = { version = "0.21.1", features = ["schemars", "serde"] } @@ -251,6 +252,7 @@ ppv-lite86 = { version = "0.2.21", default-features = false, features = ["simd", predicates = { version = "3.1.4" } proc-macro2 = { version = "1.0.106" } proptest = { version = "1.11.0" } +quote = { version = "1.0.45" } rand-274715c4dabd11b0 = { package = "rand", version = "0.9.2" } rand-c38e5c1d305a1b54 = { package = "rand", version = "0.8.5" } rand_chacha-274715c4dabd11b0 = { package = "rand_chacha", version = "0.9.0", default-features = false, features = ["std"] } @@ -266,7 +268,7 @@ rustls = { version = "0.23.37", features = ["ring"] } rustls-webpki = { version = "0.103.9", default-features = false, features = ["aws-lc-rs", "ring", "std"] } schemars = { version = "0.8.22", features = ["bytes", "chrono", "semver", "url", "uuid1"] } scopeguard = { version = "1.2.0" } -semver = { version = "1.0.27", features = ["serde"] } +semver = { version = "1.0.28", features = ["serde"] } serde = { version = "1.0.228", features = ["alloc", "derive", "rc"] } serde_core = { version = "1.0.228", features = ["alloc", "rc"] } serde_json = { version = "1.0.149", features = ["alloc", "raw_value", "unbounded_depth"] } @@ -321,7 +323,6 @@ dof-9fbad63c4bcf4a8f = { package = "dof", version = "0.4.0", default-features = hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.20", features = ["full"] } linux-raw-sys = { version = "0.4.15", default-features = false, features = ["elf", "errno", "general", "ioctl", "no_std", "system"] } -miniz_oxide = { version = "0.8.9", default-features = false, features = ["simd", "with-alloc"] } mio = { version = "1.2.0", features = ["net", "os-ext"] } object = { version = "0.37.3", default-features = false, features = ["read", "std"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.44", features = ["fs", "stdio", "system", "termios"] } @@ -335,7 +336,6 @@ dof-9fbad63c4bcf4a8f = { package = "dof", version = "0.4.0", default-features = hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.20", features = ["full"] } linux-raw-sys = { version = "0.4.15", default-features = false, features = ["elf", "errno", "general", "ioctl", "no_std", "system"] } -miniz_oxide = { version = "0.8.9", default-features = false, features = ["simd", "with-alloc"] } mio = { version = "1.2.0", features = ["net", "os-ext"] } object = { version = "0.37.3", default-features = false, features = ["read", "std"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.44", features = ["fs", "stdio", "system", "termios"] } @@ -347,7 +347,6 @@ cookie = { version = "0.18.1", default-features = false, features = ["percent-en errno = { version = "0.3.14" } hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.20", features = ["full"] } -miniz_oxide = { version = "0.8.9", default-features = false, features = ["simd", "with-alloc"] } mio = { version = "1.2.0", features = ["net", "os-ext"] } object = { version = "0.37.3", default-features = false, features = ["read", "std"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.44", features = ["fs", "stdio", "system", "termios"] } @@ -359,7 +358,6 @@ cookie = { version = "0.18.1", default-features = false, features = ["percent-en errno = { version = "0.3.14" } hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.20", features = ["full"] } -miniz_oxide = { version = "0.8.9", default-features = false, features = ["simd", "with-alloc"] } mio = { version = "1.2.0", features = ["net", "os-ext"] } object = { version = "0.37.3", default-features = false, features = ["read", "std"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.44", features = ["fs", "stdio", "system", "termios"] } @@ -371,7 +369,6 @@ cookie = { version = "0.18.1", default-features = false, features = ["percent-en errno = { version = "0.3.14" } hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.20", features = ["full"] } -miniz_oxide = { version = "0.8.9", default-features = false, features = ["simd", "with-alloc"] } mio = { version = "1.2.0", features = ["net", "os-ext"] } object = { version = "0.37.3", default-features = false, features = ["read", "std"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.44", features = ["fs", "stdio", "system", "termios"] } @@ -383,7 +380,6 @@ cookie = { version = "0.18.1", default-features = false, features = ["percent-en errno = { version = "0.3.14" } hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.20", features = ["full"] } -miniz_oxide = { version = "0.8.9", default-features = false, features = ["simd", "with-alloc"] } mio = { version = "1.2.0", features = ["net", "os-ext"] } object = { version = "0.37.3", default-features = false, features = ["read", "std"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.44", features = ["fs", "stdio", "system", "termios"] } @@ -398,7 +394,6 @@ errno = { version = "0.3.14" } getrandom-9fbad63c4bcf4a8f = { package = "getrandom", version = "0.4.1", default-features = false, features = ["std", "sys_rng"] } hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.20", features = ["full"] } -miniz_oxide = { version = "0.8.9", default-features = false, features = ["simd", "with-alloc"] } mio = { version = "1.2.0", features = ["net", "os-ext"] } object = { version = "0.37.3", default-features = false, features = ["read", "std"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.44", features = ["fs", "stdio", "system", "termios"] } @@ -415,7 +410,6 @@ errno = { version = "0.3.14" } getrandom-9fbad63c4bcf4a8f = { package = "getrandom", version = "0.4.1", default-features = false, features = ["std", "sys_rng"] } hyper-rustls = { version = "0.27.7", features = ["http2", "ring", "webpki-tokio"] } hyper-util = { version = "0.1.20", features = ["full"] } -miniz_oxide = { version = "0.8.9", default-features = false, features = ["simd", "with-alloc"] } mio = { version = "1.2.0", features = ["net", "os-ext"] } object = { version = "0.37.3", default-features = false, features = ["read", "std"] } rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.44", features = ["fs", "stdio", "system", "termios"] } From 7eeaf1d075b3d7c99eea9ee3961c530a156f5b2f Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 5 May 2026 10:32:48 -0400 Subject: [PATCH 7/8] introduce ScrimletReconcilersMode so tests can override addrs --- Cargo.lock | 2 +- sled-agent/scrimlet-reconcilers/Cargo.toml | 4 +- .../src/dpd_reconciler.rs | 41 +--- sled-agent/scrimlet-reconcilers/src/handle.rs | 177 +++++++++++++----- .../scrimlet-reconcilers/src/handle/tests.rs | 30 +-- sled-agent/scrimlet-reconcilers/src/lib.rs | 1 + .../src/mgd_reconciler.rs | 35 +--- .../src/reconciler_task.rs | 28 +-- .../src/reconciler_task/tests.rs | 9 +- .../src/uplinkd_reconciler.rs | 6 +- sled-agent/types/src/sled.rs | 3 - 11 files changed, 169 insertions(+), 167 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aa1493e12db..7e70d41ab34 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13697,7 +13697,7 @@ version = "0.1.0" dependencies = [ "assert_matches", "chrono", - "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=44a949c9bedf4fcd4d280337fa1965b4293c88d1)", + "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=d147f0925d14ed6f00eb37cb81f3a1bcbcb3c7f3)", "dropshot 0.17.0", "gateway-client", "gateway-types", diff --git a/sled-agent/scrimlet-reconcilers/Cargo.toml b/sled-agent/scrimlet-reconcilers/Cargo.toml index b8653673dc2..97268b99c6e 100644 --- a/sled-agent/scrimlet-reconcilers/Cargo.toml +++ b/sled-agent/scrimlet-reconcilers/Cargo.toml @@ -29,4 +29,6 @@ dropshot.workspace = true httpmock.workspace = true omicron-test-utils.workspace = true serde_json.workspace = true -sled-agent-types = { workspace = true, features = ["testing"] } + +[features] +testing = [] diff --git a/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs index 85d6c144ecf..ce642d72222 100644 --- a/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs +++ b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs @@ -5,12 +5,10 @@ //! Reconciler responsible for configuration of `dpd` within a scrimlet's switch //! zone. +use crate::handle::ScrimletReconcilersMode; use crate::reconciler_task::Reconciler; use crate::switch_zone_slot::ThisSledSwitchSlot; use dpd_client::Client; -use omicron_common::OMICRON_DPD_TAG; -use omicron_common::address::DENDRITE_PORT; -use sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::system_networking::SystemNetworkingConfig; use slog::Logger; use std::time::Duration; @@ -33,44 +31,11 @@ impl Reconciler for DpdReconciler { const RE_RECONCILE_INTERVAL: Duration = Duration::from_secs(30); fn new( - switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + mode: ScrimletReconcilersMode, switch_slot: ThisSledSwitchSlot, parent_log: &Logger, ) -> Self { - // Build a custom reqwest client, primarily to set a lower - // `pool_idle_timeout`. Our `RE_RECONCILE_INTERVAL` interval of 30 - // seconds happens to coincide exactly with dropshot's default - // connection timeout of 30 seconds. In early testing, this caused us to - // hit surprisingly - // frequently: dpd would close a connection right as we were trying to - // use it, resulting in spurious "connection closed before message - // completed" or "connection reset by peer" errors. - // - // We choose a much lower `pool_idle_timeout`: 10 seconds is long enough - // to reuse a connection for all the requests made during one - // reconciliation pass, but is short enough we should discard it before - // the server wants to time us out. - let reqwest_client = reqwest::ClientBuilder::new() - .connect_timeout(Duration::from_secs(15)) - .read_timeout(Duration::from_secs(15)) - .pool_idle_timeout(Duration::from_secs(10)) - .build() - .expect("reqwest parameters are valid"); - - let baseurl = - format!("http://[{switch_zone_underlay_ip}]:{DENDRITE_PORT}"); - - let client = Client::new_with_client( - &baseurl, - reqwest_client, - dpd_client::ClientState { - tag: OMICRON_DPD_TAG.to_owned(), - log: parent_log - .new(slog::o!("component" => "DpdReconcilerClient")), - }, - ); - - Self { _client: client, _switch_slot: switch_slot } + Self { _client: mode.dpd_client(parent_log), _switch_slot: switch_slot } } async fn do_reconciliation( diff --git a/sled-agent/scrimlet-reconcilers/src/handle.rs b/sled-agent/scrimlet-reconcilers/src/handle.rs index 5f08d969ce1..8509109b29f 100644 --- a/sled-agent/scrimlet-reconcilers/src/handle.rs +++ b/sled-agent/scrimlet-reconcilers/src/handle.rs @@ -14,22 +14,146 @@ use crate::status::ScrimletReconcilersStatus; use crate::status::ScrimletStatus; use crate::switch_zone_slot::ThisSledSwitchSlot; use crate::uplinkd_reconciler::UplinkdReconciler; +use omicron_common::address::DENDRITE_PORT; +use omicron_common::address::MGD_PORT; use omicron_common::address::MGS_PORT; use sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::system_networking::SystemNetworkingConfig; use slog::Logger; use slog::info; +use std::net::SocketAddr; +use std::net::SocketAddrV6; use std::sync::Arc; use std::sync::OnceLock; +use std::time::Duration; use tokio::sync::watch; +/// Mode in which the scrimlet reconcilers should run. +/// +/// This exists to support tests where we don't have a real switch zone like +/// we expect to have on real hardware. The production sled-agent will always +/// pass `SwitchZone(ip)`; in this mode, we'll run reconcilers that talk to +/// services at the provided IP with their well-known ports and that communicate +/// with SMF within the switch zone. In the `Test { .. }` mode, we'll point the +/// reconcilers at the specified addresses for some services, and won't run the +/// SMF-based reconcilers at all. +#[derive(Debug, Clone, Copy)] +pub enum ScrimletReconcilersMode { + SwitchZone(ThisSledSwitchZoneUnderlayIpAddr), + #[cfg(any(test, feature = "testing"))] + Test { + mgs_addr: SocketAddr, + dpd_addr: SocketAddr, + mgd_addr: SocketAddr, + }, +} + +impl ScrimletReconcilersMode { + // Build a `reqwest` client with different timeout settings depending on + // whether we're expecting to contact a real service or a test one. + fn reqwest_client(&self) -> reqwest::Client { + match self { + ScrimletReconcilersMode::SwitchZone(_) => { + // Build a custom reqwest client, primarily to set a lower + // `pool_idle_timeout`. dropshot's default connection timeout is + // 30 seconds. We want to ensure we don't hit + // in any + // reconcilers that try to re-reconcile on a 30 second interval, + // so we choose a much lower `pool_idle_timeout`: 10 seconds is + // long enough to reuse a connection for all the requests made + // during one reconciliation pass, but is short enough we should + // discard it before the server wants to time us out. + // + // The 15 second connect and read timeout are consistent with + // progenitor's normal defaults. + reqwest::ClientBuilder::new() + .connect_timeout(Duration::from_secs(15)) + .read_timeout(Duration::from_secs(15)) + .pool_idle_timeout(Duration::from_secs(10)) + .build() + .expect("reqwest parameters are valid") + } + #[cfg(any(test, feature = "testing"))] + ScrimletReconcilersMode::Test { .. } => { + // Some of our tests use tokio's paused time. We want to + // construct a reqwest client that does not specify any timeouts + // at all, allowing it to wait forever; this plays nicely with + // paused time. (Paused time + timeouts cause the timeouts to + // elapse instantly, which doesn't mesh well with establishing + // TCP connections.) + reqwest::Client::new() + } + } + } + + pub(crate) fn mgs_client( + self, + parent_log: &Logger, + ) -> gateway_client::Client { + let addr: SocketAddr = match self { + ScrimletReconcilersMode::SwitchZone(ip) => { + SocketAddrV6::new(ip.into(), MGS_PORT, 0, 0).into() + } + #[cfg(any(test, feature = "testing"))] + ScrimletReconcilersMode::Test { mgs_addr, .. } => mgs_addr, + }; + let baseurl = format!("http://{addr}"); + gateway_client::Client::new_with_client( + &baseurl, + self.reqwest_client(), + parent_log + .new(slog::o!("component" => "ThisSledSwitchSlotMgsClient")), + ) + } + + pub(crate) fn dpd_client(self, parent_log: &Logger) -> dpd_client::Client { + use omicron_common::OMICRON_DPD_TAG; + + let addr: SocketAddr = match self { + ScrimletReconcilersMode::SwitchZone(ip) => { + SocketAddrV6::new(ip.into(), DENDRITE_PORT, 0, 0).into() + } + #[cfg(any(test, feature = "testing"))] + ScrimletReconcilersMode::Test { dpd_addr, .. } => dpd_addr, + }; + let baseurl = format!("http://{addr}"); + dpd_client::Client::new_with_client( + &baseurl, + self.reqwest_client(), + dpd_client::ClientState { + tag: OMICRON_DPD_TAG.to_owned(), + log: parent_log + .new(slog::o!("component" => "DpdReconcilerClient")), + }, + ) + } + + pub(crate) fn mgd_client( + self, + parent_log: &Logger, + ) -> mg_admin_client::Client { + let addr: SocketAddr = match self { + ScrimletReconcilersMode::SwitchZone(ip) => { + SocketAddrV6::new(ip.into(), MGD_PORT, 0, 0).into() + } + #[cfg(any(test, feature = "testing"))] + ScrimletReconcilersMode::Test { mgd_addr, .. } => mgd_addr, + }; + let baseurl = format!("http://{addr}"); + mg_admin_client::Client::new_with_client( + &baseurl, + self.reqwest_client(), + parent_log.new(slog::o!("component" => "MgdReconcilerClient")), + ) + } +} + /// Information required to enable all the scrimlet reconciler tasks provided /// by this crate. -/// #[derive(Debug, Clone)] pub struct SledAgentNetworkingInfo { pub system_networking_config_rx: watch::Receiver, - pub switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + pub mode: ScrimletReconcilersMode, } /// Handle to tasks that reconcile network configuration with services within a @@ -78,11 +202,6 @@ pub struct ScrimletReconcilers { running_reconcilers: Arc>, parent_log: Logger, - - // Hook for unit tests to modify how we construct an MGS client when - // determining our switch slot. - #[cfg(test)] - override_make_mgs_client: Option gateway_client::Client>>, } impl ScrimletReconcilers { @@ -97,8 +216,6 @@ impl ScrimletReconcilers { determining_switch_slot: OnceLock::new(), running_reconcilers: Arc::new(OnceLock::new()), parent_log: parent_log.clone(), - #[cfg(test)] - override_make_mgs_client: None, } } @@ -173,8 +290,6 @@ impl ScrimletReconcilers { ); } - let mgs_client = self.make_mgs_client(info.switch_zone_underlay_ip); - // We now know this is the one and only time we've been called; spawn a // task that waits until we're a scrimlet, then waits until we can // determine our switch slot, then spawns all the running reconcilers @@ -187,43 +302,11 @@ impl ScrimletReconcilers { Arc::clone(&self.running_reconcilers), determining_switch_slot_tx, self.scrimlet_status_tx.subscribe(), - mgs_client, + info.mode.mgs_client(&self.parent_log), info, self.parent_log.clone(), )); } - - #[cfg(test)] - fn make_mgs_client( - &self, - switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, - ) -> gateway_client::Client { - if let Some(make_client) = &self.override_make_mgs_client { - make_client() - } else { - self.make_real_mgs_client(switch_zone_underlay_ip) - } - } - - #[cfg(not(test))] - fn make_mgs_client( - &self, - switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, - ) -> gateway_client::Client { - self.make_real_mgs_client(switch_zone_underlay_ip) - } - - fn make_real_mgs_client( - &self, - switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, - ) -> gateway_client::Client { - let baseurl = format!("http://[{switch_zone_underlay_ip}]:{MGS_PORT}"); - gateway_client::Client::new( - &baseurl, - self.parent_log - .new(slog::o!("component" => "ThisSledSwitchSlotMgsClient")), - ) - } } async fn determine_switch_slot( @@ -291,14 +374,14 @@ impl RunningReconcilers { let dpd_reconciler = ReconcilerTaskHandle::::spawn( scrimlet_status_rx.clone(), networking_info.system_networking_config_rx.clone(), - networking_info.switch_zone_underlay_ip, + networking_info.mode, this_sled_switch_slot, parent_log, ); let mgd_reconciler = ReconcilerTaskHandle::::spawn( scrimlet_status_rx.clone(), networking_info.system_networking_config_rx.clone(), - networking_info.switch_zone_underlay_ip, + networking_info.mode, this_sled_switch_slot, parent_log, ); @@ -306,7 +389,7 @@ impl RunningReconcilers { ReconcilerTaskHandle::::spawn( scrimlet_status_rx, networking_info.system_networking_config_rx, - networking_info.switch_zone_underlay_ip, + networking_info.mode, this_sled_switch_slot, parent_log, ); diff --git a/sled-agent/scrimlet-reconcilers/src/handle/tests.rs b/sled-agent/scrimlet-reconcilers/src/handle/tests.rs index 4198d248edd..cf88740a485 100644 --- a/sled-agent/scrimlet-reconcilers/src/handle/tests.rs +++ b/sled-agent/scrimlet-reconcilers/src/handle/tests.rs @@ -36,36 +36,20 @@ impl Harness { }); let mock_mgs = MockServer::start(); - let mut handle = ScrimletReconcilers::new(log); - - // Override how `handle` will attempt to contact MGS to point it at our - // mock server instead. - handle.override_make_mgs_client = { - let url = format!("http://{}", mock_mgs.address()); - let client = gateway_client::Client::new_with_client( - &url, - // Tricky bit: we use tokio's paused time in tests below both - // for consistency and test speed, but by default progenitor - // clients have a 15-second connection timeout. That passes - // _instantly_ if time is paused, which doesn't let us establish - // real TCP connections to the httpmock server, causing tests to - // spam connections as fast as possible. Pass a default reqwest - // client, which has no such timeout, allowing connections to - // wait (and succeed or fail as intended). - reqwest::Client::new(), - log.new(slog::o!("component" => "test-mgs-client")), - ); - Some(Box::new(move || client.clone())) - }; + let handle = ScrimletReconcilers::new(log); Self { handle, networking_config_tx, mock_mgs } } fn sled_agent_networking_info(&self) -> SledAgentNetworkingInfo { + let dummy_addr = "0.0.0.0:0".parse().unwrap(); SledAgentNetworkingInfo { system_networking_config_rx: self.networking_config_tx.subscribe(), - switch_zone_underlay_ip: - ThisSledSwitchZoneUnderlayIpAddr::TEST_FAKE, + mode: ScrimletReconcilersMode::Test { + mgs_addr: *self.mock_mgs.address(), + dpd_addr: dummy_addr, + mgd_addr: dummy_addr, + }, } } diff --git a/sled-agent/scrimlet-reconcilers/src/lib.rs b/sled-agent/scrimlet-reconcilers/src/lib.rs index 993f6388088..0a5de4a3fe8 100644 --- a/sled-agent/scrimlet-reconcilers/src/lib.rs +++ b/sled-agent/scrimlet-reconcilers/src/lib.rs @@ -52,6 +52,7 @@ mod uplinkd_reconciler; pub use dpd_reconciler::DpdReconcilerStatus; pub use handle::ScrimletReconcilers; +pub use handle::ScrimletReconcilersMode; pub use handle::SledAgentNetworkingInfo; pub use mgd_reconciler::MgdReconcilerStatus; pub use status::DetermineSwitchSlotStatus; diff --git a/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs index 3a400ae1970..3d4d5664713 100644 --- a/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs +++ b/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs @@ -5,11 +5,10 @@ //! Reconciler responsible for configuration of `mgd` within a scrimlet's switch //! zone. +use crate::ScrimletReconcilersMode; use crate::reconciler_task::Reconciler; use crate::switch_zone_slot::ThisSledSwitchSlot; use mg_admin_client::Client; -use omicron_common::address::MGD_PORT; -use sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::system_networking::SystemNetworkingConfig; use slog::Logger; use std::time::Duration; @@ -32,39 +31,11 @@ impl Reconciler for MgdReconciler { const RE_RECONCILE_INTERVAL: Duration = Duration::from_secs(30); fn new( - switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + mode: ScrimletReconcilersMode, switch_slot: ThisSledSwitchSlot, parent_log: &Logger, ) -> Self { - // Build a custom reqwest client, primarily to set a lower - // `pool_idle_timeout`. Our `RE_RECONCILE_INTERVAL` interval of 30 - // seconds happens to coincide exactly with dropshot's default - // connection timeout of 30 seconds. In early testing, this caused us to - // hit surprisingly - // frequently: mgd would close a connection right as we were trying to - // use it, resulting in spurious "connection closed before message - // completed" or "connection reset by peer" errors. - // - // We choose a much lower `pool_idle_timeout`: 10 seconds is long enough - // to reuse a connection for all the requests made during one - // reconciliation pass, but is short enough we should discard it before - // the server wants to time us out. - let reqwest_client = reqwest::ClientBuilder::new() - .connect_timeout(Duration::from_secs(15)) - .read_timeout(Duration::from_secs(15)) - .pool_idle_timeout(Duration::from_secs(10)) - .build() - .expect("reqwest parameters are valid"); - - let baseurl = format!("http://[{switch_zone_underlay_ip}]:{MGD_PORT}"); - - let client = Client::new_with_client( - &baseurl, - reqwest_client, - parent_log.new(slog::o!("component" => "MgdReconcilerClient")), - ); - - Self { _client: client, _switch_slot: switch_slot } + Self { _client: mode.mgd_client(parent_log), _switch_slot: switch_slot } } async fn do_reconciliation( diff --git a/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs b/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs index a15bd7cad17..dbbb2429d4f 100644 --- a/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs +++ b/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs @@ -21,6 +21,7 @@ //! [`ReconcilerTask::run()`] handles 1 and 3, and service-specific //! implementations of [`Reconciler`] provide 2. +use crate::handle::ScrimletReconcilersMode; use crate::status::ReconcilerActivationReason; use crate::status::ReconcilerCurrentStatus; use crate::status::ReconcilerInertReason; @@ -30,7 +31,6 @@ use crate::status::ReconciliationCompletedStatus; use crate::status::ScrimletStatus; use crate::switch_zone_slot::ThisSledSwitchSlot; use chrono::Utc; -use sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::system_networking::SystemNetworkingConfig; use slog::Logger; use slog::error; @@ -51,11 +51,11 @@ pub(crate) trait Reconciler: Send + 'static { /// Construct a new instance of this `Reconciler`. /// - /// Typically builds a client for the relevant service based on - /// `switch_zone_underlay_ip` and record `switch_slot` for use inside future - /// calls to `do_reconciliation()`. + /// Typically builds a client for the relevant service based on `mode` and + /// record `switch_slot` for use inside future calls to + /// `do_reconciliation()`. fn new( - switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + mode: ScrimletReconcilersMode, switch_slot: ThisSledSwitchSlot, parent_log: &Logger, ) -> Self; @@ -87,14 +87,14 @@ impl ReconcilerTaskHandle { pub(crate) fn spawn( scrimlet_status_rx: watch::Receiver, system_networking_config_rx: watch::Receiver, - switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + mode: ScrimletReconcilersMode, this_sled_switch_slot: ThisSledSwitchSlot, parent_log: &Logger, ) -> Self { Self::spawn_impl( scrimlet_status_rx, system_networking_config_rx, - switch_zone_underlay_ip, + mode, this_sled_switch_slot, parent_log, T::new, @@ -107,17 +107,13 @@ impl ReconcilerTaskHandle { fn spawn_impl( scrimlet_status_rx: watch::Receiver, system_networking_config_rx: watch::Receiver, - switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + mode: ScrimletReconcilersMode, this_sled_switch_slot: ThisSledSwitchSlot, parent_log: &Logger, inner_constructor: F, ) -> Self where - F: FnOnce( - ThisSledSwitchZoneUnderlayIpAddr, - ThisSledSwitchSlot, - &Logger, - ) -> T + F: FnOnce(ScrimletReconcilersMode, ThisSledSwitchSlot, &Logger) -> T + Send + Sync + 'static, @@ -134,11 +130,7 @@ impl ReconcilerTaskHandle { scrimlet_status_rx, system_networking_config_rx, status_tx, - inner: inner_constructor( - switch_zone_underlay_ip, - this_sled_switch_slot, - parent_log, - ), + inner: inner_constructor(mode, this_sled_switch_slot, parent_log), log, }; diff --git a/sled-agent/scrimlet-reconcilers/src/reconciler_task/tests.rs b/sled-agent/scrimlet-reconcilers/src/reconciler_task/tests.rs index df6b8d2af6e..ddf94a0f4df 100644 --- a/sled-agent/scrimlet-reconcilers/src/reconciler_task/tests.rs +++ b/sled-agent/scrimlet-reconcilers/src/reconciler_task/tests.rs @@ -23,7 +23,7 @@ impl Reconciler for MockReconciler { const RE_RECONCILE_INTERVAL: Duration = Duration::from_secs(30); fn new( - _switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + _mode: ScrimletReconcilersMode, _switch_slot: ThisSledSwitchSlot, _parent_log: &Logger, ) -> Self { @@ -97,10 +97,15 @@ impl Harness { let task = { let do_reconciliation_calls = Arc::clone(&do_reconciliation_calls); + let dummy_addr = "0.0.0.0:0".parse().unwrap(); ReconcilerTaskHandle::spawn_impl( scrimlet_status_rx, system_networking_config_rx, - ThisSledSwitchZoneUnderlayIpAddr::TEST_FAKE, + ScrimletReconcilersMode::Test { + mgs_addr: dummy_addr, + dpd_addr: dummy_addr, + mgd_addr: dummy_addr, + }, ThisSledSwitchSlot::TEST_FAKE, log, |_ip, _slot, _log| MockReconciler { diff --git a/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs index bde11b5c301..3f0e9d487e5 100644 --- a/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs +++ b/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs @@ -7,9 +7,9 @@ //! Unlike most reconcilers in this crate, `uplinkd`'s configuration is managed //! via SMF, not a dropshot server. +use crate::ScrimletReconcilersMode; use crate::reconciler_task::Reconciler; use crate::switch_zone_slot::ThisSledSwitchSlot; -use sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr; use sled_agent_types::system_networking::SystemNetworkingConfig; use slog::Logger; use std::time::Duration; @@ -31,10 +31,12 @@ impl Reconciler for UplinkdReconciler { const RE_RECONCILE_INTERVAL: std::time::Duration = Duration::from_secs(30); fn new( - _switch_zone_underlay_ip: ThisSledSwitchZoneUnderlayIpAddr, + _mode: ScrimletReconcilersMode, switch_slot: ThisSledSwitchSlot, _parent_log: &Logger, ) -> Self { + // TODO: Remain inert if `mode` is `ScrimletReconcilersMode::Test`, + // since that indicates there's no real zone to connect to. Self { _switch_slot: switch_slot } } diff --git a/sled-agent/types/src/sled.rs b/sled-agent/types/src/sled.rs index 20c570251d2..1435d4ad0b7 100644 --- a/sled-agent/types/src/sled.rs +++ b/sled-agent/types/src/sled.rs @@ -29,9 +29,6 @@ mod local_switch_zone_ip { pub struct ThisSledSwitchZoneUnderlayIpAddr(Ipv6Addr); impl ThisSledSwitchZoneUnderlayIpAddr { - #[cfg(any(test, feature = "testing"))] - pub const TEST_FAKE: Self = Self(Ipv6Addr::LOCALHOST); - /// Construct a [`ThisSledSwitchZoneUnderlayIpAddr`] from the request to /// start this sled agent. /// From 885b6c567aa03e4e8292a0f7ed44e5a5205c32e7 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 5 May 2026 11:32:40 -0400 Subject: [PATCH 8/8] use real MGS for some tests --- Cargo.lock | 2 + sled-agent/scrimlet-reconcilers/Cargo.toml | 2 + .../scrimlet-reconcilers/src/handle/tests.rs | 109 ++++++++++++------ 3 files changed, 80 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7e70d41ab34..4cf350b9dc2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13700,6 +13700,8 @@ dependencies = [ "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=d147f0925d14ed6f00eb37cb81f3a1bcbcb3c7f3)", "dropshot 0.17.0", "gateway-client", + "gateway-messages", + "gateway-test-utils", "gateway-types", "httpmock", "mg-admin-client", diff --git a/sled-agent/scrimlet-reconcilers/Cargo.toml b/sled-agent/scrimlet-reconcilers/Cargo.toml index 97268b99c6e..533574a1772 100644 --- a/sled-agent/scrimlet-reconcilers/Cargo.toml +++ b/sled-agent/scrimlet-reconcilers/Cargo.toml @@ -26,6 +26,8 @@ omicron-workspace-hack.workspace = true [dev-dependencies] assert_matches.workspace = true dropshot.workspace = true +gateway-messages.workspace = true +gateway-test-utils.workspace = true httpmock.workspace = true omicron-test-utils.workspace = true serde_json.workspace = true diff --git a/sled-agent/scrimlet-reconcilers/src/handle/tests.rs b/sled-agent/scrimlet-reconcilers/src/handle/tests.rs index cf88740a485..54439c1bb10 100644 --- a/sled-agent/scrimlet-reconcilers/src/handle/tests.rs +++ b/sled-agent/scrimlet-reconcilers/src/handle/tests.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use std::net::Ipv6Addr; use std::time::Duration; use super::*; @@ -9,19 +10,49 @@ use assert_matches::assert_matches; use dropshot::ConfigLogging; use dropshot::ConfigLoggingLevel; use dropshot::test_util::LogContext; +use gateway_messages::SpPort; +use gateway_test_utils::setup::GatewayTestContext; use httpmock::Mock; use httpmock::MockServer; use omicron_test_utils::dev; use sled_agent_types::early_networking::RackNetworkConfig; -struct Harness { +// For "happy path" tests, we spin up a real MGS instances (pointed at a +// simulated SP). +// +// For "sad path" tests, we use a mock MGS so we can inject failures. +trait MgsFlavor { + fn address(&self) -> SocketAddr; + async fn teardown(self); +} + +impl MgsFlavor for GatewayTestContext { + fn address(&self) -> SocketAddr { + SocketAddr::V6(SocketAddrV6::new(Ipv6Addr::LOCALHOST, self.port, 0, 0)) + } + + async fn teardown(self) { + GatewayTestContext::teardown(self).await + } +} + +impl MgsFlavor for MockServer { + fn address(&self) -> SocketAddr { + *MockServer::address(self) + } + + async fn teardown(self) {} +} + +struct Harness { handle: ScrimletReconcilers, networking_config_tx: watch::Sender, - mock_mgs: MockServer, + mgs: T, + logctx: LogContext, } -impl Harness { - fn new(log: &Logger) -> Self { +impl Harness { + fn new_common(logctx: LogContext, mgs: T) -> Self { let (networking_config_tx, _) = watch::channel(SystemNetworkingConfig { rack_network_config: RackNetworkConfig { @@ -35,10 +66,14 @@ impl Harness { service_zone_nat_entries: None, }); - let mock_mgs = MockServer::start(); - let handle = ScrimletReconcilers::new(log); + let handle = ScrimletReconcilers::new(&logctx.log); + + Self { handle, networking_config_tx, mgs, logctx } + } - Self { handle, networking_config_tx, mock_mgs } + async fn teardown(self) { + self.logctx.cleanup_successful(); + self.mgs.teardown().await; } fn sled_agent_networking_info(&self) -> SledAgentNetworkingInfo { @@ -46,7 +81,7 @@ impl Harness { SledAgentNetworkingInfo { system_networking_config_rx: self.networking_config_tx.subscribe(), mode: ScrimletReconcilersMode::Test { - mgs_addr: *self.mock_mgs.address(), + mgs_addr: self.mgs.address(), dpd_addr: dummy_addr, mgd_addr: dummy_addr, }, @@ -59,7 +94,7 @@ impl Harness { { let mut status = self.handle.status(); let start = tokio::time::Instant::now(); - while start.elapsed() < Duration::from_secs(10) { + while start.elapsed() < Duration::from_secs(30) { if matches(&status) { return; } @@ -68,11 +103,28 @@ impl Harness { } panic!("timeout waiting for task status (got {status:?})"); } +} + +impl Harness { + async fn new_real_mgs(logctx: LogContext) -> Self { + let ctx = gateway_test_utils::setup::test_setup( + logctx.test_name(), + SpPort::One, + ) + .await; + Self::new_common(logctx, ctx) + } +} + +impl Harness { + fn new_mock_mgs(logctx: LogContext) -> Self { + Self::new_common(logctx, MockServer::start()) + } fn mock_mgs_set_switch_slot(&self, slot: u16) -> Mock<'_> { let body = serde_json::json!({ "type": "switch", "slot": slot }).to_string(); - self.mock_mgs.mock(|when, then| { + self.mgs.mock(|when, then| { when.method(httpmock::Method::GET).path("/local/switch-id"); then.status(200) .header("content-type", "application/json") @@ -81,7 +133,7 @@ impl Harness { } fn mock_mgs_set_503(&self) -> Mock<'_> { - self.mock_mgs.mock(|when, then| { + self.mgs.mock(|when, then| { when.method(httpmock::Method::GET).path("/local/switch-id"); then.status(503).header("content-type", "application/json").body( serde_json::json!({ @@ -129,7 +181,7 @@ async fn calling_set_sled_agent_networking_info_once_multiple_times_panics() { "calling_set_sled_agent_networking_info_once_multiple_times_panics", &log_config, ); - let harness = Harness::new(&logctx.log); + let harness = Harness::new_mock_mgs(logctx); harness.handle.set_sled_agent_networking_info_once( harness.sled_agent_networking_info(), @@ -143,7 +195,7 @@ async fn calling_set_sled_agent_networking_info_once_multiple_times_panics() { #[tokio::test] async fn non_scrimlet_two_phase_initialization() { let logctx = dev::test_setup_log("non_scrimlet_two_phase_initialization"); - let harness = Harness::new(&logctx.log); + let harness = Harness::new_mock_mgs(logctx); // initial status assert_matches!( @@ -163,20 +215,17 @@ async fn non_scrimlet_two_phase_initialization() { ) ); - logctx.cleanup_successful(); + harness.teardown().await; } // Happy-path test for scrimlets where we find out we're a scrimlet after // getting our networking info. -#[tokio::test(start_paused = true)] +#[tokio::test] async fn scrimlet_three_phase_initialization_info_then_scrimlet() { let logctx = dev::test_setup_log( "scrimlet_three_phase_initialization_info_then_scrimlet", ); - let harness = Harness::new(&logctx.log); - - // Configure our mock MGS to claim to be switch slot 0. - let success_mock = harness.mock_mgs_set_switch_slot(0); + let harness = Harness::new_real_mgs(logctx).await; // initial status assert_matches!( @@ -198,28 +247,23 @@ async fn scrimlet_three_phase_initialization_info_then_scrimlet() { harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); - // We should contact MGS, then transition to the running state. - harness.wait_for_mock_to_be_called(&success_mock).await; harness .wait_for_task_status(|status| { matches!(status, ScrimletReconcilersStatus::Running { .. }) }) .await; - logctx.cleanup_successful(); + harness.teardown().await; } // Happy-path test for scrimlets where we find out we're a scrimlet before // getting our networking info. -#[tokio::test(start_paused = true)] +#[tokio::test] async fn scrimlet_three_phase_initialization_scrimlet_then_info() { let logctx = dev::test_setup_log( "scrimlet_three_phase_initialization_scrimlet_then_info", ); - let harness = Harness::new(&logctx.log); - - // Configure our mock MGS to claim to be switch slot 0. - let success_mock = harness.mock_mgs_set_switch_slot(0); + let harness = Harness::new_real_mgs(logctx).await; // Become a scrimlet right away. harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); @@ -235,21 +279,20 @@ async fn scrimlet_three_phase_initialization_scrimlet_then_info() { ); // We're already a scrimlet, so we contact MGS then transition to Running. - harness.wait_for_mock_to_be_called(&success_mock).await; harness .wait_for_task_status(|status| { matches!(status, ScrimletReconcilersStatus::Running { .. }) }) .await; - logctx.cleanup_successful(); + harness.teardown().await; } // Scrimlet test case where MGS fails the first time we ask then succeeds later. #[tokio::test(start_paused = true)] async fn scrimlet_mgs_fails_first_attempt() { let logctx = dev::test_setup_log("scrimlet_mgs_fails_first_attempt"); - let harness = Harness::new(&logctx.log); + let harness = Harness::new_mock_mgs(logctx); // Configure our mock MGS to return an HTTP 503. let mut error_mock = harness.mock_mgs_set_503(); @@ -302,7 +345,7 @@ async fn scrimlet_mgs_fails_first_attempt() { }) .await; - logctx.cleanup_successful(); + harness.teardown().await; } // Scrimlet test case where MGS fails, then we become "not a scrimlet", then we @@ -311,7 +354,7 @@ async fn scrimlet_mgs_fails_first_attempt() { async fn scrimlet_mgs_fails_then_we_become_not_a_scrimlet() { let logctx = dev::test_setup_log("scrimlet_mgs_fails_then_we_become_not_a_scrimlet"); - let harness = Harness::new(&logctx.log); + let harness = Harness::new_mock_mgs(logctx); // Configure our mock MGS to return an HTTP 503. let mut error_mock = harness.mock_mgs_set_503(); @@ -379,5 +422,5 @@ async fn scrimlet_mgs_fails_then_we_become_not_a_scrimlet() { }) .await; - logctx.cleanup_successful(); + harness.teardown().await; }