From 829bd2f00b265b71f49ebae1bb94623efb044ea8 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 29 Apr 2026 10:14:34 -0700 Subject: [PATCH 1/5] Add CRDB schema + DB model for FMD inventory tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds three tables for persisting per-sled FMD data: inv_fmd_status — per-sled outcome of FMD collection inv_fmd_host_case — diagnosed cases (event payload as JSONB) inv_fmd_resource — resources affected by cases Bumps SCHEMA_VERSION to 254 with directory schema/crdb/inv-fmd. Adds diesel table! entries, db-model structs, and From impls for the read path. No callers yet — write/read/display follow in subsequent commits. --- nexus/db-model/src/inventory.rs | 120 ++++++++++++++++++++++++++ nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-schema/src/schema.rs | 32 +++++++ schema/crdb/dbinit.sql | 44 +++++++++- schema/crdb/inv-fmd/up01.sql | 10 +++ schema/crdb/inv-fmd/up02.sql | 13 +++ schema/crdb/inv-fmd/up03.sql | 16 ++++ 7 files changed, 236 insertions(+), 2 deletions(-) create mode 100644 schema/crdb/inv-fmd/up01.sql create mode 100644 schema/crdb/inv-fmd/up02.sql create mode 100644 schema/crdb/inv-fmd/up03.sql diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 5479994d340..87e6d9c5974 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -34,6 +34,7 @@ use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, + inv_fmd_host_case, inv_fmd_resource, inv_fmd_status, inv_host_phase_1_active_slot, inv_host_phase_1_flash_hash, inv_internal_dns, inv_last_reconciliation_dataset_result, inv_last_reconciliation_disk_result, @@ -64,6 +65,8 @@ use omicron_common::update::OmicronInstallManifestSource; use omicron_common::zpool_name::ZpoolName; use omicron_uuid_kinds::DatasetKind; use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::FmdHostCaseKind; +use omicron_uuid_kinds::FmdResourceKind; use omicron_uuid_kinds::InternalZpoolKind; use omicron_uuid_kinds::MupdateKind; use omicron_uuid_kinds::MupdateOverrideKind; @@ -85,6 +88,9 @@ use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid}; use sled_agent_types::inventory::BootImageHeader; use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types::inventory::FmdHostCase; +use sled_agent_types::inventory::FmdInventoryResult; +use sled_agent_types::inventory::FmdResource; use sled_agent_types::inventory::HostPhase2DesiredContents; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::ManifestBootInventory; @@ -2128,6 +2134,120 @@ impl InvSvcEnabledNotOnlineParseError { } } +/// One row per (collection, sled) recording the outcome of FMD inventory +/// collection. `error_message` is `NULL` when the daemon was queried +/// successfully (even if it reported zero faults); set when collection +/// failed (e.g. on non-illumos sleds, or when the daemon was unreachable). +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_fmd_status)] +pub struct InvFmdStatus { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub error_message: Option, +} + +impl InvFmdStatus { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + result: &FmdInventoryResult, + ) -> Self { + let error_message = match result { + FmdInventoryResult::Available(_) => None, + FmdInventoryResult::Error { error } => Some(error.clone()), + }; + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + error_message, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_fmd_host_case)] +pub struct InvFmdHostCase { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub case_id: DbTypedUuid, + pub code: String, + pub url: String, + pub event: Option, +} + +impl InvFmdHostCase { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + case: &FmdHostCase, + ) -> Self { + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + case_id: case.uuid.into(), + code: case.code.clone(), + url: case.url.clone(), + event: case.event.clone(), + } + } +} + +impl From for FmdHostCase { + fn from(row: InvFmdHostCase) -> Self { + Self { + uuid: row.case_id.into(), + code: row.code, + url: row.url, + event: row.event, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_fmd_resource)] +pub struct InvFmdResource { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub resource_id: DbTypedUuid, + pub fmri: String, + pub case_id: DbTypedUuid, + pub faulty: bool, + pub unusable: bool, + pub invisible: bool, +} + +impl InvFmdResource { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + resource: &FmdResource, + ) -> Self { + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + resource_id: resource.uuid.into(), + fmri: resource.fmri.clone(), + case_id: resource.case_id.into(), + faulty: resource.faulty, + unusable: resource.unusable, + invisible: resource.invisible, + } + } +} + +impl From for FmdResource { + fn from(row: InvFmdResource) -> Self { + Self { + uuid: row.resource_id.into(), + fmri: row.fmri, + case_id: row.case_id.into(), + faulty: row.faulty, + unusable: row.unusable, + invisible: row.invisible, + } + } +} + // See [`sled_agent_types::inventory::SvcEnabledNotOnlineState`]. impl_enum_type!( InvSvcEnabledNotOnlineStateEnum: diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index c927e8cfc64..7d230d2ecd6 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(253, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(254, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(254, "inv-fmd"), KnownVersion::new(253, "delete-nexus-default-allow-firewall-rule"), KnownVersion::new(252, "fm-support-bundle-and-alert-request-comments"), KnownVersion::new(251, "fm-sitrep-next-inv-min-time-started"), diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 0a66531be52..69e48b1bb2c 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1791,6 +1791,38 @@ table! { } } +table! { + inv_fmd_status (inv_collection_id, sled_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + error_message -> Nullable, + } +} + +table! { + inv_fmd_host_case (inv_collection_id, sled_id, case_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + case_id -> Uuid, + code -> Text, + url -> Text, + event -> Nullable, + } +} + +table! { + inv_fmd_resource (inv_collection_id, sled_id, resource_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + resource_id -> Uuid, + fmri -> Text, + case_id -> Uuid, + faulty -> Bool, + unusable -> Bool, + invisible -> Bool, + } +} + table! { inv_sled_agent (inv_collection_id, sled_id) { inv_collection_id -> Uuid, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 42bad504270..46b66f311c5 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5131,6 +5131,48 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_svc_enabled_not_online_parse_error PRIMARY KEY (inv_collection_id, sled_id, id) ); +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_status ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + -- NULL when FMD data was successfully collected. Set to the error + -- string when FMD collection failed (e.g. on non-illumos sleds, or + -- when the daemon was unreachable). + error_message TEXT, + + PRIMARY KEY (inv_collection_id, sled_id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + case_id UUID NOT NULL, + code TEXT NOT NULL, + url TEXT NOT NULL, + -- The full FMD fault event payload as JSON, if present. Stored as + -- JSONB without parsing — Nexus does not interpret the FMD event + -- schema; it round-trips verbatim for downstream tooling (e.g. omdb). + event JSONB, + + PRIMARY KEY (inv_collection_id, sled_id, case_id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_resource ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + resource_id UUID NOT NULL, + -- Fault Management Resource Identifier + -- (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). + fmri TEXT NOT NULL, + -- The case_id pairs with a corresponding row in inv_fmd_host_case + -- under the same (inv_collection_id, sled_id) partition. + case_id UUID NOT NULL, + faulty BOOL NOT NULL, + unusable BOOL NOT NULL, + invisible BOOL NOT NULL, + + PRIMARY KEY (inv_collection_id, sled_id, resource_id) +); + /* * Various runtime configuration switches for reconfigurator * @@ -8475,7 +8517,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '253.0.0', NULL) + (TRUE, NOW(), NOW(), '254.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/inv-fmd/up01.sql b/schema/crdb/inv-fmd/up01.sql new file mode 100644 index 00000000000..de8f7c2c448 --- /dev/null +++ b/schema/crdb/inv-fmd/up01.sql @@ -0,0 +1,10 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_status ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + -- NULL when FMD data was successfully collected. Set to the error + -- string when FMD collection failed (e.g. on non-illumos sleds, or + -- when the daemon was unreachable). + error_message TEXT, + + PRIMARY KEY (inv_collection_id, sled_id) +); diff --git a/schema/crdb/inv-fmd/up02.sql b/schema/crdb/inv-fmd/up02.sql new file mode 100644 index 00000000000..7907cc8ac9f --- /dev/null +++ b/schema/crdb/inv-fmd/up02.sql @@ -0,0 +1,13 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + case_id UUID NOT NULL, + code TEXT NOT NULL, + url TEXT NOT NULL, + -- The full FMD fault event payload as JSON, if present. Stored as + -- JSONB without parsing — Nexus does not interpret the FMD event + -- schema; it round-trips verbatim for downstream tooling (e.g. omdb). + event JSONB, + + PRIMARY KEY (inv_collection_id, sled_id, case_id) +); diff --git a/schema/crdb/inv-fmd/up03.sql b/schema/crdb/inv-fmd/up03.sql new file mode 100644 index 00000000000..9bf3e4a7930 --- /dev/null +++ b/schema/crdb/inv-fmd/up03.sql @@ -0,0 +1,16 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_resource ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + resource_id UUID NOT NULL, + -- Fault Management Resource Identifier + -- (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). + fmri TEXT NOT NULL, + -- The case_id pairs with a corresponding row in inv_fmd_host_case + -- under the same (inv_collection_id, sled_id) partition. + case_id UUID NOT NULL, + faulty BOOL NOT NULL, + unusable BOOL NOT NULL, + invisible BOOL NOT NULL, + + PRIMARY KEY (inv_collection_id, sled_id, resource_id) +); From 00e634707ef002b1d2f9970c5e622dc738377d02 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 29 Apr 2026 10:20:28 -0700 Subject: [PATCH 2/5] nexus-types/inventory: add fmd field on SledAgent + builder passthrough Wires the fmd field added to sled-agent's Inventory by the parent PR through into Nexus's in-memory inventory representation. The collector builder copies inventory.fmd verbatim. The DB read path will populate it from the inv_fmd_* tables in a follow-on commit; for now, the read path substitutes Available(empty) so existing tests round-trip cleanly. --- nexus/db-queries/src/db/datastore/inventory.rs | 4 ++++ nexus/inventory/src/builder.rs | 1 + nexus/types/src/inventory.rs | 2 ++ nexus/types/src/inventory/display.rs | 1 + 4 files changed, 8 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index cd47ca48dcf..f37d666c70a 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -4637,6 +4637,10 @@ impl DataStore { reference_measurements: last_reconciliation_measurements .remove(&sled_id) .unwrap_or_default(), + // Populated by the read path in a follow-on commit. + fmd: sled_agent_types::inventory::FmdInventoryResult::Available( + sled_agent_types::inventory::FmdInventory::default(), + ), }; sled_agents .insert_unique(sled_agent) diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index 08ff5ea2538..fb9e4890a8b 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -678,6 +678,7 @@ impl CollectionBuilder { smf_services_enabled_not_online: inventory .smf_services_enabled_not_online, reference_measurements: inventory.reference_measurements, + fmd: inventory.fmd, }; self.sleds diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index 887fb615599..26218753b32 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -35,6 +35,7 @@ use serde_with::serde_as; use sled_agent_types_versions::latest::inventory::ConfigReconcilerInventory; use sled_agent_types_versions::latest::inventory::ConfigReconcilerInventoryResult; use sled_agent_types_versions::latest::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types_versions::latest::inventory::FmdInventoryResult; use sled_agent_types_versions::latest::inventory::InventoryDataset; use sled_agent_types_versions::latest::inventory::InventoryDisk; use sled_agent_types_versions::latest::inventory::InventoryZpool; @@ -649,6 +650,7 @@ pub struct SledAgent { pub file_source_resolver: OmicronFileSourceResolverInventory, pub smf_services_enabled_not_online: SvcsEnabledNotOnlineResult, pub reference_measurements: IdOrdMap, + pub fmd: FmdInventoryResult, } impl IdOrdItem for SledAgent { diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index 0964ba23d92..a410b22c04a 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -625,6 +625,7 @@ fn display_sleds( file_source_resolver, smf_services_enabled_not_online, reference_measurements, + fmd: _, } = sled; writeln!( From cfb3eea40d7eafc5e97a8342e82b2a902d489dc5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 29 Apr 2026 10:24:19 -0700 Subject: [PATCH 3/5] datastore: write + prune for inv_fmd_* tables Insert one InvFmdStatus row per sled in each inventory collection, plus a row per case and resource when collection succeeded. Wire the three tables into the existing prune transaction so old collections clean up after themselves. --- .../db-queries/src/db/datastore/inventory.rs | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index f37d666c70a..553b7c4c43f 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -39,6 +39,9 @@ use nexus_db_model::InvCollectionError; use nexus_db_model::InvConfigReconcilerStatus; use nexus_db_model::InvConfigReconcilerStatusKind; use nexus_db_model::InvDataset; +use nexus_db_model::InvFmdHostCase; +use nexus_db_model::InvFmdResource; +use nexus_db_model::InvFmdStatus; use nexus_db_model::InvHostPhase1ActiveSlot; use nexus_db_model::InvHostPhase1FlashHash; use nexus_db_model::InvInternalDns; @@ -456,6 +459,59 @@ impl DataStore { }) .collect(); + // Pull FMD inventory out of all sled agents. We always record one + // status row per sled (capturing the success/failure discriminant) + // and, when collection succeeded, a row per case and per resource. + let fmd_status_rows: Vec<_> = collection + .sled_agents + .iter() + .map(|sled_agent| { + InvFmdStatus::new( + collection_id, + sled_agent.sled_id, + &sled_agent.fmd, + ) + }) + .collect(); + let fmd_host_case_rows: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + let cases = match &sled_agent.fmd { + sled_agent_types::inventory::FmdInventoryResult::Available( + inv, + ) => Some(&inv.cases), + sled_agent_types::inventory::FmdInventoryResult::Error { + .. + } => None, + }; + cases.into_iter().flatten().map(|case| { + InvFmdHostCase::new(collection_id, sled_agent.sled_id, case) + }) + }) + .collect(); + let fmd_resource_rows: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + let resources = match &sled_agent.fmd { + sled_agent_types::inventory::FmdInventoryResult::Available( + inv, + ) => Some(&inv.resources), + sled_agent_types::inventory::FmdInventoryResult::Error { + .. + } => None, + }; + resources.into_iter().flatten().map(|resource| { + InvFmdResource::new( + collection_id, + sled_agent.sled_id, + resource, + ) + }) + }) + .collect(); + // Build up a list of `OmicronSledConfig`s we need to insert. Each sled // has 0-3: // @@ -1430,7 +1486,62 @@ impl DataStore { } } + // Insert FMD status rows (one per sled). + { + use nexus_db_schema::schema::inv_fmd_status::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut rows = fmd_status_rows.into_iter(); + loop { + let some_rows = + rows.by_ref().take(batch_size).collect::>(); + if some_rows.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_fmd_status) + .values(some_rows) + .execute_async(&conn) + .await?; + } + } + // Insert FMD host case rows (zero or more per sled). + { + use nexus_db_schema::schema::inv_fmd_host_case::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut rows = fmd_host_case_rows.into_iter(); + loop { + let some_rows = + rows.by_ref().take(batch_size).collect::>(); + if some_rows.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_fmd_host_case) + .values(some_rows) + .execute_async(&conn) + .await?; + } + } + + // Insert FMD resource rows (zero or more per sled). + { + use nexus_db_schema::schema::inv_fmd_resource::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut rows = fmd_resource_rows.into_iter(); + loop { + let some_rows = + rows.by_ref().take(batch_size).collect::>(); + if some_rows.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_fmd_resource) + .values(some_rows) + .execute_async(&conn) + .await?; + } + } // Insert rows for all the sled config reconciler disk results { @@ -2164,6 +2275,9 @@ impl DataStore { nlast_reconciliation_orphaned_datasets: usize, nlast_reconciliation_zone_results: usize, nlast_reconciliation_measurements: usize, + nfmd_status: usize, + nfmd_host_cases: usize, + nfmd_resources: usize, nzone_manifest_zones: usize, nzone_manifest_measurements: usize, nzone_manifest_non_boot: usize, @@ -2204,6 +2318,9 @@ impl DataStore { nlast_reconciliation_orphaned_datasets, nlast_reconciliation_zone_results, nlast_reconciliation_measurements, + nfmd_status, + nfmd_host_cases, + nfmd_resources, nzone_manifest_zones, nzone_manifest_measurements, nzone_manifest_non_boot, @@ -2382,6 +2499,31 @@ impl DataStore { .await? }; + // Remove FMD inventory rows. + let nfmd_status = { + use nexus_db_schema::schema::inv_fmd_status::dsl; + diesel::delete(dsl::inv_fmd_status.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + let nfmd_host_cases = { + use nexus_db_schema::schema::inv_fmd_host_case::dsl; + diesel::delete(dsl::inv_fmd_host_case.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + let nfmd_resources = { + use nexus_db_schema::schema::inv_fmd_resource::dsl; + diesel::delete(dsl::inv_fmd_resource.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; // Remove rows associated with zone resolver inventory. let nzone_manifest_zones = { @@ -2596,6 +2738,9 @@ impl DataStore { nlast_reconciliation_orphaned_datasets, nlast_reconciliation_zone_results, nlast_reconciliation_measurements, + nfmd_status, + nfmd_host_cases, + nfmd_resources, nzone_manifest_zones, nzone_manifest_measurements, nzone_manifest_non_boot, @@ -2647,6 +2792,9 @@ impl DataStore { nlast_reconciliation_zone_results, "nlast_reconciliation_measurements" => nlast_reconciliation_measurements, + "nfmd_status" => nfmd_status, + "nfmd_host_cases" => nfmd_host_cases, + "nfmd_resources" => nfmd_resources, "nzone_manifest_zones" => nzone_manifest_zones, "nzone_manifest_measurements" => nzone_manifest_measurements, "nzone_manifest_non_boot" => nzone_manifest_non_boot, From 8e47394589c68c01ab2d8c50012d3adce7422a9c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 29 Apr 2026 10:26:58 -0700 Subject: [PATCH 4/5] datastore: read path for FMD inventory Loads inv_fmd_status, inv_fmd_host_case, and inv_fmd_resource for the collection and reconstructs SledAgent.fmd. Status row's NULL error_message indicates Available; non-NULL becomes Error{error}. A missing status row falls back to Available with whatever cases/resources were found (defensive, in case of historical data predating this PR). --- .../db-queries/src/db/datastore/inventory.rs | 112 +++++++++++++++++- 1 file changed, 108 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 553b7c4c43f..e4ab70dcb4f 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -4199,6 +4199,89 @@ impl DataStore { measurements }; + // Load all FMD inventory rows. We expect at most ~tens of cases or + // resources per sled, so we don't bother paginating. + let mut fmd_status_by_sled: BTreeMap> = { + use nexus_db_schema::schema::inv_fmd_status::dsl; + let rows = dsl::inv_fmd_status + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvFmdStatus::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + rows.into_iter() + .map(|row| (row.sled_id.into(), row.error_message)) + .collect() + }; + + let mut fmd_cases_by_sled: BTreeMap< + SledUuid, + IdOrdMap, + > = { + use nexus_db_schema::schema::inv_fmd_host_case::dsl; + let rows = dsl::inv_fmd_host_case + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvFmdHostCase::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + let mut by_sled: BTreeMap< + SledUuid, + IdOrdMap, + > = BTreeMap::new(); + for row in rows { + let sled_id: SledUuid = row.sled_id.into(); + by_sled + .entry(sled_id) + .or_default() + .insert_unique(row.into()) + .map_err(|err| { + Error::internal_error(&format!( + "unexpected duplicate FMD case: {}", + InlineErrorChain::new(&err) + )) + })?; + } + by_sled + }; + + let mut fmd_resources_by_sled: BTreeMap< + SledUuid, + IdOrdMap, + > = { + use nexus_db_schema::schema::inv_fmd_resource::dsl; + let rows = dsl::inv_fmd_resource + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvFmdResource::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + let mut by_sled: BTreeMap< + SledUuid, + IdOrdMap, + > = BTreeMap::new(); + for row in rows { + let sled_id: SledUuid = row.sled_id.into(); + by_sled + .entry(sled_id) + .or_default() + .insert_unique(row.into()) + .map_err(|err| { + Error::internal_error(&format!( + "unexpected duplicate FMD resource: {}", + InlineErrorChain::new(&err) + )) + })?; + } + by_sled + }; + // Load all the config reconciler zone results; build a map of maps // keyed by sled ID. let mut last_reconciliation_zone_results = { @@ -4785,10 +4868,31 @@ impl DataStore { reference_measurements: last_reconciliation_measurements .remove(&sled_id) .unwrap_or_default(), - // Populated by the read path in a follow-on commit. - fmd: sled_agent_types::inventory::FmdInventoryResult::Available( - sled_agent_types::inventory::FmdInventory::default(), - ), + fmd: { + use sled_agent_types::inventory::{ + FmdInventory, FmdInventoryResult, + }; + let cases = + fmd_cases_by_sled.remove(&sled_id).unwrap_or_default(); + let resources = fmd_resources_by_sled + .remove(&sled_id) + .unwrap_or_default(); + // The status row's error_message column distinguishes + // Available (NULL) from Error (the message). If no row + // exists at all (i.e. an older collection predates this + // migration), fall back to Available with whatever + // case/resource rows we found, which will normally be + // empty. + match fmd_status_by_sled.remove(&sled_id) { + Some(Some(error)) => { + FmdInventoryResult::Error { error } + } + _ => FmdInventoryResult::Available(FmdInventory { + cases, + resources, + }), + } + }, }; sled_agents .insert_unique(sled_agent) From 39053f794ef695c73d5056c1ea5aee97766d1229 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 29 Apr 2026 10:35:38 -0700 Subject: [PATCH 5/5] Display wrappers + omdb golden output for FMD inventory Adds Display wrappers for FmdInventoryResult/FmdInventory/FmdHostCase/ FmdResource on the sled-agent types. Wires them into nexus/types/src/inventory/display.rs::display_sleds so that `omdb db inventory collections show` (and reconfigurator-cli scripts that print sled inventories) include the FMD section. The FmdHostCase event payload is the FMD nvlist serialized to JSON; we intentionally don't interpret the schema, so it's pretty-printed verbatim under the case heading. Also seeds the representative test inventory (nexus/inventory examples) with a single fault case + resource so the inv_fmd_* tables get rows under test_representative_collection_populates_database. The reconfigurator-cli golden outputs grow a 'fmd:' section accordingly. --- .../tests/output/cmds-example-stdout | 6 + .../output/cmds-mupdate-update-flow-stdout | 6 + .../cmds-nexus-generation-autobump-stdout | 6 + .../tests/output/cmds-target-release-stdout | 6 + .../tests/output/cmds-unsafe-zone-mgs-stdout | 6 + nexus/inventory/src/examples.rs | 28 +++- nexus/types/src/inventory/display.rs | 6 +- .../types/versions/src/impls/inventory.rs | 121 +++++++++++++++++- sled-agent/types/versions/src/latest.rs | 4 + 9 files changed, 184 insertions(+), 5 deletions(-) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index f8cf0bd29be..c4cb0672eba 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -1739,6 +1739,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -1890,6 +1892,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -2134,6 +2138,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout index 3f7a146fd52..b20a3575a16 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout @@ -326,6 +326,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -461,6 +463,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -584,6 +588,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout index 11ed4e50970..67cc9d20531 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout @@ -762,6 +762,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -955,6 +957,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -1148,6 +1152,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout index ea202ca9b46..6ef7ef4621a 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout @@ -746,6 +746,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -939,6 +941,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -1132,6 +1136,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout index e759bbbe11e..0e42aac9d93 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout @@ -800,6 +800,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -993,6 +995,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -1186,6 +1190,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 06cbb4950d1..40bf3842e27 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -1099,6 +1099,32 @@ pub fn sled_agent( result: ConfigReconcilerInventoryResult::Ok, }); + // Synthesize a representative FMD payload: a single faulted resource + // diagnosed by a single case. This keeps the per-table-population test + // happy and gives downstream golden-output tests something to render. + let case_id = omicron_uuid_kinds::FmdHostCaseUuid::new_v4(); + let resource_id = omicron_uuid_kinds::FmdResourceUuid::new_v4(); + let mut fmd_cases = iddqd::IdOrdMap::new(); + fmd_cases.insert_overwrite(sled_agent_types::inventory::FmdHostCase { + uuid: case_id, + code: "PCIEX-8000-DJ".to_string(), + url: "http://illumos.org/msg/PCIEX-8000-DJ".to_string(), + event: Some(serde_json::json!({"class": "fault.io.pci.bus"})), + }); + let mut fmd_resources = iddqd::IdOrdMap::new(); + fmd_resources.insert_overwrite(sled_agent_types::inventory::FmdResource { + uuid: resource_id, + fmri: "dev:////pci@af,0/pci1022,1483@3,5".to_string(), + case_id, + faulty: true, + unusable: false, + invisible: false, + }); + let fmd = FmdInventoryResult::Available(FmdInventory { + cases: fmd_cases, + resources: fmd_resources, + }); + Inventory { baseboard, reservoir_size: ByteCount::from(1024), @@ -1117,6 +1143,6 @@ pub fn sled_agent( file_source_resolver, smf_services_enabled_not_online, reference_measurements, - fmd: FmdInventoryResult::Available(FmdInventory::default()), + fmd, } } diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index a410b22c04a..f7cd65e585e 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -625,7 +625,7 @@ fn display_sleds( file_source_resolver, smf_services_enabled_not_online, reference_measurements, - fmd: _, + fmd, } = sled; writeln!( @@ -915,6 +915,10 @@ fn display_sleds( } } + writeln!(indented, "fmd:")?; + let mut indent2 = IndentWriter::new(" ", &mut indented); + write!(indent2, "{}", fmd.display())?; + f = indented.into_inner(); display_svcs_enabled_not_online(smf_services_enabled_not_online, f)?; } diff --git a/sled-agent/types/versions/src/impls/inventory.rs b/sled-agent/types/versions/src/impls/inventory.rs index e6af2c3cd85..5aedde4ed9b 100644 --- a/sled-agent/types/versions/src/impls/inventory.rs +++ b/sled-agent/types/versions/src/impls/inventory.rs @@ -21,9 +21,10 @@ use tufaceous_artifact::{ArtifactHash, KnownArtifactKind}; use crate::latest::inventory::{ BootImageHeader, BootPartitionContents, BootPartitionDetails, - ConfigReconcilerInventory, ConfigReconcilerInventoryResult, - HostPhase2DesiredContents, HostPhase2DesiredSlots, ManifestBootInventory, - ManifestInventory, ManifestNonBootInventory, MupdateOverrideBootInventory, + ConfigReconcilerInventory, ConfigReconcilerInventoryResult, FmdHostCase, + FmdInventory, FmdInventoryResult, FmdResource, HostPhase2DesiredContents, + HostPhase2DesiredSlots, ManifestBootInventory, ManifestInventory, + ManifestNonBootInventory, MupdateOverrideBootInventory, MupdateOverrideInventory, MupdateOverrideNonBootInventory, NetworkInterface, OmicronFileSourceResolverInventory, OmicronSledConfig, OmicronZoneConfig, OmicronZoneImageSource, OmicronZoneType, @@ -910,6 +911,120 @@ impl fmt::Display for SingleMeasurementInventoryDisplay<'_> { } } +impl FmdInventoryResult { + pub fn display(&self) -> FmdInventoryResultDisplay<'_> { + FmdInventoryResultDisplay { inner: self } + } +} + +/// a displayer for [`FmdInventoryResult`] +pub struct FmdInventoryResultDisplay<'a> { + inner: &'a FmdInventoryResult, +} + +impl fmt::Display for FmdInventoryResultDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.inner { + FmdInventoryResult::Available(inv) => { + write!(f, "{}", inv.display()) + } + FmdInventoryResult::Error { error } => { + writeln!(f, "FMD collection failed: {error}") + } + } + } +} + +impl FmdInventory { + pub fn display(&self) -> FmdInventoryDisplay<'_> { + FmdInventoryDisplay { inner: self } + } +} + +/// a displayer for [`FmdInventory`] +pub struct FmdInventoryDisplay<'a> { + inner: &'a FmdInventory, +} + +impl fmt::Display for FmdInventoryDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let FmdInventory { cases, resources } = self.inner; + if cases.is_empty() && resources.is_empty() { + writeln!(f, "no faults reported")?; + return Ok(()); + } + writeln!(f, "cases ({}):", cases.len())?; + for case in cases { + let mut indent = IndentWriter::new(" ", &mut *f); + write!(indent, "{}", case.display())?; + } + writeln!(f, "resources ({}):", resources.len())?; + for resource in resources { + let mut indent = IndentWriter::new(" ", &mut *f); + write!(indent, "{}", resource.display())?; + } + Ok(()) + } +} + +impl FmdHostCase { + pub fn display(&self) -> FmdHostCaseDisplay<'_> { + FmdHostCaseDisplay { inner: self } + } +} + +/// a displayer for [`FmdHostCase`] +pub struct FmdHostCaseDisplay<'a> { + inner: &'a FmdHostCase, +} + +impl fmt::Display for FmdHostCaseDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let FmdHostCase { uuid, code, url, event } = self.inner; + writeln!(f, "case {uuid} ({code})")?; + writeln!(f, " url: {url}")?; + // The event payload is the FMD nvlist serialized to JSON. We + // intentionally do not interpret it; round-trip pretty-printing + // is enough to make it human-readable. + if let Some(event) = event { + match serde_json::to_string_pretty(event) { + Ok(rendered) => { + writeln!(f, " event:")?; + let mut indent = IndentWriter::new(" ", &mut *f); + writeln!(indent, "{rendered}")?; + } + Err(_) => writeln!(f, " event: ")?, + } + } + Ok(()) + } +} + +impl FmdResource { + pub fn display(&self) -> FmdResourceDisplay<'_> { + FmdResourceDisplay { inner: self } + } +} + +/// a displayer for [`FmdResource`] +pub struct FmdResourceDisplay<'a> { + inner: &'a FmdResource, +} + +impl fmt::Display for FmdResourceDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let FmdResource { uuid, fmri, case_id, faulty, unusable, invisible } = + self.inner; + writeln!(f, "resource {uuid} (case {case_id})")?; + writeln!(f, " fmri: {fmri}")?; + writeln!( + f, + " faulty: {faulty}, unusable: {unusable}, invisible: {invisible}" + )?; + Ok(()) + } +} + #[derive(Debug, thiserror::Error, PartialEq, Eq)] #[error("unrecognized zpool health value `{0}`")] pub struct ZpoolHealthParseError(pub String); diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index 1b8c8277fa5..d6bbd7e0763 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -187,6 +187,10 @@ pub mod inventory { pub use crate::v37::inventory::FmdResource; pub use crate::v37::inventory::Inventory; + pub use crate::impls::inventory::FmdHostCaseDisplay; + pub use crate::impls::inventory::FmdInventoryDisplay; + pub use crate::impls::inventory::FmdInventoryResultDisplay; + pub use crate::impls::inventory::FmdResourceDisplay; pub use crate::impls::inventory::ManifestBootInventoryDisplay; pub use crate::impls::inventory::ManifestInventoryDisplay; pub use crate::impls::inventory::ManifestNonBootInventoryDisplay;