From 36414afae5e35badaa7ef0d1e895202633512e0e Mon Sep 17 00:00:00 2001 From: Jakob Naucke Date: Thu, 9 Apr 2026 15:26:59 +0200 Subject: [PATCH] Quit RV controllers upon TEC recreation to prevent racing of controllers on server side apply. Skip computation on creation if no new TEC exists yet (cannot quit ApprovedImage controller on TEC deletion as its finalizer must still run). Add a test case (includes small refactoring for constants and imports). Fixes: #216 Signed-off-by: Jakob Naucke Assisted-by: Claude --- operator/src/main.rs | 18 ++++- operator/src/reference_values.rs | 21 +++++- tests/trusted_execution_cluster.rs | 113 +++++++++++++++++++++++++---- 3 files changed, 130 insertions(+), 22 deletions(-) diff --git a/operator/src/main.rs b/operator/src/main.rs index e222da30..ee5ae555 100644 --- a/operator/src/main.rs +++ b/operator/src/main.rs @@ -47,6 +47,8 @@ struct ClusterContext { client: Client, /// UID of cluster that watchers are based on uid: Mutex>, + /// Handles for reference value controller tasks, aborted on TEC recreation + rv_handles: Mutex>>, } fn is_installed(status: Option) -> bool { @@ -77,6 +79,12 @@ async fn launch_rv_watchers( warn!("Failed to acquire lock on context UID store"); } if launch_watchers { + // Abort any previously spawned controllers before launching new ones + if let Ok(mut handles) = ctx.rv_handles.lock() { + for handle in handles.drain(..) { + handle.abort(); + } + } info!( "First registration of TrustedExecutionCluster {name} by this operator. \ Launching reference value watchers." @@ -92,8 +100,12 @@ async fn launch_rv_watchers( owner_reference: owner_reference.clone(), pcrs_compute_image, }; - reference_values::launch_rv_image_controller(rv_ctx.clone()).await; - reference_values::launch_rv_job_controller(rv_ctx.clone()).await; + let image_handle = reference_values::launch_rv_image_controller(rv_ctx.clone()).await; + let job_handle = reference_values::launch_rv_job_controller(rv_ctx.clone()).await; + if let Ok(mut handles) = ctx.rv_handles.lock() { + handles.push(image_handle); + handles.push(job_handle); + } } Ok(launch_watchers) } @@ -282,6 +294,7 @@ async fn main() -> Result<()> { let ctx = Arc::new(ClusterContext { client: kube_client, uid: Mutex::new(None), + rv_handles: Mutex::new(Vec::new()), }); Controller::new(cl, watcher::Config::default()) .run(reconcile, controller_error_policy, ctx) @@ -305,6 +318,7 @@ mod tests { ClusterContext { client, uid: Mutex::new(None), + rv_handles: Mutex::new(Vec::new()), } } diff --git a/operator/src/reference_values.rs b/operator/src/reference_values.rs index 311d8971..f4394cd8 100644 --- a/operator/src/reference_values.rs +++ b/operator/src/reference_values.rs @@ -134,7 +134,7 @@ async fn job_reconcile(job: Arc, ctx: Arc) -> Result tokio::task::JoinHandle<()> { let jobs: Api = Api::default_namespaced(ctx.client.clone()); let watcher = watcher::Config { label_selector: Some(format!("{JOB_LABEL_KEY}={PCR_COMMAND_NAME}")), @@ -144,7 +144,7 @@ pub async fn launch_rv_job_controller(ctx: RvContextData) { Controller::new(jobs, watcher) .run(job_reconcile, controller_error_policy, Arc::new(ctx)) .for_each(controller_info), - ); + ) } // Name job by sanitized image name, plus a hash to disambiguate @@ -257,6 +257,19 @@ async fn image_add_reconcile( let kube_client = ctx.client.clone(); let name = image.metadata.name.as_ref().unwrap(); + let clusters: Api = Api::default_namespaced(kube_client.clone()); + let cluster_name = &ctx.owner_reference.name; + if let Err(kube::Error::Api(ae)) = clusters.get(cluster_name).await + && ae.code == 404 + { + info!( + "Image reconciler was registered with TrustedExecutionCluster {cluster_name}, \ + but it did not exist. Creating a new TrustedExecutionCluster will \ + trigger a fresh reconciler. Requeueing..." + ); + return Ok(Action::await_change()); + } + // Adopt the image by adding TEC as owner reference if not already owned let tec_uid = &ctx.owner_reference.uid; let already_owned = image @@ -307,13 +320,13 @@ async fn image_add_reconcile( Ok(action) } -pub async fn launch_rv_image_controller(ctx: RvContextData) { +pub async fn launch_rv_image_controller(ctx: RvContextData) -> tokio::task::JoinHandle<()> { let images: Api = Api::default_namespaced(ctx.client.clone()); tokio::spawn( Controller::new(images, Default::default()) .run(image_reconcile, controller_error_policy, Arc::new(ctx)) .for_each(controller_info), - ); + ) } pub async fn handle_new_image( diff --git a/tests/trusted_execution_cluster.rs b/tests/trusted_execution_cluster.rs index a9056d55..6a42c7bd 100644 --- a/tests/trusted_execution_cluster.rs +++ b/tests/trusted_execution_cluster.rs @@ -1,10 +1,13 @@ // SPDX-FileCopyrightText: Alice Frosi +// SPDX-FileCopyrightText: Jakob Naucke // // SPDX-License-Identifier: MIT +use anyhow::anyhow; use compute_pcrs_lib::{Part, Pcr}; use k8s_openapi::api::apps::v1::Deployment; use k8s_openapi::api::core::v1::{ConfigMap, Secret}; +use kube::api::ObjectMeta; use kube::{Api, api::DeleteParams}; use std::time::Duration; use trusted_cluster_operator_lib::reference_values::ImagePcrs; @@ -14,18 +17,21 @@ use trusted_cluster_operator_lib::{ use trusted_cluster_operator_test_utils::*; const EXPECTED_PCR4: &str = "ff2b357be4a4bc66be796d4e7b2f1f27077dc89b96220aae60b443bcf4672525"; +const TEC_NAME: &str = "trusted-execution-cluster"; +const APPROVED_IMAGE_NAME: &str = "coreos"; +const TRUSTEE_CONFIG_MAP: &str = "trustee-data"; +const RV_JSON_KEY: &str = "reference-values.json"; named_test!( async fn test_trusted_execution_cluster_uninstall() -> anyhow::Result<()> { let test_ctx = setup!().await?; let client = test_ctx.client(); let namespace = test_ctx.namespace(); - let name = "trusted-execution-cluster"; let configmap_api: Api = Api::namespaced(client.clone(), namespace); let tec_api: Api = Api::namespaced(client.clone(), namespace); - let tec = tec_api.get(name).await?; + let tec = tec_api.get(TEC_NAME).await?; let owner_reference = generate_owner_reference(&tec)?; @@ -102,7 +108,7 @@ named_test!( .unwrap_or(false); if !has_approved_condition { - return Err(anyhow::anyhow!( + return Err(anyhow!( "AttestationKey does not have Approved condition yet" )); } @@ -117,10 +123,10 @@ named_test!( // Delete the cluster cr let api: Api = Api::namespaced(client.clone(), namespace); let dp = DeleteParams::default(); - api.delete(name, &dp).await?; + api.delete(TEC_NAME, &dp).await?; // Wait until it disappears - wait_for_resource_deleted(&api, name, 120, 5).await?; + wait_for_resource_deleted(&api, TEC_NAME, 120, 5).await?; let deployments_api: Api = Api::namespaced(client.clone(), namespace); wait_for_resource_deleted(&deployments_api, "trustee-deployment", 120, 1).await?; @@ -128,7 +134,7 @@ named_test!( wait_for_resource_deleted(&configmap_api, "image-pcrs", 120, 1).await?; let images_api: Api = Api::namespaced(client.clone(), namespace); - wait_for_resource_deleted(&images_api, "coreos", 120, 1).await?; + wait_for_resource_deleted(&images_api, APPROVED_IMAGE_NAME, 120, 1).await?; wait_for_resource_deleted(&machines, &machine_name, 120, 1).await?; wait_for_resource_deleted(&attestation_keys, &ak_name, 120, 1).await?; @@ -168,7 +174,7 @@ async fn test_image_pcrs_configmap_updates() -> anyhow::Result<()> { return Ok(()); } - Err(anyhow::anyhow!("image-pcrs ConfigMap not yet populated with image-pcrs.json data")) + Err(anyhow!("image-pcrs ConfigMap not yet populated with image-pcrs.json data")) } }) .await?; @@ -254,7 +260,7 @@ async fn test_image_disallow() -> anyhow::Result<()> { let namespace = test_ctx.namespace(); let images: Api = Api::namespaced(client.clone(), namespace); - images.delete("coreos", &DeleteParams::default()).await?; + images.delete(APPROVED_IMAGE_NAME, &DeleteParams::default()).await?; let configmap_api: Api = Api::namespaced(client.clone(), namespace); let poller = Poller::new() @@ -264,14 +270,14 @@ async fn test_image_disallow() -> anyhow::Result<()> { poller.poll_async(|| { let api = configmap_api.clone(); async move { - let cm = api.get("trustee-data").await?; + let cm = api.get(TRUSTEE_CONFIG_MAP).await?; if let Some(data) = &cm.data - && let Some(reference_values_json) = data.get("reference-values.json") + && let Some(reference_values_json) = data.get(RV_JSON_KEY) && !reference_values_json.contains(EXPECTED_PCR4) { return Ok(()); } - Err(anyhow::anyhow!("Reference value not yet removed")) + Err(anyhow!("Reference value not yet removed")) } }).await?; @@ -284,10 +290,9 @@ async fn test_attestation_key_lifecycle() -> anyhow::Result<()> { let test_ctx = setup!().await?; let client = test_ctx.client(); let namespace = test_ctx.namespace(); - let tec_name = "trusted-execution-cluster"; let tec_api: Api = Api::namespaced(client.clone(), namespace); - let tec = tec_api.get(tec_name).await?; + let tec = tec_api.get(TEC_NAME).await?; let owner_reference = generate_owner_reference(&tec)?; let machine_uuid = uuid::Uuid::new_v4().to_string(); @@ -366,7 +371,7 @@ async fn test_attestation_key_lifecycle() -> anyhow::Result<()> { .unwrap_or(false); if !has_approved_condition { - return Err(anyhow::anyhow!( + return Err(anyhow!( "AttestationKey does not have Approved condition yet" )); } @@ -384,7 +389,7 @@ async fn test_attestation_key_lifecycle() -> anyhow::Result<()> { .unwrap_or(false); if !has_machine_owner_ref { - return Err(anyhow::anyhow!( + return Err(anyhow!( "AttestationKey does not have owner reference to Machine yet" )); } @@ -403,7 +408,7 @@ async fn test_attestation_key_lifecycle() -> anyhow::Result<()> { .unwrap_or(false); if !has_ak_owner_ref { - return Err(anyhow::anyhow!( + return Err(anyhow!( "Secret does not have owner reference to AttestationKey yet" )); } @@ -434,3 +439,79 @@ async fn test_attestation_key_lifecycle() -> anyhow::Result<()> { Ok(()) } } + +named_test! { +async fn test_approved_image_readoption() -> anyhow::Result<()> { + let test_ctx = setup!().await?; + let client = test_ctx.client(); + let namespace = test_ctx.namespace(); + + let clusters: Api = Api::namespaced(client.clone(), namespace); + let images: Api = Api::namespaced(client.clone(), namespace); + let configmaps: Api = Api::namespaced(client.clone(), namespace); + + let cluster_spec = clusters.get(TEC_NAME).await?.spec; + let image_spec = images.get(APPROVED_IMAGE_NAME).await?.spec; + + test_ctx.info(format!("Deleting TrustedExecuctionCluster {TEC_NAME}")); + clusters.delete(TEC_NAME, &Default::default()).await?; + let removal_poller = Poller::new() + .with_timeout(Duration::from_secs(60)) + .with_interval(Duration::from_secs(5)) + .with_error_message(format!("{TRUSTEE_CONFIG_MAP} configmap not removed")); + removal_poller.poll_async(|| { + let configmaps = configmaps.clone(); + async move { + if configmaps.get(TRUSTEE_CONFIG_MAP).await.is_err() { + return Ok(()); + } + Err(anyhow!("{TRUSTEE_CONFIG_MAP} not yet removed")) + } + }).await?; + test_ctx.info(format!("Configmap {TRUSTEE_CONFIG_MAP} was removed")); + + let image = ApprovedImage { + spec: image_spec, + metadata: ObjectMeta { + name: Some(APPROVED_IMAGE_NAME.to_string()), + ..Default::default() + }, + status: None + }; + let cluster = TrustedExecutionCluster { + spec: cluster_spec, + metadata: ObjectMeta { + name: Some(TEC_NAME.to_string()), + ..Default::default() + }, + status: None + }; + + test_ctx.info("Creating new ApprovedImage and TrustedExecutionCluster"); + images.create(&Default::default(), &image).await?; + // Ensure adoption works even when cluster creation was delayed + tokio::time::sleep(Duration::from_secs(5)).await; + clusters.create(&Default::default(), &cluster).await?; + let regeneration_poller = Poller::new() + .with_timeout(Duration::from_secs(180)) + .with_interval(Duration::from_secs(5)) + .with_error_message("Reference value not regenerated".to_string()); + regeneration_poller.poll_async(|| { + let configmaps = configmaps.clone(); + async move { + let configmap = configmaps.get(TRUSTEE_CONFIG_MAP).await?; + if let Some(data) = &configmap.data + && let Some(json) = data.get(RV_JSON_KEY) + && json.contains(EXPECTED_PCR4) + { + return Ok(()); + } + Err(anyhow!("Reference value not yet regenerated")) + } + }).await?; + test_ctx.info("Reference values regenerated"); + + test_ctx.cleanup().await?; + Ok(()) +} +}