diff --git a/operator/src/main.rs b/operator/src/main.rs index e222da30..ee5ae555 100644 --- a/operator/src/main.rs +++ b/operator/src/main.rs @@ -47,6 +47,8 @@ struct ClusterContext { client: Client, /// UID of cluster that watchers are based on uid: Mutex>, + /// Handles for reference value controller tasks, aborted on TEC recreation + rv_handles: Mutex>>, } fn is_installed(status: Option) -> bool { @@ -77,6 +79,12 @@ async fn launch_rv_watchers( warn!("Failed to acquire lock on context UID store"); } if launch_watchers { + // Abort any previously spawned controllers before launching new ones + if let Ok(mut handles) = ctx.rv_handles.lock() { + for handle in handles.drain(..) { + handle.abort(); + } + } info!( "First registration of TrustedExecutionCluster {name} by this operator. \ Launching reference value watchers." @@ -92,8 +100,12 @@ async fn launch_rv_watchers( owner_reference: owner_reference.clone(), pcrs_compute_image, }; - reference_values::launch_rv_image_controller(rv_ctx.clone()).await; - reference_values::launch_rv_job_controller(rv_ctx.clone()).await; + let image_handle = reference_values::launch_rv_image_controller(rv_ctx.clone()).await; + let job_handle = reference_values::launch_rv_job_controller(rv_ctx.clone()).await; + if let Ok(mut handles) = ctx.rv_handles.lock() { + handles.push(image_handle); + handles.push(job_handle); + } } Ok(launch_watchers) } @@ -282,6 +294,7 @@ async fn main() -> Result<()> { let ctx = Arc::new(ClusterContext { client: kube_client, uid: Mutex::new(None), + rv_handles: Mutex::new(Vec::new()), }); Controller::new(cl, watcher::Config::default()) .run(reconcile, controller_error_policy, ctx) @@ -305,6 +318,7 @@ mod tests { ClusterContext { client, uid: Mutex::new(None), + rv_handles: Mutex::new(Vec::new()), } } diff --git a/operator/src/reference_values.rs b/operator/src/reference_values.rs index 311d8971..f4394cd8 100644 --- a/operator/src/reference_values.rs +++ b/operator/src/reference_values.rs @@ -134,7 +134,7 @@ async fn job_reconcile(job: Arc, ctx: Arc) -> Result tokio::task::JoinHandle<()> { let jobs: Api = Api::default_namespaced(ctx.client.clone()); let watcher = watcher::Config { label_selector: Some(format!("{JOB_LABEL_KEY}={PCR_COMMAND_NAME}")), @@ -144,7 +144,7 @@ pub async fn launch_rv_job_controller(ctx: RvContextData) { Controller::new(jobs, watcher) .run(job_reconcile, controller_error_policy, Arc::new(ctx)) .for_each(controller_info), - ); + ) } // Name job by sanitized image name, plus a hash to disambiguate @@ -257,6 +257,19 @@ async fn image_add_reconcile( let kube_client = ctx.client.clone(); let name = image.metadata.name.as_ref().unwrap(); + let clusters: Api = Api::default_namespaced(kube_client.clone()); + let cluster_name = &ctx.owner_reference.name; + if let Err(kube::Error::Api(ae)) = clusters.get(cluster_name).await + && ae.code == 404 + { + info!( + "Image reconciler was registered with TrustedExecutionCluster {cluster_name}, \ + but it did not exist. Creating a new TrustedExecutionCluster will \ + trigger a fresh reconciler. Requeueing..." + ); + return Ok(Action::await_change()); + } + // Adopt the image by adding TEC as owner reference if not already owned let tec_uid = &ctx.owner_reference.uid; let already_owned = image @@ -307,13 +320,13 @@ async fn image_add_reconcile( Ok(action) } -pub async fn launch_rv_image_controller(ctx: RvContextData) { +pub async fn launch_rv_image_controller(ctx: RvContextData) -> tokio::task::JoinHandle<()> { let images: Api = Api::default_namespaced(ctx.client.clone()); tokio::spawn( Controller::new(images, Default::default()) .run(image_reconcile, controller_error_policy, Arc::new(ctx)) .for_each(controller_info), - ); + ) } pub async fn handle_new_image( diff --git a/tests/trusted_execution_cluster.rs b/tests/trusted_execution_cluster.rs index a9056d55..6a42c7bd 100644 --- a/tests/trusted_execution_cluster.rs +++ b/tests/trusted_execution_cluster.rs @@ -1,10 +1,13 @@ // SPDX-FileCopyrightText: Alice Frosi +// SPDX-FileCopyrightText: Jakob Naucke // // SPDX-License-Identifier: MIT +use anyhow::anyhow; use compute_pcrs_lib::{Part, Pcr}; use k8s_openapi::api::apps::v1::Deployment; use k8s_openapi::api::core::v1::{ConfigMap, Secret}; +use kube::api::ObjectMeta; use kube::{Api, api::DeleteParams}; use std::time::Duration; use trusted_cluster_operator_lib::reference_values::ImagePcrs; @@ -14,18 +17,21 @@ use trusted_cluster_operator_lib::{ use trusted_cluster_operator_test_utils::*; const EXPECTED_PCR4: &str = "ff2b357be4a4bc66be796d4e7b2f1f27077dc89b96220aae60b443bcf4672525"; +const TEC_NAME: &str = "trusted-execution-cluster"; +const APPROVED_IMAGE_NAME: &str = "coreos"; +const TRUSTEE_CONFIG_MAP: &str = "trustee-data"; +const RV_JSON_KEY: &str = "reference-values.json"; named_test!( async fn test_trusted_execution_cluster_uninstall() -> anyhow::Result<()> { let test_ctx = setup!().await?; let client = test_ctx.client(); let namespace = test_ctx.namespace(); - let name = "trusted-execution-cluster"; let configmap_api: Api = Api::namespaced(client.clone(), namespace); let tec_api: Api = Api::namespaced(client.clone(), namespace); - let tec = tec_api.get(name).await?; + let tec = tec_api.get(TEC_NAME).await?; let owner_reference = generate_owner_reference(&tec)?; @@ -102,7 +108,7 @@ named_test!( .unwrap_or(false); if !has_approved_condition { - return Err(anyhow::anyhow!( + return Err(anyhow!( "AttestationKey does not have Approved condition yet" )); } @@ -117,10 +123,10 @@ named_test!( // Delete the cluster cr let api: Api = Api::namespaced(client.clone(), namespace); let dp = DeleteParams::default(); - api.delete(name, &dp).await?; + api.delete(TEC_NAME, &dp).await?; // Wait until it disappears - wait_for_resource_deleted(&api, name, 120, 5).await?; + wait_for_resource_deleted(&api, TEC_NAME, 120, 5).await?; let deployments_api: Api = Api::namespaced(client.clone(), namespace); wait_for_resource_deleted(&deployments_api, "trustee-deployment", 120, 1).await?; @@ -128,7 +134,7 @@ named_test!( wait_for_resource_deleted(&configmap_api, "image-pcrs", 120, 1).await?; let images_api: Api = Api::namespaced(client.clone(), namespace); - wait_for_resource_deleted(&images_api, "coreos", 120, 1).await?; + wait_for_resource_deleted(&images_api, APPROVED_IMAGE_NAME, 120, 1).await?; wait_for_resource_deleted(&machines, &machine_name, 120, 1).await?; wait_for_resource_deleted(&attestation_keys, &ak_name, 120, 1).await?; @@ -168,7 +174,7 @@ async fn test_image_pcrs_configmap_updates() -> anyhow::Result<()> { return Ok(()); } - Err(anyhow::anyhow!("image-pcrs ConfigMap not yet populated with image-pcrs.json data")) + Err(anyhow!("image-pcrs ConfigMap not yet populated with image-pcrs.json data")) } }) .await?; @@ -254,7 +260,7 @@ async fn test_image_disallow() -> anyhow::Result<()> { let namespace = test_ctx.namespace(); let images: Api = Api::namespaced(client.clone(), namespace); - images.delete("coreos", &DeleteParams::default()).await?; + images.delete(APPROVED_IMAGE_NAME, &DeleteParams::default()).await?; let configmap_api: Api = Api::namespaced(client.clone(), namespace); let poller = Poller::new() @@ -264,14 +270,14 @@ async fn test_image_disallow() -> anyhow::Result<()> { poller.poll_async(|| { let api = configmap_api.clone(); async move { - let cm = api.get("trustee-data").await?; + let cm = api.get(TRUSTEE_CONFIG_MAP).await?; if let Some(data) = &cm.data - && let Some(reference_values_json) = data.get("reference-values.json") + && let Some(reference_values_json) = data.get(RV_JSON_KEY) && !reference_values_json.contains(EXPECTED_PCR4) { return Ok(()); } - Err(anyhow::anyhow!("Reference value not yet removed")) + Err(anyhow!("Reference value not yet removed")) } }).await?; @@ -284,10 +290,9 @@ async fn test_attestation_key_lifecycle() -> anyhow::Result<()> { let test_ctx = setup!().await?; let client = test_ctx.client(); let namespace = test_ctx.namespace(); - let tec_name = "trusted-execution-cluster"; let tec_api: Api = Api::namespaced(client.clone(), namespace); - let tec = tec_api.get(tec_name).await?; + let tec = tec_api.get(TEC_NAME).await?; let owner_reference = generate_owner_reference(&tec)?; let machine_uuid = uuid::Uuid::new_v4().to_string(); @@ -366,7 +371,7 @@ async fn test_attestation_key_lifecycle() -> anyhow::Result<()> { .unwrap_or(false); if !has_approved_condition { - return Err(anyhow::anyhow!( + return Err(anyhow!( "AttestationKey does not have Approved condition yet" )); } @@ -384,7 +389,7 @@ async fn test_attestation_key_lifecycle() -> anyhow::Result<()> { .unwrap_or(false); if !has_machine_owner_ref { - return Err(anyhow::anyhow!( + return Err(anyhow!( "AttestationKey does not have owner reference to Machine yet" )); } @@ -403,7 +408,7 @@ async fn test_attestation_key_lifecycle() -> anyhow::Result<()> { .unwrap_or(false); if !has_ak_owner_ref { - return Err(anyhow::anyhow!( + return Err(anyhow!( "Secret does not have owner reference to AttestationKey yet" )); } @@ -434,3 +439,79 @@ async fn test_attestation_key_lifecycle() -> anyhow::Result<()> { Ok(()) } } + +named_test! { +async fn test_approved_image_readoption() -> anyhow::Result<()> { + let test_ctx = setup!().await?; + let client = test_ctx.client(); + let namespace = test_ctx.namespace(); + + let clusters: Api = Api::namespaced(client.clone(), namespace); + let images: Api = Api::namespaced(client.clone(), namespace); + let configmaps: Api = Api::namespaced(client.clone(), namespace); + + let cluster_spec = clusters.get(TEC_NAME).await?.spec; + let image_spec = images.get(APPROVED_IMAGE_NAME).await?.spec; + + test_ctx.info(format!("Deleting TrustedExecuctionCluster {TEC_NAME}")); + clusters.delete(TEC_NAME, &Default::default()).await?; + let removal_poller = Poller::new() + .with_timeout(Duration::from_secs(60)) + .with_interval(Duration::from_secs(5)) + .with_error_message(format!("{TRUSTEE_CONFIG_MAP} configmap not removed")); + removal_poller.poll_async(|| { + let configmaps = configmaps.clone(); + async move { + if configmaps.get(TRUSTEE_CONFIG_MAP).await.is_err() { + return Ok(()); + } + Err(anyhow!("{TRUSTEE_CONFIG_MAP} not yet removed")) + } + }).await?; + test_ctx.info(format!("Configmap {TRUSTEE_CONFIG_MAP} was removed")); + + let image = ApprovedImage { + spec: image_spec, + metadata: ObjectMeta { + name: Some(APPROVED_IMAGE_NAME.to_string()), + ..Default::default() + }, + status: None + }; + let cluster = TrustedExecutionCluster { + spec: cluster_spec, + metadata: ObjectMeta { + name: Some(TEC_NAME.to_string()), + ..Default::default() + }, + status: None + }; + + test_ctx.info("Creating new ApprovedImage and TrustedExecutionCluster"); + images.create(&Default::default(), &image).await?; + // Ensure adoption works even when cluster creation was delayed + tokio::time::sleep(Duration::from_secs(5)).await; + clusters.create(&Default::default(), &cluster).await?; + let regeneration_poller = Poller::new() + .with_timeout(Duration::from_secs(180)) + .with_interval(Duration::from_secs(5)) + .with_error_message("Reference value not regenerated".to_string()); + regeneration_poller.poll_async(|| { + let configmaps = configmaps.clone(); + async move { + let configmap = configmaps.get(TRUSTEE_CONFIG_MAP).await?; + if let Some(data) = &configmap.data + && let Some(json) = data.get(RV_JSON_KEY) + && json.contains(EXPECTED_PCR4) + { + return Ok(()); + } + Err(anyhow!("Reference value not yet regenerated")) + } + }).await?; + test_ctx.info("Reference values regenerated"); + + test_ctx.cleanup().await?; + Ok(()) +} +}