diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index ed10ed7c15..9ea69374df 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -2349,6 +2349,15 @@ func (dn *Daemon) updateConfigAndState(state *stateAndConfigs) (bool, bool, erro // Great, we've successfully rebooted for the desired config, // let's mark it done! + // Verify extension packages are actually installed before marking as done + // See: https://redhat.atlassian.net/browse/OCPBUGS-65645 + if dn.os.IsCoreOSVariant() { + coreOSDaemon := CoreOSDaemon{dn} + if err := coreOSDaemon.verifyExtensionPackages(state.currentConfig); err != nil { + return missingODC, inDesiredConfig, fmt.Errorf("extension package verification failed: %w", err) + } + } + // Get MCP associated with node pool, err := helpers.GetPrimaryPoolNameForMCN(dn.mcpLister, dn.node) if err != nil { diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index 264fc81146..765239ebd5 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -1841,7 +1841,126 @@ func (dn *CoreOSDaemon) applyExtensions(oldConfig, newConfig *mcfgv1.MachineConf // Add "update" to the start of argument list args = append([]string{constants.RPMOSTreeUpdateArg}, args...) logSystem("Applying extensions : %+q", args) - return runRpmOstree(args...) + if err := runRpmOstree(args...); err != nil { + return err + } + + // Verify extension packages are staged before rebooting + // See: https://redhat.atlassian.net/browse/OCPBUGS-65645 + return dn.verifyExtensionsStaged(newConfig) +} + +// verifyExtensionsStaged verifies that all extension packages specified in a +// MachineConfig are staged in the rpm-ostree deployment before rebooting. +// This catches issues early, before the node reboots and potentially fails to boot. +// See: https://redhat.atlassian.net/browse/OCPBUGS-65645 +func (dn *CoreOSDaemon) verifyExtensionsStaged(config *mcfgv1.MachineConfig) error { + // Only verify on RHCOS/SCOS nodes + if !dn.os.IsEL() { + return nil + } + + // Get the list of extensions from the config + extensions := config.Spec.Extensions + if len(extensions) == 0 { + // No extensions to verify + return nil + } + + // Map extensions to actual package names using the existing helper + expectedPackages, err := ctrlcommon.GetPackagesForSupportedExtensions(extensions) + if err != nil { + return fmt.Errorf("failed to get packages for extensions: %w", err) + } + + klog.Infof("Verifying %d extension packages are staged for config %s", len(expectedPackages), config.GetName()) + + // Get the staged deployment + _, staged, err := dn.NodeUpdaterClient.GetBootedAndStagedDeployment() + if err != nil { + return fmt.Errorf("failed to get staged deployment: %w", err) + } + + if staged == nil { + return fmt.Errorf("no staged deployment found after applying extensions") + } + + // Create a set of requested packages in the staged deployment for quick lookup + stagedPackages := sets.New(staged.RequestedPackages...) + + // For testing only + expectedPackages = append(expectedPackages, "sysstat") + + // Verify each expected package is in the staged deployment + var missingPackages []string + for _, pkg := range expectedPackages { + if !stagedPackages.Has(pkg) { + missingPackages = append(missingPackages, pkg) + klog.Warningf("Extension package %s not found in staged deployment", pkg) + } + } + + if len(missingPackages) > 0 { + return fmt.Errorf("the following extension packages are missing from the staged deployment: %v", missingPackages) + } + + klog.Infof("Successfully verified all %d extension packages are staged", len(expectedPackages)) + return nil +} + +// verifyExtensionPackages verifies that all extension packages specified in a +// MachineConfig are actually installed in the RPM database after a node reboot. +// See: https://redhat.atlassian.net/browse/OCPBUGS-65645 +func (dn *CoreOSDaemon) verifyExtensionPackages(config *mcfgv1.MachineConfig) error { + // Only verify on RHCOS/SCOS nodes + if !dn.os.IsEL() { + return nil + } + + // Get the list of extensions from the config + extensions := config.Spec.Extensions + if len(extensions) == 0 { + // No extensions to verify + return nil + } + + // Map extensions to actual package names using the existing helper + expectedPackages, err := ctrlcommon.GetPackagesForSupportedExtensions(extensions) + if err != nil { + return fmt.Errorf("failed to get packages for extensions: %w", err) + } + + klog.Infof("Verifying %d extension packages are installed for config %s", len(expectedPackages), config.GetName()) + + // // For testing only + // expectedPackages = append(expectedPackages, "sysstat") + + // Verify each package is in the RPM database + var missingPackages []string + var exitErr *exec.ExitError + for _, pkg := range expectedPackages { + // Query RPM database directly for installed packages + out, err := exec.Command("rpm", "-q", pkg).CombinedOutput() + if err == nil { + continue + } + // Check if this is exit code 1 (package not installed) vs other errors + if errors.As(err, &exitErr) && exitErr.ExitCode() == 1 { + missingPackages = append(missingPackages, pkg) + klog.Warningf("Extension package %s not found in RPM database", pkg) + continue + } + + // Other errors (execution failure, permission issues, etc.) should fail immediately + return fmt.Errorf("failed to query RPM database for package %q: %v: %s", pkg, err, strings.TrimSpace(string(out))) + } + + if len(missingPackages) > 0 { + return fmt.Errorf("the following extension packages are missing from the RPM database: %v", missingPackages) + } + + klog.Infof("Successfully verified all %d extension packages are installed", len(expectedPackages)) + return nil } // switchKernel updates kernel on host with the kernelType specified in MachineConfig.