diff --git a/cmd/machine-config-server/bootstrap.go b/cmd/machine-config-server/bootstrap.go index 6310275a28..ba981eff68 100644 --- a/cmd/machine-config-server/bootstrap.go +++ b/cmd/machine-config-server/bootstrap.go @@ -22,6 +22,7 @@ var ( serverBaseDir string serverKubeConfig string certificates []string + mcsURL string } ) @@ -30,6 +31,7 @@ func init() { bootstrapCmd.PersistentFlags().StringVar(&bootstrapOpts.serverBaseDir, "server-basedir", "/etc/mcs/bootstrap", "base directory on the host, relative to which machine-configs and pools can be found.") bootstrapCmd.PersistentFlags().StringVar(&bootstrapOpts.serverKubeConfig, "bootstrap-kubeconfig", "/etc/kubernetes/kubeconfig", "path to bootstrap kubeconfig served by the bootstrap server.") bootstrapCmd.PersistentFlags().StringArrayVar(&bootstrapOpts.certificates, "bootstrap-certs", []string{}, "a certificate bundle formatted in a string array with the format key=value,key=value") + bootstrapCmd.PersistentFlags().StringVar(&bootstrapOpts.mcsURL, "mcs-url", "", "Base URL for Machine Config Server; Used for status reporting") } func runBootstrapCmd(_ *cobra.Command, _ []string) { @@ -39,10 +41,9 @@ func runBootstrapCmd(_ *cobra.Command, _ []string) { // To help debugging, immediately log version klog.Infof("Version: %+v (%s)", version.Raw, version.Hash) - bs, err := server.NewBootstrapServer(bootstrapOpts.serverBaseDir, bootstrapOpts.serverKubeConfig, bootstrapOpts.certificates) - + bs, err := server.NewBootstrapServer(bootstrapOpts.serverBaseDir, bootstrapOpts.serverKubeConfig, bootstrapOpts.mcsURL, bootstrapOpts.certificates) if err != nil { - klog.Exitf("Machine Config Server exited with error: %v", err) + klog.Fatal(err) } // Read-in bootstrap apiserver file /etc/mcs/bootstrap/api-server/apiserver.yaml. @@ -59,9 +60,12 @@ func runBootstrapCmd(_ *cobra.Command, _ []string) { klog.Infof("Launching bootstrap server with tls min version: %v & cipher suites %v", tlsminversion, tlsciphersuites) tlsConfig := ctrlcommon.GetGoTLSConfig(tlsminversion, tlsciphersuites) + failureReporter := server.NewBootstrapFailureReporter() + failureHandler := server.NewNodeFailureHandler(failureReporter) + apiHandler := server.NewServerAPIHandler(bs) - secureServer := server.NewAPIServer(apiHandler, rootOpts.sport, false, rootOpts.cert, rootOpts.key, tlsConfig) - insecureServer := server.NewAPIServer(apiHandler, rootOpts.isport, true, "", "", tlsConfig) + secureServer := server.NewAPIServer(apiHandler, failureHandler, rootOpts.sport, false, rootOpts.cert, rootOpts.key, tlsConfig) + insecureServer := server.NewAPIServer(apiHandler, failureHandler, rootOpts.isport, true, "", "", tlsConfig) stopCh := make(chan struct{}) go secureServer.Serve() diff --git a/cmd/machine-config-server/start.go b/cmd/machine-config-server/start.go index c05cea586f..60b3cf64df 100644 --- a/cmd/machine-config-server/start.go +++ b/cmd/machine-config-server/start.go @@ -23,6 +23,7 @@ var ( startOpts struct { kubeconfig string apiserverURL string + mcsURL string } ) @@ -30,6 +31,7 @@ func init() { rootCmd.AddCommand(startCmd) startCmd.PersistentFlags().StringVar(&startOpts.kubeconfig, "kubeconfig", "", "Kubeconfig file to access a remote cluster (testing only)") startCmd.PersistentFlags().StringVar(&startOpts.apiserverURL, "apiserver-url", "", "URL for apiserver; Used to generate kubeconfig") + startCmd.PersistentFlags().StringVar(&startOpts.mcsURL, "mcs-url", "", "Base URL for Machine Config Server; Used for status reporting") } @@ -47,7 +49,7 @@ func runStartCmd(_ *cobra.Command, _ []string) { klog.Exitf("--apiserver-url cannot be empty") } - cs, err := server.NewClusterServer(startOpts.kubeconfig, startOpts.apiserverURL) + cs, err := server.NewClusterServer(startOpts.kubeconfig, startOpts.apiserverURL, startOpts.mcsURL) if err != nil { ctrlcommon.WriteTerminationError(err) } @@ -55,9 +57,12 @@ func runStartCmd(_ *cobra.Command, _ []string) { klog.Infof("Launching server with tls min version: %v & cipher suites %v", rootOpts.tlsminversion, rootOpts.tlsciphersuites) tlsConfig := ctrlcommon.GetGoTLSConfig(rootOpts.tlsminversion, rootOpts.tlsciphersuites) + failureReporter := server.NewClusterFailureReporter(cs.GetKubeClient()) + failureHandler := server.NewNodeFailureHandler(failureReporter) + apiHandler := server.NewServerAPIHandler(cs) - secureServer := server.NewAPIServer(apiHandler, rootOpts.sport, false, rootOpts.cert, rootOpts.key, tlsConfig) - insecureServer := server.NewAPIServer(apiHandler, rootOpts.isport, true, "", "", tlsConfig) + secureServer := server.NewAPIServer(apiHandler, failureHandler, rootOpts.sport, false, rootOpts.cert, rootOpts.key, tlsConfig) + insecureServer := server.NewAPIServer(apiHandler, failureHandler, rootOpts.isport, true, "", "", tlsConfig) stopCh := make(chan struct{}) go secureServer.Serve() diff --git a/manifests/bootstrap-pod-v2.yaml b/manifests/bootstrap-pod-v2.yaml index b1c19619ab..78e1c89358 100644 --- a/manifests/bootstrap-pod-v2.yaml +++ b/manifests/bootstrap-pod-v2.yaml @@ -37,6 +37,7 @@ spec: command: ["/usr/bin/machine-config-server"] args: - "bootstrap" + - "--mcs-url={{.MCSURL}}" - "--payload-version={{.ReleaseVersion}}" terminationMessagePolicy: FallbackToLogsOnError volumeMounts: diff --git a/manifests/machineconfigserver/daemonset.yaml b/manifests/machineconfigserver/daemonset.yaml index 2a8809f98f..3fffd4d6d1 100644 --- a/manifests/machineconfigserver/daemonset.yaml +++ b/manifests/machineconfigserver/daemonset.yaml @@ -23,6 +23,7 @@ spec: args: - "start" - "--apiserver-url={{.APIServerURL}}" + - "--mcs-url={{.MCSURL}}" - "--payload-version={{.ReleaseVersion}}" - "--tls-cipher-suites={{join .TLSCipherSuites ","}}" - "--tls-min-version={{.TLSMinVersion}}" diff --git a/pkg/daemon/constants/constants.go b/pkg/daemon/constants/constants.go index 1f3d456505..eb2adf2198 100644 --- a/pkg/daemon/constants/constants.go +++ b/pkg/daemon/constants/constants.go @@ -12,7 +12,8 @@ const ( CurrentImageAnnotationKey = "machineconfiguration.openshift.io/currentImage" // DesiredImageAnnotationKey is used to specify the desired OS image pullspec for a machine DesiredImageAnnotationKey = "machineconfiguration.openshift.io/desiredImage" - + // MachineConfigServerURLAnnotationKey stores the MCS base URL for status reporting + MachineConfigServerURLAnnotationKey = "machineconfiguration.openshift.io/machineConfigServerURL" // CurrentMachineConfigAnnotationKey is used to fetch current MachineConfig for a machine CurrentMachineConfigAnnotationKey = "machineconfiguration.openshift.io/currentConfig" // DesiredMachineConfigAnnotationKey is used to specify the desired MachineConfig for a machine diff --git a/pkg/operator/bootstrap.go b/pkg/operator/bootstrap.go index c0b9bed742..906ad87618 100644 --- a/pkg/operator/bootstrap.go +++ b/pkg/operator/bootstrap.go @@ -147,8 +147,14 @@ func buildSpec(dependencies *BootstrapDependencies, imgs *ctrlcommon.Images, rel templatectrl.DockerRegistryKey: imgs.DockerRegistry, } + ignitionHost, err := getIgnitionHost(&dependencies.Infrastructure.Status) + if err != nil { + return nil, err + } + mcsURL := fmt.Sprintf("https://%s", ignitionHost) + config := getRenderConfig("", dependencies.KubeAPIServerServingCA, spec, - &imgs.RenderConfigImages, dependencies.Infrastructure, nil, nil, "2") + &imgs.RenderConfigImages, dependencies.Infrastructure, nil, nil, mcsURL, "2") return config, nil } diff --git a/pkg/operator/render.go b/pkg/operator/render.go index 1afcd1e3d9..6de141e795 100644 --- a/pkg/operator/render.go +++ b/pkg/operator/render.go @@ -45,6 +45,7 @@ type renderConfig struct { ControllerConfig mcfgv1.ControllerConfigSpec KubeAPIServerServingCA string APIServerURL string + MCSURL string Images *ctrlcommon.RenderConfigImages Infra configv1.Infrastructure Constants map[string]string diff --git a/pkg/operator/sync.go b/pkg/operator/sync.go index 354bb1f22c..09ccee5408 100644 --- a/pkg/operator/sync.go +++ b/pkg/operator/sync.go @@ -625,6 +625,8 @@ func (optr *Operator) syncRenderConfig(_ *renderConfig, _ *configv1.ClusterOpera return err } + mcsURL := fmt.Sprintf("https://%s", ignitionHost) + pointerConfig, err := ctrlcommon.PointerConfig(ignitionHost, machineConfigServerCABundle) if err != nil { return err @@ -650,7 +652,7 @@ func (optr *Operator) syncRenderConfig(_ *renderConfig, _ *configv1.ClusterOpera optr.setOperatorLogLevel(mcop.Spec.OperatorLogLevel) } - optr.renderConfig = getRenderConfig(optr.namespace, string(kubeAPIServerServingCABytes), spec, &imgs.RenderConfigImages, infra, pointerConfigData, apiServer, fmt.Sprintf("%d", optr.logLevel)) + optr.renderConfig = getRenderConfig(optr.namespace, string(kubeAPIServerServingCABytes), spec, &imgs.RenderConfigImages, infra, pointerConfigData, apiServer, mcsURL, fmt.Sprintf("%d", optr.logLevel)) return nil } @@ -2150,7 +2152,7 @@ func setGVK(obj runtime.Object, scheme *runtime.Scheme) error { return nil } -func getRenderConfig(tnamespace, kubeAPIServerServingCA string, ccSpec *mcfgv1.ControllerConfigSpec, imgs *ctrlcommon.RenderConfigImages, infra *configv1.Infrastructure, pointerConfigData []byte, apiServer *configv1.APIServer, logLevel string) *renderConfig { +func getRenderConfig(tnamespace, kubeAPIServerServingCA string, ccSpec *mcfgv1.ControllerConfigSpec, imgs *ctrlcommon.RenderConfigImages, infra *configv1.Infrastructure, pointerConfigData []byte, apiServer *configv1.APIServer, mcsURL, logLevel string) *renderConfig { tlsMinVersion, tlsCipherSuites := ctrlcommon.GetSecurityProfileCiphersFromAPIServer(apiServer) return &renderConfig{ TargetNamespace: tnamespace, @@ -2160,6 +2162,7 @@ func getRenderConfig(tnamespace, kubeAPIServerServingCA string, ccSpec *mcfgv1.C Images: imgs, KubeAPIServerServingCA: kubeAPIServerServingCA, APIServerURL: infra.Status.APIServerInternalURL, + MCSURL: mcsURL, PointerConfig: string(pointerConfigData), Infra: *infra, TLSMinVersion: tlsMinVersion, diff --git a/pkg/server/api.go b/pkg/server/api.go index 27bc7333c4..cb347c8432 100644 --- a/pkg/server/api.go +++ b/pkg/server/api.go @@ -45,10 +45,11 @@ type APIServer struct { // NewAPIServer initializes a new API server // that runs the Machine Config Server as a // handler. -func NewAPIServer(a *APIHandler, p int, is bool, c, k string, t *tls.Config) *APIServer { +func NewAPIServer(a *APIHandler, failureHandler *NodeFailureHandler, p int, is bool, c, k string, t *tls.Config) *APIServer { mux := http.NewServeMux() mux.Handle("/config/", a) mux.Handle("/healthz", &healthHandler{}) + mux.Handle("/v1/node-failure", failureHandler) // NEW mux.Handle("/", &defaultHandler{}) return &APIServer{ @@ -182,6 +183,71 @@ func (sh *APIHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { type healthHandler struct{} +// NodeFailureHandler handles POST requests to /v1/node-failure +type NodeFailureHandler struct { + reporter FailureReporter +} + +// NewNodeFailureHandler creates a handler for node failure reports +func NewNodeFailureHandler(reporter FailureReporter) *NodeFailureHandler { + return &NodeFailureHandler{ + reporter: reporter, + } +} + +// ServeHTTP processes firstboot failure reports from nodes +func (h *NodeFailureHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + // Only accept POST + if r.Method != http.MethodPost { + w.Header().Set("Content-Length", "0") + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + + // Read and decode JSON body + var report FirstbootFailureReport + decoder := json.NewDecoder(r.Body) + if err := decoder.Decode(&report); err != nil { + klog.Errorf("Failed to decode failure report: %v", err) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusBadRequest) + json.NewEncoder(w).Encode(map[string]string{ + "error": "Invalid JSON payload", + }) + return + } + + // Validate required fields + if report.Pool == "" || report.NodeID == "" || report.Stage == "" { + klog.Errorf("Missing required fields in failure report: pool=%s node=%s stage=%s", + report.Pool, report.NodeID, report.Stage) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusBadRequest) + json.NewEncoder(w).Encode(map[string]string{ + "error": "Missing required fields: pool, nodeID, and stage are required", + }) + return + } + + klog.Infof("Received failure report: pool=%s node=%s stage=%s", + report.Pool, report.NodeID, report.Stage) + + // Report the failure (best-effort, errors are logged but don't fail the request) + ctx := r.Context() + if err := h.reporter.ReportFailure(ctx, &report); err != nil { + // Log error but still return 202 - we don't want the node to retry-loop + klog.Errorf("Failed to process failure report (still returning 202): %v", err) + } + + // Always return 202 Accepted (best-effort delivery) + // Never return 5xx - the node must not retry-loop on reporter errors + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusAccepted) + json.NewEncoder(w).Encode(map[string]string{ + "status": "accepted", + }) +} + type acceptHeaderValue struct { MIMEType string MIMESubtype string diff --git a/pkg/server/api_test.go b/pkg/server/api_test.go index 5e36f55279..8bc99d60dd 100644 --- a/pkg/server/api_test.go +++ b/pkg/server/api_test.go @@ -15,6 +15,7 @@ import ( "github.com/stretchr/testify/require" "golang.org/x/net/http2" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" "github.com/openshift/machine-config-operator/test/helpers" @@ -28,6 +29,10 @@ func (ms *mockServer) GetConfig(pr poolRequest) (*runtime.RawExtension, error) { return ms.GetConfigFn(pr) } +func (ms *mockServer) GetKubeClient() kubernetes.Interface { + return nil +} + type checkResponse func(t *testing.T, response *http.Response) type scenario struct { @@ -717,7 +722,7 @@ func TestAPIServer(t *testing.T) { ms := &mockServer{ GetConfigFn: scenario.serverFunc, } - server := NewAPIServer(NewServerAPIHandler(ms), 0, false, "", "", nil) + server := NewAPIServer(NewServerAPIHandler(ms), nil, 0, false, "", "", nil) server.handler.ServeHTTP(w, scenario.request) resp := w.Result() diff --git a/pkg/server/bootstrap_server.go b/pkg/server/bootstrap_server.go index d8b8cf3d30..123acb3f19 100644 --- a/pkg/server/bootstrap_server.go +++ b/pkg/server/bootstrap_server.go @@ -8,6 +8,7 @@ import ( yaml "github.com/ghodss/yaml" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" clientcmd "k8s.io/client-go/tools/clientcmd/api/v1" "k8s.io/klog/v2" @@ -28,11 +29,18 @@ type bootstrapServer struct { kubeconfigFunc kubeconfigFunc certs []string + + mcsURL string +} + +// GetKubeClient returns nil for bootstrap server (no k8s access during bootstrap) +func (bsc *bootstrapServer) GetKubeClient() kubernetes.Interface { + return nil } // NewBootstrapServer initializes a new Bootstrap server that implements // the Server interface. -func NewBootstrapServer(dir, kubeconfig string, ircerts []string) (Server, error) { +func NewBootstrapServer(dir, kubeconfig, mcsURL string, ircerts []string) (Server, error) { if _, err := os.Stat(kubeconfig); err != nil { return nil, fmt.Errorf("kubeconfig not found at location: %s", kubeconfig) } @@ -40,6 +48,7 @@ func NewBootstrapServer(dir, kubeconfig string, ircerts []string) (Server, error serverBaseDir: dir, kubeconfigFunc: func() ([]byte, []byte, error) { return kubeconfigFromFile(kubeconfig) }, certs: ircerts, + mcsURL: mcsURL, }, nil } @@ -136,7 +145,7 @@ func (bsc *bootstrapServer) GetConfig(cr poolRequest) (*runtime.RawExtension, er addDataAndMaybeAppendToIgnition(cloudProviderCAPath, cc.Spec.CloudProviderCAData, &ignConf) appenders := newAppendersBuilder(nil, bsc.kubeconfigFunc, bsc.certs, bsc.serverBaseDir). - WithNodeAnnotations(currConf, ""). + WithNodeAnnotations(currConf, "", bsc.mcsURL). build() for _, a := range appenders { diff --git a/pkg/server/cluster_server.go b/pkg/server/cluster_server.go index 59c6d13889..cf19fdc596 100644 --- a/pkg/server/cluster_server.go +++ b/pkg/server/cluster_server.go @@ -22,6 +22,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" clientset "k8s.io/client-go/kubernetes" corelisterv1 "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" @@ -56,6 +57,7 @@ type clusterServer struct { kubeconfigFunc kubeconfigFunc apiserverURL string + mcsURL string } const minResyncPeriod = 20 * time.Minute @@ -76,7 +78,7 @@ func resyncPeriod() func() time.Duration { // It accepts a kubeConfig, which is not required when it's // run from within a cluster(useful in testing). // It accepts the apiserverURL which is the location of the KubeAPIServer. -func NewClusterServer(kubeConfig, apiserverURL string) (Server, error) { +func NewClusterServer(kubeConfig, apiserverURL, mcsURL string) (Server, error) { clientsBuilder, err := clients.NewBuilder(kubeConfig) if err != nil { return nil, fmt.Errorf("failed to create Kubernetes rest client: %w", err) @@ -123,6 +125,7 @@ func NewClusterServer(kubeConfig, apiserverURL string) (Server, error) { routeclient: routeClient, kubeconfigFunc: func() ([]byte, []byte, error) { return kubeconfigFromSecret(bootstrapTokenDir, apiserverURL, nil) }, apiserverURL: apiserverURL, + mcsURL: mcsURL, }, nil } @@ -187,7 +190,7 @@ func (cs *clusterServer) GetConfig(cr poolRequest) (*runtime.RawExtension, error desiredImage := cs.resolveDesiredImageForPool(mp) appenders := newAppendersBuilder(cr.version, cs.kubeconfigFunc, []string{}, ""). - WithNodeAnnotations(currConf, desiredImage). + WithNodeAnnotations(currConf, desiredImage, cs.mcsURL). WithCustomAppender(appendDesiredOSImage(desiredImage)). build() @@ -369,3 +372,8 @@ func appendDesiredOSImage(desired string) appenderFunc { return nil } } + +// GetKubeClient returns the Kubernetes client for this cluster server +func (cs *clusterServer) GetKubeClient() kubernetes.Interface { + return cs.kubeclient +} diff --git a/pkg/server/failure_reporter.go b/pkg/server/failure_reporter.go new file mode 100644 index 0000000000..aba0f1dab2 --- /dev/null +++ b/pkg/server/failure_reporter.go @@ -0,0 +1,141 @@ +package server + +import ( + "context" + "fmt" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + + ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" +) + +// FirstbootFailureReport represents a failure report from a node during firstboot +type FirstbootFailureReport struct { + Pool string `json:"pool"` // MachineConfigPool name (e.g., "worker", "master") + NodeID string `json:"nodeID"` // Node identifier (hostname or MAC address) + Stage string `json:"stage"` // Failure stage (e.g., "pivot", "kargs", "updateLayeredOS") + ImageURL string `json:"imageURL"` // OS image URL that failed to apply + ErrorMessage string `json:"errorMessage"` // Detailed error message +} + +// FailureReporter defines the interface for reporting firstboot failures +type FailureReporter interface { + // ReportFailure processes a firstboot failure report + // Returns error only for logging; HTTP handler always returns 202 + ReportFailure(ctx context.Context, report *FirstbootFailureReport) error +} + +// clusterFailureReporter creates Kubernetes Events for firstboot failures +type clusterFailureReporter struct { + kubeclient kubernetes.Interface +} + +// NewClusterFailureReporter creates a FailureReporter for cluster mode +func NewClusterFailureReporter(kubeclient kubernetes.Interface) FailureReporter { + return &clusterFailureReporter{ + kubeclient: kubeclient, + } +} + +func (r *clusterFailureReporter) ReportFailure(ctx context.Context, report *FirstbootFailureReport) error { + // Event name is deterministic based on pool + nodeID to enable patching + eventName := fmt.Sprintf("firstboot-failure-%s-%s", report.Pool, sanitizeForEventName(report.NodeID)) + + namespace := ctrlcommon.MCONamespace // "openshift-machine-config-operator" + + // Check if event already exists + existingEvent, err := r.kubeclient.CoreV1().Events(namespace).Get(ctx, eventName, metav1.GetOptions{}) + + now := metav1.NewTime(time.Now()) + + if err == nil { + // Event exists, update it (increment count, update timestamp and message) + existingEvent.Count++ + existingEvent.LastTimestamp = now + existingEvent.Message = formatFailureMessage(report) + + _, updateErr := r.kubeclient.CoreV1().Events(namespace).Update(ctx, existingEvent, metav1.UpdateOptions{}) + if updateErr != nil { + klog.Errorf("Failed to update firstboot failure event %s: %v", eventName, updateErr) + return updateErr + } + klog.Infof("Updated firstboot failure event for pool=%s node=%s (count=%d)", + report.Pool, report.NodeID, existingEvent.Count) + } else { + // Event doesn't exist, create it + event := &corev1.Event{ + ObjectMeta: metav1.ObjectMeta{ + Name: eventName, + Namespace: namespace, + }, + InvolvedObject: corev1.ObjectReference{ + Kind: "MachineConfigPool", + Name: report.Pool, + Namespace: namespace, + }, + Reason: "FirstbootFailed", + Type: corev1.EventTypeWarning, + Message: formatFailureMessage(report), + FirstTimestamp: now, + LastTimestamp: now, + Count: 1, + Source: corev1.EventSource{ + Component: "machine-config-server", + }, + } + + _, createErr := r.kubeclient.CoreV1().Events(namespace).Create(ctx, event, metav1.CreateOptions{}) + if createErr != nil { + klog.Errorf("Failed to create firstboot failure event %s: %v", eventName, createErr) + return createErr + } + klog.Infof("Created firstboot failure event for pool=%s node=%s stage=%s", + report.Pool, report.NodeID, report.Stage) + } + + return nil +} + +// formatFailureMessage creates a human-readable event message +func formatFailureMessage(report *FirstbootFailureReport) string { + return fmt.Sprintf("Node %s failed during firstboot at stage '%s': %s (image: %s)", + report.NodeID, report.Stage, report.ErrorMessage, report.ImageURL) +} + +// sanitizeForEventName converts nodeID to a valid Kubernetes name +// (lowercase alphanumeric, '-', max 253 chars) +func sanitizeForEventName(nodeID string) string { + // Replace invalid characters with '-' + // Kubernetes event names must match: [a-z0-9]([-a-z0-9]*[a-z0-9])? + sanitized := strings.ToLower(nodeID) + sanitized = strings.ReplaceAll(sanitized, ":", "-") + sanitized = strings.ReplaceAll(sanitized, "_", "-") + sanitized = strings.ReplaceAll(sanitized, ".", "-") + + // Truncate if needed + if len(sanitized) > 200 { + sanitized = sanitized[:200] + } + + return sanitized +} + +// bootstrapFailureReporter logs failures (no API access during bootstrap) +type bootstrapFailureReporter struct{} + +// NewBootstrapFailureReporter creates a FailureReporter for bootstrap mode +func NewBootstrapFailureReporter() FailureReporter { + return &bootstrapFailureReporter{} +} + +func (r *bootstrapFailureReporter) ReportFailure(ctx context.Context, report *FirstbootFailureReport) error { + // During bootstrap, we can only log - no Kubernetes API access + klog.Warningf("Firstboot failure (bootstrap mode, log only): pool=%s node=%s stage=%s image=%s error=%s", + report.Pool, report.NodeID, report.Stage, report.ImageURL, report.ErrorMessage) + return nil +} diff --git a/pkg/server/failure_reporter_test.go b/pkg/server/failure_reporter_test.go new file mode 100644 index 0000000000..c604ad2a5d --- /dev/null +++ b/pkg/server/failure_reporter_test.go @@ -0,0 +1,111 @@ +package server + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// mockFailureReporter for testing +type mockFailureReporter struct { + reports []*FirstbootFailureReport + err error +} + +func (m *mockFailureReporter) ReportFailure(ctx context.Context, report *FirstbootFailureReport) error { + m.reports = append(m.reports, report) + return m.err +} + +func TestNodeFailureHandler_POST(t *testing.T) { + reporter := &mockFailureReporter{} + handler := NewNodeFailureHandler(reporter) + + report := FirstbootFailureReport{ + Pool: "worker", + NodeID: "test-node-1", + Stage: "updateLayeredOS", + ImageURL: "quay.io/test:latest", + ErrorMessage: "Image pull failed", + } + + body, err := json.Marshal(report) + require.NoError(t, err) + + req := httptest.NewRequest(http.MethodPost, "/v1/node-failure", bytes.NewReader(body)) + w := httptest.NewRecorder() + + handler.ServeHTTP(w, req) + + assert.Equal(t, http.StatusAccepted, w.Code) + assert.Equal(t, 1, len(reporter.reports)) + assert.Equal(t, "worker", reporter.reports[0].Pool) + assert.Equal(t, "test-node-1", reporter.reports[0].NodeID) +} + +func TestNodeFailureHandler_MethodNotAllowed(t *testing.T) { + handler := NewNodeFailureHandler(&mockFailureReporter{}) + + req := httptest.NewRequest(http.MethodGet, "/v1/node-failure", nil) + w := httptest.NewRecorder() + + handler.ServeHTTP(w, req) + + assert.Equal(t, http.StatusMethodNotAllowed, w.Code) +} + +func TestNodeFailureHandler_InvalidJSON(t *testing.T) { + handler := NewNodeFailureHandler(&mockFailureReporter{}) + + req := httptest.NewRequest(http.MethodPost, "/v1/node-failure", bytes.NewReader([]byte("invalid json"))) + w := httptest.NewRecorder() + + handler.ServeHTTP(w, req) + + assert.Equal(t, http.StatusBadRequest, w.Code) +} + +func TestNodeFailureHandler_MissingFields(t *testing.T) { + handler := NewNodeFailureHandler(&mockFailureReporter{}) + + report := FirstbootFailureReport{ + Pool: "worker", + // Missing NodeID and Stage + } + + body, _ := json.Marshal(report) + req := httptest.NewRequest(http.MethodPost, "/v1/node-failure", bytes.NewReader(body)) + w := httptest.NewRecorder() + + handler.ServeHTTP(w, req) + + assert.Equal(t, http.StatusBadRequest, w.Code) +} + +func TestNodeFailureHandler_ReporterError(t *testing.T) { + // Even if reporter fails, handler should return 202 (best-effort) + reporter := &mockFailureReporter{err: fmt.Errorf("reporter error")} + handler := NewNodeFailureHandler(reporter) + + report := FirstbootFailureReport{ + Pool: "worker", + NodeID: "test", + Stage: "pivot", + } + + body, _ := json.Marshal(report) + req := httptest.NewRequest(http.MethodPost, "/v1/node-failure", bytes.NewReader(body)) + w := httptest.NewRecorder() + + handler.ServeHTTP(w, req) + + // Still returns 202 even though reporter failed + assert.Equal(t, http.StatusAccepted, w.Code) +} diff --git a/pkg/server/server.go b/pkg/server/server.go index ad1792bf92..b9e9536625 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -12,6 +12,7 @@ import ( ign3types "github.com/coreos/ignition/v2/config/v3_5/types" "github.com/vincent-petithory/dataurl" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" mcfgv1 "github.com/openshift/api/machineconfiguration/v1" @@ -45,6 +46,7 @@ type appenderFunc func(*ign3types.Config, *mcfgv1.MachineConfig) error // machine config server implementations. type Server interface { GetConfig(poolRequest) (*runtime.RawExtension, error) + GetKubeClient() kubernetes.Interface } // appendersBuilder builds a slice of appenderFunc with guaranteed ordering. @@ -70,9 +72,9 @@ func newAppendersBuilder(version *semver.Version, kubeconfigFn kubeconfigFunc, c } // WithNodeAnnotations adds the node annotations appender with the specified config and image. -func (ab *appendersBuilder) WithNodeAnnotations(currMachineConfig, image string) *appendersBuilder { +func (ab *appendersBuilder) WithNodeAnnotations(currMachineConfig, image, mcsURL string) *appendersBuilder { ab.customAppenders = append(ab.customAppenders, func(cfg *ign3types.Config, mc *mcfgv1.MachineConfig) error { - return appendNodeAnnotations(cfg, currMachineConfig, image, mc) + return appendNodeAnnotations(cfg, currMachineConfig, image, mcsURL, mc) }) return ab } @@ -196,8 +198,8 @@ func appendKubeConfig(conf *ign3types.Config, f kubeconfigFunc) error { return nil } -func appendNodeAnnotations(conf *ign3types.Config, currConf, image string, mc *mcfgv1.MachineConfig) error { - anno, err := getNodeAnnotation(currConf, image, mc) +func appendNodeAnnotations(conf *ign3types.Config, currConf, image, mcsURL string, mc *mcfgv1.MachineConfig) error { + anno, err := getNodeAnnotation(currConf, image, mcsURL, mc) if err != nil { return err } @@ -208,7 +210,7 @@ func appendNodeAnnotations(conf *ign3types.Config, currConf, image string, mc *m return nil } -func getNodeAnnotation(conf, image string, mc *mcfgv1.MachineConfig) (string, error) { +func getNodeAnnotation(conf, image, mcsURL string, mc *mcfgv1.MachineConfig) (string, error) { nodeAnnotations := map[string]string{ daemonconsts.CurrentMachineConfigAnnotationKey: conf, daemonconsts.DesiredMachineConfigAnnotationKey: conf, @@ -216,6 +218,10 @@ func getNodeAnnotation(conf, image string, mc *mcfgv1.MachineConfig) (string, er daemonconsts.MachineConfigDaemonStateAnnotationKey: daemonconsts.MachineConfigDaemonStateDone, } + if mcsURL != "" { + nodeAnnotations[daemonconsts.MachineConfigServerURLAnnotationKey] = mcsURL + } + // Determine which image to use: // 1. Pre-built image from MC annotations (install-time hybrid OCL) takes priority // 2. Dynamically resolved image parameter (runtime scaling) diff --git a/pkg/server/server_test.go b/pkg/server/server_test.go index 6a46689989..8c2affda77 100644 --- a/pkg/server/server_test.go +++ b/pkg/server/server_test.go @@ -185,7 +185,7 @@ func TestBootstrapServer(t *testing.T) { if err != nil { t.Fatalf("unexpected error while appending file to ignition: %v", err) } - anno, err := getNodeAnnotation(mp.Status.Configuration.Name, "", mc) + anno, err := getNodeAnnotation(mp.Status.Configuration.Name, "", "https://api-int.test.example.com:22623", mc) if err != nil { t.Fatalf("unexpected error while creating annotations err: %v", err) } @@ -380,7 +380,7 @@ func TestClusterServer(t *testing.T) { if err != nil { t.Fatalf("unexpected error while appending file to ignition: %v", err) } - anno, err := getNodeAnnotation(mp.Status.Configuration.Name, "", mc) + anno, err := getNodeAnnotation(mp.Status.Configuration.Name, "", "https://api-int.test.example.com:22623", mc) if err != nil { t.Fatalf("unexpected error while creating annotations err: %v", err) }