From 3bdaf7b287046a4a866044be8e1b93d6f173d3a1 Mon Sep 17 00:00:00 2001 From: Angel Cervera Roldan <48255007+angelcerveraroldan@users.noreply.github.com> Date: Mon, 20 Apr 2026 09:54:39 +0000 Subject: [PATCH 1/2] WIP: Lookup available zones for GCP --- mantle/cmd/kola/options.go | 2 +- mantle/platform/api/gcloud/api.go | 74 ++++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/mantle/cmd/kola/options.go b/mantle/cmd/kola/options.go index 146005c576..d6a53648fd 100644 --- a/mantle/cmd/kola/options.go +++ b/mantle/cmd/kola/options.go @@ -121,7 +121,7 @@ func init() { // gcp-specific options sv(&kola.GCPOptions.Image, "gcp-image", "", "GCP image, full api endpoints names are accepted if resource is in a different project") sv(&kola.GCPOptions.Project, "gcp-project", "fedora-coreos-devel", "GCP project name") - sv(&kola.GCPOptions.Zone, "gcp-zone", "us-central1-a", "GCP zone name") + sv(&kola.GCPOptions.Zone, "gcp-zone", "us-central1-a", "Preferred GCP zone name, if the resources to this zone are depleted, we will fallback to another zone in the same region") sv(&kola.GCPOptions.MachineType, "gcp-machinetype", "", "GCP machine type") sv(&kola.GCPOptions.DiskType, "gcp-disktype", "", "GCP disk type (default pd-ssd)") sv(&kola.GCPOptions.Network, "gcp-network", "default", "GCP network") diff --git a/mantle/platform/api/gcloud/api.go b/mantle/platform/api/gcloud/api.go index cd2f699fd5..dc21e34c6b 100644 --- a/mantle/platform/api/gcloud/api.go +++ b/mantle/platform/api/gcloud/api.go @@ -17,10 +17,13 @@ package gcloud import ( "context" - "google.golang.org/api/option" + "fmt" "net/http" + "regexp" "time" + "google.golang.org/api/option" + "github.com/coreos/pkg/capnslog" "google.golang.org/api/compute/v1" @@ -50,6 +53,68 @@ type API struct { client *http.Client compute *compute.Service options *Options + zones []string +} + +// This regex should match all standard (non-AI) zones +// See: https://docs.cloud.google.com/compute/docs/regions-zones +var standardZoneRegexp = regexp.MustCompile(`^([a-z]+-[a-z]+\d+)-[a-z]$`) + +// zones are in the form "us-central1-a" and the region would be "us-central1" +// See: https://docs.cloud.google.com/compute/docs/regions-zones +func extractRegionFromZone(zone string) (string, error) { + matches := standardZoneRegexp.FindStringSubmatch(zone) + if matches == nil { + return "", fmt.Errorf("zone %q does not match expected format {region}-{letter}", zone) + } + return matches[1], nil +} + +func getAvailableZones(computeService *compute.Service, opts *Options) ([]string, error) { + if opts.MachineType == "" { + return []string{opts.Zone}, nil + } + + list, err := computeService.MachineTypes.AggregatedList(opts.Project). + Filter("name=" + opts.MachineType).Do() + + if err != nil { + return nil, err + } + + targetRegion, err := extractRegionFromZone(opts.Zone) + if err != nil { + return nil, fmt.Errorf("could not extract region from zone %q: %w", opts.Zone, err) + } + + zones := []string{} + for _, scopedList := range list.Items { + // There should be either 1 or 0 MachineTypes + // 0 if this zone does not have the required machine type + // 1 if this zone does have the required machine type + if len(scopedList.MachineTypes) == 0 { + continue + } + if len(scopedList.MachineTypes) > 1 { + plog.Warningf("Unexpected: got %d machine types for filter name=%s", len(scopedList.MachineTypes), opts.MachineType) + continue + } + zone := scopedList.MachineTypes[0].Zone + if region, err := extractRegionFromZone(zone); err == nil && region == targetRegion { + // If the preferred zone can be used, it should be the first zone that we use, + // so we will make add it to the start of the list, rather than the end. + if zone == opts.Zone { + zones = append([]string{zone}, zones...) + } else { + zones = append(zones, zone) + } + } + } + + if len(zones) == 0 { + return zones, fmt.Errorf("no zones in region %s for machine type %s were found", targetRegion, opts.MachineType) + } + return zones, nil } func New(opts *Options) (*API, error) { @@ -83,6 +148,12 @@ func New(opts *Options) (*API, error) { return nil, err } + zones, err := getAvailableZones(computeService, opts) + if err != nil { + plog.Warningf("Failed to discover available zones: %v. Falling back to preferred zone (%s) only.", err, opts.Zone) + zones = []string{opts.Zone} + } + if opts.ServiceAcct == "" { proj, err := computeService.Projects.Get(opts.Project).Do() if err != nil { @@ -95,6 +166,7 @@ func New(opts *Options) (*API, error) { client: client, compute: computeService, options: opts, + zones: zones, } return api, nil From 72a6bcc2f584b79618007863d5ab693d5761555c Mon Sep 17 00:00:00 2001 From: Angel Cervera Roldan <48255007+angelcerveraroldan@users.noreply.github.com> Date: Tue, 21 Apr 2026 09:03:49 +0000 Subject: [PATCH 2/2] gcloud: Iterate zones on GCP Connection failure (WIP) When we fail to connect to a machine with an error that indicates that the zone we are connecting to is not functioning correctly, fallback to a different zone and try again. --- mantle/cmd/kola/options.go | 2 +- mantle/cmd/ore/gcloud/destroy.go | 8 +- mantle/cmd/ore/gcloud/gcloud.go | 2 +- mantle/platform/api/gcloud/api.go | 21 +-- mantle/platform/api/gcloud/compute.go | 149 +++++++++++++++------- mantle/platform/machine/gcloud/cluster.go | 19 +++ mantle/platform/machine/gcloud/machine.go | 5 +- 7 files changed, 148 insertions(+), 58 deletions(-) diff --git a/mantle/cmd/kola/options.go b/mantle/cmd/kola/options.go index d6a53648fd..a3031a927d 100644 --- a/mantle/cmd/kola/options.go +++ b/mantle/cmd/kola/options.go @@ -121,7 +121,7 @@ func init() { // gcp-specific options sv(&kola.GCPOptions.Image, "gcp-image", "", "GCP image, full api endpoints names are accepted if resource is in a different project") sv(&kola.GCPOptions.Project, "gcp-project", "fedora-coreos-devel", "GCP project name") - sv(&kola.GCPOptions.Zone, "gcp-zone", "us-central1-a", "Preferred GCP zone name, if the resources to this zone are depleted, we will fallback to another zone in the same region") + sv(&kola.GCPOptions.PreferredZone, "gcp-zone", "us-central1-a", "Preferred GCP zone name, if the resources to this zone are depleted, we will fallback to another zone in the same region") sv(&kola.GCPOptions.MachineType, "gcp-machinetype", "", "GCP machine type") sv(&kola.GCPOptions.DiskType, "gcp-disktype", "", "GCP disk type (default pd-ssd)") sv(&kola.GCPOptions.Network, "gcp-network", "default", "GCP network") diff --git a/mantle/cmd/ore/gcloud/destroy.go b/mantle/cmd/ore/gcloud/destroy.go index ea87467c3e..aee60e44cf 100644 --- a/mantle/cmd/ore/gcloud/destroy.go +++ b/mantle/cmd/ore/gcloud/destroy.go @@ -18,6 +18,7 @@ import ( "fmt" "os" + "github.com/coreos/coreos-assembler/mantle/platform/machine/gcloud" "github.com/spf13/cobra" ) @@ -54,7 +55,12 @@ func runDestroy(cmd *cobra.Command, args []string) { var count int for _, vm := range vms { - if err := api.TerminateInstance(vm.Name); err != nil { + zone, err := gcloud.ExtractZoneFromInstance(vm) + if err != nil { + fmt.Fprintf(os.Stderr, "Could not get vm's zone from URL %s: %v", vm.Zone, err) + os.Exit(1) + } + if err := api.TerminateInstance(vm.Name, zone); err != nil { fmt.Fprintf(os.Stderr, "Failed destroying vm: %v\n", err) os.Exit(1) } diff --git a/mantle/cmd/ore/gcloud/gcloud.go b/mantle/cmd/ore/gcloud/gcloud.go index 5ae27aa543..e7941fb07d 100644 --- a/mantle/cmd/ore/gcloud/gcloud.go +++ b/mantle/cmd/ore/gcloud/gcloud.go @@ -41,7 +41,7 @@ func init() { sv(&opts.Image, "image", "", "image name") sv(&opts.Project, "project", "fedora-coreos-devel", "project") - sv(&opts.Zone, "zone", "us-central1-a", "zone") + sv(&opts.PreferredZone, "zone", "us-central1-a", "zone") sv(&opts.MachineType, "machinetype", "n1-standard-1", "machine type") sv(&opts.DiskType, "disktype", "pd-ssd", "disk type") sv(&opts.BaseName, "basename", "kola", "instance name prefix") diff --git a/mantle/platform/api/gcloud/api.go b/mantle/platform/api/gcloud/api.go index dc21e34c6b..064b8ef150 100644 --- a/mantle/platform/api/gcloud/api.go +++ b/mantle/platform/api/gcloud/api.go @@ -38,7 +38,7 @@ var ( type Options struct { Image string Project string - Zone string + PreferredZone string MachineType string DiskType string Network string @@ -72,7 +72,7 @@ func extractRegionFromZone(zone string) (string, error) { func getAvailableZones(computeService *compute.Service, opts *Options) ([]string, error) { if opts.MachineType == "" { - return []string{opts.Zone}, nil + return []string{opts.PreferredZone}, nil } list, err := computeService.MachineTypes.AggregatedList(opts.Project). @@ -82,9 +82,9 @@ func getAvailableZones(computeService *compute.Service, opts *Options) ([]string return nil, err } - targetRegion, err := extractRegionFromZone(opts.Zone) + targetRegion, err := extractRegionFromZone(opts.PreferredZone) if err != nil { - return nil, fmt.Errorf("could not extract region from zone %q: %w", opts.Zone, err) + return nil, fmt.Errorf("could not extract region from zone %q: %w", opts.PreferredZone, err) } zones := []string{} @@ -103,7 +103,7 @@ func getAvailableZones(computeService *compute.Service, opts *Options) ([]string if region, err := extractRegionFromZone(zone); err == nil && region == targetRegion { // If the preferred zone can be used, it should be the first zone that we use, // so we will make add it to the start of the list, rather than the end. - if zone == opts.Zone { + if zone == opts.PreferredZone { zones = append([]string{zone}, zones...) } else { zones = append(zones, zone) @@ -150,8 +150,8 @@ func New(opts *Options) (*API, error) { zones, err := getAvailableZones(computeService, opts) if err != nil { - plog.Warningf("Failed to discover available zones: %v. Falling back to preferred zone (%s) only.", err, opts.Zone) - zones = []string{opts.Zone} + plog.Warningf("Failed to discover available zones: %v. Falling back to preferred zone (%s) only.", err, opts.PreferredZone) + zones = []string{opts.PreferredZone} } if opts.ServiceAcct == "" { @@ -177,5 +177,10 @@ func (a *API) Client() *http.Client { } func (a *API) GC(gracePeriod time.Duration) error { - return a.gcInstances(gracePeriod) + for _, zone := range a.zones { + if err := a.gcInstances(gracePeriod, zone); err != nil { + return err + } + } + return nil } diff --git a/mantle/platform/api/gcloud/compute.go b/mantle/platform/api/gcloud/compute.go index 69a6268279..5ffcc1f9fe 100644 --- a/mantle/platform/api/gcloud/compute.go +++ b/mantle/platform/api/gcloud/compute.go @@ -16,7 +16,10 @@ package gcloud import ( "crypto/rand" + "errors" "fmt" + "net/url" + "regexp" "strings" "time" @@ -24,6 +27,7 @@ import ( "github.com/coreos/coreos-assembler/mantle/util" "golang.org/x/crypto/ssh/agent" "google.golang.org/api/compute/v1" + "google.golang.org/api/googleapi" ) func (a *API) vmname() string { @@ -69,7 +73,7 @@ func ParseDisk(spec string, zone string) (*compute.AttachedDisk, error) { } // Taken from: https://github.com/golang/build/blob/master/buildlet/gce.go -func (a *API) mkinstance(userdata, name string, keys []*agent.Key, opts platform.MachineOptions, useServiceAcct bool) (*compute.Instance, error) { +func (a *API) mkinstance(userdata, name, zone string, keys []*agent.Key, opts platform.MachineOptions, useServiceAcct bool) (*compute.Instance, error) { mantle := "mantle" metadataItems := []*compute.MetadataItems{ { @@ -95,7 +99,7 @@ func (a *API) mkinstance(userdata, name string, keys []*agent.Key, opts platform instance := &compute.Instance{ Name: name, - MachineType: instancePrefix + "/zones/" + a.options.Zone + "/machineTypes/" + a.options.MachineType, + MachineType: instancePrefix + "/zones/" + zone + "/machineTypes/" + a.options.MachineType, Metadata: &compute.Metadata{ Items: metadataItems, }, @@ -113,7 +117,7 @@ func (a *API) mkinstance(userdata, name string, keys []*agent.Key, opts platform InitializeParams: &compute.AttachedDiskInitializeParams{ DiskName: name, SourceImage: a.options.Image, - DiskType: "/zones/" + a.options.Zone + "/diskTypes/" + a.options.DiskType, + DiskType: "/zones/" + zone + "/diskTypes/" + a.options.DiskType, DiskSizeGb: 16, }, }, @@ -171,7 +175,7 @@ func (a *API) mkinstance(userdata, name string, keys []*agent.Key, opts platform // attach aditional disk for _, spec := range opts.AdditionalDisks { plog.Debugf("Parsing disk spec %q\n", spec) - disk, err := ParseDisk(spec, a.options.Zone) + disk, err := ParseDisk(spec, zone) if err != nil { return nil, fmt.Errorf("failed to parse spec %q: %w", spec, err) } @@ -180,71 +184,126 @@ func (a *API) mkinstance(userdata, name string, keys []*agent.Key, opts platform return instance, nil } -// CreateInstance creates a Google Compute Engine instance. -func (a *API) CreateInstance(userdata string, keys []*agent.Key, opts platform.MachineOptions, useServiceAcct bool) (*compute.Instance, error) { - name := a.vmname() - inst, err := a.mkinstance(userdata, name, keys, opts, useServiceAcct) - if err != nil { - return nil, fmt.Errorf("failed to create instance %q: %w", name, err) - } +var zoneUnavailableErrorPattern = regexp.MustCompile("is currently unavailable in the .+ zone") - plog.Debugf("Creating instance %q", name) +// isZoneError returns true if the error is due to the zone (i.e. zone has no resources) and false +// if the error is not due to zone issues. +// See: https://docs.cloud.google.com/compute/docs/troubleshooting/troubleshooting-resource-availability +func isZoneError(errorMessage string) bool { + return strings.Contains(errorMessage, "does not have enough resources available") || + zoneUnavailableErrorPattern.MatchString(errorMessage) +} - op, err := a.compute.Instances.Insert(a.options.Project, a.options.Zone, inst).Do() - if err != nil { - return nil, fmt.Errorf("failed to request new GCP instance: %v\n", err) +// isRetriableError returns true if it could be a flake / network error that may be worth retrying +// The error codes that are considered transient are those defined in: https://docs.cloud.google.com/storage/docs/retry-strategy +func isRetriableError(err error) bool { + var urlErr *url.Error + if errors.As(err, &urlErr) { + return true } - doable := a.compute.ZoneOperations.Get(a.options.Project, a.options.Zone, op.Name) - if err := a.NewPending(op.Name, doable).Wait(); err != nil { - return nil, err + // If we got a googleapi error, then we will retry on 408, 429, and 5xx response codes. + var apiErr *googleapi.Error + if errors.As(err, &apiErr) { + return apiErr.Code == 408 || apiErr.Code == 429 || + (apiErr.Code >= 500 && apiErr.Code <= 599) } - err = util.WaitUntilReady(10*time.Minute, 10*time.Second, func() (bool, error) { - var err error - inst, err = a.compute.Instances.Get(a.options.Project, a.options.Zone, name).Do() + return false +} + +// CreateInstance creates a Google Compute Engine instance. It will first attempt to create it in the zone given by a.options.PreferredZone, if +// that fails (due to for example a capacity error) it will fall back to other zones in the same region. +func (a *API) CreateInstance(userdata string, keys []*agent.Key, opts platform.MachineOptions, useServiceAcct bool) (*compute.Instance, error) { + var lastError error + for _, zone := range a.zones { + name := a.vmname() // we need a different name for each try + inst, err := a.mkinstance(userdata, name, zone, keys, opts, useServiceAcct) if err != nil { - return false, err + return nil, fmt.Errorf("failed to create instance %q: %w", name, err) } - return inst.Status == "RUNNING", nil - }) - if err != nil { - return nil, fmt.Errorf("failed getting instance %s details after creation: %v", name, err) - } - plog.Debugf("Created instance %q", name) + plog.Debugf("Creating instance %q in zone %s", name, zone) - return inst, nil + // Lets try to insert the instance, and retry if we get a network issue + var op *compute.Operation + err = util.RetryConditional(3, 10*time.Second, isRetriableError, func() error { + op, err = a.compute.Instances.Insert(a.options.Project, zone, inst).Do() + if err != nil { + return fmt.Errorf("failed to request new GCP instance: %w", err) + } + return nil + }) + if err != nil { + return nil, err + } + + doable := a.compute.ZoneOperations.Get(a.options.Project, zone, op.Name) + if err := a.NewPending(op.Name, doable).Wait(); err != nil { + plog.Warningf("Failed to create instance %q in zone %s: %v", name, zone, err) + lastError = err + // If the error is caused by the zone we chose, then lets continue and try a different zone + if isZoneError(err.Error()) { + continue + } else { + break + } + } + + err = util.WaitUntilReady(10*time.Minute, 10*time.Second, func() (bool, error) { + var err error + inst, err = a.compute.Instances.Get(a.options.Project, zone, name).Do() + if err != nil { + if isRetriableError(err) { + plog.Warningf("error getting instance %s in zone %s, will retry: %v", name, zone, err) + return false, nil + } + return false, err + } + return inst.Status == "RUNNING", nil + }) + + if err != nil { + return nil, fmt.Errorf("failed getting instance %s details after creation: %w", name, err) + } + + plog.Debugf("Created instance %q in zone %s", name, zone) + return inst, nil + } + + return nil, fmt.Errorf("failed to create instance in all zones: %w", lastError) } -func (a *API) TerminateInstance(name string) error { +func (a *API) TerminateInstance(name, zone string) error { plog.Debugf("Terminating instance %q", name) - _, err := a.compute.Instances.Delete(a.options.Project, a.options.Zone, name).Do() + _, err := a.compute.Instances.Delete(a.options.Project, zone, name).Do() return err } func (a *API) ListInstances(prefix string) ([]*compute.Instance, error) { var instances []*compute.Instance - list, err := a.compute.Instances.List(a.options.Project, a.options.Zone).Do() - if err != nil { - return nil, err - } - - for _, inst := range list.Items { - if !strings.HasPrefix(inst.Name, prefix) { - continue + for _, zone := range a.zones { + list, err := a.compute.Instances.List(a.options.Project, zone).Do() + if err != nil { + return nil, err } - instances = append(instances, inst) + for _, inst := range list.Items { + if !strings.HasPrefix(inst.Name, prefix) { + continue + } + + instances = append(instances, inst) + } } return instances, nil } -func (a *API) GetConsoleOutput(name string) (string, error) { - out, err := a.compute.Instances.GetSerialPortOutput(a.options.Project, a.options.Zone, name).Do() +func (a *API) GetConsoleOutput(name, zone string) (string, error) { + out, err := a.compute.Instances.GetSerialPortOutput(a.options.Project, zone, name).Do() if err != nil { return "", fmt.Errorf("failed to retrieve console output for %q: %v", name, err) } @@ -266,10 +325,10 @@ func InstanceIPs(inst *compute.Instance) (intIP, extIP string) { return } -func (a *API) gcInstances(gracePeriod time.Duration) error { +func (a *API) gcInstances(gracePeriod time.Duration, zone string) error { threshold := time.Now().Add(-gracePeriod) - list, err := a.compute.Instances.List(a.options.Project, a.options.Zone).Do() + list, err := a.compute.Instances.List(a.options.Project, zone).Do() if err != nil { return err } @@ -303,7 +362,7 @@ func (a *API) gcInstances(gracePeriod time.Duration) error { continue } - if err := a.TerminateInstance(instance.Name); err != nil { + if err := a.TerminateInstance(instance.Name, zone); err != nil { return fmt.Errorf("couldn't terminate instance %q: %v", instance.Name, err) } } diff --git a/mantle/platform/machine/gcloud/cluster.go b/mantle/platform/machine/gcloud/cluster.go index 8d46821618..22e3746675 100644 --- a/mantle/platform/machine/gcloud/cluster.go +++ b/mantle/platform/machine/gcloud/cluster.go @@ -17,10 +17,13 @@ package gcloud import ( "errors" + "fmt" "os" "path/filepath" + "regexp" "golang.org/x/crypto/ssh/agent" + "google.golang.org/api/compute/v1" "github.com/coreos/coreos-assembler/mantle/platform" "github.com/coreos/coreos-assembler/mantle/platform/api/gcloud" @@ -36,6 +39,16 @@ func (gc *cluster) NewMachine(userdata *conf.UserData) (platform.Machine, error) return gc.NewMachineWithOptions(userdata, platform.MachineOptions{}) } +var zoneFromURLPattern = regexp.MustCompile(`/zones/([^/]+)`) + +func ExtractZoneFromInstance(instance *compute.Instance) (string, error) { + match := zoneFromURLPattern.FindStringSubmatch(instance.Zone) + if match == nil { + return "", fmt.Errorf("could not extract zone from instance zone field: %q", instance.Zone) + } + return match[1], nil +} + func (gc *cluster) NewMachineWithOptions(userdata *conf.UserData, options platform.MachineOptions) (platform.Machine, error) { if err := options.EnsureNoQEMUOnlyOptions("gcp"); err != nil { return nil, err @@ -65,6 +78,11 @@ func (gc *cluster) NewMachineWithOptions(userdata *conf.UserData, options platfo return nil, err } + zone, err := ExtractZoneFromInstance(instance) + if err != nil { + return nil, err + } + intip, extip := gcloud.InstanceIPs(instance) gm := &machine{ @@ -72,6 +90,7 @@ func (gc *cluster) NewMachineWithOptions(userdata *conf.UserData, options platfo name: instance.Name, intIP: intip, extIP: extip, + zone: zone, } gm.dir = filepath.Join(gc.RuntimeConf().OutputDir, gm.ID()) diff --git a/mantle/platform/machine/gcloud/machine.go b/mantle/platform/machine/gcloud/machine.go index bef4e2e12b..8694ed3082 100644 --- a/mantle/platform/machine/gcloud/machine.go +++ b/mantle/platform/machine/gcloud/machine.go @@ -32,6 +32,7 @@ type machine struct { dir string journal *platform.Journal console string + zone string } func (gm *machine) ID() string { @@ -87,7 +88,7 @@ func (gm *machine) Destroy() { plog.Errorf("Error saving console for instance %v: %v", gm.ID(), err) } - if err := gm.gc.flight.api.TerminateInstance(gm.name); err != nil { + if err := gm.gc.flight.api.TerminateInstance(gm.name, gm.zone); err != nil { plog.Errorf("Error terminating instance %v: %v", gm.ID(), err) } @@ -104,7 +105,7 @@ func (gm *machine) ConsoleOutput() string { func (gm *machine) saveConsole() error { var err error - gm.console, err = gm.gc.flight.api.GetConsoleOutput(gm.name) + gm.console, err = gm.gc.flight.api.GetConsoleOutput(gm.name, gm.zone) if err != nil { return err }