diff --git a/pkg/destroy/powervs/power-dhcp.go b/pkg/destroy/powervs/power-dhcp.go index 499e4bdaaa4..d2a07ca9ed2 100644 --- a/pkg/destroy/powervs/power-dhcp.go +++ b/pkg/destroy/powervs/power-dhcp.go @@ -189,11 +189,66 @@ func (o *ClusterUninstaller) listDHCPNetworksByName() ([]string, error) { return result, nil } +// extractNetworkIDFromError extracts network ID from error message if present. +// Error format: "network xxx-xxx-xxxxx still attached to pvm-instances". +func extractNetworkIDFromError(err error) string { + if err == nil { + return "" + } + errStr := err.Error() + // Look for pattern "network still attached" + parts := strings.Split(errStr, "network ") + if len(parts) > 1 { + remaining := parts[1] + // Extract UUID from error message (format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx) + spaceIdx := strings.Index(remaining, " ") + if spaceIdx > 0 { + networkID := remaining[:spaceIdx] + // UUID format validation (36 chars with dashes) + if len(networkID) == 36 && strings.Count(networkID, "-") == 4 { + return networkID + } + } + } + return "" +} + +// findNetworkIDByName finds a network ID by matching the network name. +func (o *ClusterUninstaller) findNetworkIDByName(networkName string) string { + if o.networkClient == nil { + return "" + } + networks, err := o.networkClient.GetAll() + if err != nil { + o.Logger.Debugf("Failed to list networks to find ID for %q: %v", networkName, err) + return "" + } + for _, network := range networks.Networks { + if network.Name != nil && *network.Name == networkName { + if network.NetworkID != nil { + return *network.NetworkID + } + } + } + return "" +} + +// isDHCPNetworkAttachedError checks if an error indicates the DHCP network is attached to PVM instances. +func isDHCPNetworkAttachedError(err error) bool { + if err == nil { + return false + } + errStr := err.Error() + return strings.Contains(errStr, "still attached to pvm-instances") || + strings.Contains(errStr, "still attached to") || + strings.Contains(errStr, "pcloudDhcpDeleteBadRequest") +} + // destroyDHCPNetwork deletes a PowerVS DHCP network. func (o *ClusterUninstaller) destroyDHCPNetwork(item cloudResource) error { var err error - _, err = o.dhcpClient.Get(item.id) + dhcpServer, err := o.dhcpClient.Get(item.id) if err != nil { o.deletePendingItems(item.typeName, []cloudResource{item}) o.Logger.Infof("Deleted DHCP Network %q", item.name) @@ -202,8 +257,46 @@ func (o *ClusterUninstaller) destroyDHCPNetwork(item cloudResource) error { o.Logger.Debugf("Deleting DHCP network %q", item.name) + // Before deleting the DHCP server, check if its network has attached network interfaces + // that need to be deleted first. This prevents errors like "network still attached to pvm-instances, that fail early before subnet network interfaces are deleted in the destroyPowerSubnets() stage." + var networkID string + if dhcpServer.Network != nil { + // Try to find network ID by name + if dhcpServer.Network.Name != nil { + networkID = o.findNetworkIDByName(*dhcpServer.Network.Name) + if networkID != "" { + o.Logger.Debugf("Found network ID %s for DHCP subnet %q. Checking for network interfaces...", networkID, item.name) + // Try to delete network interfaces from the subnet + if nicErr := o.deleteNetworkInterfaces(networkID); nicErr != nil { + o.Logger.Debugf("Note: Could not delete network interfaces for DHCP subnet %q: %v (will attempt DHCP deletion anyway)", item.name, nicErr) + } + } + } + } + err = o.dhcpClient.Delete(item.id) if err != nil { + // If deletion failed because network is still attached to instances, try deleting network interfaces + if isDHCPNetworkAttachedError(err) { + // Try to extract network ID from error message if we don't have it yet + if networkID == "" { + networkID = extractNetworkIDFromError(err) + } + // If still no network ID, try finding by name again + if networkID == "" && dhcpServer.Network != nil && dhcpServer.Network.Name != nil { + networkID = o.findNetworkIDByName(*dhcpServer.Network.Name) + } + + if networkID != "" { + o.Logger.Debugf("DHCP subnet %q is still attached to instances. Attempting to delete network interfaces from network %s...", item.name, networkID) + if nicErr := o.deleteNetworkInterfaces(networkID); nicErr != nil { + o.Logger.Warnf("Failed to delete network interfaces for DHCP subnet %q: %v", item.name, nicErr) + } + // Return error to trigger retry after NIC deletion + return fmt.Errorf("DHCP server deletion blocked by attached network interfaces: %w", err) + } + o.Logger.Warnf("Could not determine network ID for DHCP subnet %q to delete network interfaces", item.name) + } o.Logger.Infof("Error: o.dhcpClient.Delete: %q", err) return err } @@ -329,31 +422,11 @@ func (o *ClusterUninstaller) destroyDHCPNetworks() error { o.Logger.Fatal("destroyDHCPNetworks: ExponentialBackoffWithContext (list) returns ", err) } - // PowerVS hack: - // We were asked to query for the subnet still existing as a test for the DHCP network to be - // finally destroyed. Even though we can't list it anymore, it is still being destroyed. :( - backoff = wait.Backoff{ - Duration: 15 * time.Second, - Factor: 1.1, - Cap: leftInContext(ctx), - Steps: math.MaxInt32} - err = wait.ExponentialBackoffWithContext(ctx, backoff, func(context.Context) (bool, error) { - secondPassList, err2 := o.listPowerSubnets() - if err2 != nil { - return false, err2 - } - if len(secondPassList) == 0 { - // We finally don't see any remaining instances! - return true, nil - } - for _, item := range secondPassList { - o.Logger.Debugf("destroyDHCPNetworks: found %s in second pass", item.name) - } - return false, nil - }) - if err != nil { - o.Logger.Fatal("destroyDHCPNetworks: ExponentialBackoffWithContext (list) returns ", err) - } + // Note: DHCP server subnets will be deleted in the destroyPowerSubnets() stage. + // We no longer wait for them here since: + // 1. Network interfaces are now properly deleted during DHCP deletion + // 2. Subnet deletion happens in a later stage with its own retry logic + // 3. Waiting here was causing timeouts since subnets are deleted in a different stage return nil } diff --git a/pkg/destroy/powervs/power-subnet.go b/pkg/destroy/powervs/power-subnet.go index 5aebc4da2cc..6a26696a68e 100644 --- a/pkg/destroy/powervs/power-subnet.go +++ b/pkg/destroy/powervs/power-subnet.go @@ -2,6 +2,7 @@ package powervs import ( "context" + "errors" "fmt" "math" "strings" @@ -51,6 +52,38 @@ func (o *ClusterUninstaller) listPowerSubnets() (cloudResources, error) { return cloudResources{}.insert(result...), nil } +// deleteNetworkInterfaces deletes all network interfaces attached to a subnet. +func (o *ClusterUninstaller) deleteNetworkInterfaces(subnetID string) error { + interfaces, err := o.networkClient.GetAllNetworkInterfaces(subnetID) + if err != nil { + return fmt.Errorf("failed to list network interfaces: %w", err) + } + + var deleteErrs []error + for _, nic := range interfaces.Interfaces { + if nic.ID != nil { + o.Logger.Debugf("Deleting network interface %q from subnet %q", *nic.ID, subnetID) + if err := o.networkClient.DeleteNetworkInterface(subnetID, *nic.ID); err != nil { + o.Logger.Warnf("Failed to delete network interface %q: %v", *nic.ID, err) + deleteErrs = append(deleteErrs, fmt.Errorf("failed to delete network interface %q: %w", *nic.ID, err)) + } + } + } + + return errors.Join(deleteErrs...) +} + +// isNetworkInterfaceError checks if an error indicates network interfaces are blocking deletion. (i.e 409 Conflict). +func isNetworkInterfaceError(err error) bool { + if err == nil { + return false + } + errStr := err.Error() + return strings.Contains(errStr, "one or more network interfaces have an IP allocation") || + strings.Contains(errStr, "status 409") || + strings.Contains(errStr, "409 Conflict") +} + func (o *ClusterUninstaller) deletePowerSubnet(item cloudResource) error { if _, err := o.networkClient.Get(item.id); err != nil { o.deletePendingItems(item.typeName, []cloudResource{item}) @@ -61,6 +94,16 @@ func (o *ClusterUninstaller) deletePowerSubnet(item cloudResource) error { o.Logger.Debugf("Deleting Power Network %q", item.name) if err := o.networkClient.Delete(item.id); err != nil { + // If deletion failed due to attached network interfaces, delete them and retry + if isNetworkInterfaceError(err) { + o.Logger.Debugf("Subnet %q has attached network interfaces. Deleting them...", item.name) + if nicErr := o.deleteNetworkInterfaces(item.id); nicErr != nil { + o.Logger.Warnf("Failed to delete network interfaces for subnet %q: %v", item.name, nicErr) + } + // Return error to trigger retry after NIC deletion + return fmt.Errorf("subnet deletion blocked by network interfaces: %w", err) + } + o.Logger.Infof("Error: o.networkClient.Delete: %q", err) return err } diff --git a/pkg/destroy/powervs/powervs.go b/pkg/destroy/powervs/powervs.go index 748461851e8..a62ee0c4dff 100644 --- a/pkg/destroy/powervs/powervs.go +++ b/pkg/destroy/powervs/powervs.go @@ -6,6 +6,7 @@ import ( "fmt" "math" gohttp "net/http" + "strconv" "strings" "sync" "time" @@ -385,6 +386,40 @@ func (o *ClusterUninstaller) newAuthenticator(apikey string) (core.Authenticator return authenticator, nil } +// waitForRetryAfter sleeps for the duration specified in the "Retry-After" header of the response (in seconds). +// returns true if it actually waited. +func (o *ClusterUninstaller) waitForRetryAfter(ctx context.Context, response *core.DetailedResponse) bool { + if response == nil { + return false + } + headers := response.GetHeaders() + + if headers == nil { + return false + } + + retryAfterHeader := headers.Get("Retry-After") + + if retryAfterHeader == "" { + o.Logger.Debugf("waitForRetryAfter: Retry-After header is not present, or could not be retrieved") + return false + } + secs, err := strconv.Atoi(retryAfterHeader) + if err != nil || secs <= 0 { + o.Logger.Debugf("waitForRetryAfter: Invalid Retry-After header value: %s", retryAfterHeader) + return false + } + d := time.Duration(secs) * time.Second + o.Logger.Debugf("waitForRetryAfter: Rate limited, honoring Retry-After header, waiting for %v", secs) + select { + case <-ctx.Done(): + // Context is done, return to avoid waiting the full duration + return false + case <-time.After(d): + return true + } +} + func (o *ClusterUninstaller) loadSDKServices() error { var ( err error @@ -516,9 +551,21 @@ func (o *ClusterUninstaller) loadSDKServices() error { // Get the Zone ID zoneOptions := o.zonesSvc.NewListZonesOptions() - zoneResources, detailedResponse, err := o.zonesSvc.ListZonesWithContext(ctx, zoneOptions) - if err != nil { - return fmt.Errorf("loadSDKServices: Failed to list Zones: %w and the response is: %s", err, detailedResponse) + var ( + zoneResources *zonesv1.ListZonesResp + detailedResponse *core.DetailedResponse + ) + for { + zoneResources, detailedResponse, err = o.zonesSvc.ListZonesWithContext(ctx, zoneOptions) + if err != nil { + // check if error is a 429 too many requests + if detailedResponse != nil && detailedResponse.StatusCode == gohttp.StatusTooManyRequests && o.waitForRetryAfter(ctx, detailedResponse) { + o.Logger.Debugf("loadSDKServices: ListZonesWithContext was rate limited, honoring RetryAfter for ListZones...") + continue // we have waited the appropriate amount of time, now retry the request + } + return fmt.Errorf("loadSDKServices: Failed to list Zones: %w and the response is: %s", err, detailedResponse) + } + break // no error, reached on successful response, break the loop } for _, zone := range zoneResources.Result { @@ -562,9 +609,20 @@ func (o *ClusterUninstaller) loadSDKServices() error { return fmt.Errorf("failed to parse DNSInstanceCRN: %w", err) } options := o.dnsZonesSvc.NewListDnszonesOptions(dnsCRN.ServiceInstance) - listZonesResponse, detailedResponse, err := o.dnsZonesSvc.ListDnszones(options) - if err != nil { - return fmt.Errorf("loadSDKServices: Failed to list Zones: %w and the response is: %s", err, detailedResponse) + var ( + listZonesResponse *dnszonesv1.ListDnszones + detailedResponse *core.DetailedResponse + ) + for { + listZonesResponse, detailedResponse, err = o.dnsZonesSvc.ListDnszones(options) + if err != nil { + if detailedResponse != nil && detailedResponse.StatusCode == gohttp.StatusTooManyRequests && o.waitForRetryAfter(ctx, detailedResponse) { + o.Logger.Debugf("loadSDKServices: ListDnszones was rate limited, honoring RetryAfter for ListDnszones...") + continue + } + return fmt.Errorf("loadSDKServices: Failed to list Zones: %w and the response is: %s", err, detailedResponse) + } + break } for _, zone := range listZonesResponse.Dnszones { diff --git a/pkg/destroy/powervs/powervs_test.go b/pkg/destroy/powervs/powervs_test.go new file mode 100644 index 00000000000..bdfcfe35124 --- /dev/null +++ b/pkg/destroy/powervs/powervs_test.go @@ -0,0 +1,149 @@ +package powervs + +// powervs_test.go + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "github.com/IBM/go-sdk-core/v5/core" + "github.com/IBM/networking-go-sdk/zonesv1" + "github.com/sirupsen/logrus" +) + +func TestWaitForRetryAfter(t *testing.T) { + o := &ClusterUninstaller{ + Logger: logrus.New(), + } + + t.Run("returns false when response is nil", func(t *testing.T) { + ctx := context.Background() + got := o.waitForRetryAfter(ctx, nil) + if got { + t.Error("expected false for nil response") + } + }) + + t.Run("returns false when no Retry-After header", func(t *testing.T) { + ctx := context.Background() + resp := &core.DetailedResponse{ + StatusCode: http.StatusTooManyRequests, + Headers: http.Header{}, + } + got := o.waitForRetryAfter(ctx, resp) + if got { + t.Error("expected false when Retry-After header is absent") + } + }) + + t.Run("waits when valid Retry-After header present", func(t *testing.T) { + ctx := context.Background() + resp := &core.DetailedResponse{ + StatusCode: http.StatusTooManyRequests, + Headers: http.Header{"Retry-After": []string{"1"}}, + } + + start := time.Now() + got := o.waitForRetryAfter(ctx, resp) + elapsed := time.Since(start) + + if !got { + t.Error("expected true when Retry-After header is present") + } + if elapsed < time.Second { + t.Errorf("expected to wait ~1s, only waited %v", elapsed) + } + }) + + t.Run("respects context cancellation", func(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer cancel() + + resp := &core.DetailedResponse{} + resp.Headers = http.Header{"Retry-After": []string{"60"}} // long wait + + start := time.Now() + got := o.waitForRetryAfter(ctx, resp) + elapsed := time.Since(start) + + if !got { + t.Error("expected true even on ctx cancellation") + } + if elapsed > time.Second { + t.Errorf("should have cancelled quickly, took %v", elapsed) + } + }) +} + +func TestListZonesRetryLogic(t *testing.T) { + var callCount atomic.Int32 + + // Server returns 429 twice, then 200 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + count := callCount.Add(1) + if count <= 2 { + w.Header().Set("Retry-After", "1") + w.WriteHeader(http.StatusTooManyRequests) + return + } + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + fmt.Fprintln(w, `{"result": [], "result_info": {"page": 1, "per_page": 20, "count": 0, "total_count": 0}}`) + })) + defer server.Close() + + o := &ClusterUninstaller{ + Logger: logrus.New(), + CISInstanceCRN: "crn:v1:bluemix:public:internet-svcs:global:a/fake:fake::", + BaseDomain: "example.com", + } + + authenticator := &core.NoAuthAuthenticator{} + var err error + o.zonesSvc, err = zonesv1.NewZonesV1(&zonesv1.ZonesV1Options{ + Authenticator: authenticator, + URL: server.URL, + Crn: &o.CISInstanceCRN, + }) + if err != nil { + t.Fatalf("failed to create zonesSvc: %v", err) + } + + ctx := context.Background() + zoneOptions := o.zonesSvc.NewListZonesOptions() + + var ( + detailedResponse *core.DetailedResponse + zoneResources *zonesv1.ListZonesResp + ) + + start := time.Now() + // This mirrors the exact loop in loadSDKServices + for { + zoneResources, detailedResponse, err = o.zonesSvc.ListZonesWithContext(ctx, zoneOptions) + if err != nil { + if detailedResponse != nil && detailedResponse.StatusCode == http.StatusTooManyRequests && o.waitForRetryAfter(ctx, detailedResponse) { + o.Logger.Debugf("rate limited, retrying...") + continue + } + t.Fatalf("unexpected error after retries: %v", err) + } + break + } + elapsed := time.Since(start) + + if zoneResources == nil { + t.Error("expected zone resources, got nil") + } + if callCount.Load() != 3 { + t.Errorf("expected 3 calls, got %d", callCount.Load()) + } + if elapsed < 2*time.Second { + t.Errorf("expected at least 2s of retry delay got %v", elapsed) + } +}