-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Honor retry after #10421
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Honor retry after #10421
Changes from all commits
dc45528
e542202
56d03fe
646b356
c1c68b0
040fe05
b2251b5
d487dc2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ package powervs | |
|
|
||
| import ( | ||
| "context" | ||
| "errors" | ||
| "fmt" | ||
| "math" | ||
| "strings" | ||
|
|
@@ -51,6 +52,38 @@ func (o *ClusterUninstaller) listPowerSubnets() (cloudResources, error) { | |
| return cloudResources{}.insert(result...), nil | ||
| } | ||
|
|
||
| // deleteNetworkInterfaces deletes all network interfaces attached to a subnet. | ||
| func (o *ClusterUninstaller) deleteNetworkInterfaces(subnetID string) error { | ||
| interfaces, err := o.networkClient.GetAllNetworkInterfaces(subnetID) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to list network interfaces: %w", err) | ||
| } | ||
|
|
||
| var deleteErrs []error | ||
| for _, nic := range interfaces.Interfaces { | ||
| if nic.ID != nil { | ||
| o.Logger.Debugf("Deleting network interface %q from subnet %q", *nic.ID, subnetID) | ||
| if err := o.networkClient.DeleteNetworkInterface(subnetID, *nic.ID); err != nil { | ||
| o.Logger.Warnf("Failed to delete network interface %q: %v", *nic.ID, err) | ||
| deleteErrs = append(deleteErrs, fmt.Errorf("failed to delete network interface %q: %w", *nic.ID, err)) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return errors.Join(deleteErrs...) | ||
| } | ||
|
|
||
| // isNetworkInterfaceError checks if an error indicates network interfaces are blocking deletion. (i.e 409 Conflict). | ||
| func isNetworkInterfaceError(err error) bool { | ||
| if err == nil { | ||
| return false | ||
| } | ||
| errStr := err.Error() | ||
| return strings.Contains(errStr, "one or more network interfaces have an IP allocation") || | ||
| strings.Contains(errStr, "status 409") || | ||
| strings.Contains(errStr, "409 Conflict") | ||
| } | ||
|
|
||
| func (o *ClusterUninstaller) deletePowerSubnet(item cloudResource) error { | ||
| if _, err := o.networkClient.Get(item.id); err != nil { | ||
| o.deletePendingItems(item.typeName, []cloudResource{item}) | ||
|
|
@@ -61,6 +94,16 @@ func (o *ClusterUninstaller) deletePowerSubnet(item cloudResource) error { | |
| o.Logger.Debugf("Deleting Power Network %q", item.name) | ||
|
|
||
| if err := o.networkClient.Delete(item.id); err != nil { | ||
| // If deletion failed due to attached network interfaces, delete them and retry | ||
| if isNetworkInterfaceError(err) { | ||
| o.Logger.Debugf("Subnet %q has attached network interfaces. Deleting them...", item.name) | ||
| if nicErr := o.deleteNetworkInterfaces(item.id); nicErr != nil { | ||
| o.Logger.Warnf("Failed to delete network interfaces for subnet %q: %v", item.name, nicErr) | ||
| } | ||
| // Return error to trigger retry after NIC deletion | ||
| return fmt.Errorf("subnet deletion blocked by network interfaces: %w", err) | ||
|
Comment on lines
+100
to
+104
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
set -euo pipefail
version=$(sed -nE 's/^[[:space:]]*k8s\.io\/apimachinery[[:space:]]+(v[^[:space:]]+).*/\1/p' go.mod | head -n1)
if [[ -z "${version}" ]]; then
echo "Could not determine k8s.io/apimachinery version from go.mod" >&2
exit 1
fi
for path in pkg/util/wait/backoff.go pkg/util/wait/wait.go; do
url="https://raw.githubusercontent.com/kubernetes/apimachinery/${version}/${path}"
if curl -fsSL "$url" | rg -n -C2 'func ExponentialBackoffWithContext|err != nil \|\| ok'; then
exit 0
fi
done
echo "Unable to locate ExponentialBackoffWithContext source for ${version}" >&2
exit 1Repository: openshift/installer Length of output: 740 Fix retry semantics: returning a non-nil error terminates the backoff instead of retrying
🤖 Prompt for AI Agents |
||
| } | ||
|
|
||
| o.Logger.Infof("Error: o.networkClient.Delete: %q", err) | ||
| return err | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ import ( | |
| "fmt" | ||
| "math" | ||
| gohttp "net/http" | ||
| "strconv" | ||
| "strings" | ||
| "sync" | ||
| "time" | ||
|
|
@@ -385,6 +386,40 @@ func (o *ClusterUninstaller) newAuthenticator(apikey string) (core.Authenticator | |
| return authenticator, nil | ||
| } | ||
|
|
||
| // waitForRetryAfter sleeps for the duration specified in the "Retry-After" header of the response (in seconds). | ||
| // returns true if it actually waited. | ||
| func (o *ClusterUninstaller) waitForRetryAfter(ctx context.Context, response *core.DetailedResponse) bool { | ||
| if response == nil { | ||
| return false | ||
| } | ||
| headers := response.GetHeaders() | ||
|
|
||
| if headers == nil { | ||
| return false | ||
| } | ||
|
|
||
| retryAfterHeader := headers.Get("Retry-After") | ||
|
|
||
| if retryAfterHeader == "" { | ||
| o.Logger.Debugf("waitForRetryAfter: Retry-After header is not present, or could not be retrieved") | ||
| return false | ||
| } | ||
| secs, err := strconv.Atoi(retryAfterHeader) | ||
| if err != nil || secs <= 0 { | ||
| o.Logger.Debugf("waitForRetryAfter: Invalid Retry-After header value: %s", retryAfterHeader) | ||
| return false | ||
| } | ||
| d := time.Duration(secs) * time.Second | ||
| o.Logger.Debugf("waitForRetryAfter: Rate limited, honoring Retry-After header, waiting for %v", secs) | ||
|
Comment on lines
+401
to
+413
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: cat -n pkg/destroy/powervs/powervs.go | sed -n '390,420p'Repository: openshift/installer Length of output: 1218 🌐 Web query:
💡 Result: The RFC 7231 Retry-After header format specification defines it as either an HTTP-date or delta-seconds (number of seconds to delay). ABNF: Retry-After = HTTP-date / delay-seconds delay-seconds = 1*DIGIT HTTP-date is defined in RFC 7231 Section 7.1.1.1 as IMF-fixdate / obs-date (preferred fixed-length format or obsolete formats for compatibility). delta-seconds is a non-negative decimal integer (1 or more digits) representing seconds after the response is received. It matches the general delta-seconds rule used across HTTP/1.1 specs (defined in RFC 7230 Section 3.2.6 or equivalent, but locally in Retry-After context). Examples from RFC: Retry-After: Fri, 31 Dec 1999 23:59:59 GMT Retry-After: 120 (2 minutes) Note: RFC 7231 is obsoleted by RFC 9110 (HTTP Semantics, 2022), but the format remains unchanged. Citations:
🏁 Script executed: # Check if net/http provides ParseTime function
python3 << 'EOF'
import subprocess
result = subprocess.run(['go', 'doc', 'net/http', 'ParseTime'],
capture_output=True, text=True, cwd='.')
print(result.stdout)
print(result.stderr)
EOFRepository: openshift/installer Length of output: 319 🏁 Script executed: head -30 pkg/destroy/powervs/powervs.goRepository: openshift/installer Length of output: 964 🏁 Script executed: rg "gohttp|import" pkg/destroy/powervs/powervs.go | head -40Repository: openshift/installer Length of output: 549 Support the full The Suggested fix retryAfterHeader := headers.Get("Retry-After")
if retryAfterHeader == "" {
o.Logger.Debugf("waitForRetryAfter: Retry-After header is not present, or could not be retrieved")
return false
}
- secs, err := strconv.Atoi(retryAfterHeader)
- if err != nil || secs <= 0 {
- o.Logger.Debugf("waitForRetryAfter: Invalid Retry-After header value: %s", retryAfterHeader)
- return false
- }
- d := time.Duration(secs) * time.Second
- o.Logger.Debugf("waitForRetryAfter: Rate limited, honoring Retry-After header, waiting for %v", secs)
+ retryAfterHeader = strings.TrimSpace(retryAfterHeader)
+ var d time.Duration
+ if secs, err := strconv.Atoi(retryAfterHeader); err == nil {
+ if secs < 0 {
+ o.Logger.Debugf("waitForRetryAfter: Invalid Retry-After header value: %s", retryAfterHeader)
+ return false
+ }
+ d = time.Duration(secs) * time.Second
+ } else if t, err := gohttp.ParseTime(retryAfterHeader); err == nil {
+ d = time.Until(t)
+ if d < 0 {
+ d = 0
+ }
+ } else {
+ o.Logger.Debugf("waitForRetryAfter: Invalid Retry-After header value: %s", retryAfterHeader)
+ return false
+ }
+ o.Logger.Debugf("waitForRetryAfter: Rate limited, honoring Retry-After header, waiting for %v", d)🤖 Prompt for AI Agents
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree need to incorporate something along these lineds There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
retryAfterHeader = strings.TrimSpace(retryAfterHeader)
var d time.Duration
if secs, err := strconv.Atoi(retryAfterHeader); err == nil {
if secs < 0 {
o.Logger.Debugf("waitForRetryAfter: Invalid Retry-After header value: %s", retryAfterHeader)
return false
}
d = time.Duration(secs) * time.Second
} else if t, err := gohttp.ParseTime(retryAfterHeader); err == nil {
d = time.Until(t)
if d < 0 {
d = 0
}
} else {
o.Logger.Debugf("waitForRetryAfter: Invalid Retry-After header value: %s", retryAfterHeader)
return false
}
o.Logger.Debugf("waitForRetryAfter: Rate limited, honoring Retry-After header, waiting for %v", d)
select {
case <-ctx.Done():
return false
case <-time.After(d):
return true
}Key points vs. the original:
|
||
| select { | ||
| case <-ctx.Done(): | ||
| // Context is done, return to avoid waiting the full duration | ||
| return false | ||
| case <-time.After(d): | ||
| return true | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
|
||
| func (o *ClusterUninstaller) loadSDKServices() error { | ||
| var ( | ||
| err error | ||
|
|
@@ -516,9 +551,21 @@ func (o *ClusterUninstaller) loadSDKServices() error { | |
|
|
||
| // Get the Zone ID | ||
| zoneOptions := o.zonesSvc.NewListZonesOptions() | ||
| zoneResources, detailedResponse, err := o.zonesSvc.ListZonesWithContext(ctx, zoneOptions) | ||
| if err != nil { | ||
| return fmt.Errorf("loadSDKServices: Failed to list Zones: %w and the response is: %s", err, detailedResponse) | ||
| var ( | ||
| zoneResources *zonesv1.ListZonesResp | ||
| detailedResponse *core.DetailedResponse | ||
| ) | ||
| for { | ||
| zoneResources, detailedResponse, err = o.zonesSvc.ListZonesWithContext(ctx, zoneOptions) | ||
| if err != nil { | ||
| // check if error is a 429 too many requests | ||
| if detailedResponse != nil && detailedResponse.StatusCode == gohttp.StatusTooManyRequests && o.waitForRetryAfter(ctx, detailedResponse) { | ||
| o.Logger.Debugf("loadSDKServices: ListZonesWithContext was rate limited, honoring RetryAfter for ListZones...") | ||
| continue // we have waited the appropriate amount of time, now retry the request | ||
| } | ||
| return fmt.Errorf("loadSDKServices: Failed to list Zones: %w and the response is: %s", err, detailedResponse) | ||
| } | ||
| break // no error, reached on successful response, break the loop | ||
| } | ||
|
|
||
| for _, zone := range zoneResources.Result { | ||
|
|
@@ -562,9 +609,20 @@ func (o *ClusterUninstaller) loadSDKServices() error { | |
| return fmt.Errorf("failed to parse DNSInstanceCRN: %w", err) | ||
| } | ||
| options := o.dnsZonesSvc.NewListDnszonesOptions(dnsCRN.ServiceInstance) | ||
| listZonesResponse, detailedResponse, err := o.dnsZonesSvc.ListDnszones(options) | ||
| if err != nil { | ||
| return fmt.Errorf("loadSDKServices: Failed to list Zones: %w and the response is: %s", err, detailedResponse) | ||
| var ( | ||
| listZonesResponse *dnszonesv1.ListDnszones | ||
| detailedResponse *core.DetailedResponse | ||
| ) | ||
| for { | ||
| listZonesResponse, detailedResponse, err = o.dnsZonesSvc.ListDnszones(options) | ||
| if err != nil { | ||
| if detailedResponse != nil && detailedResponse.StatusCode == gohttp.StatusTooManyRequests && o.waitForRetryAfter(ctx, detailedResponse) { | ||
| o.Logger.Debugf("loadSDKServices: ListDnszones was rate limited, honoring RetryAfter for ListDnszones...") | ||
| continue | ||
| } | ||
| return fmt.Errorf("loadSDKServices: Failed to list Zones: %w and the response is: %s", err, detailedResponse) | ||
| } | ||
| break | ||
| } | ||
|
|
||
| for _, zone := range listZonesResponse.Dnszones { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: openshift/installer
Length of output: 45
🏁 Script executed:
Repository: openshift/installer
Length of output: 132
🏁 Script executed:
Repository: openshift/installer
Length of output: 1179
🏁 Script executed:
Repository: openshift/installer
Length of output: 2773
🏁 Script executed:
Repository: openshift/installer
Length of output: 5369
🏁 Script executed:
Repository: openshift/installer
Length of output: 3405
🏁 Script executed:
Repository: openshift/installer
Length of output: 108
🏁 Script executed:
Repository: openshift/installer
Length of output: 1064
🌐 Web query:
k8s.io/apimachinery/pkg/util/wait ExponentialBackoffWithContext return value semantics bool error💡 Result:
The function ExponentialBackoffWithContext in k8s.io/apimachinery/pkg/util/wait has the signature: func ExponentialBackoffWithContext(ctx context.Context, backoff Backoff, condition ConditionWithContextFunc) error It returns a single error value, not (bool, error). The condition function (ConditionWithContextFunc) is called repeatedly with exponential backoff until: - the condition returns (true, nil): function returns nil - the condition returns (false, err): function returns that err - context is cancelled: returns ctx.Err - backoff.Steps exhausted: returns ErrWaitTimeout There is no bool return value from the function itself; success is indicated by nil error, timeout by ErrWaitTimeout, condition error by the condition's error, or context error by ctx.Err.
Citations:
🏁 Script executed:
Repository: openshift/installer
Length of output: 113
🌐 Web query:
k8s.io/apimachinery/pkg/util/wait ConditionWithContextFunc (false, error) behavior retry💡 Result:
In the k8s.io/apimachinery/pkg/util/wait package, ConditionWithContextFunc is defined as func(context.Context) (done bool, err error). It returns true if the condition is satisfied, or an error if the loop should be aborted. This type was added in v0.22.0. When used in polling functions like ExponentialBackoffWithContext(ctx context.Context, backoff Backoff, condition ConditionWithContextFunc) error (or similar functions like PollImmediateUntilWithContext), the behavior for returning (false, error) is: the error is immediately propagated and returned by the polling function, terminating the retry loop without further attempts. Returning (false, nil) continues retrying according to the backoff policy until success (true, nil), timeout, context cancellation, or max steps. This mirrors the non-context ConditionFunc behavior: errors abort immediately, false continues polling. Context variants respect ctx.Done for cancellation. Example usage pattern from client-go/util/retry: err := wait.ExponentialBackoff(backoff, func (bool, error) { err := fn if err == nil { return true, nil } if retriable(err) { return false, nil } // retry return false, err // abort }) Returning (false, error) aborts retry immediately with that error.
Citations:
Return value exits retry loop immediately instead of triggering retry
Lines 295-296 return an error expecting the retry loop to continue, but
wait.ExponentialBackoffWithContextaborts on(false, error)instead of retrying. The error propagates immediately, bypassing the intended retry after NIC deletion. Use(false, nil)to signal retry continuation.🤖 Prompt for AI Agents