diff --git a/README.md b/README.md index e48d21d..b8ced38 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Accepted values for booleans are: "1", "t", "T", "true", "TRUE", "True", "0", "f | ANKA_CLOUD_CONTROLLER_URL | ✅ | String | Anka Build Cloud's Controller URL. Inlcuding `http[s]` prefix. Port optional | | ANKA_CLOUD_TEMPLATE_ID | ✅* | String | VM Template ID to use. Takes precedence over `ANKA_CLOUD_TEMPLATE_NAME`. **Required if `ANKA_CLOUD_TEMPLATE_NAME` not provided** | | ANKA_CLOUD_TEMPLATE_NAME | ✅* | String | VM Template Name to use. Since template names are not guaranteed to be unique, it is recommended to use `ANKA_CLOUD_TEMPLATE_ID`. **Required if `ANKA_CLOUD_TEMPLATE_ID` not provided** | -| ANKA_CLOUD_DEBUG | ❌ | Boolean | Output Anka Cloud debug info | +| ANKA_CLOUD_DEBUG | ❌ | Boolean | Output Anka Cloud debug info | | ANKA_CLOUD_TEMPLATE_TAG | ❌ | String | Template tag to use | | ANKA_CLOUD_NODE_ID | ❌ | String | Run VM on this specific node | | ANKA_CLOUD_PRIORITY | ❌ | Number | Priority in range 1-10000 (lower is more urgent) | diff --git a/RELEASING.md b/RELEASING.md index 7e7b481..a3347a3 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -4,4 +4,4 @@ 2. Merge into main branch. 3. Create a new release on Github. 4. Build and Watch https://jenkins/job/anka-cloud-gitlab-executor-release/ to see the release build. -5. Update the release description and title if needed. The workflow will attach artifacts to the release. +5. Update the release description and title if needed. The workflow will attach artifacts to the release. \ No newline at end of file diff --git a/internal/ankacloud/client.go b/internal/ankacloud/client.go index cf75358..4c2d906 100644 --- a/internal/ankacloud/client.go +++ b/internal/ankacloud/client.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "encoding/json" + "errors" "fmt" "io" "net/http" @@ -18,6 +19,9 @@ import ( const ( defaultMaxIdleConnsPerHost = 20 defaultRequestTimeout = 10 * time.Second + DefaultRetryAttempts = 3 + DefaultRetryInitialDelay = 5 * time.Second + DefaultRetryMaxDelay = 30 * time.Second ) type APIClient struct { @@ -26,6 +30,86 @@ type APIClient struct { CustomHttpHeaders map[string]string } +// RetryConfig holds configuration for retry behavior with exponential backoff +type RetryConfig struct { + MaxAttempts int + InitialDelay time.Duration + MaxDelay time.Duration +} + +// DefaultRetryConfig returns the default retry configuration +func DefaultRetryConfig() RetryConfig { + return RetryConfig{ + MaxAttempts: DefaultRetryAttempts, + InitialDelay: DefaultRetryInitialDelay, + MaxDelay: DefaultRetryMaxDelay, + } +} + +// IsRetryableError checks if an error is retryable (timeout or transient errors) +func IsRetryableError(err error) bool { + if err == nil { + return false + } + // Check for TransientError (which wraps timeout errors) + if errors.Is(err, gitlab.ErrTransient) { + return true + } + // Check for url.Error timeout + var urlErr *url.Error + if errors.As(err, &urlErr) && urlErr.Timeout() { + return true + } + // Also check for common timeout error messages + errStr := err.Error() + return strings.Contains(errStr, "deadline exceeded") || + strings.Contains(errStr, "Client.Timeout") +} + +// WithRetry executes the given operation with retry logic using exponential backoff +func WithRetry[T any](ctx context.Context, config RetryConfig, operation func() (T, error)) (T, error) { + var zero T + var lastErr error + delay := config.InitialDelay + + for attempt := 1; attempt <= config.MaxAttempts; attempt++ { + result, err := operation() + if err == nil { + return result, nil + } + + lastErr = err + + if !IsRetryableError(err) { + return zero, err + } + + if attempt < config.MaxAttempts { + log.Printf("Request timed out (attempt %d/%d), retrying in %v...\n", attempt, config.MaxAttempts, delay) + select { + case <-ctx.Done(): + return zero, ctx.Err() + case <-time.After(delay): + } + // Exponential backoff: double the delay for next attempt, capped at MaxDelay + delay *= 2 + if delay > config.MaxDelay { + delay = config.MaxDelay + } + } + } + + return zero, fmt.Errorf("operation failed after %d attempts: %w", config.MaxAttempts, lastErr) +} + +// WithRetryNoResult executes the given operation with retry logic for operations that don't return a value +func WithRetryNoResult(ctx context.Context, config RetryConfig, operation func() error) error { + _, err := WithRetry(ctx, config, func() (struct{}, error) { + return struct{}{}, operation() + }) + return err +} + func (c *APIClient) parse(body []byte) (response, error) { var r response err := json.Unmarshal(body, &r) diff --git a/internal/ankacloud/client_test.go b/internal/ankacloud/client_test.go index d987415..45ae5e9 100644 --- a/internal/ankacloud/client_test.go +++ b/internal/ankacloud/client_test.go @@ -3,9 +3,14 @@ package ankacloud import ( "context" "encoding/json" + "errors" "net/http" "net/http/httptest" + "net/url" "testing" + "time" + + "github.com/veertuinc/anka-cloud-gitlab-executor/internal/gitlab" ) func TestCustomHeaders(t *testing.T) { @@ -46,3 +51,245 @@ func TestCustomHeaders(t *testing.T) { t.Error(err) } } + +func TestIsRetryableError(t *testing.T) { + tests := []struct { + name string + err error + expected bool + }{ + { + name: "nil error", + err: nil, + expected: false, + }, + { + name: "regular error", + err: errors.New("some error"), + expected: false, + }, + { + name: "transient error", + err: gitlab.TransientError(errors.New("timeout")), + expected: true, + }, + { + name: "error with deadline exceeded", + err: errors.New("context deadline exceeded"), + expected: true, + }, + { + name: "error with Client.Timeout", + err: errors.New("Client.Timeout exceeded while awaiting headers"), + expected: true, + }, + { + name: "wrapped deadline exceeded", + err: errors.New("failed to terminate: context deadline exceeded (Client.Timeout exceeded)"), + expected: true, + }, + { + name: "url.Error with timeout", + err: &url.Error{Op: "Get", URL: "http://test", Err: errors.New("timeout")}, + expected: false, // url.Error.Timeout() returns false for generic errors + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := IsRetryableError(tt.err) + if result != tt.expected { + t.Errorf("IsRetryableError(%v) = %v, expected %v", tt.err, result, tt.expected) + } + }) + } +} + +func TestWithRetry_Success(t *testing.T) { + callCount := 0 + config := RetryConfig{ + MaxAttempts: 3, + InitialDelay: 10 * time.Millisecond, + MaxDelay: 100 * time.Millisecond, + } + + result, err := WithRetry(context.Background(), config, func() (string, error) { + callCount++ + return "success", nil + }) + + if err != nil { + t.Errorf("expected no error, got %v", err) + } + if result != "success" { + t.Errorf("expected 'success', got %q", result) + } + if callCount != 1 { + t.Errorf("expected 1 call, got %d", callCount) + } +} + +func TestWithRetry_NonRetryableError(t *testing.T) { + callCount := 0 + config := RetryConfig{ + MaxAttempts: 3, + InitialDelay: 10 * time.Millisecond, + MaxDelay: 100 * time.Millisecond, + } + + _, err := WithRetry(context.Background(), config, func() (string, error) { + callCount++ + return "", errors.New("permanent error") + }) + + if err == nil { + t.Error("expected error, got nil") + } + if callCount != 1 { + t.Errorf("expected 1 call (no retry for non-retryable error), got %d", callCount) + } +} + +func TestWithRetry_RetryableError_EventualSuccess(t *testing.T) { + callCount := 0 + config := RetryConfig{ + MaxAttempts: 3, + InitialDelay: 10 * time.Millisecond, + MaxDelay: 100 * time.Millisecond, + } + + result, err := WithRetry(context.Background(), config, func() (string, error) { + callCount++ + if callCount < 3 { + return "", gitlab.TransientError(errors.New("timeout")) + } + return "success", nil + }) + + if err != nil { + t.Errorf("expected no error, got %v", err) + } + if result != "success" { + t.Errorf("expected 'success', got %q", result) + } + if callCount != 3 { + t.Errorf("expected 3 calls, got %d", callCount) + } +} + +func TestWithRetry_RetryableError_AllAttemptsFail(t *testing.T) { + callCount := 0 + config := RetryConfig{ + MaxAttempts: 3, + InitialDelay: 10 * time.Millisecond, + MaxDelay: 100 * time.Millisecond, + } + + _, err := WithRetry(context.Background(), config, func() (string, error) { + callCount++ + return "", gitlab.TransientError(errors.New("timeout")) + }) + + if err == nil { + t.Error("expected error, got nil") + } + if callCount != 3 { + t.Errorf("expected 3 calls, got %d", callCount) + } + if !errors.Is(err, gitlab.ErrTransient) { + t.Errorf("expected error to wrap ErrTransient, got %v", err) + } +} + +func TestWithRetry_ExponentialBackoff(t *testing.T) { + callCount := 0 + var callTimes []time.Time + config := RetryConfig{ + MaxAttempts: 4, + InitialDelay: 50 * time.Millisecond, + MaxDelay: 150 * time.Millisecond, + } + + start := time.Now() + _, _ = WithRetry(context.Background(), config, func() (string, error) { + callTimes = append(callTimes, time.Now()) + callCount++ + return "", gitlab.TransientError(errors.New("timeout")) + }) + + if callCount != 4 { + t.Errorf("expected 4 calls, got %d", callCount) + } + + // Verify exponential backoff timing + // Expected delays: 50ms, 100ms, 150ms (capped) + // Total minimum time: 50 + 100 + 150 = 300ms + elapsed := time.Since(start) + minExpected := 250 * time.Millisecond // Allow some tolerance + if elapsed < minExpected { + t.Errorf("expected at least %v elapsed, got %v", minExpected, elapsed) + } + + // Verify delay between calls increases (with tolerance for timing) + if len(callTimes) >= 3 { + delay1 := callTimes[1].Sub(callTimes[0]) + delay2 := callTimes[2].Sub(callTimes[1]) + // Second delay should be roughly double the first (with tolerance) + if delay2 < delay1 { + t.Errorf("expected exponential backoff: delay2 (%v) should be >= delay1 (%v)", delay2, delay1) + } + } +} + +func TestWithRetry_ContextCancellation(t *testing.T) { + callCount := 0 + config := RetryConfig{ + MaxAttempts: 5, + InitialDelay: 100 * time.Millisecond, + MaxDelay: 1 * time.Second, + } + + ctx, cancel := context.WithCancel(context.Background()) + + go func() { + time.Sleep(50 * time.Millisecond) + cancel() + }() + + _, err := WithRetry(ctx, config, func() (string, error) { + callCount++ + return "", gitlab.TransientError(errors.New("timeout")) + }) + + if !errors.Is(err, context.Canceled) { + t.Errorf("expected context.Canceled error, got %v", err) + } + // Should have been cancelled before all retries completed + if callCount >= 5 { + t.Errorf("expected fewer than 5 calls due to cancellation, got %d", callCount) + } +} + +func TestWithRetryNoResult(t *testing.T) { + callCount := 0 + config := RetryConfig{ + MaxAttempts: 3, + InitialDelay: 10 * time.Millisecond, + MaxDelay: 100 * time.Millisecond, + } + + err := WithRetryNoResult(context.Background(), config, func() error { + callCount++ + if callCount < 2 { + return gitlab.TransientError(errors.New("timeout")) + } + return nil + }) + + if err != nil { + t.Errorf("expected no error, got %v", err) + } + if callCount != 2 { + t.Errorf("expected 2 calls, got %d", callCount) + } +} diff --git a/internal/ankacloud/controller.go b/internal/ankacloud/controller.go index 7ca079e..9f63a1f 100644 --- a/internal/ankacloud/controller.go +++ b/internal/ankacloud/controller.go @@ -9,7 +9,7 @@ import ( "github.com/veertuinc/anka-cloud-gitlab-executor/internal/log" ) -type controller struct { +type Controller struct { APIClient *APIClient } @@ -63,12 +63,12 @@ type InstanceWrapper struct { Instance *Instance `json:"vm,omitempty"` } -func NewController(apiClient *APIClient) *controller { - return &controller{ +func NewController(apiClient *APIClient) *Controller { + return &Controller{ APIClient: apiClient, } } -func (c *controller) GetNode(ctx context.Context, req GetNodeRequest) (*Node, error) { +func (c *Controller) GetNode(ctx context.Context, req GetNodeRequest) (*Node, error) { body, err := c.APIClient.Get(ctx, "/api/v1/node", map[string]string{"id": req.Id}) if err != nil { return nil, fmt.Errorf("failed to get node %q: %w", req.Id, err) @@ -87,7 +87,7 @@ func (c *controller) GetNode(ctx context.Context, req GetNodeRequest) (*Node, er return &response.Nodes[0], nil } -func (c *controller) GetInstance(ctx context.Context, req GetInstanceRequest) (*Instance, error) { +func (c *Controller) GetInstance(ctx context.Context, req GetInstanceRequest) (*Instance, error) { body, err := c.APIClient.Get(ctx, "/api/v1/vm", map[string]string{"id": req.Id}) if err != nil { return nil, fmt.Errorf("failed to get instance %s: %w", req.Id, err) @@ -102,7 +102,7 @@ func (c *controller) GetInstance(ctx context.Context, req GetInstanceRequest) (* return &response.Instance, nil } -func (c *controller) CreateInstance(ctx context.Context, payload CreateInstanceRequest) (string, error) { +func (c *Controller) CreateInstance(ctx context.Context, payload CreateInstanceRequest) (string, error) { if payload.Priority < 0 || payload.Priority > 10000 { return "", fmt.Errorf("priority must be between 1 and 10000. Got %d", payload.Priority) @@ -122,7 +122,7 @@ func (c *controller) CreateInstance(ctx context.Context, payload CreateInstanceR return response.InstanceIds[0], nil } -func (c *controller) WaitForInstanceToBeScheduled(ctx context.Context, instanceId string) (*Instance, error) { +func (c *Controller) WaitForInstanceToBeScheduled(ctx context.Context, instanceId string) (*Instance, error) { const pollingInterval = 10 * time.Second for { select { @@ -156,7 +156,7 @@ func (c *controller) WaitForInstanceToBeScheduled(ctx context.Context, instanceI } } -func (c *controller) TerminateInstance(ctx context.Context, payload TerminateInstanceRequest) error { +func (c *Controller) TerminateInstance(ctx context.Context, payload TerminateInstanceRequest) error { body, err := c.APIClient.Delete(ctx, "/api/v1/vm", payload) if err != nil { return fmt.Errorf("failed to terminate instance %+v: %w", payload, err) @@ -171,7 +171,14 @@ func (c *controller) TerminateInstance(ctx context.Context, payload TerminateIns return nil } -func (c *controller) GetAllInstances(ctx context.Context) ([]Instance, error) { +// TerminateInstanceWithRetry terminates an instance with automatic retry on timeout errors +func (c *Controller) TerminateInstanceWithRetry(ctx context.Context, payload TerminateInstanceRequest) error { + return WithRetryNoResult(ctx, DefaultRetryConfig(), func() error { + return c.TerminateInstance(ctx, payload) + }) +} + +func (c *Controller) GetAllInstances(ctx context.Context) ([]Instance, error) { body, err := c.APIClient.Get(ctx, "/api/v1/vm", nil) if err != nil { @@ -199,17 +206,22 @@ func (c *controller) GetAllInstances(ctx context.Context) ([]Instance, error) { return instances, nil } -func (c *controller) GetInstanceByExternalId(ctx context.Context, externalId string) (*Instance, error) { - instances, err := c.GetAllInstances(ctx) - - if len(instances) == 0 { - return nil, fmt.Errorf("no instances returned from controller: %w", err) - } +func (c *Controller) GetInstanceByExternalId(ctx context.Context, externalId string) (*Instance, error) { + return WithRetry(ctx, DefaultRetryConfig(), func() (*Instance, error) { + return c.getInstanceByExternalId(ctx, externalId) + }) +} +func (c *Controller) getInstanceByExternalId(ctx context.Context, externalId string) (*Instance, error) { + instances, err := c.GetAllInstances(ctx) if err != nil { return nil, fmt.Errorf("failed to get instance by external id %s: %w", externalId, err) } + if len(instances) == 0 { + return nil, fmt.Errorf("no instances returned from controller") + } + var matchingInstances []*Instance for _, instance := range instances { if instance.ExternalId == externalId { @@ -235,7 +247,7 @@ func (c *controller) GetInstanceByExternalId(ctx context.Context, externalId str externalId, matchingInstances[0].State) } -func (c *controller) GetTemplateIdByName(ctx context.Context, templateName string) (string, error) { +func (c *Controller) GetTemplateIdByName(ctx context.Context, templateName string) (string, error) { body, err := c.APIClient.Get(ctx, "/api/v1/registry/vm", map[string]string{"apiVer": "v1"}) if err != nil { return "", fmt.Errorf("failed to get templates: %w", err) diff --git a/internal/ankacloud/controller_test.go b/internal/ankacloud/controller_test.go index 522636c..9ca3100 100644 --- a/internal/ankacloud/controller_test.go +++ b/internal/ankacloud/controller_test.go @@ -5,6 +5,8 @@ import ( "encoding/json" "net/http" "net/http/httptest" + "strings" + "sync/atomic" "testing" ) @@ -201,3 +203,100 @@ func TestGetInstanceByExternalId_PrioritizesSchedulingOverError(t *testing.T) { t.Errorf("Expected state Scheduling, got %s", instance.State) } } + +func TestGetInstanceByExternalId_NoRetryOnNonRetryableError(t *testing.T) { + // Test that GetInstanceByExternalId does NOT retry on non-retryable errors + var callCount int32 + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&callCount, 1) + + // Return a non-retryable error (bad status) + response := response{ + Status: "ERROR", + Message: "instance not found", + } + json.NewEncoder(w).Encode(response) + })) + defer server.Close() + + apiClient := &APIClient{ + ControllerURL: server.URL, + HttpClient: server.Client(), + } + controller := NewController(apiClient) + + _, err := controller.GetInstanceByExternalId(context.Background(), "https://gitlab.com/job/error-test") + if err == nil { + t.Fatal("Expected error, got nil") + } + + // Should only make 1 call - no retries for non-retryable errors + if atomic.LoadInt32(&callCount) != 1 { + t.Errorf("Expected 1 call (no retries for non-retryable error), got %d", callCount) + } +} + +func TestGetInstanceByExternalId_OnlyUsableStates(t *testing.T) { + // Test that only Error/Terminated instances fail explicitly + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + response := getAllInstancesResponse{ + response: response{Status: "OK"}, + Instances: []InstanceWrapper{ + { + Id: "terminated-instance", + ExternalId: "https://gitlab.com/job/terminated", + Instance: &Instance{ + Id: "terminated-instance", + ExternalId: "https://gitlab.com/job/terminated", + State: StateTerminated, + }, + }, + }, + } + json.NewEncoder(w).Encode(response) + })) + defer server.Close() + + apiClient := &APIClient{ + ControllerURL: server.URL, + HttpClient: server.Client(), + } + controller := NewController(apiClient) + + _, err := controller.GetInstanceByExternalId(context.Background(), "https://gitlab.com/job/terminated") + if err == nil { + t.Fatal("Expected error for terminated instance, got nil") + } + + if !strings.Contains(err.Error(), "not in a usable state") { + t.Errorf("Expected error about usable state, got: %v", err) + } +} + +func TestGetInstanceByExternalId_EmptyInstanceList(t *testing.T) { + // Test behavior when no instances exist at all + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + response := getAllInstancesResponse{ + response: response{Status: "OK"}, + Instances: []InstanceWrapper{}, + } + json.NewEncoder(w).Encode(response) + })) + defer server.Close() + + apiClient := &APIClient{ + ControllerURL: server.URL, + HttpClient: server.Client(), + } + controller := NewController(apiClient) + + _, err := controller.GetInstanceByExternalId(context.Background(), "https://gitlab.com/job/any") + if err == nil { + t.Fatal("Expected error for empty instance list, got nil") + } + + if !strings.Contains(err.Error(), "no instances returned") { + t.Errorf("Expected 'no instances returned' error, got: %v", err) + } +} diff --git a/internal/command/cleanup.go b/internal/command/cleanup.go index 398e355..688e9e8 100644 --- a/internal/command/cleanup.go +++ b/internal/command/cleanup.go @@ -49,7 +49,7 @@ func executeCleanup(ctx context.Context, env gitlab.Environment) error { log.Printf("instance id: %s\n", instance.Id) log.Printf("Issuing termination request for instance %s\n", instance.Id) - err = controller.TerminateInstance(ctx, ankacloud.TerminateInstanceRequest{ + err = controller.TerminateInstanceWithRetry(ctx, ankacloud.TerminateInstanceRequest{ Id: instance.Id, }) if err != nil {