diff --git a/README.md b/README.md
index e48d21d..b8ced38 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ Accepted values for booleans are: "1", "t", "T", "true", "TRUE", "True", "0", "f
 | ANKA_CLOUD_CONTROLLER_URL | ✅ | String | Anka Build Cloud's Controller URL. Inlcuding `http[s]` prefix. Port optional |
 | ANKA_CLOUD_TEMPLATE_ID | ✅* | String | VM Template ID to use. Takes precedence over `ANKA_CLOUD_TEMPLATE_NAME`. **Required if `ANKA_CLOUD_TEMPLATE_NAME` not provided** |
 | ANKA_CLOUD_TEMPLATE_NAME | ✅* | String | VM Template Name to use. Since template names are not guaranteed to be unique, it is recommended to use `ANKA_CLOUD_TEMPLATE_ID`. **Required if `ANKA_CLOUD_TEMPLATE_ID` not provided** |
-| ANKA_CLOUD_DEBUG |     ❌ | Boolean | Output Anka Cloud debug info |
+| ANKA_CLOUD_DEBUG | ❌ | Boolean | Output Anka Cloud debug info |
 | ANKA_CLOUD_TEMPLATE_TAG | ❌ | String | Template tag to use |
 | ANKA_CLOUD_NODE_ID | ❌ | String | Run VM on this specific node |
 | ANKA_CLOUD_PRIORITY | ❌ | Number | Priority in range 1-10000 (lower is more urgent) |
diff --git a/RELEASING.md b/RELEASING.md
index 7e7b481..a3347a3 100644
--- a/RELEASING.md
+++ b/RELEASING.md
@@ -4,4 +4,4 @@
 2. Merge into main branch.
 3. Create a new release on Github.
 4. Build and Watch https://jenkins/job/anka-cloud-gitlab-executor-release/ to see the release build.
-5. Update the release description and title if needed. The workflow will attach artifacts to the release.
+5. Update the release description and title if needed. The workflow will attach artifacts to the release.
\ No newline at end of file
diff --git a/internal/ankacloud/client.go b/internal/ankacloud/client.go
index cf75358..4c2d906 100644
--- a/internal/ankacloud/client.go
+++ b/internal/ankacloud/client.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"net/http"
@@ -18,6 +19,9 @@ import (
 const (
 	defaultMaxIdleConnsPerHost = 20
 	defaultRequestTimeout      = 10 * time.Second
+	DefaultRetryAttempts       = 3
+	DefaultRetryInitialDelay   = 5 * time.Second
+	DefaultRetryMaxDelay       = 30 * time.Second
 )
 
 type APIClient struct {
@@ -26,6 +30,86 @@ type APIClient struct {
 	CustomHttpHeaders map[string]string
 }
 
+// RetryConfig holds configuration for retry behavior with exponential backoff
+type RetryConfig struct {
+	MaxAttempts  int
+	InitialDelay time.Duration
+	MaxDelay     time.Duration
+}
+
+// DefaultRetryConfig returns the default retry configuration
+func DefaultRetryConfig() RetryConfig {
+	return RetryConfig{
+		MaxAttempts:  DefaultRetryAttempts,
+		InitialDelay: DefaultRetryInitialDelay,
+		MaxDelay:     DefaultRetryMaxDelay,
+	}
+}
+
+// IsRetryableError checks if an error is retryable (timeout or transient errors)
+func IsRetryableError(err error) bool {
+	if err == nil {
+		return false
+	}
+	// Check for TransientError (which wraps timeout errors)
+	if errors.Is(err, gitlab.ErrTransient) {
+		return true
+	}
+	// Check for url.Error timeout
+	var urlErr *url.Error
+	if errors.As(err, &urlErr) && urlErr.Timeout() {
+		return true
+	}
+	// Also check for common timeout error messages
+	errStr := err.Error()
+	return strings.Contains(errStr, "deadline exceeded") ||
+		strings.Contains(errStr, "Client.Timeout")
+}
+
+// WithRetry executes the given operation with retry logic using exponential backoff
+func WithRetry[T any](ctx context.Context, config RetryConfig, operation func() (T, error)) (T, error) {
+	var zero T
+	var lastErr error
+	delay := config.InitialDelay
+
+	for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
+		result, err := operation()
+		if err == nil {
+			return result, nil
+		}
+
+		lastErr = err
+
+		if !IsRetryableError(err) {
+			return zero, err
+		}
+
+		if attempt < config.MaxAttempts {
+			log.Printf("Request timed out (attempt %d/%d), retrying in %v...\n", attempt, config.MaxAttempts, delay)
+			select {
+			case <-ctx.Done():
+				return zero, ctx.Err()
+			case <-time.After(delay):
+			}
+			// Exponential backoff: double the delay for next attempt, capped at MaxDelay
+			delay *= 2
+			if delay > config.MaxDelay {
+				delay = config.MaxDelay
+			}
+		}
+	}
+
+	return zero, fmt.Errorf("operation failed after %d attempts: %w", config.MaxAttempts, lastErr)
+}
+
+// WithRetryNoResult executes the given operation with retry logic for operations that don't return a value
+func WithRetryNoResult(ctx context.Context, config RetryConfig, operation func() error) error {
+	_, err := WithRetry(ctx, config, func() (struct{}, error) {
+		return struct{}{}, operation()
+	})
+	return err
+}
+
 func (c *APIClient) parse(body []byte) (response, error) {
 	var r response
 	err := json.Unmarshal(body, &r)
diff --git a/internal/ankacloud/client_test.go b/internal/ankacloud/client_test.go
index d987415..45ae5e9 100644
--- a/internal/ankacloud/client_test.go
+++ b/internal/ankacloud/client_test.go
@@ -3,9 +3,14 @@ package ankacloud
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"net/http"
 	"net/http/httptest"
+	"net/url"
 	"testing"
+	"time"
+
+	"github.com/veertuinc/anka-cloud-gitlab-executor/internal/gitlab"
 )
 
 func TestCustomHeaders(t *testing.T) {
@@ -46,3 +51,245 @@ func TestCustomHeaders(t *testing.T) {
 		t.Error(err)
 	}
 }
+
+func TestIsRetryableError(t *testing.T) {
+	tests := []struct {
+		name     string
+		err      error
+		expected bool
+	}{
+		{
+			name:     "nil error",
+			err:      nil,
+			expected: false,
+		},
+		{
+			name:     "regular error",
+			err:      errors.New("some error"),
+			expected: false,
+		},
+		{
+			name:     "transient error",
+			err:      gitlab.TransientError(errors.New("timeout")),
+			expected: true,
+		},
+		{
+			name:     "error with deadline exceeded",
+			err:      errors.New("context deadline exceeded"),
+			expected: true,
+		},
+		{
+			name:     "error with Client.Timeout",
+			err:      errors.New("Client.Timeout exceeded while awaiting headers"),
+			expected: true,
+		},
+		{
+			name:     "wrapped deadline exceeded",
+			err:      errors.New("failed to terminate: context deadline exceeded (Client.Timeout exceeded)"),
+			expected: true,
+		},
+		{
+			name:     "url.Error with timeout",
+			err:      &url.Error{Op: "Get", URL: "http://test", Err: errors.New("timeout")},
+			expected: false, // url.Error.Timeout() returns false for generic errors
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := IsRetryableError(tt.err)
+			if result != tt.expected {
+				t.Errorf("IsRetryableError(%v) = %v, expected %v", tt.err, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestWithRetry_Success(t *testing.T) {
+	callCount := 0
+	config := RetryConfig{
+		MaxAttempts:  3,
+		InitialDelay: 10 * time.Millisecond,
+		MaxDelay:     100 * time.Millisecond,
+	}
+
+	result, err := WithRetry(context.Background(), config, func() (string, error) {
+		callCount++
+		return "success", nil
+	})
+
+	if err != nil {
+		t.Errorf("expected no error, got %v", err)
+	}
+	if result != "success" {
+		t.Errorf("expected 'success', got %q", result)
+	}
+	if callCount != 1 {
+		t.Errorf("expected 1 call, got %d", callCount)
+	}
+}
+
+func TestWithRetry_NonRetryableError(t *testing.T) {
+	callCount := 0
+	config := RetryConfig{
+		MaxAttempts:  3,
+		InitialDelay: 10 * time.Millisecond,
+		MaxDelay:     100 * time.Millisecond,
+	}
+
+	_, err := WithRetry(context.Background(), config, func() (string, error) {
+		callCount++
+		return "", errors.New("permanent error")
+	})
+
+	if err == nil {
+		t.Error("expected error, got nil")
+	}
+	if callCount != 1 {
+		t.Errorf("expected 1 call (no retry for non-retryable error), got %d", callCount)
+	}
+}
+
+func TestWithRetry_RetryableError_EventualSuccess(t *testing.T) {
+	callCount := 0
+	config := RetryConfig{
+		MaxAttempts:  3,
+		InitialDelay: 10 * time.Millisecond,
+		MaxDelay:     100 * time.Millisecond,
+	}
+
+	result, err := WithRetry(context.Background(), config, func() (string, error) {
+		callCount++
+		if callCount < 3 {
+			return "", gitlab.TransientError(errors.New("timeout"))
+		}
+		return "success", nil
+	})
+
+	if err != nil {
+		t.Errorf("expected no error, got %v", err)
+	}
+	if result != "success" {
+		t.Errorf("expected 'success', got %q", result)
+	}
+	if callCount != 3 {
+		t.Errorf("expected 3 calls, got %d", callCount)
+	}
+}
+
+func TestWithRetry_RetryableError_AllAttemptsFail(t *testing.T) {
+	callCount := 0
+	config := RetryConfig{
+		MaxAttempts:  3,
+		InitialDelay: 10 * time.Millisecond,
+		MaxDelay:     100 * time.Millisecond,
+	}
+
+	_, err := WithRetry(context.Background(), config, func() (string, error) {
+		callCount++
+		return "", gitlab.TransientError(errors.New("timeout"))
+	})
+
+	if err == nil {
+		t.Error("expected error, got nil")
+	}
+	if callCount != 3 {
+		t.Errorf("expected 3 calls, got %d", callCount)
+	}
+	if !errors.Is(err, gitlab.ErrTransient) {
+		t.Errorf("expected error to wrap ErrTransient, got %v", err)
+	}
+}
+
+func TestWithRetry_ExponentialBackoff(t *testing.T) {
+	callCount := 0
+	var callTimes []time.Time
+	config := RetryConfig{
+		MaxAttempts:  4,
+		InitialDelay: 50 * time.Millisecond,
+		MaxDelay:     150 * time.Millisecond,
+	}
+
+	start := time.Now()
+	_, _ = WithRetry(context.Background(), config, func() (string, error) {
+		callTimes = append(callTimes, time.Now())
+		callCount++
+		return "", gitlab.TransientError(errors.New("timeout"))
+	})
+
+	if callCount != 4 {
+		t.Errorf("expected 4 calls, got %d", callCount)
+	}
+
+	// Verify exponential backoff timing
+	// Expected delays: 50ms, 100ms, 150ms (capped)
+	// Total minimum time: 50 + 100 + 150 = 300ms
+	elapsed := time.Since(start)
+	minExpected := 250 * time.Millisecond // Allow some tolerance
+	if elapsed < minExpected {
+		t.Errorf("expected at least %v elapsed, got %v", minExpected, elapsed)
+	}
+
+	// Verify delay between calls increases (with tolerance for timing)
+	if len(callTimes) >= 3 {
+		delay1 := callTimes[1].Sub(callTimes[0])
+		delay2 := callTimes[2].Sub(callTimes[1])
+		// Second delay should be roughly double the first (with tolerance)
+		if delay2 < delay1 {
+			t.Errorf("expected exponential backoff: delay2 (%v) should be >= delay1 (%v)", delay2, delay1)
+		}
+	}
+}
+
+func TestWithRetry_ContextCancellation(t *testing.T) {
+	callCount := 0
+	config := RetryConfig{
+		MaxAttempts:  5,
+		InitialDelay: 100 * time.Millisecond,
+		MaxDelay:     1 * time.Second,
+	}
+
+	ctx, cancel := context.WithCancel(context.Background())
+
+	go func() {
+		time.Sleep(50 * time.Millisecond)
+		cancel()
+	}()
+
+	_, err := WithRetry(ctx, config, func() (string, error) {
+		callCount++
+		return "", gitlab.TransientError(errors.New("timeout"))
+	})
+
+	if !errors.Is(err, context.Canceled) {
+		t.Errorf("expected context.Canceled error, got %v", err)
+	}
+	// Should have been cancelled before all retries completed
+	if callCount >= 5 {
+		t.Errorf("expected fewer than 5 calls due to cancellation, got %d", callCount)
+	}
+}
+
+func TestWithRetryNoResult(t *testing.T) {
+	callCount := 0
+	config := RetryConfig{
+		MaxAttempts:  3,
+		InitialDelay: 10 * time.Millisecond,
+		MaxDelay:     100 * time.Millisecond,
+	}
+
+	err := WithRetryNoResult(context.Background(), config, func() error {
+		callCount++
+		if callCount < 2 {
+			return gitlab.TransientError(errors.New("timeout"))
+		}
+		return nil
+	})
+
+	if err != nil {
+		t.Errorf("expected no error, got %v", err)
+	}
+	if callCount != 2 {
+		t.Errorf("expected 2 calls, got %d", callCount)
+	}
+}
diff --git a/internal/ankacloud/controller.go b/internal/ankacloud/controller.go
index 7ca079e..9f63a1f 100644
--- a/internal/ankacloud/controller.go
+++ b/internal/ankacloud/controller.go
@@ -9,7 +9,7 @@ import (
 	"github.com/veertuinc/anka-cloud-gitlab-executor/internal/log"
 )
 
-type controller struct {
+type Controller struct {
 	APIClient *APIClient
 }
 
@@ -63,12 +63,12 @@ type InstanceWrapper struct {
 	Instance   *Instance `json:"vm,omitempty"`
 }
 
-func NewController(apiClient *APIClient) *controller {
-	return &controller{
+func NewController(apiClient *APIClient) *Controller {
+	return &Controller{
 		APIClient: apiClient,
 	}
 }
-func (c *controller) GetNode(ctx context.Context, req GetNodeRequest) (*Node, error) {
+func (c *Controller) GetNode(ctx context.Context, req GetNodeRequest) (*Node, error) {
 	body, err := c.APIClient.Get(ctx, "/api/v1/node", map[string]string{"id": req.Id})
 	if err != nil {
 		return nil, fmt.Errorf("failed to get node %q: %w", req.Id, err)
@@ -87,7 +87,7 @@ func (c *controller) GetNode(ctx context.Context, req GetNodeRequest) (*Node, er
 	return &response.Nodes[0], nil
 }
 
-func (c *controller) GetInstance(ctx context.Context, req GetInstanceRequest) (*Instance, error) {
+func (c *Controller) GetInstance(ctx context.Context, req GetInstanceRequest) (*Instance, error) {
 	body, err := c.APIClient.Get(ctx, "/api/v1/vm", map[string]string{"id": req.Id})
 	if err != nil {
 		return nil, fmt.Errorf("failed to get instance %s: %w", req.Id, err)
@@ -102,7 +102,7 @@ func (c *controller) GetInstance(ctx context.Context, req GetInstanceRequest) (*
 	return &response.Instance, nil
 }
 
-func (c *controller) CreateInstance(ctx context.Context, payload CreateInstanceRequest) (string, error) {
+func (c *Controller) CreateInstance(ctx context.Context, payload CreateInstanceRequest) (string, error) {
 
 	if payload.Priority < 0 || payload.Priority > 10000 {
 		return "", fmt.Errorf("priority must be between 1 and 10000. Got %d", payload.Priority)
@@ -122,7 +122,7 @@ func (c *controller) CreateInstance(ctx context.Context, payload CreateInstanceR
 	return response.InstanceIds[0], nil
 }
 
-func (c *controller) WaitForInstanceToBeScheduled(ctx context.Context, instanceId string) (*Instance, error) {
+func (c *Controller) WaitForInstanceToBeScheduled(ctx context.Context, instanceId string) (*Instance, error) {
 	const pollingInterval = 10 * time.Second
 	for {
 		select {
@@ -156,7 +156,7 @@ func (c *controller) WaitForInstanceToBeScheduled(ctx context.Context, instanceI
 	}
 }
 
-func (c *controller) TerminateInstance(ctx context.Context, payload TerminateInstanceRequest) error {
+func (c *Controller) TerminateInstance(ctx context.Context, payload TerminateInstanceRequest) error {
 	body, err := c.APIClient.Delete(ctx, "/api/v1/vm", payload)
 	if err != nil {
 		return fmt.Errorf("failed to terminate instance %+v: %w", payload, err)
@@ -171,7 +171,14 @@ func (c *controller) TerminateInstance(ctx context.Context, payload TerminateIns
 	return nil
 }
 
-func (c *controller) GetAllInstances(ctx context.Context) ([]Instance, error) {
+// TerminateInstanceWithRetry terminates an instance with automatic retry on timeout errors
+func (c *Controller) TerminateInstanceWithRetry(ctx context.Context, payload TerminateInstanceRequest) error {
+	return WithRetryNoResult(ctx, DefaultRetryConfig(), func() error {
+		return c.TerminateInstance(ctx, payload)
+	})
+}
+
+func (c *Controller) GetAllInstances(ctx context.Context) ([]Instance, error) {
 
 	body, err := c.APIClient.Get(ctx, "/api/v1/vm", nil)
 	if err != nil {
@@ -199,17 +206,22 @@ func (c *controller) GetAllInstances(ctx context.Context) ([]Instance, error) {
 	return instances, nil
 }
 
-func (c *controller) GetInstanceByExternalId(ctx context.Context, externalId string) (*Instance, error) {
-	instances, err := c.GetAllInstances(ctx)
-
-	if len(instances) == 0 {
-		return nil, fmt.Errorf("no instances returned from controller: %w", err)
-	}
+func (c *Controller) GetInstanceByExternalId(ctx context.Context, externalId string) (*Instance, error) {
+	return WithRetry(ctx, DefaultRetryConfig(), func() (*Instance, error) {
+		return c.getInstanceByExternalId(ctx, externalId)
+	})
+}
 
+func (c *Controller) getInstanceByExternalId(ctx context.Context, externalId string) (*Instance, error) {
+	instances, err := c.GetAllInstances(ctx)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get instance by external id %s: %w", externalId, err)
 	}
 
+	if len(instances) == 0 {
+		return nil, fmt.Errorf("no instances returned from controller")
+	}
+
 	var matchingInstances []*Instance
 	for _, instance := range instances {
 		if instance.ExternalId == externalId {
@@ -235,7 +247,7 @@ func (c *controller) GetInstanceByExternalId(ctx context.Context, externalId str
 		externalId, matchingInstances[0].State)
 }
 
-func (c *controller) GetTemplateIdByName(ctx context.Context, templateName string) (string, error) {
+func (c *Controller) GetTemplateIdByName(ctx context.Context, templateName string) (string, error) {
 	body, err := c.APIClient.Get(ctx, "/api/v1/registry/vm", map[string]string{"apiVer": "v1"})
 	if err != nil {
 		return "", fmt.Errorf("failed to get templates: %w", err)
diff --git a/internal/ankacloud/controller_test.go b/internal/ankacloud/controller_test.go
index 522636c..9ca3100 100644
--- a/internal/ankacloud/controller_test.go
+++ b/internal/ankacloud/controller_test.go
@@ -5,6 +5,8 @@ import (
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
+	"strings"
+	"sync/atomic"
 	"testing"
 )
 
@@ -201,3 +203,100 @@ func TestGetInstanceByExternalId_PrioritizesSchedulingOverError(t *testing.T) {
 		t.Errorf("Expected state Scheduling, got %s", instance.State)
 	}
 }
+
+func TestGetInstanceByExternalId_NoRetryOnNonRetryableError(t *testing.T) {
+	// Test that GetInstanceByExternalId does NOT retry on non-retryable errors
+	var callCount int32
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		atomic.AddInt32(&callCount, 1)
+
+		// Return a non-retryable error (bad status)
+		response := response{
+			Status:  "ERROR",
+			Message: "instance not found",
+		}
+		json.NewEncoder(w).Encode(response)
+	}))
+	defer server.Close()
+
+	apiClient := &APIClient{
+		ControllerURL: server.URL,
+		HttpClient:    server.Client(),
+	}
+	controller := NewController(apiClient)
+
+	_, err := controller.GetInstanceByExternalId(context.Background(), "https://gitlab.com/job/error-test")
+	if err == nil {
+		t.Fatal("Expected error, got nil")
+	}
+
+	// Should only make 1 call - no retries for non-retryable errors
+	if atomic.LoadInt32(&callCount) != 1 {
+		t.Errorf("Expected 1 call (no retries for non-retryable error), got %d", callCount)
+	}
+}
+
+func TestGetInstanceByExternalId_OnlyUsableStates(t *testing.T) {
+	// Test that only Error/Terminated instances fail explicitly
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		response := getAllInstancesResponse{
+			response: response{Status: "OK"},
+			Instances: []InstanceWrapper{
+				{
+					Id:         "terminated-instance",
+					ExternalId: "https://gitlab.com/job/terminated",
+					Instance: &Instance{
+						Id:         "terminated-instance",
+						ExternalId: "https://gitlab.com/job/terminated",
+						State:      StateTerminated,
+					},
+				},
+			},
+		}
+		json.NewEncoder(w).Encode(response)
+	}))
+	defer server.Close()
+
+	apiClient := &APIClient{
+		ControllerURL: server.URL,
+		HttpClient:    server.Client(),
+	}
+	controller := NewController(apiClient)
+
+	_, err := controller.GetInstanceByExternalId(context.Background(), "https://gitlab.com/job/terminated")
+	if err == nil {
+		t.Fatal("Expected error for terminated instance, got nil")
+	}
+
+	if !strings.Contains(err.Error(), "not in a usable state") {
+		t.Errorf("Expected error about usable state, got: %v", err)
+	}
+}
+
+func TestGetInstanceByExternalId_EmptyInstanceList(t *testing.T) {
+	// Test behavior when no instances exist at all
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		response := getAllInstancesResponse{
+			response:  response{Status: "OK"},
+			Instances: []InstanceWrapper{},
+		}
+		json.NewEncoder(w).Encode(response)
+	}))
+	defer server.Close()
+
+	apiClient := &APIClient{
+		ControllerURL: server.URL,
+		HttpClient:    server.Client(),
+	}
+	controller := NewController(apiClient)
+
+	_, err := controller.GetInstanceByExternalId(context.Background(), "https://gitlab.com/job/any")
+	if err == nil {
+		t.Fatal("Expected error for empty instance list, got nil")
+	}
+
+	if !strings.Contains(err.Error(), "no instances returned") {
+		t.Errorf("Expected 'no instances returned' error, got: %v", err)
+	}
+}
diff --git a/internal/command/cleanup.go b/internal/command/cleanup.go
index 398e355..688e9e8 100644
--- a/internal/command/cleanup.go
+++ b/internal/command/cleanup.go
@@ -49,7 +49,7 @@ func executeCleanup(ctx context.Context, env gitlab.Environment) error {
 	log.Printf("instance id: %s\n", instance.Id)
 
 	log.Printf("Issuing termination request for instance %s\n", instance.Id)
-	err = controller.TerminateInstance(ctx, ankacloud.TerminateInstanceRequest{
+	err = controller.TerminateInstanceWithRetry(ctx, ankacloud.TerminateInstanceRequest{
 		Id: instance.Id,
 	})
 	if err != nil {