From 8d1db590c806e29fe77df8c23e61747967f5d57e Mon Sep 17 00:00:00 2001 From: Richard Wang Date: Mon, 11 May 2026 08:37:52 -0700 Subject: [PATCH 01/10] Initial retry changes --- .../plugins/retries/retry_quota.rb | 12 +- .../lib/aws-sdk-core/plugins/retry_errors.rb | 202 +++++++++++------- 2 files changed, 132 insertions(+), 82 deletions(-) diff --git a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb index 63a2db2f5a7..9c3c3832245 100644 --- a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb +++ b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb @@ -8,9 +8,9 @@ module Retries # Used in 'standard' and 'adaptive' retry modes. class RetryQuota INITIAL_RETRY_TOKENS = 500 - RETRY_COST = 5 + RETRY_COST = 14 NO_RETRY_INCREMENT = 1 - TIMEOUT_RETRY_COST = 10 + THROTTLING_RETRY_COST = 10 def initialize(opts = {}) @mutex = Mutex.new @@ -19,13 +19,13 @@ def initialize(opts = {}) end # check if there is sufficient capacity to retry - # and return it. If there is insufficient capacity + # and return it. If there is insufficient capacity # return 0 # @return [Integer] The amount of capacity checked out def checkout_capacity(error_inspector) @mutex.synchronize do - capacity_amount = if error_inspector.networking? - TIMEOUT_RETRY_COST + capacity_amount = if error_inspector.throttling_error? + THROTTLING_RETRY_COST else RETRY_COST end @@ -39,7 +39,7 @@ def checkout_capacity(error_inspector) end # capacity_amount refers to the amount of capacity requested from - # the last retry. It can either be RETRY_COST, TIMEOUT_RETRY_COST, + # the last retry. It can either be RETRY_COST, THROTTLING_RETRY_COST, # or unset. def release(capacity_amount) # Implementation note: The release() method is called for diff --git a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb index c44a899c5fc..1c0fc47abf8 100644 --- a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb +++ b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb @@ -41,32 +41,32 @@ class RetryErrors < Seahorse::Client::Plugin :retry_limit, default: 3, doc_type: Integer, - docstring: <<-DOCS) -The maximum number of times to retry failed requests. Only -~ 500 level server errors and certain ~ 400 level client errors -are retried. Generally, these are throttling errors, data -checksum errors, networking errors, timeout errors, auth errors, -endpoint discovery, and errors from expired credentials. -This option is only used in the `legacy` retry mode. + docstring: <<~DOCS) + The maximum number of times to retry failed requests. Only + ~ 500 level server errors and certain ~ 400 level client errors + are retried. Generally, these are throttling errors, data + checksum errors, networking errors, timeout errors, auth errors, + endpoint discovery, and errors from expired credentials. + This option is only used in the `legacy` retry mode. DOCS option( :retry_max_delay, default: 0, doc_type: Integer, - docstring: <<-DOCS) -The maximum number of seconds to delay between retries (0 for no limit) -used by the default backoff function. This option is only used in the -`legacy` retry mode. + docstring: <<~DOCS) + The maximum number of seconds to delay between retries (0 for no limit) + used by the default backoff function. This option is only used in the + `legacy` retry mode. DOCS option( :retry_base_delay, default: 0.3, doc_type: Float, - docstring: <<-DOCS) -The base delay in seconds used by the default backoff function. This option -is only used in the `legacy` retry mode. + docstring: <<~DOCS) + The base delay in seconds used by the default backoff function. This option + is only used in the `legacy` retry mode. DOCS option( @@ -74,45 +74,43 @@ class RetryErrors < Seahorse::Client::Plugin default: :none, doc_type: Symbol, rbs_type: '(:none | :equal | :full | ^(Integer) -> Integer)', - docstring: <<-DOCS) -A delay randomiser function used by the default backoff function. -Some predefined functions can be referenced by name - :none, :equal, :full, -otherwise a Proc that takes and returns a number. This option is only used -in the `legacy` retry mode. + docstring: <<~DOCS) + A delay randomiser function used by the default backoff function. + Some predefined functions can be referenced by name - :none, :equal, :full, + otherwise a Proc that takes and returns a number. This option is only used + in the `legacy` retry mode. -@see https://www.awsarchitectureblog.com/2015/03/backoff.html + @see https://www.awsarchitectureblog.com/2015/03/backoff.html DOCS option( :retry_backoff, default: DEFAULT_BACKOFF, doc_type: Proc, - docstring: <<-DOCS) -A proc or lambda used for backoff. Defaults to 2**retries * retry_base_delay. -This option is only used in the `legacy` retry mode. + docstring: <<~DOCS) + A proc or lambda used for backoff. Defaults to 2**retries * retry_base_delay. + This option is only used in the `legacy` retry mode. DOCS # END LEGACY OPTIONS option( :retry_mode, - default: 'legacy', + default: 'standard', doc_type: String, rbs_type: '("legacy" | "standard" | "adaptive")', - docstring: <<-DOCS) do |cfg| -Specifies which retry algorithm to use. Values are: + docstring: <<~DOCS) do |cfg| + Specifies which retry algorithm to use. Values are: -* `legacy` - The pre-existing retry behavior. This is default value if - no retry mode is provided. + * `legacy` - The pre-existing retry behavior. -* `standard` - A standardized set of retry rules across the AWS SDKs. - This includes support for retry quotas, which limit the number of - unsuccessful retries a client can make. + * `standard` - A standardized set of retry rules across the AWS SDKs. + This includes support for retry quotas, which limit the number of + unsuccessful retries a client can make. This is default value if + no retry mode is provided. -* `adaptive` - An experimental retry mode that includes all the - functionality of `standard` mode along with automatic client side - throttling. This is a provisional mode that may change behavior - in the future. + * `adaptive` - A retry mode that includes all the functionality of + `standard` mode along with automatic client side throttling. DOCS resolve_retry_mode(cfg) end @@ -121,11 +119,11 @@ class RetryErrors < Seahorse::Client::Plugin :max_attempts, default: 3, doc_type: Integer, - docstring: <<-DOCS) do |cfg| -An integer representing the maximum number attempts that will be made for -a single request, including the initial attempt. For example, -setting this value to 5 will result in a request being retried up to -4 times. Used in `standard` and `adaptive` retry modes. + docstring: <<~DOCS) do |cfg| + An integer representing the maximum number attempts that will be made for + a single request, including the initial attempt. For example, + setting this value to 5 will result in a request being retried up to + 4 times. Used in `standard` and `adaptive` retry modes. DOCS resolve_max_attempts(cfg) end @@ -134,11 +132,11 @@ class RetryErrors < Seahorse::Client::Plugin :adaptive_retry_wait_to_fill, default: true, doc_type: 'Boolean', - docstring: <<-DOCS) do |cfg| -Used only in `adaptive` retry mode. When true, the request will sleep -until there is sufficent client side capacity to retry the request. -When false, the request will raise a `RetryCapacityNotAvailableError` and will -not retry instead of sleeping. + docstring: <<~DOCS) do |cfg| + Used only in `adaptive` retry mode. When true, the request will sleep + until there is sufficent client side capacity to retry the request. + When false, the request will raise a `RetryCapacityNotAvailableError` and will + not retry instead of sleeping. DOCS resolve_adaptive_retry_wait_to_fill(cfg) end @@ -147,10 +145,10 @@ class RetryErrors < Seahorse::Client::Plugin :correct_clock_skew, default: true, doc_type: 'Boolean', - docstring: <<-DOCS) do |cfg| -Used only in `standard` and adaptive retry modes. Specifies whether to apply -a clock skew correction and retry requests with skewed client clocks. - DOCS + docstring: <<~DOCS) do |cfg| + Used only in `standard` and adaptive retry modes. Specifies whether to apply + a clock skew correction and retry requests with skewed client clocks. + DOCS resolve_correct_clock_skew(cfg) end @@ -169,31 +167,35 @@ def self.resolve_retry_mode(cfg) cfg.defaults_mode_config_resolver.resolve(:retry_mode) end - value = ENV['AWS_RETRY_MODE'] || - Aws.shared_config.retry_mode(profile: cfg.profile) || - default_mode_value || - 'legacy' + value = ENV['AWS_RETRY_MODE'] || + Aws.shared_config.retry_mode(profile: cfg.profile) || + default_mode_value || + 'standard' # Raise if provided value is not one of the retry modes if value != 'legacy' && value != 'standard' && value != 'adaptive' raise ArgumentError, - 'Must provide either `legacy`, `standard`, or `adaptive` for '\ - 'retry_mode profile option or for ENV[\'AWS_RETRY_MODE\']' + 'Must provide either `legacy`, `standard`, or `adaptive` for '\ + 'retry_mode profile option or for ENV[\'AWS_RETRY_MODE\']' end value end def self.resolve_max_attempts(cfg) value = (ENV['AWS_MAX_ATTEMPTS']) || - Aws.shared_config.max_attempts(profile: cfg.profile) || - '3' - value = value.to_i - # Raise if provided value is not a positive integer - if value <= 0 - raise ArgumentError, - 'Must provide a positive integer for max_attempts profile '\ - 'option or for ENV[\'AWS_MAX_ATTEMPTS\']' + Aws.shared_config.max_attempts(profile: cfg.profile) + if value + value = value.to_i + # Raise if provided value is not a positive integer + if value <= 0 + raise ArgumentError, + 'Must provide a positive integer for max_attempts profile '\ + 'option or for ENV[\'AWS_MAX_ATTEMPTS\']' + end + return value end - value + + service_id = cfg.api.metadata['serviceId'] if cfg.respond_to?(:api) + ['DynamoDB', 'DynamoDB Streams'].include?(service_id) ? 4 : 3 end def self.resolve_adaptive_retry_wait_to_fill(cfg) @@ -203,9 +205,9 @@ def self.resolve_adaptive_retry_wait_to_fill(cfg) # Raise if provided value is not true or false if value != 'true' && value != 'false' raise ArgumentError, - 'Must provide either `true` or `false` for '\ - 'adaptive_retry_wait_to_fill profile option or for '\ - 'ENV[\'AWS_ADAPTIVE_RETRY_WAIT_TO_FILL\']' + 'Must provide either `true` or `false` for '\ + 'adaptive_retry_wait_to_fill profile option or for '\ + 'ENV[\'AWS_ADAPTIVE_RETRY_WAIT_TO_FILL\']' end value == 'true' end @@ -217,9 +219,9 @@ def self.resolve_correct_clock_skew(cfg) # Raise if provided value is not true or false if value != 'true' && value != 'false' raise ArgumentError, - 'Must provide either `true` or `false` for '\ - 'correct_clock_skew profile option or for '\ - 'ENV[\'AWS_CORRECT_CLOCK_SKEW\']' + 'Must provide either `true` or `false` for '\ + 'correct_clock_skew profile option or for '\ + 'ENV[\'AWS_CORRECT_CLOCK_SKEW\']' end value == 'true' end @@ -228,6 +230,14 @@ class Handler < Seahorse::Client::Handler # Max backoff (in seconds) MAX_BACKOFF = 20 + # Hard-coded combination of services and operations as having the + # longPoll trait. To be removed when trait is enabled. + LONG_POLLING_OPERATIONS = { + 'SQS' => Set[:receive_message], + 'SFN' => Set[:get_activity_task], + 'SWF' => Set[:poll_for_activity_task, :poll_for_decision_task] + }.freeze + def call(context) context.metadata[:retries] ||= {} config = context.config @@ -260,12 +270,17 @@ def call(context) return response if context.retries >= config.max_attempts - 1 - context.metadata[:retries][:capacity_amount] = - config.retry_quota.checkout_capacity(error_inspector) - return response unless context.metadata[:retries][:capacity_amount] > 0 + capacity_amount = config.retry_quota.checkout_capacity(error_inspector) + context.metadata[:retries][:capacity_amount] = capacity_amount + + return response if capacity_amount <= 0 && !long_polling_operation?(context) - delay = exponential_backoff(context.retries) + service_id = context.config.api.metadata['serviceId'] + delay = backoff(context, error_inspector, service_id) Kernel.sleep(delay) + + return response if capacity_amount <= 0 + retry_request(context, error_inspector) end @@ -311,9 +326,44 @@ def retryable?(context, response, error_inspector) context.http_response.body.respond_to?(:truncate) end - def exponential_backoff(retries) + def long_polling_operation?(context) + return true if context.operation['longPoll'] + + # Hard-coded failback until the trait is enabled + service_id = context.config.api.metadata['serviceId'] + LONG_POLLING_OPERATIONS.include?([service_id, context.operation_name]) + end + + def backoff(context, error_inspector, service_id) + exp_backoff = exponential_backoff(context.retries, error_inspector, service_id) + retry_after = parse_retry_after(context) + return exp_backoff unless retry_after + + backoff_duration = [retry_after, exp_backoff].max + [backoff_duration, exp_backoff + 5].min + end + + def exponential_backoff(retries, error_inspector, service_id) # for a transient error, use backoff - [Kernel.rand * 2**retries, MAX_BACKOFF].min + backoff_scalar = if error_inspector.throttling_error? + 1 + elsif ['DynamoDB', 'DynamoDB Streams'].include?(service_id) + 0.025 + else + 0.05 + end + Kernel.rand * [backoff_scalar * 2**retries, MAX_BACKOFF].min + end + + def parse_retry_after(context) + retry_after = context.http_response.headers['x-amx-retry-after'] + return nil unless retry_after + + Integer(retry_after) / 1000.0 + rescue ArgumentError + context.config.logger&.debug( + "Failed to parse x-amz-retry-after header value: #{retry_after.inspect}" + ) end def retry_request(context, error) From 0d8d988cab78a8d7fd9993c06736a94517d17056 Mon Sep 17 00:00:00 2001 From: Richard Wang Date: Mon, 11 May 2026 14:35:51 -0700 Subject: [PATCH 02/10] Update and add retry tests --- .../plugins/retries/retry_quota.rb | 2 +- .../lib/aws-sdk-core/plugins/retry_errors.rb | 4 +- .../aws/plugins/retry_errors_legacy_spec.rb | 4 - .../spec/aws/plugins/retry_errors_spec.rb | 198 +++++++++++++++--- gems/aws-sdk-core/spec/retry_errors_helper.rb | 4 + 5 files changed, 173 insertions(+), 39 deletions(-) diff --git a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb index 9c3c3832245..5eb123e1131 100644 --- a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb +++ b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb @@ -10,7 +10,7 @@ class RetryQuota INITIAL_RETRY_TOKENS = 500 RETRY_COST = 14 NO_RETRY_INCREMENT = 1 - THROTTLING_RETRY_COST = 10 + THROTTLING_RETRY_COST = 5 def initialize(opts = {}) @mutex = Mutex.new diff --git a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb index 1c0fc47abf8..3237c413aae 100644 --- a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb +++ b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb @@ -331,7 +331,7 @@ def long_polling_operation?(context) # Hard-coded failback until the trait is enabled service_id = context.config.api.metadata['serviceId'] - LONG_POLLING_OPERATIONS.include?([service_id, context.operation_name]) + LONG_POLLING_OPERATIONS[service_id]&.include?(context.operation_name) end def backoff(context, error_inspector, service_id) @@ -356,7 +356,7 @@ def exponential_backoff(retries, error_inspector, service_id) end def parse_retry_after(context) - retry_after = context.http_response.headers['x-amx-retry-after'] + retry_after = context.http_response.headers['x-amz-retry-after'] return nil unless retry_after Integer(retry_after) / 1000.0 diff --git a/gems/aws-sdk-core/spec/aws/plugins/retry_errors_legacy_spec.rb b/gems/aws-sdk-core/spec/aws/plugins/retry_errors_legacy_spec.rb index 067231e8004..d92afdb7005 100644 --- a/gems/aws-sdk-core/spec/aws/plugins/retry_errors_legacy_spec.rb +++ b/gems/aws-sdk-core/spec/aws/plugins/retry_errors_legacy_spec.rb @@ -24,10 +24,6 @@ module Plugins expect(client.config.retry_jitter).to eq(:none) end - it 'defaults config.retry_mode to legacy' do - expect(client.config.retry_mode).to eq('legacy') - end - it 'uses the legacy handler when retry_mode is legacy' do client = RetryErrorsSvc::Client.new(retry_mode: 'legacy', region: 'us-west-2') expect(client.handlers.entries.map(&:handler_class)).to include(RetryErrors::LegacyHandler) diff --git a/gems/aws-sdk-core/spec/aws/plugins/retry_errors_spec.rb b/gems/aws-sdk-core/spec/aws/plugins/retry_errors_spec.rb index e911e43236d..e7d20b7e1fb 100644 --- a/gems/aws-sdk-core/spec/aws/plugins/retry_errors_spec.rb +++ b/gems/aws-sdk-core/spec/aws/plugins/retry_errors_spec.rb @@ -8,6 +8,10 @@ module Plugins describe RetryErrors do let(:client) { RetryErrorsSvc::Client.new(stub_responses: true) } + it 'defaults config.retry_mode to standard' do + expect(client.config.retry_mode).to eq('standard') + end + it 'can configure retry_mode with shared config' do allow_any_instance_of(Aws::SharedConfig) .to receive(:retry_mode).and_return('standard') @@ -120,6 +124,7 @@ module Plugins cfg.add_option(:credentials, credentials) cfg.add_option(:endpoint_cache, cache) cfg.add_option(:api, api) + cfg.add_option(:logger, nil) cfg.add_option(:profile, nil) RetryErrors.new.add_options(cfg) cfg.build! @@ -133,6 +138,8 @@ module Plugins let(:service_error) { RetryErrorsSvc::Errors::ServiceError.new(nil, nil) } + let(:throttling_error) { RetryErrorsSvc::Errors::Throttling.new(nil, nil) } + before(:each) do resp.context.config = config operation.endpoint_discovery = {} @@ -149,15 +156,15 @@ module Plugins test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 495, retries: 1, delay: 1 } + expect: { available_capacity: 486, retries: 1, delay: 0.05 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 490, retries: 2, delay: 2 } + expect: { available_capacity: 472, retries: 2, delay: 0.1 } }, { response: { status_code: 200, error: nil }, - expect: { available_capacity: 495, retries: 2 } + expect: { available_capacity: 486, retries: 2 } } # success ] @@ -168,15 +175,15 @@ module Plugins test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 495, retries: 1, delay: 1 } + expect: { available_capacity: 486, retries: 1, delay: 0.05 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 490, retries: 2, delay: 2 } + expect: { available_capacity: 472, retries: 2, delay: 0.1 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 490, retries: 2 } + expect: { available_capacity: 472, retries: 2 } } # failure ] @@ -184,12 +191,12 @@ module Plugins end it 'fails due to retry quota reached after a single retry' do - config.retry_quota.instance_variable_set(:@available_capacity, 5) + config.retry_quota.instance_variable_set(:@available_capacity, 14) test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 0, retries: 1, delay: 1 } + expect: { available_capacity: 0, retries: 1, delay: 0.05 } }, { response: { status_code: 500, error: service_error }, @@ -219,23 +226,23 @@ module Plugins test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 495, retries: 1, delay: 1 } + expect: { available_capacity: 486, retries: 1, delay: 0.05 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 490, retries: 2, delay: 2 } + expect: { available_capacity: 472, retries: 2, delay: 0.1 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 485, retries: 3, delay: 4 } + expect: { available_capacity: 458, retries: 3, delay: 0.2 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 480, retries: 4, delay: 8 } + expect: { available_capacity: 444, retries: 4, delay: 0.4 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 480, retries: 4 } + expect: { available_capacity: 444, retries: 4 } } ] @@ -244,28 +251,28 @@ module Plugins it 'does not exceed the max backoff time' do config.max_attempts = 5 - stub_const('Aws::Plugins::RetryErrors::Handler::MAX_BACKOFF', 3) + stub_const('Aws::Plugins::RetryErrors::Handler::MAX_BACKOFF', 0.2) test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 495, retries: 1, delay: 1 } + expect: { available_capacity: 486, retries: 1, delay: 0.05 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 490, retries: 2, delay: 2 } + expect: { available_capacity: 472, retries: 2, delay: 0.1 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 485, retries: 3, delay: 3 } + expect: { available_capacity: 458, retries: 3, delay: 0.2 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 480, retries: 4, delay: 3 } + expect: { available_capacity: 444, retries: 4, delay: 0.2 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 480, retries: 4 } + expect: { available_capacity: 444, retries: 4 } } ] @@ -274,20 +281,16 @@ module Plugins it 'fails due to retry quota bucket exhaustion' do config.max_attempts = 5 - config.retry_quota.instance_variable_set(:@available_capacity, 10) + config.retry_quota.instance_variable_set(:@available_capacity, 20) test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 5, retries: 1, delay: 1 } + expect: { available_capacity: 6, retries: 1, delay: 0.05 } }, { response: { status_code: 502, error: service_error }, - expect: { available_capacity: 0, retries: 2, delay: 2 } - }, - { - response: { status_code: 503, error: service_error }, - expect: { available_capacity: 0, retries: 2 } + expect: { available_capacity: 6, retries: 1 } } ] @@ -296,20 +299,20 @@ module Plugins it 'recovers after successful responses' do config.max_attempts = 5 - config.retry_quota.instance_variable_set(:@available_capacity, 15) + config.retry_quota.instance_variable_set(:@available_capacity, 30) test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 10, retries: 1, delay: 1 } + expect: { available_capacity: 16, retries: 1, delay: 0.05 } }, { response: { status_code: 502, error: service_error }, - expect: { available_capacity: 5, retries: 2, delay: 2 } + expect: { available_capacity: 2, retries: 2, delay: 0.1 } }, { response: { status_code: 200, error: nil }, - expect: { available_capacity: 10, retries: 2 } + expect: { available_capacity: 16, retries: 2 } } ] handle_with_retry(test_case_def) @@ -317,17 +320,148 @@ module Plugins test_case_post_success = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 5, retries: 1, delay: 1 } + expect: { available_capacity: 2, retries: 1, delay: 0.05 } }, { response: { status_code: 200, error: nil }, - expect: { available_capacity: 10, retries: 1 } + expect: { available_capacity: 16, retries: 1 } } ] reset_request handle_with_retry(test_case_post_success) end + it 'retries for throttling errors' do + test_case_def = [ + { + response: { status_code: 400, error: throttling_error }, + expect: { available_capacity: 495, retries: 1, delay: 1 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 500, retries: 1 } + } + ] + + handle_with_retry(test_case_def) + end + + context 'DynamoDB base backoff and increased retries' do + let(:api) do + api = Seahorse::Model::Api.new + api.metadata['serviceId'] = 'DynamoDB' + api + end + + it 'retries errors' do + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 486, retries: 1, delay: 0.025 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 472, retries: 2, delay: 0.05 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 458, retries: 3, delay: 0.1 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 458, retries: 3 } + } + ] + + handle_with_retry(test_case_def) + end + end + + # TODO: update with generic long-polling service once trait is supported + context 'long-polling' do + let(:api) do + api = Seahorse::Model::Api.new + api.metadata['serviceId'] = 'SQS' + api + end + + it 'backs off even with depleted token bucket' do + resp.context.operation_name = :receive_message + config.retry_quota.instance_variable_set(:@available_capacity, 0) + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 0, retries: 0, delay: 0.05 } + } + ] + + handle_with_retry(test_case_def) + end + end + + context 'x-amz-retry-after' do + it 'honors the header' do + test_case_def = [ + { + response: { status_code: 500, error: service_error, retry_after: '1500' }, + expect: { available_capacity: 486, retries: 1, delay: 1.5 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 500, retries: 1 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'delays for at least the exponential backoff duration' do + test_case_def = [ + { + response: { status_code: 500, error: service_error, retry_after: '0' }, + expect: { available_capacity: 486, retries: 1, delay: 0.05 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 500, retries: 1 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'delays for at most 5 plus the exponential backoff duration' do + test_case_def = [ + { + response: { status_code: 500, error: service_error, retry_after: '10000' }, + expect: { available_capacity: 486, retries: 1, delay: 5.05 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 500, retries: 1 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'falls back to exponential backoff for invalid headers' do + test_case_def = [ + { + response: { status_code: 500, error: service_error, retry_after: 'invalid' }, + expect: { available_capacity: 486, retries: 1, delay: 0.05 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 500, retries: 1 } + } + ] + + handle_with_retry(test_case_def) + end + end + it 'corrects and retries clock skew errors' do clock_skew_error = RetryErrorsSvc::Errors::RequestTimeTooSkewed .new(nil, nil) diff --git a/gems/aws-sdk-core/spec/retry_errors_helper.rb b/gems/aws-sdk-core/spec/retry_errors_helper.rb index f91cabc5b3b..08ec27a3c0a 100644 --- a/gems/aws-sdk-core/spec/retry_errors_helper.rb +++ b/gems/aws-sdk-core/spec/retry_errors_helper.rb @@ -119,6 +119,10 @@ def setup_next_response(test_case) resp.context.http_response.headers['date'] = Time.now.utc + response[:clock_skew] end + if response[:retry_after] + resp.context.http_response.headers['x-amz-retry-after'] = response[:retry_after].to_s + end + if response[:endpoint_discovery] allow(resp.context.operation).to receive(:endpoint_discovery).and_return(true) end From e869d9dea8e738dd6d58d9d158d71043383656c6 Mon Sep 17 00:00:00 2001 From: Richard Wang Date: Mon, 11 May 2026 14:59:32 -0700 Subject: [PATCH 03/10] Update more tests --- gems/aws-sdk-core/spec/aws/client_spec.rb | 2 +- .../spec/aws/plugins/retries/retry_quota_spec.rb | 8 ++++---- gems/aws-sdk-s3/spec/client_spec.rb | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gems/aws-sdk-core/spec/aws/client_spec.rb b/gems/aws-sdk-core/spec/aws/client_spec.rb index 116ceab0edb..d9c1398620d 100644 --- a/gems/aws-sdk-core/spec/aws/client_spec.rb +++ b/gems/aws-sdk-core/spec/aws/client_spec.rb @@ -65,7 +65,7 @@ module Aws end expect(e).to be_kind_of(Errors::NoSuchEndpointError) - expect(e.context.retries).to be(3) # updated to retry based on customer request + expect(e.context.retries).to be(2) # updated to retry based on customer request expect(e.message).to include('us-east-1') expect(e.message).to include('us-west-1') expect(e.message).to include('cn-north-1') diff --git a/gems/aws-sdk-core/spec/aws/plugins/retries/retry_quota_spec.rb b/gems/aws-sdk-core/spec/aws/plugins/retries/retry_quota_spec.rb index 09956e8af23..611ed02eb89 100644 --- a/gems/aws-sdk-core/spec/aws/plugins/retries/retry_quota_spec.rb +++ b/gems/aws-sdk-core/spec/aws/plugins/retries/retry_quota_spec.rb @@ -18,7 +18,7 @@ module Plugins end describe '#checkout_capacity' do - let(:error) { double('ErrorInspector', networking?: false) } + let(:error) { double('ErrorInspector', throttling_error?: false) } it 'returns the requested capacity when available' do initial_capacity = retry_quota.instance_variable_get(:@available_capacity) @@ -30,11 +30,11 @@ module Plugins .to eq(initial_capacity - checked_out_capacity) end - it 'checks out the timeout cost when the error is a networking error' do - error = double('ErrorInspector', networking?: true) + it 'checks out the timeout cost when the error is a throttling error' do + error = double('ErrorInspector', throttling_error?: true) checked_out_capacity = retry_quota.checkout_capacity(error) - expect(checked_out_capacity).to eq(Retries::RetryQuota::TIMEOUT_RETRY_COST) + expect(checked_out_capacity).to eq(Retries::RetryQuota::THROTTLING_RETRY_COST) end it 'returns 0 when there is insufficient capacity' do diff --git a/gems/aws-sdk-s3/spec/client_spec.rb b/gems/aws-sdk-s3/spec/client_spec.rb index 7ca10759590..45edd9b45ac 100644 --- a/gems/aws-sdk-s3/spec/client_spec.rb +++ b/gems/aws-sdk-s3/spec/client_spec.rb @@ -796,7 +796,7 @@ module S3 key: 'key' }.merge(params)) expect(resp.error).to be_kind_of(S3::Errors::InternalError) - expect(resp.context.retries).to eq(3) + expect(resp.context.retries).to eq(2) expect(resp.data).to be(nil) end @@ -815,7 +815,7 @@ module S3 key: 'key' }.merge(params)) expect(resp.error).to be_kind_of(Seahorse::Client::NetworkingError) - expect(resp.context.retries).to eq(3) + expect(resp.context.retries).to eq(2) expect(resp.data).to be(nil) end end From e834f9dc8712137d7aaac804a9d29740880bdae9 Mon Sep 17 00:00:00 2001 From: Richard Wang Date: Tue, 12 May 2026 09:31:43 -0700 Subject: [PATCH 04/10] Add 2026 retries gate --- .../plugins/retries/retry_quota.rb | 13 +- .../lib/aws-sdk-core/plugins/retry_errors.rb | 65 ++++-- gems/aws-sdk-core/spec/aws/client_spec.rb | 4 +- .../aws/plugins/retries/retry_quota_spec.rb | 68 ++++-- .../aws/plugins/retry_errors_legacy_spec.rb | 7 + .../spec/aws/plugins/retry_errors_spec.rb | 204 +++++++++++++++++- gems/aws-sdk-s3/spec/client_spec.rb | 7 +- 7 files changed, 325 insertions(+), 43 deletions(-) diff --git a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb index 5eb123e1131..b48cc06e507 100644 --- a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb +++ b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb @@ -9,8 +9,10 @@ module Retries class RetryQuota INITIAL_RETRY_TOKENS = 500 RETRY_COST = 14 + LEGACY_RETRY_COST = 5 # TODO: Remove when new retries become default NO_RETRY_INCREMENT = 1 THROTTLING_RETRY_COST = 5 + TIMEOUT_RETRY_COST = 10 # TODO: Remove when new retries become default def initialize(opts = {}) @mutex = Mutex.new @@ -24,10 +26,11 @@ def initialize(opts = {}) # @return [Integer] The amount of capacity checked out def checkout_capacity(error_inspector) @mutex.synchronize do - capacity_amount = if error_inspector.throttling_error? - THROTTLING_RETRY_COST + # TODO: Remove gate and keep only the new_retries branch + capacity_amount = if RetryErrors.new_retries? + error_inspector.throttling_error? ? THROTTLING_RETRY_COST : RETRY_COST else - RETRY_COST + error_inspector.networking? ? TIMEOUT_RETRY_COST : LEGACY_RETRY_COST end # unable to acquire capacity @@ -39,8 +42,8 @@ def checkout_capacity(error_inspector) end # capacity_amount refers to the amount of capacity requested from - # the last retry. It can either be RETRY_COST, THROTTLING_RETRY_COST, - # or unset. + # the last retry. It can either be RETRY_COST, + # THROTTLING_RETRY_COST/TIMEOUT_RETRY_COST, or unset. def release(capacity_amount) # Implementation note: The release() method is called for # every API call. In the common case where the request is diff --git a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb index 3237c413aae..8ee80b9bf20 100644 --- a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb +++ b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb @@ -10,6 +10,18 @@ module Aws module Plugins # @api private class RetryErrors < Seahorse::Client::Plugin + # TODO: Remove this gate and hardcode new retry behavior once + # AWS_NEW_RETRIES_2026 is enabled by default, which includes: + # - Default retry_mode to 'standard' + # - Default max_attempts to 4 for DynamoDB + # - Remove the old retries branch in Handler#call + # - Remove the old retries branch in #exponential_backoff + # - Remove LEGACY_RETRY_COST and TIMEOUT_RETRY_COST from RetryQuota + # @api private + def self.new_retries? + ENV.fetch('AWS_NEW_RETRIES_2026', 'false').downcase == 'true' + end + # BEGIN LEGACY OPTIONS EQUAL_JITTER = ->(delay) { (delay / 2) + Kernel.rand(0..(delay / 2)) } FULL_JITTER = ->(delay) { Kernel.rand(0..delay) } @@ -170,7 +182,7 @@ def self.resolve_retry_mode(cfg) value = ENV['AWS_RETRY_MODE'] || Aws.shared_config.retry_mode(profile: cfg.profile) || default_mode_value || - 'standard' + (new_retries? ? 'standard' : 'legacy') # TODO: default to 'standard' when new retries become default # Raise if provided value is not one of the retry modes if value != 'legacy' && value != 'standard' && value != 'adaptive' raise ArgumentError, @@ -194,8 +206,13 @@ def self.resolve_max_attempts(cfg) return value end - service_id = cfg.api.metadata['serviceId'] if cfg.respond_to?(:api) - ['DynamoDB', 'DynamoDB Streams'].include?(service_id) ? 4 : 3 + # TODO: Remove gate and keep only the new retries branch + if RetryErrors.new_retries? + service_id = cfg.api.metadata['serviceId'] if cfg.respond_to?(:api) + ['DynamoDB', 'DynamoDB Streams'].include?(service_id) ? 4 : 3 + else + 3 + end end def self.resolve_adaptive_retry_wait_to_fill(cfg) @@ -273,13 +290,21 @@ def call(context) capacity_amount = config.retry_quota.checkout_capacity(error_inspector) context.metadata[:retries][:capacity_amount] = capacity_amount - return response if capacity_amount <= 0 && !long_polling_operation?(context) + # TODO: Remove gate and keep only the new retries branch + if RetryErrors.new_retries? + return response if capacity_amount <= 0 && !long_polling_operation?(context) - service_id = context.config.api.metadata['serviceId'] - delay = backoff(context, error_inspector, service_id) - Kernel.sleep(delay) + service_id = context.config.api.metadata['serviceId'] + delay = backoff(context, error_inspector, service_id) + Kernel.sleep(delay) - return response if capacity_amount <= 0 + return response if capacity_amount <= 0 + else + return response unless capacity_amount > 0 + + delay = exponential_backoff(context.retries) + Kernel.sleep(delay) + end retry_request(context, error_inspector) end @@ -343,16 +368,20 @@ def backoff(context, error_inspector, service_id) [backoff_duration, exp_backoff + 5].min end - def exponential_backoff(retries, error_inspector, service_id) - # for a transient error, use backoff - backoff_scalar = if error_inspector.throttling_error? - 1 - elsif ['DynamoDB', 'DynamoDB Streams'].include?(service_id) - 0.025 - else - 0.05 - end - Kernel.rand * [backoff_scalar * 2**retries, MAX_BACKOFF].min + # TODO: Remove gate, remove default nil params, keep only new retries branch + def exponential_backoff(retries, error_inspector = nil, service_id = nil) + if RetryErrors.new_retries? + backoff_scalar = if error_inspector.throttling_error? + 1 + elsif ['DynamoDB', 'DynamoDB Streams'].include?(service_id) + 0.025 + else + 0.05 + end + Kernel.rand * [backoff_scalar * 2**retries, MAX_BACKOFF].min + else + [Kernel.rand * 2**retries, MAX_BACKOFF].min + end end def parse_retry_after(context) diff --git a/gems/aws-sdk-core/spec/aws/client_spec.rb b/gems/aws-sdk-core/spec/aws/client_spec.rb index d9c1398620d..3c93a9c4b1c 100644 --- a/gems/aws-sdk-core/spec/aws/client_spec.rb +++ b/gems/aws-sdk-core/spec/aws/client_spec.rb @@ -46,7 +46,9 @@ module Aws end end + # TODO: Update retries to 2 and remove stub when new retries become default it 'raises a helpful error on possible incorrect regions' do + allow(Aws::Plugins::RetryErrors).to receive(:new_retries?).and_return(false) # simulate an error from connecting to an unknown endpoint stub_request(:any, /.*/). @@ -65,7 +67,7 @@ module Aws end expect(e).to be_kind_of(Errors::NoSuchEndpointError) - expect(e.context.retries).to be(2) # updated to retry based on customer request + expect(e.context.retries).to be(3) expect(e.message).to include('us-east-1') expect(e.message).to include('us-west-1') expect(e.message).to include('cn-north-1') diff --git a/gems/aws-sdk-core/spec/aws/plugins/retries/retry_quota_spec.rb b/gems/aws-sdk-core/spec/aws/plugins/retries/retry_quota_spec.rb index 611ed02eb89..7d07a8b5aed 100644 --- a/gems/aws-sdk-core/spec/aws/plugins/retries/retry_quota_spec.rb +++ b/gems/aws-sdk-core/spec/aws/plugins/retries/retry_quota_spec.rb @@ -17,30 +17,66 @@ module Plugins end end - describe '#checkout_capacity' do - let(:error) { double('ErrorInspector', throttling_error?: false) } + context 'new retries' do + before { allow(RetryErrors).to receive(:new_retries?).and_return(true) } - it 'returns the requested capacity when available' do - initial_capacity = retry_quota.instance_variable_get(:@available_capacity) + describe '#checkout_capacity' do + let(:error) { double('ErrorInspector', throttling_error?: false) } - checked_out_capacity = retry_quota.checkout_capacity(error) - expect(checked_out_capacity).to eq(Retries::RetryQuota::RETRY_COST) + it 'returns the requested capacity when available' do + initial_capacity = retry_quota.instance_variable_get(:@available_capacity) - expect(retry_quota.instance_variable_get(:@available_capacity)) - .to eq(initial_capacity - checked_out_capacity) - end + checked_out_capacity = retry_quota.checkout_capacity(error) + expect(checked_out_capacity).to eq(Retries::RetryQuota::RETRY_COST) + + expect(retry_quota.instance_variable_get(:@available_capacity)) + .to eq(initial_capacity - checked_out_capacity) + end + + it 'checks out the throttling cost when the error is a throttling error' do + error = double('ErrorInspector', throttling_error?: true) - it 'checks out the timeout cost when the error is a throttling error' do - error = double('ErrorInspector', throttling_error?: true) + checked_out_capacity = retry_quota.checkout_capacity(error) + expect(checked_out_capacity).to eq(Retries::RetryQuota::THROTTLING_RETRY_COST) + end - checked_out_capacity = retry_quota.checkout_capacity(error) - expect(checked_out_capacity).to eq(Retries::RetryQuota::THROTTLING_RETRY_COST) + it 'returns 0 when there is insufficient capacity' do + retry_quota.instance_variable_set(:@available_capacity, 1) + + expect(retry_quota.checkout_capacity(error)).to eq(0) + end end + end + + # TODO: Remove this context when new retries become default + context 'old retries' do + before { allow(RetryErrors).to receive(:new_retries?).and_return(false) } + + describe '#checkout_capacity' do + let(:error) { double('ErrorInspector', networking?: false) } + + it 'returns the requested capacity when available' do + initial_capacity = retry_quota.instance_variable_get(:@available_capacity) + + checked_out_capacity = retry_quota.checkout_capacity(error) + expect(checked_out_capacity).to eq(Retries::RetryQuota::LEGACY_RETRY_COST) + + expect(retry_quota.instance_variable_get(:@available_capacity)) + .to eq(initial_capacity - checked_out_capacity) + end + + it 'checks out the timeout cost when the error is a networking error' do + error = double('ErrorInspector', networking?: true) + + checked_out_capacity = retry_quota.checkout_capacity(error) + expect(checked_out_capacity).to eq(Retries::RetryQuota::TIMEOUT_RETRY_COST) + end - it 'returns 0 when there is insufficient capacity' do - retry_quota.instance_variable_set(:@available_capacity, 1) + it 'returns 0 when there is insufficient capacity' do + retry_quota.instance_variable_set(:@available_capacity, 1) - expect(retry_quota.checkout_capacity(error)).to eq(0) + expect(retry_quota.checkout_capacity(error)).to eq(0) + end end end diff --git a/gems/aws-sdk-core/spec/aws/plugins/retry_errors_legacy_spec.rb b/gems/aws-sdk-core/spec/aws/plugins/retry_errors_legacy_spec.rb index d92afdb7005..7e8a91b7902 100644 --- a/gems/aws-sdk-core/spec/aws/plugins/retry_errors_legacy_spec.rb +++ b/gems/aws-sdk-core/spec/aws/plugins/retry_errors_legacy_spec.rb @@ -24,6 +24,13 @@ module Plugins expect(client.config.retry_jitter).to eq(:none) end + # TODO: Remove when new retries become default + it 'defaults config.retry_mode to legacy' do + allow(RetryErrors).to receive(:new_retries?).and_return(false) + client = RetryErrorsSvc::Client.new(stub_responses: true) + expect(client.config.retry_mode).to eq('legacy') + end + it 'uses the legacy handler when retry_mode is legacy' do client = RetryErrorsSvc::Client.new(retry_mode: 'legacy', region: 'us-west-2') expect(client.handlers.entries.map(&:handler_class)).to include(RetryErrors::LegacyHandler) diff --git a/gems/aws-sdk-core/spec/aws/plugins/retry_errors_spec.rb b/gems/aws-sdk-core/spec/aws/plugins/retry_errors_spec.rb index e7d20b7e1fb..9cb05c92602 100644 --- a/gems/aws-sdk-core/spec/aws/plugins/retry_errors_spec.rb +++ b/gems/aws-sdk-core/spec/aws/plugins/retry_errors_spec.rb @@ -8,10 +8,18 @@ module Plugins describe RetryErrors do let(:client) { RetryErrorsSvc::Client.new(stub_responses: true) } - it 'defaults config.retry_mode to standard' do + it 'defaults config.retry_mode to standard when new retries enabled' do + allow(RetryErrors).to receive(:new_retries?).and_return(true) + client = RetryErrorsSvc::Client.new(stub_responses: true) expect(client.config.retry_mode).to eq('standard') end + it 'defaults config.retry_mode to legacy when new retries disabled' do + allow(RetryErrors).to receive(:new_retries?).and_return(false) + client = RetryErrorsSvc::Client.new(stub_responses: true) + expect(client.config.retry_mode).to eq('legacy') + end + it 'can configure retry_mode with shared config' do allow_any_instance_of(Aws::SharedConfig) .to receive(:retry_mode).and_return('standard') @@ -148,6 +156,7 @@ module Plugins context 'standard mode' do before(:each) do + allow(RetryErrors).to receive(:new_retries?).and_return(true) config.retry_mode = 'standard' allow(Kernel).to receive(:rand).and_return(1) end @@ -354,6 +363,7 @@ module Plugins end it 'retries errors' do + config.max_attempts = 4 test_case_def = [ { response: { status_code: 500, error: service_error }, @@ -518,6 +528,198 @@ module Plugins end + # TODO: Remove this context when new retries become default + context 'standard mode (old retries)' do + before(:each) do + allow(RetryErrors).to receive(:new_retries?).and_return(false) + config.retry_mode = 'standard' + allow(Kernel).to receive(:rand).and_return(1) + end + + it 'retry eventually succeeds' do + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 495, retries: 1, delay: 1 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 490, retries: 2, delay: 2 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 495, retries: 2 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'fails due to max attempts reached' do + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 495, retries: 1, delay: 1 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 490, retries: 2, delay: 2 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 490, retries: 2 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'fails due to retry quota reached after a single retry' do + config.retry_quota.instance_variable_set(:@available_capacity, 5) + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 0, retries: 1, delay: 1 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 0, retries: 1 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'does not retry if the retry quota is 0' do + config.retry_quota.instance_variable_set(:@available_capacity, 0) + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 0, retries: 0 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'uses exponential backoff timing' do + config.max_attempts = 5 + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 495, retries: 1, delay: 1 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 490, retries: 2, delay: 2 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 485, retries: 3, delay: 4 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 480, retries: 4, delay: 8 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 480, retries: 4 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'does not exceed the max backoff time' do + config.max_attempts = 5 + stub_const('Aws::Plugins::RetryErrors::Handler::MAX_BACKOFF', 3) + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 495, retries: 1, delay: 1 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 490, retries: 2, delay: 2 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 485, retries: 3, delay: 3 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 480, retries: 4, delay: 3 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 480, retries: 4 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'fails due to retry quota bucket exhaustion' do + config.max_attempts = 5 + config.retry_quota.instance_variable_set(:@available_capacity, 10) + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 5, retries: 1, delay: 1 } + }, + { + response: { status_code: 502, error: service_error }, + expect: { available_capacity: 0, retries: 2, delay: 2 } + }, + { + response: { status_code: 503, error: service_error }, + expect: { available_capacity: 0, retries: 2 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'recovers after successful responses' do + config.max_attempts = 5 + config.retry_quota.instance_variable_set(:@available_capacity, 15) + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 10, retries: 1, delay: 1 } + }, + { + response: { status_code: 502, error: service_error }, + expect: { available_capacity: 5, retries: 2, delay: 2 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 10, retries: 2 } + } + ] + handle_with_retry(test_case_def) + + test_case_post_success = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 5, retries: 1, delay: 1 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 10, retries: 1 } + } + ] + reset_request + handle_with_retry(test_case_post_success) + end + end + context 'adaptive mode' do before(:each) do config.retry_mode = 'adaptive' diff --git a/gems/aws-sdk-s3/spec/client_spec.rb b/gems/aws-sdk-s3/spec/client_spec.rb index 45edd9b45ac..a4f7095d64b 100644 --- a/gems/aws-sdk-s3/spec/client_spec.rb +++ b/gems/aws-sdk-s3/spec/client_spec.rb @@ -763,7 +763,10 @@ module S3 end end + # TODO: Update retries to 2 and remove stub when new retries become default describe "200 errors response handling" do + before { allow(Aws::Plugins::RetryErrors).to receive(:new_retries?).and_return(false) } + { complete_multipart_upload: { upload_id: 'upload-id' }, copy_object: { copy_source: 'bucket/key' }, @@ -796,7 +799,7 @@ module S3 key: 'key' }.merge(params)) expect(resp.error).to be_kind_of(S3::Errors::InternalError) - expect(resp.context.retries).to eq(2) + expect(resp.context.retries).to eq(3) expect(resp.data).to be(nil) end @@ -815,7 +818,7 @@ module S3 key: 'key' }.merge(params)) expect(resp.error).to be_kind_of(Seahorse::Client::NetworkingError) - expect(resp.context.retries).to eq(2) + expect(resp.context.retries).to eq(3) expect(resp.data).to be(nil) end end From 9e4909507b6347d336c82ae4ad57285b4aae898a Mon Sep 17 00:00:00 2001 From: Richard Wang Date: Tue, 12 May 2026 09:44:23 -0700 Subject: [PATCH 05/10] Add changelog and bump min core --- build_tools/services.rb | 4 ++-- gems/aws-sdk-core/CHANGELOG.md | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/build_tools/services.rb b/build_tools/services.rb index 8bf91b2eb42..501e022a77c 100644 --- a/build_tools/services.rb +++ b/build_tools/services.rb @@ -9,10 +9,10 @@ class ServiceEnumerator MANIFEST_PATH = File.expand_path('../../services.json', __FILE__) # Minimum `aws-sdk-core` version for new gem builds - MINIMUM_CORE_VERSION = "3.244.0" + MINIMUM_CORE_VERSION = "3.248.0" # Minimum `aws-sdk-core` version for new S3 gem builds - MINIMUM_CORE_VERSION_S3 = "3.244.0" + MINIMUM_CORE_VERSION_S3 = "3.248.0" EVENTSTREAM_PLUGIN = "Aws::Plugins::EventStreamConfiguration" diff --git a/gems/aws-sdk-core/CHANGELOG.md b/gems/aws-sdk-core/CHANGELOG.md index 161ec602f37..6cb1b591c60 100644 --- a/gems/aws-sdk-core/CHANGELOG.md +++ b/gems/aws-sdk-core/CHANGELOG.md @@ -1,6 +1,8 @@ Unreleased Changes ------------------ +* Feature - Add new retry behavior behind `AWS_NEW_RETRIES_2026` environment variable. When enabled, defaults to `standard` retry mode, service-specific tuning for DynamoDB, long-polling backoff support, and `x-amz-retry-after` header support. + 3.246.0 (2026-04-23) ------------------ From 17ca319dc4c9eb558afb094a41c0a4b948f3a022 Mon Sep 17 00:00:00 2001 From: Richard Wang Date: Tue, 12 May 2026 09:48:26 -0700 Subject: [PATCH 06/10] Fix changelog --- gems/aws-sdk-core/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gems/aws-sdk-core/CHANGELOG.md b/gems/aws-sdk-core/CHANGELOG.md index 75d3562172b..6416a9d98ca 100644 --- a/gems/aws-sdk-core/CHANGELOG.md +++ b/gems/aws-sdk-core/CHANGELOG.md @@ -3,7 +3,7 @@ Unreleased Changes * Feature - Add YJIT & ZJIT tracking to user agent. * Issue - Fix error messaging in SSO OIDC. -* * Feature - Add new retry behavior behind `AWS_NEW_RETRIES_2026` environment variable. When enabled, defaults to `standard` retry mode, service-specific tuning for DynamoDB, long-polling backoff support, and `x-amz-retry-after` header support. +* Feature - Add new retry behavior behind `AWS_NEW_RETRIES_2026` environment variable. When enabled, defaults to `standard` retry mode, service-specific tuning for DynamoDB, long-polling backoff support, and `x-amz-retry-after` header support. 3.246.0 (2026-04-23) ------------------ From 3f9005ab7650790ed5dfa2d7c29af5ebb81e6b9c Mon Sep 17 00:00:00 2001 From: Richard Wang Date: Tue, 12 May 2026 10:10:50 -0700 Subject: [PATCH 07/10] Fix default and tighten retry header value check --- .../lib/aws-sdk-core/plugins/retry_errors.rb | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb index 8ee80b9bf20..f5384a90926 100644 --- a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb +++ b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb @@ -108,7 +108,7 @@ def self.new_retries? option( :retry_mode, - default: 'standard', + default: 'legacy', # TODO: Change to 'standard' when new retries become default doc_type: String, rbs_type: '("legacy" | "standard" | "adaptive")', docstring: <<~DOCS) do |cfg| @@ -388,11 +388,14 @@ def parse_retry_after(context) retry_after = context.http_response.headers['x-amz-retry-after'] return nil unless retry_after - Integer(retry_after) / 1000.0 - rescue ArgumentError - context.config.logger&.debug( - "Failed to parse x-amz-retry-after header value: #{retry_after.inspect}" - ) + unless retry_after.match?(/\A\d+\z/) + context.config.logger&.debug( + "Failed to parse x-amz-retry-after header value: #{retry_after.inspect}" + ) + return nil + end + + retry_after.to_i / 1000.0 end def retry_request(context, error) From 09bb84782a4392f92a1a608ad3bbbe532ac1f5f7 Mon Sep 17 00:00:00 2001 From: Richard Wang Date: Tue, 12 May 2026 10:44:26 -0700 Subject: [PATCH 08/10] Update retry mode docstring --- gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb index f5384a90926..9426e6c9016 100644 --- a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb +++ b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb @@ -114,12 +114,12 @@ def self.new_retries? docstring: <<~DOCS) do |cfg| Specifies which retry algorithm to use. Values are: - * `legacy` - The pre-existing retry behavior. + * `legacy` - The pre-existing retry behavior. This is the default + value if no retry mode is provided. * `standard` - A standardized set of retry rules across the AWS SDKs. This includes support for retry quotas, which limit the number of - unsuccessful retries a client can make. This is default value if - no retry mode is provided. + unsuccessful retries a client can make. * `adaptive` - A retry mode that includes all the functionality of `standard` mode along with automatic client side throttling. From 87fcd9eae7edd27cbaaa3116afd1fd97bde41792 Mon Sep 17 00:00:00 2001 From: Richard Wang Date: Tue, 12 May 2026 12:41:14 -0700 Subject: [PATCH 09/10] Revert min core to test SDK build --- build_tools/services.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build_tools/services.rb b/build_tools/services.rb index 501e022a77c..a78ed7ce825 100644 --- a/build_tools/services.rb +++ b/build_tools/services.rb @@ -9,10 +9,10 @@ class ServiceEnumerator MANIFEST_PATH = File.expand_path('../../services.json', __FILE__) # Minimum `aws-sdk-core` version for new gem builds - MINIMUM_CORE_VERSION = "3.248.0" + MINIMUM_CORE_VERSION = "3.247.0" # Minimum `aws-sdk-core` version for new S3 gem builds - MINIMUM_CORE_VERSION_S3 = "3.248.0" + MINIMUM_CORE_VERSION_S3 = "3.247.0" EVENTSTREAM_PLUGIN = "Aws::Plugins::EventStreamConfiguration" From 51f180f0a1e6c16d0a38b4aa1f2fe5d415d1b476 Mon Sep 17 00:00:00 2001 From: Richard Wang Date: Tue, 12 May 2026 13:02:52 -0700 Subject: [PATCH 10/10] PR feedback --- gems/aws-sdk-core/CHANGELOG.md | 2 +- .../lib/aws-sdk-core/plugins/retry_errors.rb | 153 +++++++++--------- 2 files changed, 82 insertions(+), 73 deletions(-) diff --git a/gems/aws-sdk-core/CHANGELOG.md b/gems/aws-sdk-core/CHANGELOG.md index 6416a9d98ca..5afa1e0b12e 100644 --- a/gems/aws-sdk-core/CHANGELOG.md +++ b/gems/aws-sdk-core/CHANGELOG.md @@ -3,7 +3,7 @@ Unreleased Changes * Feature - Add YJIT & ZJIT tracking to user agent. * Issue - Fix error messaging in SSO OIDC. -* Feature - Add new retry behavior behind `AWS_NEW_RETRIES_2026` environment variable. When enabled, defaults to `standard` retry mode, service-specific tuning for DynamoDB, long-polling backoff support, and `x-amz-retry-after` header support. +* Feature - Add `AWS_NEW_RETRIES_2026` environment variable to opt-in to updated `standard` retry mode with reduced backoff intervals. 3.246.0 (2026-04-23) ------------------ diff --git a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb index 9426e6c9016..19212b769d4 100644 --- a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb +++ b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb @@ -10,18 +10,6 @@ module Aws module Plugins # @api private class RetryErrors < Seahorse::Client::Plugin - # TODO: Remove this gate and hardcode new retry behavior once - # AWS_NEW_RETRIES_2026 is enabled by default, which includes: - # - Default retry_mode to 'standard' - # - Default max_attempts to 4 for DynamoDB - # - Remove the old retries branch in Handler#call - # - Remove the old retries branch in #exponential_backoff - # - Remove LEGACY_RETRY_COST and TIMEOUT_RETRY_COST from RetryQuota - # @api private - def self.new_retries? - ENV.fetch('AWS_NEW_RETRIES_2026', 'false').downcase == 'true' - end - # BEGIN LEGACY OPTIONS EQUAL_JITTER = ->(delay) { (delay / 2) + Kernel.rand(0..(delay / 2)) } FULL_JITTER = ->(delay) { Kernel.rand(0..delay) } @@ -158,7 +146,7 @@ def self.new_retries? default: true, doc_type: 'Boolean', docstring: <<~DOCS) do |cfg| - Used only in `standard` and adaptive retry modes. Specifies whether to apply + Used only in `standard` and `adaptive` retry modes. Specifies whether to apply a clock skew correction and retry requests with skewed client clocks. DOCS resolve_correct_clock_skew(cfg) @@ -173,74 +161,91 @@ def self.new_retries? # @api private undocumented option(:clock_skew) { Retries::ClockSkew.new } - def self.resolve_retry_mode(cfg) - default_mode_value = - if cfg.respond_to?(:defaults_mode_config_resolver) - cfg.defaults_mode_config_resolver.resolve(:retry_mode) - end - - value = ENV['AWS_RETRY_MODE'] || - Aws.shared_config.retry_mode(profile: cfg.profile) || - default_mode_value || - (new_retries? ? 'standard' : 'legacy') # TODO: default to 'standard' when new retries become default - # Raise if provided value is not one of the retry modes - if value != 'legacy' && value != 'standard' && value != 'adaptive' - raise ArgumentError, - 'Must provide either `legacy`, `standard`, or `adaptive` for '\ - 'retry_mode profile option or for ENV[\'AWS_RETRY_MODE\']' + DYNAMODB_SERVICES = Set['DynamoDB', 'DynamoDB Streams'].freeze + + class << self + # TODO: Remove this gate and hardcode new retry behavior once + # AWS_NEW_RETRIES_2026 is enabled by default, which includes: + # - Default retry_mode to 'standard' + # - Default max_attempts to 4 for DynamoDB + # - Remove the old retries branch in Handler#call + # - Remove the old retries branch in #exponential_backoff + # - Remove LEGACY_RETRY_COST and TIMEOUT_RETRY_COST from RetryQuota + def new_retries? + ENV.fetch('AWS_NEW_RETRIES_2026', 'false').downcase == 'true' end - value - end - def self.resolve_max_attempts(cfg) - value = (ENV['AWS_MAX_ATTEMPTS']) || - Aws.shared_config.max_attempts(profile: cfg.profile) - if value - value = value.to_i - # Raise if provided value is not a positive integer - if value <= 0 + def resolve_retry_mode(cfg) + default_mode_value = + if cfg.respond_to?(:defaults_mode_config_resolver) + cfg.defaults_mode_config_resolver.resolve(:retry_mode) + end + + value = ENV['AWS_RETRY_MODE'] || + Aws.shared_config.retry_mode(profile: cfg.profile) || + default_mode_value || + (new_retries? ? 'standard' : 'legacy') # TODO: default to 'standard' when new retries become default + # Raise if provided value is not one of the retry modes + if value != 'legacy' && value != 'standard' && value != 'adaptive' raise ArgumentError, - 'Must provide a positive integer for max_attempts profile '\ - 'option or for ENV[\'AWS_MAX_ATTEMPTS\']' + 'Must provide either `legacy`, `standard`, or `adaptive` for '\ + 'retry_mode profile option or for ENV[\'AWS_RETRY_MODE\']' + end + value + end + + def resolve_max_attempts(cfg) + value = (ENV['AWS_MAX_ATTEMPTS']) || + Aws.shared_config.max_attempts(profile: cfg.profile) + if value + value = value.to_i + # Raise if provided value is not a positive integer + if value <= 0 + raise ArgumentError, + 'Must provide a positive integer for max_attempts profile '\ + 'option or for ENV[\'AWS_MAX_ATTEMPTS\']' + end + return value end - return value + + default_max_attempts(cfg) end - # TODO: Remove gate and keep only the new retries branch - if RetryErrors.new_retries? + def default_max_attempts(cfg) + # TODO: Remove gate and keep only the new retries branch + return 3 unless new_retries? + service_id = cfg.api.metadata['serviceId'] if cfg.respond_to?(:api) - ['DynamoDB', 'DynamoDB Streams'].include?(service_id) ? 4 : 3 - else - 3 + DYNAMODB_SERVICES.include?(service_id) ? 4 : 3 end - end - def self.resolve_adaptive_retry_wait_to_fill(cfg) - value = ENV['AWS_ADAPTIVE_RETRY_WAIT_TO_FILL'] || - Aws.shared_config.adaptive_retry_wait_to_fill(profile: cfg.profile) || - 'true' - # Raise if provided value is not true or false - if value != 'true' && value != 'false' - raise ArgumentError, - 'Must provide either `true` or `false` for '\ - 'adaptive_retry_wait_to_fill profile option or for '\ - 'ENV[\'AWS_ADAPTIVE_RETRY_WAIT_TO_FILL\']' + def resolve_adaptive_retry_wait_to_fill(cfg) + value = ENV['AWS_ADAPTIVE_RETRY_WAIT_TO_FILL'] || + Aws.shared_config.adaptive_retry_wait_to_fill(profile: cfg.profile) || + 'true' + # Raise if provided value is not true or false + if value != 'true' && value != 'false' + raise ArgumentError, + 'Must provide either `true` or `false` for '\ + 'adaptive_retry_wait_to_fill profile option or for '\ + 'ENV[\'AWS_ADAPTIVE_RETRY_WAIT_TO_FILL\']' + end + value == 'true' end - value == 'true' - end - def self.resolve_correct_clock_skew(cfg) - value = ENV['AWS_CORRECT_CLOCK_SKEW'] || - Aws.shared_config.correct_clock_skew(profile: cfg.profile) || - 'true' - # Raise if provided value is not true or false - if value != 'true' && value != 'false' - raise ArgumentError, - 'Must provide either `true` or `false` for '\ - 'correct_clock_skew profile option or for '\ - 'ENV[\'AWS_CORRECT_CLOCK_SKEW\']' + def resolve_correct_clock_skew(cfg) + value = ENV['AWS_CORRECT_CLOCK_SKEW'] || + Aws.shared_config.correct_clock_skew(profile: cfg.profile) || + 'true' + # Raise if provided value is not true or false + if value != 'true' && value != 'false' + raise ArgumentError, + 'Must provide either `true` or `false` for '\ + 'correct_clock_skew profile option or for '\ + 'ENV[\'AWS_CORRECT_CLOCK_SKEW\']' + end + value == 'true' end - value == 'true' end class Handler < Seahorse::Client::Handler @@ -291,7 +296,7 @@ def call(context) context.metadata[:retries][:capacity_amount] = capacity_amount # TODO: Remove gate and keep only the new retries branch - if RetryErrors.new_retries? + if new_retries? return response if capacity_amount <= 0 && !long_polling_operation?(context) service_id = context.config.api.metadata['serviceId'] @@ -311,6 +316,10 @@ def call(context) private + def new_retries? + RetryErrors.new_retries? + end + def with_metric(retry_mode, &block) Aws::Plugins::UserAgent.metric("RETRY_MODE_#{retry_mode.upcase}", &block) end @@ -370,10 +379,10 @@ def backoff(context, error_inspector, service_id) # TODO: Remove gate, remove default nil params, keep only new retries branch def exponential_backoff(retries, error_inspector = nil, service_id = nil) - if RetryErrors.new_retries? + if new_retries? backoff_scalar = if error_inspector.throttling_error? 1 - elsif ['DynamoDB', 'DynamoDB Streams'].include?(service_id) + elsif DYNAMODB_SERVICES.include?(service_id) 0.025 else 0.05