diff --git a/gems/aws-sdk-core/CHANGELOG.md b/gems/aws-sdk-core/CHANGELOG.md index 1b5999804e0..5afa1e0b12e 100644 --- a/gems/aws-sdk-core/CHANGELOG.md +++ b/gems/aws-sdk-core/CHANGELOG.md @@ -1,7 +1,9 @@ Unreleased Changes ------------------ + * Feature - Add YJIT & ZJIT tracking to user agent. * Issue - Fix error messaging in SSO OIDC. +* Feature - Add `AWS_NEW_RETRIES_2026` environment variable to opt-in to updated `standard` retry mode with reduced backoff intervals. 3.246.0 (2026-04-23) ------------------ diff --git a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb index 63a2db2f5a7..b48cc06e507 100644 --- a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb +++ b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retries/retry_quota.rb @@ -8,9 +8,11 @@ module Retries # Used in 'standard' and 'adaptive' retry modes. class RetryQuota INITIAL_RETRY_TOKENS = 500 - RETRY_COST = 5 + RETRY_COST = 14 + LEGACY_RETRY_COST = 5 # TODO: Remove when new retries become default NO_RETRY_INCREMENT = 1 - TIMEOUT_RETRY_COST = 10 + THROTTLING_RETRY_COST = 5 + TIMEOUT_RETRY_COST = 10 # TODO: Remove when new retries become default def initialize(opts = {}) @mutex = Mutex.new @@ -19,15 +21,16 @@ def initialize(opts = {}) end # check if there is sufficient capacity to retry - # and return it. If there is insufficient capacity + # and return it. If there is insufficient capacity # return 0 # @return [Integer] The amount of capacity checked out def checkout_capacity(error_inspector) @mutex.synchronize do - capacity_amount = if error_inspector.networking? - TIMEOUT_RETRY_COST + # TODO: Remove gate and keep only the new_retries branch + capacity_amount = if RetryErrors.new_retries? + error_inspector.throttling_error? ? THROTTLING_RETRY_COST : RETRY_COST else - RETRY_COST + error_inspector.networking? ? TIMEOUT_RETRY_COST : LEGACY_RETRY_COST end # unable to acquire capacity @@ -39,8 +42,8 @@ def checkout_capacity(error_inspector) end # capacity_amount refers to the amount of capacity requested from - # the last retry. It can either be RETRY_COST, TIMEOUT_RETRY_COST, - # or unset. + # the last retry. It can either be RETRY_COST, + # THROTTLING_RETRY_COST/TIMEOUT_RETRY_COST, or unset. def release(capacity_amount) # Implementation note: The release() method is called for # every API call. In the common case where the request is diff --git a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb index c44a899c5fc..19212b769d4 100644 --- a/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb +++ b/gems/aws-sdk-core/lib/aws-sdk-core/plugins/retry_errors.rb @@ -41,32 +41,32 @@ class RetryErrors < Seahorse::Client::Plugin :retry_limit, default: 3, doc_type: Integer, - docstring: <<-DOCS) -The maximum number of times to retry failed requests. Only -~ 500 level server errors and certain ~ 400 level client errors -are retried. Generally, these are throttling errors, data -checksum errors, networking errors, timeout errors, auth errors, -endpoint discovery, and errors from expired credentials. -This option is only used in the `legacy` retry mode. + docstring: <<~DOCS) + The maximum number of times to retry failed requests. Only + ~ 500 level server errors and certain ~ 400 level client errors + are retried. Generally, these are throttling errors, data + checksum errors, networking errors, timeout errors, auth errors, + endpoint discovery, and errors from expired credentials. + This option is only used in the `legacy` retry mode. DOCS option( :retry_max_delay, default: 0, doc_type: Integer, - docstring: <<-DOCS) -The maximum number of seconds to delay between retries (0 for no limit) -used by the default backoff function. This option is only used in the -`legacy` retry mode. + docstring: <<~DOCS) + The maximum number of seconds to delay between retries (0 for no limit) + used by the default backoff function. This option is only used in the + `legacy` retry mode. DOCS option( :retry_base_delay, default: 0.3, doc_type: Float, - docstring: <<-DOCS) -The base delay in seconds used by the default backoff function. This option -is only used in the `legacy` retry mode. + docstring: <<~DOCS) + The base delay in seconds used by the default backoff function. This option + is only used in the `legacy` retry mode. DOCS option( @@ -74,45 +74,43 @@ class RetryErrors < Seahorse::Client::Plugin default: :none, doc_type: Symbol, rbs_type: '(:none | :equal | :full | ^(Integer) -> Integer)', - docstring: <<-DOCS) -A delay randomiser function used by the default backoff function. -Some predefined functions can be referenced by name - :none, :equal, :full, -otherwise a Proc that takes and returns a number. This option is only used -in the `legacy` retry mode. + docstring: <<~DOCS) + A delay randomiser function used by the default backoff function. + Some predefined functions can be referenced by name - :none, :equal, :full, + otherwise a Proc that takes and returns a number. This option is only used + in the `legacy` retry mode. -@see https://www.awsarchitectureblog.com/2015/03/backoff.html + @see https://www.awsarchitectureblog.com/2015/03/backoff.html DOCS option( :retry_backoff, default: DEFAULT_BACKOFF, doc_type: Proc, - docstring: <<-DOCS) -A proc or lambda used for backoff. Defaults to 2**retries * retry_base_delay. -This option is only used in the `legacy` retry mode. + docstring: <<~DOCS) + A proc or lambda used for backoff. Defaults to 2**retries * retry_base_delay. + This option is only used in the `legacy` retry mode. DOCS # END LEGACY OPTIONS option( :retry_mode, - default: 'legacy', + default: 'legacy', # TODO: Change to 'standard' when new retries become default doc_type: String, rbs_type: '("legacy" | "standard" | "adaptive")', - docstring: <<-DOCS) do |cfg| -Specifies which retry algorithm to use. Values are: + docstring: <<~DOCS) do |cfg| + Specifies which retry algorithm to use. Values are: -* `legacy` - The pre-existing retry behavior. This is default value if - no retry mode is provided. + * `legacy` - The pre-existing retry behavior. This is the default + value if no retry mode is provided. -* `standard` - A standardized set of retry rules across the AWS SDKs. - This includes support for retry quotas, which limit the number of - unsuccessful retries a client can make. + * `standard` - A standardized set of retry rules across the AWS SDKs. + This includes support for retry quotas, which limit the number of + unsuccessful retries a client can make. -* `adaptive` - An experimental retry mode that includes all the - functionality of `standard` mode along with automatic client side - throttling. This is a provisional mode that may change behavior - in the future. + * `adaptive` - A retry mode that includes all the functionality of + `standard` mode along with automatic client side throttling. DOCS resolve_retry_mode(cfg) end @@ -121,11 +119,11 @@ class RetryErrors < Seahorse::Client::Plugin :max_attempts, default: 3, doc_type: Integer, - docstring: <<-DOCS) do |cfg| -An integer representing the maximum number attempts that will be made for -a single request, including the initial attempt. For example, -setting this value to 5 will result in a request being retried up to -4 times. Used in `standard` and `adaptive` retry modes. + docstring: <<~DOCS) do |cfg| + An integer representing the maximum number attempts that will be made for + a single request, including the initial attempt. For example, + setting this value to 5 will result in a request being retried up to + 4 times. Used in `standard` and `adaptive` retry modes. DOCS resolve_max_attempts(cfg) end @@ -134,11 +132,11 @@ class RetryErrors < Seahorse::Client::Plugin :adaptive_retry_wait_to_fill, default: true, doc_type: 'Boolean', - docstring: <<-DOCS) do |cfg| -Used only in `adaptive` retry mode. When true, the request will sleep -until there is sufficent client side capacity to retry the request. -When false, the request will raise a `RetryCapacityNotAvailableError` and will -not retry instead of sleeping. + docstring: <<~DOCS) do |cfg| + Used only in `adaptive` retry mode. When true, the request will sleep + until there is sufficent client side capacity to retry the request. + When false, the request will raise a `RetryCapacityNotAvailableError` and will + not retry instead of sleeping. DOCS resolve_adaptive_retry_wait_to_fill(cfg) end @@ -147,10 +145,10 @@ class RetryErrors < Seahorse::Client::Plugin :correct_clock_skew, default: true, doc_type: 'Boolean', - docstring: <<-DOCS) do |cfg| -Used only in `standard` and adaptive retry modes. Specifies whether to apply -a clock skew correction and retry requests with skewed client clocks. - DOCS + docstring: <<~DOCS) do |cfg| + Used only in `standard` and `adaptive` retry modes. Specifies whether to apply + a clock skew correction and retry requests with skewed client clocks. + DOCS resolve_correct_clock_skew(cfg) end @@ -163,71 +161,105 @@ class RetryErrors < Seahorse::Client::Plugin # @api private undocumented option(:clock_skew) { Retries::ClockSkew.new } - def self.resolve_retry_mode(cfg) - default_mode_value = - if cfg.respond_to?(:defaults_mode_config_resolver) - cfg.defaults_mode_config_resolver.resolve(:retry_mode) - end + DYNAMODB_SERVICES = Set['DynamoDB', 'DynamoDB Streams'].freeze + + class << self + # TODO: Remove this gate and hardcode new retry behavior once + # AWS_NEW_RETRIES_2026 is enabled by default, which includes: + # - Default retry_mode to 'standard' + # - Default max_attempts to 4 for DynamoDB + # - Remove the old retries branch in Handler#call + # - Remove the old retries branch in #exponential_backoff + # - Remove LEGACY_RETRY_COST and TIMEOUT_RETRY_COST from RetryQuota + def new_retries? + ENV.fetch('AWS_NEW_RETRIES_2026', 'false').downcase == 'true' + end + + def resolve_retry_mode(cfg) + default_mode_value = + if cfg.respond_to?(:defaults_mode_config_resolver) + cfg.defaults_mode_config_resolver.resolve(:retry_mode) + end value = ENV['AWS_RETRY_MODE'] || Aws.shared_config.retry_mode(profile: cfg.profile) || default_mode_value || - 'legacy' - # Raise if provided value is not one of the retry modes - if value != 'legacy' && value != 'standard' && value != 'adaptive' - raise ArgumentError, - 'Must provide either `legacy`, `standard`, or `adaptive` for '\ - 'retry_mode profile option or for ENV[\'AWS_RETRY_MODE\']' + (new_retries? ? 'standard' : 'legacy') # TODO: default to 'standard' when new retries become default + # Raise if provided value is not one of the retry modes + if value != 'legacy' && value != 'standard' && value != 'adaptive' + raise ArgumentError, + 'Must provide either `legacy`, `standard`, or `adaptive` for '\ + 'retry_mode profile option or for ENV[\'AWS_RETRY_MODE\']' + end + value end - value - end - def self.resolve_max_attempts(cfg) - value = (ENV['AWS_MAX_ATTEMPTS']) || - Aws.shared_config.max_attempts(profile: cfg.profile) || - '3' - value = value.to_i - # Raise if provided value is not a positive integer - if value <= 0 - raise ArgumentError, - 'Must provide a positive integer for max_attempts profile '\ - 'option or for ENV[\'AWS_MAX_ATTEMPTS\']' + def resolve_max_attempts(cfg) + value = (ENV['AWS_MAX_ATTEMPTS']) || + Aws.shared_config.max_attempts(profile: cfg.profile) + if value + value = value.to_i + # Raise if provided value is not a positive integer + if value <= 0 + raise ArgumentError, + 'Must provide a positive integer for max_attempts profile '\ + 'option or for ENV[\'AWS_MAX_ATTEMPTS\']' + end + return value + end + + default_max_attempts(cfg) end - value - end - def self.resolve_adaptive_retry_wait_to_fill(cfg) - value = ENV['AWS_ADAPTIVE_RETRY_WAIT_TO_FILL'] || - Aws.shared_config.adaptive_retry_wait_to_fill(profile: cfg.profile) || - 'true' - # Raise if provided value is not true or false - if value != 'true' && value != 'false' - raise ArgumentError, - 'Must provide either `true` or `false` for '\ - 'adaptive_retry_wait_to_fill profile option or for '\ - 'ENV[\'AWS_ADAPTIVE_RETRY_WAIT_TO_FILL\']' + def default_max_attempts(cfg) + # TODO: Remove gate and keep only the new retries branch + return 3 unless new_retries? + + service_id = cfg.api.metadata['serviceId'] if cfg.respond_to?(:api) + DYNAMODB_SERVICES.include?(service_id) ? 4 : 3 + end + + def resolve_adaptive_retry_wait_to_fill(cfg) + value = ENV['AWS_ADAPTIVE_RETRY_WAIT_TO_FILL'] || + Aws.shared_config.adaptive_retry_wait_to_fill(profile: cfg.profile) || + 'true' + # Raise if provided value is not true or false + if value != 'true' && value != 'false' + raise ArgumentError, + 'Must provide either `true` or `false` for '\ + 'adaptive_retry_wait_to_fill profile option or for '\ + 'ENV[\'AWS_ADAPTIVE_RETRY_WAIT_TO_FILL\']' + end + value == 'true' end - value == 'true' - end - def self.resolve_correct_clock_skew(cfg) - value = ENV['AWS_CORRECT_CLOCK_SKEW'] || - Aws.shared_config.correct_clock_skew(profile: cfg.profile) || - 'true' - # Raise if provided value is not true or false - if value != 'true' && value != 'false' - raise ArgumentError, - 'Must provide either `true` or `false` for '\ - 'correct_clock_skew profile option or for '\ - 'ENV[\'AWS_CORRECT_CLOCK_SKEW\']' + def resolve_correct_clock_skew(cfg) + value = ENV['AWS_CORRECT_CLOCK_SKEW'] || + Aws.shared_config.correct_clock_skew(profile: cfg.profile) || + 'true' + # Raise if provided value is not true or false + if value != 'true' && value != 'false' + raise ArgumentError, + 'Must provide either `true` or `false` for '\ + 'correct_clock_skew profile option or for '\ + 'ENV[\'AWS_CORRECT_CLOCK_SKEW\']' + end + value == 'true' end - value == 'true' end class Handler < Seahorse::Client::Handler # Max backoff (in seconds) MAX_BACKOFF = 20 + # Hard-coded combination of services and operations as having the + # longPoll trait. To be removed when trait is enabled. + LONG_POLLING_OPERATIONS = { + 'SQS' => Set[:receive_message], + 'SFN' => Set[:get_activity_task], + 'SWF' => Set[:poll_for_activity_task, :poll_for_decision_task] + }.freeze + def call(context) context.metadata[:retries] ||= {} config = context.config @@ -260,17 +292,34 @@ def call(context) return response if context.retries >= config.max_attempts - 1 - context.metadata[:retries][:capacity_amount] = - config.retry_quota.checkout_capacity(error_inspector) - return response unless context.metadata[:retries][:capacity_amount] > 0 + capacity_amount = config.retry_quota.checkout_capacity(error_inspector) + context.metadata[:retries][:capacity_amount] = capacity_amount + + # TODO: Remove gate and keep only the new retries branch + if new_retries? + return response if capacity_amount <= 0 && !long_polling_operation?(context) + + service_id = context.config.api.metadata['serviceId'] + delay = backoff(context, error_inspector, service_id) + Kernel.sleep(delay) + + return response if capacity_amount <= 0 + else + return response unless capacity_amount > 0 + + delay = exponential_backoff(context.retries) + Kernel.sleep(delay) + end - delay = exponential_backoff(context.retries) - Kernel.sleep(delay) retry_request(context, error_inspector) end private + def new_retries? + RetryErrors.new_retries? + end + def with_metric(retry_mode, &block) Aws::Plugins::UserAgent.metric("RETRY_MODE_#{retry_mode.upcase}", &block) end @@ -311,9 +360,51 @@ def retryable?(context, response, error_inspector) context.http_response.body.respond_to?(:truncate) end - def exponential_backoff(retries) - # for a transient error, use backoff - [Kernel.rand * 2**retries, MAX_BACKOFF].min + def long_polling_operation?(context) + return true if context.operation['longPoll'] + + # Hard-coded failback until the trait is enabled + service_id = context.config.api.metadata['serviceId'] + LONG_POLLING_OPERATIONS[service_id]&.include?(context.operation_name) + end + + def backoff(context, error_inspector, service_id) + exp_backoff = exponential_backoff(context.retries, error_inspector, service_id) + retry_after = parse_retry_after(context) + return exp_backoff unless retry_after + + backoff_duration = [retry_after, exp_backoff].max + [backoff_duration, exp_backoff + 5].min + end + + # TODO: Remove gate, remove default nil params, keep only new retries branch + def exponential_backoff(retries, error_inspector = nil, service_id = nil) + if new_retries? + backoff_scalar = if error_inspector.throttling_error? + 1 + elsif DYNAMODB_SERVICES.include?(service_id) + 0.025 + else + 0.05 + end + Kernel.rand * [backoff_scalar * 2**retries, MAX_BACKOFF].min + else + [Kernel.rand * 2**retries, MAX_BACKOFF].min + end + end + + def parse_retry_after(context) + retry_after = context.http_response.headers['x-amz-retry-after'] + return nil unless retry_after + + unless retry_after.match?(/\A\d+\z/) + context.config.logger&.debug( + "Failed to parse x-amz-retry-after header value: #{retry_after.inspect}" + ) + return nil + end + + retry_after.to_i / 1000.0 end def retry_request(context, error) diff --git a/gems/aws-sdk-core/spec/aws/client_spec.rb b/gems/aws-sdk-core/spec/aws/client_spec.rb index 116ceab0edb..3c93a9c4b1c 100644 --- a/gems/aws-sdk-core/spec/aws/client_spec.rb +++ b/gems/aws-sdk-core/spec/aws/client_spec.rb @@ -46,7 +46,9 @@ module Aws end end + # TODO: Update retries to 2 and remove stub when new retries become default it 'raises a helpful error on possible incorrect regions' do + allow(Aws::Plugins::RetryErrors).to receive(:new_retries?).and_return(false) # simulate an error from connecting to an unknown endpoint stub_request(:any, /.*/). @@ -65,7 +67,7 @@ module Aws end expect(e).to be_kind_of(Errors::NoSuchEndpointError) - expect(e.context.retries).to be(3) # updated to retry based on customer request + expect(e.context.retries).to be(3) expect(e.message).to include('us-east-1') expect(e.message).to include('us-west-1') expect(e.message).to include('cn-north-1') diff --git a/gems/aws-sdk-core/spec/aws/plugins/retries/retry_quota_spec.rb b/gems/aws-sdk-core/spec/aws/plugins/retries/retry_quota_spec.rb index 09956e8af23..7d07a8b5aed 100644 --- a/gems/aws-sdk-core/spec/aws/plugins/retries/retry_quota_spec.rb +++ b/gems/aws-sdk-core/spec/aws/plugins/retries/retry_quota_spec.rb @@ -17,30 +17,66 @@ module Plugins end end - describe '#checkout_capacity' do - let(:error) { double('ErrorInspector', networking?: false) } + context 'new retries' do + before { allow(RetryErrors).to receive(:new_retries?).and_return(true) } - it 'returns the requested capacity when available' do - initial_capacity = retry_quota.instance_variable_get(:@available_capacity) + describe '#checkout_capacity' do + let(:error) { double('ErrorInspector', throttling_error?: false) } - checked_out_capacity = retry_quota.checkout_capacity(error) - expect(checked_out_capacity).to eq(Retries::RetryQuota::RETRY_COST) + it 'returns the requested capacity when available' do + initial_capacity = retry_quota.instance_variable_get(:@available_capacity) - expect(retry_quota.instance_variable_get(:@available_capacity)) - .to eq(initial_capacity - checked_out_capacity) - end + checked_out_capacity = retry_quota.checkout_capacity(error) + expect(checked_out_capacity).to eq(Retries::RetryQuota::RETRY_COST) + + expect(retry_quota.instance_variable_get(:@available_capacity)) + .to eq(initial_capacity - checked_out_capacity) + end + + it 'checks out the throttling cost when the error is a throttling error' do + error = double('ErrorInspector', throttling_error?: true) - it 'checks out the timeout cost when the error is a networking error' do - error = double('ErrorInspector', networking?: true) + checked_out_capacity = retry_quota.checkout_capacity(error) + expect(checked_out_capacity).to eq(Retries::RetryQuota::THROTTLING_RETRY_COST) + end - checked_out_capacity = retry_quota.checkout_capacity(error) - expect(checked_out_capacity).to eq(Retries::RetryQuota::TIMEOUT_RETRY_COST) + it 'returns 0 when there is insufficient capacity' do + retry_quota.instance_variable_set(:@available_capacity, 1) + + expect(retry_quota.checkout_capacity(error)).to eq(0) + end end + end + + # TODO: Remove this context when new retries become default + context 'old retries' do + before { allow(RetryErrors).to receive(:new_retries?).and_return(false) } + + describe '#checkout_capacity' do + let(:error) { double('ErrorInspector', networking?: false) } + + it 'returns the requested capacity when available' do + initial_capacity = retry_quota.instance_variable_get(:@available_capacity) + + checked_out_capacity = retry_quota.checkout_capacity(error) + expect(checked_out_capacity).to eq(Retries::RetryQuota::LEGACY_RETRY_COST) + + expect(retry_quota.instance_variable_get(:@available_capacity)) + .to eq(initial_capacity - checked_out_capacity) + end + + it 'checks out the timeout cost when the error is a networking error' do + error = double('ErrorInspector', networking?: true) + + checked_out_capacity = retry_quota.checkout_capacity(error) + expect(checked_out_capacity).to eq(Retries::RetryQuota::TIMEOUT_RETRY_COST) + end - it 'returns 0 when there is insufficient capacity' do - retry_quota.instance_variable_set(:@available_capacity, 1) + it 'returns 0 when there is insufficient capacity' do + retry_quota.instance_variable_set(:@available_capacity, 1) - expect(retry_quota.checkout_capacity(error)).to eq(0) + expect(retry_quota.checkout_capacity(error)).to eq(0) + end end end diff --git a/gems/aws-sdk-core/spec/aws/plugins/retry_errors_legacy_spec.rb b/gems/aws-sdk-core/spec/aws/plugins/retry_errors_legacy_spec.rb index 067231e8004..7e8a91b7902 100644 --- a/gems/aws-sdk-core/spec/aws/plugins/retry_errors_legacy_spec.rb +++ b/gems/aws-sdk-core/spec/aws/plugins/retry_errors_legacy_spec.rb @@ -24,7 +24,10 @@ module Plugins expect(client.config.retry_jitter).to eq(:none) end + # TODO: Remove when new retries become default it 'defaults config.retry_mode to legacy' do + allow(RetryErrors).to receive(:new_retries?).and_return(false) + client = RetryErrorsSvc::Client.new(stub_responses: true) expect(client.config.retry_mode).to eq('legacy') end diff --git a/gems/aws-sdk-core/spec/aws/plugins/retry_errors_spec.rb b/gems/aws-sdk-core/spec/aws/plugins/retry_errors_spec.rb index e911e43236d..9cb05c92602 100644 --- a/gems/aws-sdk-core/spec/aws/plugins/retry_errors_spec.rb +++ b/gems/aws-sdk-core/spec/aws/plugins/retry_errors_spec.rb @@ -8,6 +8,18 @@ module Plugins describe RetryErrors do let(:client) { RetryErrorsSvc::Client.new(stub_responses: true) } + it 'defaults config.retry_mode to standard when new retries enabled' do + allow(RetryErrors).to receive(:new_retries?).and_return(true) + client = RetryErrorsSvc::Client.new(stub_responses: true) + expect(client.config.retry_mode).to eq('standard') + end + + it 'defaults config.retry_mode to legacy when new retries disabled' do + allow(RetryErrors).to receive(:new_retries?).and_return(false) + client = RetryErrorsSvc::Client.new(stub_responses: true) + expect(client.config.retry_mode).to eq('legacy') + end + it 'can configure retry_mode with shared config' do allow_any_instance_of(Aws::SharedConfig) .to receive(:retry_mode).and_return('standard') @@ -120,6 +132,7 @@ module Plugins cfg.add_option(:credentials, credentials) cfg.add_option(:endpoint_cache, cache) cfg.add_option(:api, api) + cfg.add_option(:logger, nil) cfg.add_option(:profile, nil) RetryErrors.new.add_options(cfg) cfg.build! @@ -133,6 +146,8 @@ module Plugins let(:service_error) { RetryErrorsSvc::Errors::ServiceError.new(nil, nil) } + let(:throttling_error) { RetryErrorsSvc::Errors::Throttling.new(nil, nil) } + before(:each) do resp.context.config = config operation.endpoint_discovery = {} @@ -141,6 +156,7 @@ module Plugins context 'standard mode' do before(:each) do + allow(RetryErrors).to receive(:new_retries?).and_return(true) config.retry_mode = 'standard' allow(Kernel).to receive(:rand).and_return(1) end @@ -149,15 +165,15 @@ module Plugins test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 495, retries: 1, delay: 1 } + expect: { available_capacity: 486, retries: 1, delay: 0.05 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 490, retries: 2, delay: 2 } + expect: { available_capacity: 472, retries: 2, delay: 0.1 } }, { response: { status_code: 200, error: nil }, - expect: { available_capacity: 495, retries: 2 } + expect: { available_capacity: 486, retries: 2 } } # success ] @@ -168,15 +184,15 @@ module Plugins test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 495, retries: 1, delay: 1 } + expect: { available_capacity: 486, retries: 1, delay: 0.05 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 490, retries: 2, delay: 2 } + expect: { available_capacity: 472, retries: 2, delay: 0.1 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 490, retries: 2 } + expect: { available_capacity: 472, retries: 2 } } # failure ] @@ -184,12 +200,12 @@ module Plugins end it 'fails due to retry quota reached after a single retry' do - config.retry_quota.instance_variable_set(:@available_capacity, 5) + config.retry_quota.instance_variable_set(:@available_capacity, 14) test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 0, retries: 1, delay: 1 } + expect: { available_capacity: 0, retries: 1, delay: 0.05 } }, { response: { status_code: 500, error: service_error }, @@ -219,23 +235,23 @@ module Plugins test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 495, retries: 1, delay: 1 } + expect: { available_capacity: 486, retries: 1, delay: 0.05 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 490, retries: 2, delay: 2 } + expect: { available_capacity: 472, retries: 2, delay: 0.1 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 485, retries: 3, delay: 4 } + expect: { available_capacity: 458, retries: 3, delay: 0.2 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 480, retries: 4, delay: 8 } + expect: { available_capacity: 444, retries: 4, delay: 0.4 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 480, retries: 4 } + expect: { available_capacity: 444, retries: 4 } } ] @@ -244,28 +260,28 @@ module Plugins it 'does not exceed the max backoff time' do config.max_attempts = 5 - stub_const('Aws::Plugins::RetryErrors::Handler::MAX_BACKOFF', 3) + stub_const('Aws::Plugins::RetryErrors::Handler::MAX_BACKOFF', 0.2) test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 495, retries: 1, delay: 1 } + expect: { available_capacity: 486, retries: 1, delay: 0.05 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 490, retries: 2, delay: 2 } + expect: { available_capacity: 472, retries: 2, delay: 0.1 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 485, retries: 3, delay: 3 } + expect: { available_capacity: 458, retries: 3, delay: 0.2 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 480, retries: 4, delay: 3 } + expect: { available_capacity: 444, retries: 4, delay: 0.2 } }, { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 480, retries: 4 } + expect: { available_capacity: 444, retries: 4 } } ] @@ -274,20 +290,16 @@ module Plugins it 'fails due to retry quota bucket exhaustion' do config.max_attempts = 5 - config.retry_quota.instance_variable_set(:@available_capacity, 10) + config.retry_quota.instance_variable_set(:@available_capacity, 20) test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 5, retries: 1, delay: 1 } + expect: { available_capacity: 6, retries: 1, delay: 0.05 } }, { response: { status_code: 502, error: service_error }, - expect: { available_capacity: 0, retries: 2, delay: 2 } - }, - { - response: { status_code: 503, error: service_error }, - expect: { available_capacity: 0, retries: 2 } + expect: { available_capacity: 6, retries: 1 } } ] @@ -296,20 +308,20 @@ module Plugins it 'recovers after successful responses' do config.max_attempts = 5 - config.retry_quota.instance_variable_set(:@available_capacity, 15) + config.retry_quota.instance_variable_set(:@available_capacity, 30) test_case_def = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 10, retries: 1, delay: 1 } + expect: { available_capacity: 16, retries: 1, delay: 0.05 } }, { response: { status_code: 502, error: service_error }, - expect: { available_capacity: 5, retries: 2, delay: 2 } + expect: { available_capacity: 2, retries: 2, delay: 0.1 } }, { response: { status_code: 200, error: nil }, - expect: { available_capacity: 10, retries: 2 } + expect: { available_capacity: 16, retries: 2 } } ] handle_with_retry(test_case_def) @@ -317,17 +329,149 @@ module Plugins test_case_post_success = [ { response: { status_code: 500, error: service_error }, - expect: { available_capacity: 5, retries: 1, delay: 1 } + expect: { available_capacity: 2, retries: 1, delay: 0.05 } }, { response: { status_code: 200, error: nil }, - expect: { available_capacity: 10, retries: 1 } + expect: { available_capacity: 16, retries: 1 } } ] reset_request handle_with_retry(test_case_post_success) end + it 'retries for throttling errors' do + test_case_def = [ + { + response: { status_code: 400, error: throttling_error }, + expect: { available_capacity: 495, retries: 1, delay: 1 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 500, retries: 1 } + } + ] + + handle_with_retry(test_case_def) + end + + context 'DynamoDB base backoff and increased retries' do + let(:api) do + api = Seahorse::Model::Api.new + api.metadata['serviceId'] = 'DynamoDB' + api + end + + it 'retries errors' do + config.max_attempts = 4 + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 486, retries: 1, delay: 0.025 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 472, retries: 2, delay: 0.05 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 458, retries: 3, delay: 0.1 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 458, retries: 3 } + } + ] + + handle_with_retry(test_case_def) + end + end + + # TODO: update with generic long-polling service once trait is supported + context 'long-polling' do + let(:api) do + api = Seahorse::Model::Api.new + api.metadata['serviceId'] = 'SQS' + api + end + + it 'backs off even with depleted token bucket' do + resp.context.operation_name = :receive_message + config.retry_quota.instance_variable_set(:@available_capacity, 0) + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 0, retries: 0, delay: 0.05 } + } + ] + + handle_with_retry(test_case_def) + end + end + + context 'x-amz-retry-after' do + it 'honors the header' do + test_case_def = [ + { + response: { status_code: 500, error: service_error, retry_after: '1500' }, + expect: { available_capacity: 486, retries: 1, delay: 1.5 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 500, retries: 1 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'delays for at least the exponential backoff duration' do + test_case_def = [ + { + response: { status_code: 500, error: service_error, retry_after: '0' }, + expect: { available_capacity: 486, retries: 1, delay: 0.05 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 500, retries: 1 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'delays for at most 5 plus the exponential backoff duration' do + test_case_def = [ + { + response: { status_code: 500, error: service_error, retry_after: '10000' }, + expect: { available_capacity: 486, retries: 1, delay: 5.05 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 500, retries: 1 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'falls back to exponential backoff for invalid headers' do + test_case_def = [ + { + response: { status_code: 500, error: service_error, retry_after: 'invalid' }, + expect: { available_capacity: 486, retries: 1, delay: 0.05 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 500, retries: 1 } + } + ] + + handle_with_retry(test_case_def) + end + end + it 'corrects and retries clock skew errors' do clock_skew_error = RetryErrorsSvc::Errors::RequestTimeTooSkewed .new(nil, nil) @@ -384,6 +528,198 @@ module Plugins end + # TODO: Remove this context when new retries become default + context 'standard mode (old retries)' do + before(:each) do + allow(RetryErrors).to receive(:new_retries?).and_return(false) + config.retry_mode = 'standard' + allow(Kernel).to receive(:rand).and_return(1) + end + + it 'retry eventually succeeds' do + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 495, retries: 1, delay: 1 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 490, retries: 2, delay: 2 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 495, retries: 2 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'fails due to max attempts reached' do + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 495, retries: 1, delay: 1 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 490, retries: 2, delay: 2 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 490, retries: 2 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'fails due to retry quota reached after a single retry' do + config.retry_quota.instance_variable_set(:@available_capacity, 5) + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 0, retries: 1, delay: 1 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 0, retries: 1 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'does not retry if the retry quota is 0' do + config.retry_quota.instance_variable_set(:@available_capacity, 0) + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 0, retries: 0 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'uses exponential backoff timing' do + config.max_attempts = 5 + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 495, retries: 1, delay: 1 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 490, retries: 2, delay: 2 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 485, retries: 3, delay: 4 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 480, retries: 4, delay: 8 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 480, retries: 4 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'does not exceed the max backoff time' do + config.max_attempts = 5 + stub_const('Aws::Plugins::RetryErrors::Handler::MAX_BACKOFF', 3) + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 495, retries: 1, delay: 1 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 490, retries: 2, delay: 2 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 485, retries: 3, delay: 3 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 480, retries: 4, delay: 3 } + }, + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 480, retries: 4 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'fails due to retry quota bucket exhaustion' do + config.max_attempts = 5 + config.retry_quota.instance_variable_set(:@available_capacity, 10) + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 5, retries: 1, delay: 1 } + }, + { + response: { status_code: 502, error: service_error }, + expect: { available_capacity: 0, retries: 2, delay: 2 } + }, + { + response: { status_code: 503, error: service_error }, + expect: { available_capacity: 0, retries: 2 } + } + ] + + handle_with_retry(test_case_def) + end + + it 'recovers after successful responses' do + config.max_attempts = 5 + config.retry_quota.instance_variable_set(:@available_capacity, 15) + + test_case_def = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 10, retries: 1, delay: 1 } + }, + { + response: { status_code: 502, error: service_error }, + expect: { available_capacity: 5, retries: 2, delay: 2 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 10, retries: 2 } + } + ] + handle_with_retry(test_case_def) + + test_case_post_success = [ + { + response: { status_code: 500, error: service_error }, + expect: { available_capacity: 5, retries: 1, delay: 1 } + }, + { + response: { status_code: 200, error: nil }, + expect: { available_capacity: 10, retries: 1 } + } + ] + reset_request + handle_with_retry(test_case_post_success) + end + end + context 'adaptive mode' do before(:each) do config.retry_mode = 'adaptive' diff --git a/gems/aws-sdk-core/spec/retry_errors_helper.rb b/gems/aws-sdk-core/spec/retry_errors_helper.rb index f91cabc5b3b..08ec27a3c0a 100644 --- a/gems/aws-sdk-core/spec/retry_errors_helper.rb +++ b/gems/aws-sdk-core/spec/retry_errors_helper.rb @@ -119,6 +119,10 @@ def setup_next_response(test_case) resp.context.http_response.headers['date'] = Time.now.utc + response[:clock_skew] end + if response[:retry_after] + resp.context.http_response.headers['x-amz-retry-after'] = response[:retry_after].to_s + end + if response[:endpoint_discovery] allow(resp.context.operation).to receive(:endpoint_discovery).and_return(true) end diff --git a/gems/aws-sdk-s3/spec/client_spec.rb b/gems/aws-sdk-s3/spec/client_spec.rb index 7ca10759590..a4f7095d64b 100644 --- a/gems/aws-sdk-s3/spec/client_spec.rb +++ b/gems/aws-sdk-s3/spec/client_spec.rb @@ -763,7 +763,10 @@ module S3 end end + # TODO: Update retries to 2 and remove stub when new retries become default describe "200 errors response handling" do + before { allow(Aws::Plugins::RetryErrors).to receive(:new_retries?).and_return(false) } + { complete_multipart_upload: { upload_id: 'upload-id' }, copy_object: { copy_source: 'bucket/key' },