From 7b29d4073e1cb1856ec70318ab07cefa52042dea Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Mon, 2 Feb 2026 13:58:12 -0700 Subject: [PATCH 01/16] Add fail action 6, will fallback to serving stale if retry attempts are exhausted --- doc/admin-guide/files/records.yaml.en.rst | 10 +- include/proxy/http/HttpConfig.h | 1 + include/proxy/http/HttpTransact.h | 6 + src/proxy/http/HttpCacheSM.cc | 18 +- src/proxy/http/HttpConfig.cc | 4 +- src/proxy/http/HttpSM.cc | 28 ++- src/proxy/http/HttpTransact.cc | 122 +++++++++--- src/records/RecordsConfig.cc | 4 +- .../cache/cache-read-retry-stale.test.py | 29 +++ .../replay/cache-read-retry-stale.replay.yaml | 187 ++++++++++++++++++ 10 files changed, 368 insertions(+), 41 deletions(-) create mode 100644 tests/gold_tests/cache/cache-read-retry-stale.test.py create mode 100644 tests/gold_tests/cache/replay/cache-read-retry-stale.replay.yaml diff --git a/doc/admin-guide/files/records.yaml.en.rst b/doc/admin-guide/files/records.yaml.en.rst index e7057658d30..bfd5940c9c8 100644 --- a/doc/admin-guide/files/records.yaml.en.rst +++ b/doc/admin-guide/files/records.yaml.en.rst @@ -2892,7 +2892,7 @@ Dynamic Content & Content Negotiation The number of times to attempt a cache open write upon failure to get a write lock. This config is ignored when :ts:cv:`proxy.config.http.cache.open_write_fail_action` is - set to ``5`` or :ts:cv:`proxy.config.http.cache.max_open_write_retry_timeout` is set to gt ``0``. + set to ``5`` or ``6``, or when :ts:cv:`proxy.config.http.cache.max_open_write_retry_timeout` is set to gt ``0``. .. ts:cv:: CONFIG proxy.config.http.cache.max_open_write_retry_timeout INT 0 :reloadable: @@ -2901,7 +2901,7 @@ Dynamic Content & Content Negotiation A timeout for attempting a cache open write upon failure to get a write lock. This config is ignored when :ts:cv:`proxy.config.http.cache.open_write_fail_action` is - set to ``5``. + set to ``5`` or ``6``. .. ts:cv:: CONFIG proxy.config.http.cache.open_write_fail_action INT 0 :reloadable: @@ -2931,6 +2931,12 @@ Dynamic Content & Content Negotiation Make sure to configure the :ref:`admin-config-read-while-writer` feature correctly. Note that this option may result in CACHE_LOOKUP_COMPLETE HOOK being called back more than once. + ``6`` Retry Cache Read on a Cache Write Lock failure (same as ``5``), but if + read retries are exhausted and a stale cached object exists, serve the + stale content if allowed. This combines the request collapsing behavior + of ``5`` with the stale-serving fallback of ``2``. If stale is not + returnable (e.g., due to ``Cache-Control: must-revalidate``), go to + origin server. ===== ====================================================================== Customizable User Response Pages diff --git a/include/proxy/http/HttpConfig.h b/include/proxy/http/HttpConfig.h index 85cb7ea433c..234a930e883 100644 --- a/include/proxy/http/HttpConfig.h +++ b/include/proxy/http/HttpConfig.h @@ -371,6 +371,7 @@ enum class CacheOpenWriteFailAction_t { ERROR_ON_MISS_STALE_ON_REVALIDATE = 0x03, ERROR_ON_MISS_OR_REVALIDATE = 0x04, READ_RETRY = 0x05, + READ_RETRY_STALE_ON_REVALIDATE = 0x06, TOTAL_TYPES }; diff --git a/include/proxy/http/HttpTransact.h b/include/proxy/http/HttpTransact.h index 856fbe991d0..1d91d5d60a1 100644 --- a/include/proxy/http/HttpTransact.h +++ b/include/proxy/http/HttpTransact.h @@ -703,6 +703,10 @@ class HttpTransact /// configuration. bool is_cacheable_due_to_negative_caching_configuration = false; + /// Set when stale content is served due to cache write lock failure. + /// Used to correctly attribute statistics and VIA strings. + bool serving_stale_due_to_write_lock = false; + MgmtByte cache_open_write_fail_action = 0; HttpConfigParams *http_config_param = nullptr; @@ -998,9 +1002,11 @@ class HttpTransact static void HandleCacheOpenReadHitFreshness(State *s); static void HandleCacheOpenReadHit(State *s); static void HandleCacheOpenReadMiss(State *s); + static void HandleCacheOpenReadMissGoToOrigin(State *s); static void set_cache_prepare_write_action_for_new_request(State *s); static void build_response_from_cache(State *s, HTTPWarningCode warning_code); static void handle_cache_write_lock(State *s); + static void handle_cache_write_lock_go_to_origin(State *s); static void HandleResponse(State *s); static void HandleUpdateCachedObject(State *s); static void HandleUpdateCachedObjectContinue(State *s); diff --git a/src/proxy/http/HttpCacheSM.cc b/src/proxy/http/HttpCacheSM.cc index 075e81694fc..7893fda4b1a 100644 --- a/src/proxy/http/HttpCacheSM.cc +++ b/src/proxy/http/HttpCacheSM.cc @@ -43,6 +43,14 @@ namespace { DbgCtl dbg_ctl_http_cache{"http_cache"}; + +// Helper to check if cache_open_write_fail_action has READ_RETRY behavior +inline bool +is_read_retry_action(MgmtByte action) +{ + return action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || + action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE); +} } // end anonymous namespace //// @@ -215,12 +223,11 @@ HttpCacheSM::state_cache_open_write(int event, void *data) break; case CACHE_EVENT_OPEN_WRITE_FAILED: { - if (master_sm->t_state.txn_conf->cache_open_write_fail_action == - static_cast(CacheOpenWriteFailAction_t::READ_RETRY)) { + if (is_read_retry_action(master_sm->t_state.txn_conf->cache_open_write_fail_action)) { // fall back to open_read_tries - // Note that when CacheOpenWriteFailAction_t::READ_RETRY is configured, max_cache_open_write_retries + // Note that when READ_RETRY actions are configured, max_cache_open_write_retries // is automatically ignored. Make sure to not disable max_cache_open_read_retries - // with CacheOpenWriteFailAction_t::READ_RETRY as this results in proxy'ing to origin + // with READ_RETRY actions as this results in proxy'ing to origin // without write retries in both a cache miss or a cache refresh scenario. if (write_retry_done()) { @@ -264,8 +271,7 @@ HttpCacheSM::state_cache_open_write(int event, void *data) _read_retry_event = nullptr; } - if (master_sm->t_state.txn_conf->cache_open_write_fail_action == - static_cast(CacheOpenWriteFailAction_t::READ_RETRY)) { + if (is_read_retry_action(master_sm->t_state.txn_conf->cache_open_write_fail_action)) { Dbg(dbg_ctl_http_cache, "[%" PRId64 "] [state_cache_open_write] cache open write failure %d. " "falling back to read retry...", diff --git a/src/proxy/http/HttpConfig.cc b/src/proxy/http/HttpConfig.cc index 3b7352b1fc6..19e5b1d4d3d 100644 --- a/src/proxy/http/HttpConfig.cc +++ b/src/proxy/http/HttpConfig.cc @@ -1384,7 +1384,9 @@ HttpConfig::reconfigure() params->disallow_post_100_continue = INT_TO_BOOL(m_master.disallow_post_100_continue); params->oride.cache_open_write_fail_action = m_master.oride.cache_open_write_fail_action; - if (params->oride.cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY)) { + if (params->oride.cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || + params->oride.cache_open_write_fail_action == + static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE)) { if (params->oride.max_cache_open_read_retries <= 0 || params->oride.max_cache_open_write_retries <= 0) { Warning("Invalid config, cache_open_write_fail_action (%d), max_cache_open_read_retries (%" PRIu64 "), " "max_cache_open_write_retries (%" PRIu64 ")", diff --git a/src/proxy/http/HttpSM.cc b/src/proxy/http/HttpSM.cc index d9cfb662923..0bb4bdfa4db 100644 --- a/src/proxy/http/HttpSM.cc +++ b/src/proxy/http/HttpSM.cc @@ -2537,9 +2537,11 @@ HttpSM::state_cache_open_write(int event, void *data) case CACHE_EVENT_OPEN_READ: if (!t_state.cache_info.object_read) { t_state.cache_open_write_fail_action = t_state.txn_conf->cache_open_write_fail_action; - // Note that CACHE_LOOKUP_COMPLETE may be invoked more than once - // if CacheOpenWriteFailAction_t::READ_RETRY is configured - ink_assert(t_state.cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY)); + // READ_RETRY mode: write lock failed, no stale object available. + // CACHE_LOOKUP_COMPLETE will fire from HandleCacheOpenReadMiss with MISS result. + ink_assert(t_state.cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || + t_state.cache_open_write_fail_action == + static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE)); t_state.cache_lookup_result = HttpTransact::CacheLookupResult_t::NONE; t_state.cache_info.write_lock_state = HttpTransact::CacheWriteLock_t::READ_RETRY; break; @@ -2558,8 +2560,9 @@ HttpSM::state_cache_open_write(int event, void *data) t_state.source = HttpTransact::Source_t::CACHE; // clear up CacheLookupResult_t::MISS, let Freshness function decide // hit status - t_state.cache_lookup_result = HttpTransact::CacheLookupResult_t::NONE; - t_state.cache_info.write_lock_state = HttpTransact::CacheWriteLock_t::READ_RETRY; + t_state.cache_open_write_fail_action = t_state.txn_conf->cache_open_write_fail_action; + t_state.cache_lookup_result = HttpTransact::CacheLookupResult_t::NONE; + t_state.cache_info.write_lock_state = HttpTransact::CacheWriteLock_t::READ_RETRY; break; case HTTP_TUNNEL_EVENT_DONE: @@ -2663,7 +2666,20 @@ HttpSM::state_cache_open_read(int event, void *data) ink_assert(t_state.transact_return_point == nullptr); t_state.transact_return_point = HttpTransact::HandleCacheOpenRead; - setup_cache_lookup_complete_api(); + + // For READ_RETRY actions (5 and 6), skip the CACHE_LOOKUP_COMPLETE hook now. + // The hook will fire later with the final result: HIT if stale content is found + // during retry, or MISS if nothing is found (from HandleCacheOpenReadMiss). + // This ensures plugins see only the final cache lookup result, avoiding issues + // like stats double-counting and duplicate hook registrations. + if (t_state.txn_conf->cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || + t_state.txn_conf->cache_open_write_fail_action == + static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE)) { + SMDbg(dbg_ctl_http, "READ_RETRY configured, deferring CACHE_LOOKUP_COMPLETE hook"); + call_transact_and_set_next_state(nullptr); + } else { + setup_cache_lookup_complete_api(); + } break; default: diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index 0f5e689222d..c024693fa25 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -2591,6 +2591,13 @@ HttpTransact::HandleCacheOpenReadHitFreshness(State *s) } } + // If we're serving stale due to write lock failure, override cache_lookup_result to HIT_STALE + // so plugins see the accurate status. what_is_document_freshness returns FRESH to bypass + // revalidation, but the content is actually stale. + if (s->serving_stale_due_to_write_lock) { + s->cache_lookup_result = HttpTransact::CacheLookupResult_t::HIT_STALE; + } + ink_assert(s->cache_lookup_result != HttpTransact::CacheLookupResult_t::MISS); if (s->cache_lookup_result == HttpTransact::CacheLookupResult_t::HIT_STALE) { SET_VIA_STRING(VIA_DETAIL_CACHE_LOOKUP, VIA_DETAIL_MISS_EXPIRED); @@ -2787,7 +2794,9 @@ HttpTransact::HandleCacheOpenReadHit(State *s) // proxy.config.http.cache.ignore_server_no_cache is set to 0 (i.e don't ignore no cache -- the default setting) // // But, we only do this if we're not in an API updating the cached object (see TSHttpTxnUpdateCachedObject) - if ((((s->cache_lookup_result == CacheLookupResult_t::HIT_STALE) || + // Also skip revalidation if we're serving stale due to write lock failure - we've already decided + // to serve the stale content to avoid contention, so don't try to revalidate. + if ((((s->cache_lookup_result == CacheLookupResult_t::HIT_STALE && !s->serving_stale_due_to_write_lock) || ((obj->response_get()->get_cooked_cc_mask() & MIME_COOKED_MASK_CC_NO_CACHE) && !s->cache_control.ignore_server_no_cache)) && (s->api_update_cached_object != HttpTransact::UpdateCachedObject_t::CONTINUE))) { needs_revalidate = true; @@ -2942,14 +2951,26 @@ HttpTransact::HandleCacheOpenReadHit(State *s) HttpCacheSM &cache_sm = s->state_machine->get_cache_sm(); TxnDbg(dbg_ctl_http_trans, "CacheOpenRead --- HIT-FRESH read while write %d", cache_sm.is_readwhilewrite_inprogress()); - if (cache_sm.is_readwhilewrite_inprogress()) + if (cache_sm.is_readwhilewrite_inprogress()) { SET_VIA_STRING(VIA_CACHE_RESULT, VIA_IN_CACHE_RWW_HIT); + } + + // If serving stale due to write lock failure (actions 2, 3, or 6), adjust VIA to reflect stale serving. + // This ensures correct statistics attribution (cache_hit_stale_served instead of cache_hit_fresh). + if (s->serving_stale_due_to_write_lock) { + TxnDbg(dbg_ctl_http_trans, "Serving stale due to write lock failure, adjusting VIA for statistics"); + SET_VIA_STRING(VIA_CACHE_RESULT, VIA_IN_CACHE_STALE); + SET_VIA_STRING(VIA_SERVER_RESULT, VIA_SERVER_ERROR); + } if (s->cache_lookup_result == CacheLookupResult_t::HIT_WARNING) { build_response_from_cache(s, HTTPWarningCode::HERUISTIC_EXPIRATION); } else if (s->cache_lookup_result == CacheLookupResult_t::HIT_STALE) { ink_assert(server_up == false); build_response_from_cache(s, HTTPWarningCode::REVALIDATION_FAILED); + } else if (s->serving_stale_due_to_write_lock) { + // Serving stale due to write lock failure - no Warning header (deprecated per RFC 9111) + build_response_from_cache(s, HTTPWarningCode::NONE); } else { build_response_from_cache(s, HTTPWarningCode::NONE); } @@ -3166,7 +3187,9 @@ HttpTransact::handle_cache_write_lock(State *s) // Write failed and read retry triggered // Clean up server_request and re-initiate // Cache Lookup - ink_assert(s->cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY)); + ink_assert(s->cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || + s->cache_open_write_fail_action == + static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE)); s->cache_info.write_status = CacheWriteStatus_t::LOCK_MISS; StateMachineAction_t next; next = StateMachineAction_t::CACHE_LOOKUP; @@ -3207,21 +3230,44 @@ HttpTransact::handle_cache_write_lock(State *s) } if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { - TxnDbg(dbg_ctl_http_error, "calling hdr_info.server_request.destroy"); + TxnDbg(dbg_ctl_http_trans, "READ_RETRY: destroying server_request before cache hit handling"); s->hdr_info.server_request.destroy(); HandleCacheOpenReadHitFreshness(s); } else { - StateMachineAction_t next; - next = how_to_open_connection(s); - if (next == StateMachineAction_t::ORIGIN_SERVER_OPEN || next == StateMachineAction_t::ORIGIN_SERVER_RAW_OPEN) { - s->next_action = next; - TRANSACT_RETURN(next, nullptr); - } else { - // hehe! - s->next_action = next; - ink_assert(s->next_action == StateMachineAction_t::DNS_LOOKUP); - return; + // For action 5/6, the CACHE_LOOKUP_COMPLETE hook was deferred in state_cache_open_read. + // Since we're not in READ_RETRY (write lock succeeded or failed without retry), fire + // the deferred hook now with MISS before going to origin. + if (s->txn_conf->cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || + s->txn_conf->cache_open_write_fail_action == + static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE)) { + TxnDbg(dbg_ctl_http_trans, "Action 5/6 configured, firing deferred CACHE_LOOKUP_COMPLETE with MISS"); + s->cache_lookup_result = CacheLookupResult_t::MISS; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, handle_cache_write_lock_go_to_origin); } + handle_cache_write_lock_go_to_origin(s); + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Name : handle_cache_write_lock_go_to_origin +// Description: Continuation after CACHE_LOOKUP_COMPLETE hook for handle_cache_write_lock. +// Goes to origin server when write lock succeeded or failed without retry. +// +/////////////////////////////////////////////////////////////////////////////// +void +HttpTransact::handle_cache_write_lock_go_to_origin(State *s) +{ + TxnDbg(dbg_ctl_http_trans, "handle_cache_write_lock_go_to_origin - proceeding to origin"); + + StateMachineAction_t next; + next = how_to_open_connection(s); + if (next == StateMachineAction_t::ORIGIN_SERVER_OPEN || next == StateMachineAction_t::ORIGIN_SERVER_RAW_OPEN) { + s->next_action = next; + TRANSACT_RETURN(next, nullptr); + } else { + s->next_action = next; + ink_assert(s->next_action == StateMachineAction_t::DNS_LOOKUP); + return; } } @@ -3270,21 +3316,41 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) s->cache_info.action = CacheAction_t::NO_ACTION; } else if (s->api_server_response_no_store) { // plugin may have decided not to cache the response s->cache_info.action = CacheAction_t::NO_ACTION; - } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { + } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY && !s->serving_stale_due_to_write_lock) { // We've looped back around due to failing to read during READ_RETRY mode. // Don't attempt another cache write - just proxy to origin without caching. - TxnDbg(dbg_ctl_http_trans, "READ_RETRY cache read failed, bypassing cache"); + // Fire the deferred CACHE_LOOKUP_COMPLETE hook with MISS result so plugins + // see the final cache lookup status before we go to origin. + // NOTE: Only fire if serving_stale_due_to_write_lock is false. If true, we already + // found a stale object and the hook was fired from HandleCacheOpenReadHitFreshness. + // We're here because authentication (MUST_PROXY) requires going to origin anyway. + TxnDbg(dbg_ctl_http_trans, "READ_RETRY cache read failed, firing deferred CACHE_LOOKUP_COMPLETE with MISS"); + s->cache_info.action = CacheAction_t::NO_ACTION; + s->cache_lookup_result = CacheLookupResult_t::MISS; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadMissGoToOrigin); + } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { + // READ_RETRY with serving_stale_due_to_write_lock = true means we found stale content + // but can't serve it (e.g., MUST_PROXY auth). Hook already fired, just go to origin. + TxnDbg(dbg_ctl_http_trans, "READ_RETRY stale found but requires origin, bypassing cache"); s->cache_info.action = CacheAction_t::NO_ACTION; } else { HttpTransact::set_cache_prepare_write_action_for_new_request(s); } - /////////////////////////////////////////////////////////////// - // a normal miss would try to fetch the document from the // - // origin server, unless the origin server isn't resolvable, // - // but if "CacheControl: only-if-cached" is set, then we are // - // supposed to send a 504 (GATEWAY TIMEOUT) response. // - /////////////////////////////////////////////////////////////// + // Proceed to origin server (handles DNS lookup, parent proxy, etc.) + HandleCacheOpenReadMissGoToOrigin(s); +} + +/////////////////////////////////////////////////////////////////////////////// +// Name : HandleCacheOpenReadMissGoToOrigin +// Description: Contains the "go to origin" logic for cache miss cases. +// Handles DNS lookup, parent proxy selection, and request building. +// +/////////////////////////////////////////////////////////////////////////////// +void +HttpTransact::HandleCacheOpenReadMissGoToOrigin(State *s) +{ + TxnDbg(dbg_ctl_http_trans, "HandleCacheOpenReadMissGoToOrigin - proceeding to origin"); HTTPHdr *h = &s->hdr_info.client_request; @@ -3327,8 +3393,6 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) build_error_response(s, HTTPStatus::GATEWAY_TIMEOUT, "Not Cached", "cache#not_in_cache"); s->next_action = StateMachineAction_t::SEND_ERROR_CACHE_NOOP; } - - return; } void @@ -7300,9 +7364,17 @@ HttpTransact::what_is_document_freshness(State *s, HTTPHdr *client_request, HTTP uint32_t cc_mask, cooked_cc_mask; uint32_t os_specifies_revalidate; + // This check works for STALE_ON_REVALIDATE(0x2), ERROR_ON_MISS_STALE_ON_REVALIDATE(0x3), and + // READ_RETRY_STALE_ON_REVALIDATE(0x6). + // We return FRESH (not STALE) intentionally to bypass the revalidation code path in + // HandleCacheOpenReadHit. Returning STALE would trigger origin server contact for revalidation, + // but for write lock failure scenarios we want to serve the stale content directly without + // revalidation. The serving_stale_due_to_write_lock flag tracks that we're actually serving + // stale content, so VIA strings and statistics can be correctly attributed. if (s->cache_open_write_fail_action & static_cast(CacheOpenWriteFailAction_t::STALE_ON_REVALIDATE)) { if (is_stale_cache_response_returnable(s)) { - TxnDbg(dbg_ctl_http_match, "cache_serve_stale_on_write_lock_fail, return FRESH"); + TxnDbg(dbg_ctl_http_match, "cache_serve_stale_on_write_lock_fail, return FRESH to bypass revalidation"); + s->serving_stale_due_to_write_lock = true; return (Freshness_t::FRESH); } } diff --git a/src/records/RecordsConfig.cc b/src/records/RecordsConfig.cc index 88526c08ad3..ce94cdff93e 100644 --- a/src/records/RecordsConfig.cc +++ b/src/records/RecordsConfig.cc @@ -610,13 +610,15 @@ static constexpr RecordElement RecordsConfig[] = , {RECT_CONFIG, "proxy.config.http.cache.max_open_write_retry_timeout", RECD_INT, "0", RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL} , - // # open_write_fail_action has 3 options: + // # open_write_fail_action options: // # // # 0 - default. disable cache and goto origin // # 1 - return error if cache miss // # 2 - serve stale until proxy.config.http.cache.max_stale_age, then goto origin, if revalidate // # 3 - return error if cache miss or serve stale until proxy.config.http.cache.max_stale_age, then goto origin, if revalidate // # 4 - return error if cache miss or if revalidate + // # 5 - retry cache read (read-while-writer) on write lock failure, goto origin if retries exhausted + // # 6 - retry cache read on write lock failure, if retries exhausted serve stale if allowed, otherwise goto origin {RECT_CONFIG, "proxy.config.http.cache.open_write_fail_action", RECD_INT, "0", RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL} , // # when_to_revalidate has 4 options: diff --git a/tests/gold_tests/cache/cache-read-retry-stale.test.py b/tests/gold_tests/cache/cache-read-retry-stale.test.py new file mode 100644 index 00000000000..d9dc3be03be --- /dev/null +++ b/tests/gold_tests/cache/cache-read-retry-stale.test.py @@ -0,0 +1,29 @@ +''' +Test cache_open_write_fail_action = 6 (READ_RETRY with stale fallback) +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +Test.Summary = ''' +Test cache_open_write_fail_action = 6 (READ_RETRY_STALE_ON_REVALIDATE) to verify: +1. READ_RETRY behavior (same as action 5) +2. Stale fallback when read retries are exhausted +3. System does not crash under write lock contention with stale objects +''' + +Test.ContinueOnFail = True + +Test.ATSReplayTest(replay_file="replay/cache-read-retry-stale.replay.yaml") diff --git a/tests/gold_tests/cache/replay/cache-read-retry-stale.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry-stale.replay.yaml new file mode 100644 index 00000000000..de28fe96484 --- /dev/null +++ b/tests/gold_tests/cache/replay/cache-read-retry-stale.replay.yaml @@ -0,0 +1,187 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# This replay file tests cache_open_write_fail_action = 6 (READ_RETRY_STALE_ON_REVALIDATE) +# +# This is a smoke test that verifies: +# 1. Action 6 is accepted by the configuration system +# 2. Basic caching works correctly with action 6 enabled +# 3. The system does not crash +# +# The stale fallback behavior (serving stale on read retry exhaustion) is +# difficult to reliably test due to timing sensitivity. This test focuses +# on verifying the feature is functional without causing instability. +# + +meta: + version: "1.0" + +autest: + description: 'Test cache_open_write_fail_action = 6 (READ_RETRY with stale fallback) smoke test' + dns: + name: 'dns-read-retry-stale' + + server: + name: 'origin-read-retry-stale' + + client: + name: 'client-read-retry-stale' + + ats: + name: 'ts-read-retry-stale' + process_config: + enable_cache: true + + records_config: + proxy.config.diags.debug.enabled: 1 + proxy.config.diags.debug.tags: 'http_cache' + # Enable READ_RETRY_STALE_ON_REVALIDATE mode (action 6) + proxy.config.http.cache.open_write_fail_action: 6 + # Configure retry parameters + proxy.config.http.cache.max_open_write_retries: 1 + proxy.config.http.cache.max_open_write_retry_timeout: 0 + proxy.config.http.cache.max_open_read_retries: 3 + proxy.config.http.cache.open_read_retry_time: 100 + proxy.config.cache.enable_read_while_writer: 1 + # Allow serving stale content + proxy.config.http.cache.max_stale_age: 300 + + remap_config: + - from: "http://example.com/" + to: "http://backend.example.com:{SERVER_HTTP_PORT}/" + + log_validation: + traffic_out: + # Should NOT contain crash indicators + excludes: + - expression: "FATAL|ALERT|Emergency|ink_release_assert|ink_abort" + description: "Verify ATS does not crash with READ_RETRY_STALE mode enabled" + +sessions: + ############################################################################# + # First session: Cache a normal object + ############################################################################# + - transactions: + - client-request: + method: "GET" + version: "1.1" + url: /test-object + headers: + fields: + - [uuid, cache-prime] + - [Host, example.com] + + server-response: + status: 200 + reason: OK + headers: + fields: + - [Content-Length, 100] + - [Cache-Control, "max-age=300"] + - [X-Response, cached-content] + + proxy-response: + status: 200 + headers: + fields: + - [X-Response, {value: 'cached-content', as: equal}] + + ############################################################################# + # Second session: Verify cache hit (action 6 doesn't break basic caching) + ############################################################################# + - transactions: + - client-request: + delay: 500ms + method: "GET" + version: "1.1" + url: /test-object + headers: + fields: + - [uuid, cache-hit] + - [Host, example.com] + + # Should NOT reach origin (cache hit) + server-response: + status: 500 + reason: Should Not Reach + headers: + fields: + - [Content-Length, 100] + - [X-Response, should-not-reach] + + # Should serve from cache + proxy-response: + status: 200 + headers: + fields: + - [X-Response, {value: 'cached-content', as: equal}] + + ############################################################################# + # Third session: Test a different uncached object + ############################################################################# + - transactions: + - client-request: + method: "GET" + version: "1.1" + url: /another-object + headers: + fields: + - [uuid, another-cache] + - [Host, example.com] + + server-response: + status: 200 + reason: OK + headers: + fields: + - [Content-Length, 50] + - [Cache-Control, "max-age=300"] + - [X-Response, another-content] + + proxy-response: + status: 200 + headers: + fields: + - [X-Response, {value: 'another-content', as: equal}] + + ############################################################################# + # Fourth session: Verify second object cached + ############################################################################# + - transactions: + - client-request: + delay: 500ms + method: "GET" + version: "1.1" + url: /another-object + headers: + fields: + - [uuid, another-hit] + - [Host, example.com] + + server-response: + status: 500 + reason: Should Not Reach + headers: + fields: + - [Content-Length, 100] + - [X-Response, should-not-reach] + + proxy-response: + status: 200 + headers: + fields: + - [X-Response, {value: 'another-content', as: equal}] From 5da38a58f2767811255f35c3004dedb60d06444c Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Tue, 3 Feb 2026 15:03:33 -0700 Subject: [PATCH 02/16] Update src/proxy/http/HttpTransact.cc yes, this is a good fix Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/proxy/http/HttpTransact.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index c024693fa25..9e22380a819 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -2965,7 +2965,7 @@ HttpTransact::HandleCacheOpenReadHit(State *s) if (s->cache_lookup_result == CacheLookupResult_t::HIT_WARNING) { build_response_from_cache(s, HTTPWarningCode::HERUISTIC_EXPIRATION); - } else if (s->cache_lookup_result == CacheLookupResult_t::HIT_STALE) { + } else if (s->cache_lookup_result == CacheLookupResult_t::HIT_STALE && !s->serving_stale_due_to_write_lock) { ink_assert(server_up == false); build_response_from_cache(s, HTTPWarningCode::REVALIDATION_FAILED); } else if (s->serving_stale_due_to_write_lock) { From 8e340d3bd2c19582b4c9682058cfeae50aa64c5b Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Tue, 3 Feb 2026 15:04:34 -0700 Subject: [PATCH 03/16] Update doc/admin-guide/files/records.yaml.en.rst Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- doc/admin-guide/files/records.yaml.en.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/admin-guide/files/records.yaml.en.rst b/doc/admin-guide/files/records.yaml.en.rst index bfd5940c9c8..17ce3cb9c47 100644 --- a/doc/admin-guide/files/records.yaml.en.rst +++ b/doc/admin-guide/files/records.yaml.en.rst @@ -2929,8 +2929,8 @@ Dynamic Content & Content Negotiation with :ts:cv:`proxy.config.cache.enable_read_while_writer` configuration allows to collapse concurrent requests without a need for any plugin. Make sure to configure the :ref:`admin-config-read-while-writer` feature - correctly. Note that this option may result in CACHE_LOOKUP_COMPLETE HOOK - being called back more than once. + correctly. With this option, CACHE_LOOKUP_COMPLETE HOOK is deferred for + read retries so that plugins see only the final cache lookup result. ``6`` Retry Cache Read on a Cache Write Lock failure (same as ``5``), but if read retries are exhausted and a stale cached object exists, serve the stale content if allowed. This combines the request collapsing behavior From c631cddd55909e2c1d95280bd835217d18b2d02d Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Tue, 3 Feb 2026 15:51:21 -0700 Subject: [PATCH 04/16] review fixes --- src/proxy/http/HttpTransact.cc | 12 ++++++++++++ .../gold_tests/cache/cache-read-retry-stale.test.py | 12 ++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index 9e22380a819..17339d22065 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -3337,6 +3337,18 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) HttpTransact::set_cache_prepare_write_action_for_new_request(s); } + // If action 5/6 configured, we set NO_ACTION, and we're not in READ_RETRY, + // fire the deferred CACHE_LOOKUP_COMPLETE hook now with MISS. + // We won't enter READ_RETRY since we're not caching, so fire the hook now. + if (s->cache_info.action == CacheAction_t::NO_ACTION && s->cache_info.write_lock_state != CacheWriteLock_t::READ_RETRY && + (s->txn_conf->cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || + s->txn_conf->cache_open_write_fail_action == + static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE))) { + TxnDbg(dbg_ctl_http_trans, "Action 5/6 NO_ACTION path, firing deferred CACHE_LOOKUP_COMPLETE with MISS"); + s->cache_lookup_result = CacheLookupResult_t::MISS; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadMissGoToOrigin); + } + // Proceed to origin server (handles DNS lookup, parent proxy, etc.) HandleCacheOpenReadMissGoToOrigin(s); } diff --git a/tests/gold_tests/cache/cache-read-retry-stale.test.py b/tests/gold_tests/cache/cache-read-retry-stale.test.py index d9dc3be03be..1407fee532d 100644 --- a/tests/gold_tests/cache/cache-read-retry-stale.test.py +++ b/tests/gold_tests/cache/cache-read-retry-stale.test.py @@ -18,10 +18,14 @@ # limitations under the License. Test.Summary = ''' -Test cache_open_write_fail_action = 6 (READ_RETRY_STALE_ON_REVALIDATE) to verify: -1. READ_RETRY behavior (same as action 5) -2. Stale fallback when read retries are exhausted -3. System does not crash under write lock contention with stale objects +Smoke test for cache_open_write_fail_action = 6 (READ_RETRY_STALE_ON_REVALIDATE) to verify: +1. Action 6 is accepted by the configuration system +2. Basic caching works correctly with action 6 enabled +3. The system does not crash + +Note: The stale fallback behavior (serving stale on read retry exhaustion) is +difficult to reliably test due to timing sensitivity. This test focuses on +verifying the feature is functional without causing instability. ''' Test.ContinueOnFail = True From e0fc675d0f2cca3db0d7f81fc04236e0d8e065ca Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Wed, 4 Feb 2026 10:39:14 -0700 Subject: [PATCH 05/16] Removing old setup based on config and just using a hook deferred variable to keep track of when to fire --- include/proxy/http/HttpTransact.h | 5 +++ src/proxy/http/HttpSM.cc | 1 + src/proxy/http/HttpTransact.cc | 58 +++++++++++++++++-------------- 3 files changed, 37 insertions(+), 27 deletions(-) diff --git a/include/proxy/http/HttpTransact.h b/include/proxy/http/HttpTransact.h index 1d91d5d60a1..1e89d3379d8 100644 --- a/include/proxy/http/HttpTransact.h +++ b/include/proxy/http/HttpTransact.h @@ -707,6 +707,11 @@ class HttpTransact /// Used to correctly attribute statistics and VIA strings. bool serving_stale_due_to_write_lock = false; + /// Set when CACHE_LOOKUP_COMPLETE hook is deferred for action 5/6. + /// The hook will fire later with the final result once we know if + /// stale content will be served or if we're going to origin. + bool cache_lookup_complete_deferred = false; + MgmtByte cache_open_write_fail_action = 0; HttpConfigParams *http_config_param = nullptr; diff --git a/src/proxy/http/HttpSM.cc b/src/proxy/http/HttpSM.cc index 0bb4bdfa4db..8c339a35c91 100644 --- a/src/proxy/http/HttpSM.cc +++ b/src/proxy/http/HttpSM.cc @@ -2676,6 +2676,7 @@ HttpSM::state_cache_open_read(int event, void *data) t_state.txn_conf->cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE)) { SMDbg(dbg_ctl_http, "READ_RETRY configured, deferring CACHE_LOOKUP_COMPLETE hook"); + t_state.cache_lookup_complete_deferred = true; call_transact_and_set_next_state(nullptr); } else { setup_cache_lookup_complete_api(); diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index 17339d22065..efd5e5f1fa2 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -2604,6 +2604,9 @@ HttpTransact::HandleCacheOpenReadHitFreshness(State *s) SET_VIA_STRING(VIA_CACHE_RESULT, VIA_IN_CACHE_STALE); } + // Clear the deferred flag if set (for READ_RETRY cases that found stale content) + s->cache_lookup_complete_deferred = false; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HttpTransact::HandleCacheOpenReadHit); } @@ -3232,16 +3235,27 @@ HttpTransact::handle_cache_write_lock(State *s) if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { TxnDbg(dbg_ctl_http_trans, "READ_RETRY: destroying server_request before cache hit handling"); s->hdr_info.server_request.destroy(); - HandleCacheOpenReadHitFreshness(s); + + // For READ_RETRY with stale object, set up state and serve stale. + // The CACHE_LOOKUP_COMPLETE hook already fired with HIT_STALE during the initial + // cache hit (before revalidation was attempted), so we skip the hook here to + // avoid double-firing. Call what_is_document_freshness to set up + // serving_stale_due_to_write_lock, then go directly to HandleCacheOpenReadHit. + CacheHTTPInfo *obj = s->cache_info.object_read; + if (obj != nullptr) { + what_is_document_freshness(s, &s->hdr_info.client_request, obj->response_get()); + s->cache_lookup_result = CacheLookupResult_t::HIT_STALE; + HandleCacheOpenReadHit(s); + } else { + HandleCacheOpenReadMiss(s); + } } else { - // For action 5/6, the CACHE_LOOKUP_COMPLETE hook was deferred in state_cache_open_read. - // Since we're not in READ_RETRY (write lock succeeded or failed without retry), fire - // the deferred hook now with MISS before going to origin. - if (s->txn_conf->cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || - s->txn_conf->cache_open_write_fail_action == - static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE)) { - TxnDbg(dbg_ctl_http_trans, "Action 5/6 configured, firing deferred CACHE_LOOKUP_COMPLETE with MISS"); - s->cache_lookup_result = CacheLookupResult_t::MISS; + // If the CACHE_LOOKUP_COMPLETE hook was deferred, fire it now with MISS + // since we're not in READ_RETRY (write lock succeeded or failed without retry). + if (s->cache_lookup_complete_deferred) { + TxnDbg(dbg_ctl_http_trans, "Firing deferred CACHE_LOOKUP_COMPLETE with MISS"); + s->cache_lookup_complete_deferred = false; + s->cache_lookup_result = CacheLookupResult_t::MISS; TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, handle_cache_write_lock_go_to_origin); } handle_cache_write_lock_go_to_origin(s); @@ -3319,15 +3333,8 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY && !s->serving_stale_due_to_write_lock) { // We've looped back around due to failing to read during READ_RETRY mode. // Don't attempt another cache write - just proxy to origin without caching. - // Fire the deferred CACHE_LOOKUP_COMPLETE hook with MISS result so plugins - // see the final cache lookup status before we go to origin. - // NOTE: Only fire if serving_stale_due_to_write_lock is false. If true, we already - // found a stale object and the hook was fired from HandleCacheOpenReadHitFreshness. - // We're here because authentication (MUST_PROXY) requires going to origin anyway. - TxnDbg(dbg_ctl_http_trans, "READ_RETRY cache read failed, firing deferred CACHE_LOOKUP_COMPLETE with MISS"); - s->cache_info.action = CacheAction_t::NO_ACTION; - s->cache_lookup_result = CacheLookupResult_t::MISS; - TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadMissGoToOrigin); + TxnDbg(dbg_ctl_http_trans, "READ_RETRY cache read failed, bypassing cache"); + s->cache_info.action = CacheAction_t::NO_ACTION; } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { // READ_RETRY with serving_stale_due_to_write_lock = true means we found stale content // but can't serve it (e.g., MUST_PROXY auth). Hook already fired, just go to origin. @@ -3337,15 +3344,12 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) HttpTransact::set_cache_prepare_write_action_for_new_request(s); } - // If action 5/6 configured, we set NO_ACTION, and we're not in READ_RETRY, - // fire the deferred CACHE_LOOKUP_COMPLETE hook now with MISS. - // We won't enter READ_RETRY since we're not caching, so fire the hook now. - if (s->cache_info.action == CacheAction_t::NO_ACTION && s->cache_info.write_lock_state != CacheWriteLock_t::READ_RETRY && - (s->txn_conf->cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || - s->txn_conf->cache_open_write_fail_action == - static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE))) { - TxnDbg(dbg_ctl_http_trans, "Action 5/6 NO_ACTION path, firing deferred CACHE_LOOKUP_COMPLETE with MISS"); - s->cache_lookup_result = CacheLookupResult_t::MISS; + // If the CACHE_LOOKUP_COMPLETE hook was deferred, fire it now with MISS. + // This covers all deferred cases: NO_ACTION, PREPARE_TO_WRITE, READ_RETRY without stale, etc. + if (s->cache_lookup_complete_deferred) { + TxnDbg(dbg_ctl_http_trans, "Firing deferred CACHE_LOOKUP_COMPLETE with MISS"); + s->cache_lookup_complete_deferred = false; + s->cache_lookup_result = CacheLookupResult_t::MISS; TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadMissGoToOrigin); } From 8550e920716a6ca021543e297cf2675de4f1b143 Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Wed, 4 Feb 2026 12:57:30 -0700 Subject: [PATCH 06/16] More fixes --- include/proxy/http/HttpTransact.h | 1 + src/proxy/http/HttpTransact.cc | 91 +++++++++++++++++++++++-------- 2 files changed, 69 insertions(+), 23 deletions(-) diff --git a/include/proxy/http/HttpTransact.h b/include/proxy/http/HttpTransact.h index 1e89d3379d8..229217eea5b 100644 --- a/include/proxy/http/HttpTransact.h +++ b/include/proxy/http/HttpTransact.h @@ -1012,6 +1012,7 @@ class HttpTransact static void build_response_from_cache(State *s, HTTPWarningCode warning_code); static void handle_cache_write_lock(State *s); static void handle_cache_write_lock_go_to_origin(State *s); + static void handle_cache_write_lock_go_to_origin_continue(State *s); static void HandleResponse(State *s); static void HandleUpdateCachedObject(State *s); static void HandleUpdateCachedObjectContinue(State *s); diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index efd5e5f1fa2..a58701e4e8b 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -3236,36 +3236,48 @@ HttpTransact::handle_cache_write_lock(State *s) TxnDbg(dbg_ctl_http_trans, "READ_RETRY: destroying server_request before cache hit handling"); s->hdr_info.server_request.destroy(); - // For READ_RETRY with stale object, set up state and serve stale. - // The CACHE_LOOKUP_COMPLETE hook already fired with HIT_STALE during the initial - // cache hit (before revalidation was attempted), so we skip the hook here to - // avoid double-firing. Call what_is_document_freshness to set up - // serving_stale_due_to_write_lock, then go directly to HandleCacheOpenReadHit. + // For READ_RETRY with cached object, determine if we can serve stale. + // If the initial lookup was a MISS, cache_lookup_complete_deferred is true and we + // need to fire the hook here with the final result. If the initial lookup was a + // HIT_STALE (revalidation case), the hook already fired and deferred is false. CacheHTTPInfo *obj = s->cache_info.object_read; if (obj != nullptr) { + // Call what_is_document_freshness to check if stale can be served. + // For action 6 (READ_RETRY_STALE_ON_REVALIDATE), this sets serving_stale_due_to_write_lock = true. + // For action 5 (READ_RETRY), serving_stale_due_to_write_lock remains false because + // the bitwise check (action & STALE_ON_REVALIDATE) fails: 0x05 & 0x02 = 0. what_is_document_freshness(s, &s->hdr_info.client_request, obj->response_get()); - s->cache_lookup_result = CacheLookupResult_t::HIT_STALE; - HandleCacheOpenReadHit(s); + + if (s->serving_stale_due_to_write_lock) { + // Action 6: Serve stale content + TxnDbg(dbg_ctl_http_trans, "READ_RETRY: serving stale content (action 6)"); + s->cache_lookup_result = CacheLookupResult_t::HIT_STALE; + + // Fire the deferred CACHE_LOOKUP_COMPLETE hook with HIT_STALE if needed + if (s->cache_lookup_complete_deferred) { + s->cache_lookup_complete_deferred = false; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadHit); + } + HandleCacheOpenReadHit(s); + } else { + // Action 5: Cannot serve stale, go to origin without caching. + // Set NO_ACTION to prevent infinite revalidation loop. + TxnDbg(dbg_ctl_http_trans, "READ_RETRY: stale found but not serving (action 5), going to origin"); + s->cache_info.action = CacheAction_t::NO_ACTION; + handle_cache_write_lock_go_to_origin(s); + } } else { HandleCacheOpenReadMiss(s); } } else { - // If the CACHE_LOOKUP_COMPLETE hook was deferred, fire it now with MISS - // since we're not in READ_RETRY (write lock succeeded or failed without retry). - if (s->cache_lookup_complete_deferred) { - TxnDbg(dbg_ctl_http_trans, "Firing deferred CACHE_LOOKUP_COMPLETE with MISS"); - s->cache_lookup_complete_deferred = false; - s->cache_lookup_result = CacheLookupResult_t::MISS; - TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, handle_cache_write_lock_go_to_origin); - } handle_cache_write_lock_go_to_origin(s); } } /////////////////////////////////////////////////////////////////////////////// // Name : handle_cache_write_lock_go_to_origin -// Description: Continuation after CACHE_LOOKUP_COMPLETE hook for handle_cache_write_lock. -// Goes to origin server when write lock succeeded or failed without retry. +// Description: Helper to proceed to origin server from handle_cache_write_lock. +// Called when write lock succeeded or failed without READ_RETRY. // /////////////////////////////////////////////////////////////////////////////// void @@ -3273,6 +3285,27 @@ HttpTransact::handle_cache_write_lock_go_to_origin(State *s) { TxnDbg(dbg_ctl_http_trans, "handle_cache_write_lock_go_to_origin - proceeding to origin"); + // If the CACHE_LOOKUP_COMPLETE hook was deferred (for action 5/6), fire it now. + // We're going to origin without READ_RETRY, so MISS is the final result. + if (s->cache_lookup_complete_deferred) { + TxnDbg(dbg_ctl_http_trans, "Firing deferred CACHE_LOOKUP_COMPLETE with MISS before origin"); + s->cache_lookup_complete_deferred = false; + s->cache_lookup_result = CacheLookupResult_t::MISS; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, handle_cache_write_lock_go_to_origin_continue); + } + + handle_cache_write_lock_go_to_origin_continue(s); +} + +/////////////////////////////////////////////////////////////////////////////// +// Name : handle_cache_write_lock_go_to_origin_continue +// Description: Continuation of handle_cache_write_lock_go_to_origin after +// the deferred CACHE_LOOKUP_COMPLETE hook fires. +// +/////////////////////////////////////////////////////////////////////////////// +void +HttpTransact::handle_cache_write_lock_go_to_origin_continue(State *s) +{ StateMachineAction_t next; next = how_to_open_connection(s); if (next == StateMachineAction_t::ORIGIN_SERVER_OPEN || next == StateMachineAction_t::ORIGIN_SERVER_RAW_OPEN) { @@ -3344,13 +3377,25 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) HttpTransact::set_cache_prepare_write_action_for_new_request(s); } - // If the CACHE_LOOKUP_COMPLETE hook was deferred, fire it now with MISS. - // This covers all deferred cases: NO_ACTION, PREPARE_TO_WRITE, READ_RETRY without stale, etc. + // If the CACHE_LOOKUP_COMPLETE hook was deferred, decide whether to fire it now. + // Only fire when we're at a terminal state for cache lookup: + // - NO_ACTION: Not attempting write lock, MISS is the final result + // - READ_RETRY: Already retried and still missed, MISS is the final result + // + // Don't fire yet if action == PREPARE_TO_WRITE and write_lock_state != READ_RETRY, + // because we're about to attempt the write lock and may find content on retry. + // In that case, the hook will fire later: + // - If READ_RETRY finds HIT: HandleCacheOpenReadHitFreshness fires with HIT + // - If READ_RETRY finds MISS: We come back here with write_lock_state == READ_RETRY if (s->cache_lookup_complete_deferred) { - TxnDbg(dbg_ctl_http_trans, "Firing deferred CACHE_LOOKUP_COMPLETE with MISS"); - s->cache_lookup_complete_deferred = false; - s->cache_lookup_result = CacheLookupResult_t::MISS; - TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadMissGoToOrigin); + if (s->cache_info.action == CacheAction_t::NO_ACTION || s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { + TxnDbg(dbg_ctl_http_trans, "Firing deferred CACHE_LOOKUP_COMPLETE with MISS (action=%d, write_lock_state=%d)", + static_cast(s->cache_info.action), static_cast(s->cache_info.write_lock_state)); + s->cache_lookup_complete_deferred = false; + s->cache_lookup_result = CacheLookupResult_t::MISS; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadMissGoToOrigin); + } + TxnDbg(dbg_ctl_http_trans, "Deferring CACHE_LOOKUP_COMPLETE hook - waiting for write lock attempt"); } // Proceed to origin server (handles DNS lookup, parent proxy, etc.) From a225f1b42b1dda6860fafc14f3e4d74b3b2252c5 Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Wed, 4 Feb 2026 17:22:48 -0700 Subject: [PATCH 07/16] more fixes --- src/proxy/http/HttpTransact.cc | 68 +++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index a58701e4e8b..f4be387fceb 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -3236,35 +3236,55 @@ HttpTransact::handle_cache_write_lock(State *s) TxnDbg(dbg_ctl_http_trans, "READ_RETRY: destroying server_request before cache hit handling"); s->hdr_info.server_request.destroy(); - // For READ_RETRY with cached object, determine if we can serve stale. + // For READ_RETRY with cached object, evaluate actual freshness to decide the path. // If the initial lookup was a MISS, cache_lookup_complete_deferred is true and we // need to fire the hook here with the final result. If the initial lookup was a // HIT_STALE (revalidation case), the hook already fired and deferred is false. CacheHTTPInfo *obj = s->cache_info.object_read; if (obj != nullptr) { - // Call what_is_document_freshness to check if stale can be served. - // For action 6 (READ_RETRY_STALE_ON_REVALIDATE), this sets serving_stale_due_to_write_lock = true. - // For action 5 (READ_RETRY), serving_stale_due_to_write_lock remains false because - // the bitwise check (action & STALE_ON_REVALIDATE) fails: 0x05 & 0x02 = 0. - what_is_document_freshness(s, &s->hdr_info.client_request, obj->response_get()); - - if (s->serving_stale_due_to_write_lock) { - // Action 6: Serve stale content - TxnDbg(dbg_ctl_http_trans, "READ_RETRY: serving stale content (action 6)"); - s->cache_lookup_result = CacheLookupResult_t::HIT_STALE; - - // Fire the deferred CACHE_LOOKUP_COMPLETE hook with HIT_STALE if needed + // Evaluate actual document freshness - don't let STALE_ON_REVALIDATE short-circuit this + // by temporarily clearing the flag. We need to know if the object is truly fresh or stale. + MgmtByte saved_action = s->cache_open_write_fail_action; + s->cache_open_write_fail_action = static_cast(CacheOpenWriteFailAction_t::READ_RETRY); + Freshness_t freshness = what_is_document_freshness(s, &s->hdr_info.client_request, obj->response_get()); + s->cache_open_write_fail_action = saved_action; + + if (freshness == Freshness_t::FRESH || freshness == Freshness_t::WARNING) { + // Object is fresh - serve it from cache for both action 5 and 6. + // This is the main benefit of request collapsing: we found a valid cached object. + TxnDbg(dbg_ctl_http_trans, "READ_RETRY: found fresh object, serving from cache"); + s->cache_lookup_result = + (freshness == Freshness_t::FRESH) ? CacheLookupResult_t::HIT_FRESH : CacheLookupResult_t::HIT_WARNING; + if (s->cache_lookup_complete_deferred) { s->cache_lookup_complete_deferred = false; TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadHit); } HandleCacheOpenReadHit(s); } else { - // Action 5: Cannot serve stale, go to origin without caching. - // Set NO_ACTION to prevent infinite revalidation loop. - TxnDbg(dbg_ctl_http_trans, "READ_RETRY: stale found but not serving (action 5), going to origin"); - s->cache_info.action = CacheAction_t::NO_ACTION; - handle_cache_write_lock_go_to_origin(s); + // Object is stale. Decision depends on action type. + bool can_serve_stale = + (s->cache_open_write_fail_action & static_cast(CacheOpenWriteFailAction_t::STALE_ON_REVALIDATE)) && + is_stale_cache_response_returnable(s); + + if (can_serve_stale) { + // Action 6: Serve stale content without revalidation + TxnDbg(dbg_ctl_http_trans, "READ_RETRY: object stale, serving stale (action 6)"); + s->serving_stale_due_to_write_lock = true; + s->cache_lookup_result = CacheLookupResult_t::HIT_STALE; + + if (s->cache_lookup_complete_deferred) { + s->cache_lookup_complete_deferred = false; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadHit); + } + HandleCacheOpenReadHit(s); + } else { + // Action 5: Object is stale, go to origin for revalidation. + // Set NO_ACTION to prevent cache write attempts. + TxnDbg(dbg_ctl_http_trans, "READ_RETRY: object stale, going to origin (action 5)"); + s->cache_info.action = CacheAction_t::NO_ACTION; + handle_cache_write_lock_go_to_origin(s); + } } } else { HandleCacheOpenReadMiss(s); @@ -3363,15 +3383,11 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) s->cache_info.action = CacheAction_t::NO_ACTION; } else if (s->api_server_response_no_store) { // plugin may have decided not to cache the response s->cache_info.action = CacheAction_t::NO_ACTION; - } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY && !s->serving_stale_due_to_write_lock) { - // We've looped back around due to failing to read during READ_RETRY mode. - // Don't attempt another cache write - just proxy to origin without caching. - TxnDbg(dbg_ctl_http_trans, "READ_RETRY cache read failed, bypassing cache"); - s->cache_info.action = CacheAction_t::NO_ACTION; } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { - // READ_RETRY with serving_stale_due_to_write_lock = true means we found stale content - // but can't serve it (e.g., MUST_PROXY auth). Hook already fired, just go to origin. - TxnDbg(dbg_ctl_http_trans, "READ_RETRY stale found but requires origin, bypassing cache"); + // READ_RETRY cache read failed (no object found). If there was a stale object, + // handle_cache_write_lock would have called HandleCacheOpenReadHit, not us. + // Don't attempt cache write - just proxy to origin without caching. + TxnDbg(dbg_ctl_http_trans, "READ_RETRY cache read failed, bypassing cache"); s->cache_info.action = CacheAction_t::NO_ACTION; } else { HttpTransact::set_cache_prepare_write_action_for_new_request(s); From 4ebb070f265db5584d4e6f22b72555c0e44a783a Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Thu, 5 Feb 2026 10:27:08 -0700 Subject: [PATCH 08/16] Address PR comments --- src/proxy/http/HttpTransact.cc | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index f4be387fceb..8cff1f7ca7e 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -2962,6 +2962,7 @@ HttpTransact::HandleCacheOpenReadHit(State *s) // This ensures correct statistics attribution (cache_hit_stale_served instead of cache_hit_fresh). if (s->serving_stale_due_to_write_lock) { TxnDbg(dbg_ctl_http_trans, "Serving stale due to write lock failure, adjusting VIA for statistics"); + SET_VIA_STRING(VIA_DETAIL_CACHE_LOOKUP, VIA_DETAIL_MISS_EXPIRED); SET_VIA_STRING(VIA_CACHE_RESULT, VIA_IN_CACHE_STALE); SET_VIA_STRING(VIA_SERVER_RESULT, VIA_SERVER_ERROR); } @@ -3242,6 +3243,13 @@ HttpTransact::handle_cache_write_lock(State *s) // HIT_STALE (revalidation case), the hook already fired and deferred is false. CacheHTTPInfo *obj = s->cache_info.object_read; if (obj != nullptr) { + // Restore request/response times from cached object for freshness calculations and Age header. + // Similar to HandleCacheOpenReadHitFreshness, handle clock skew by capping times. + s->request_sent_time = obj->request_sent_time_get(); + s->response_received_time = obj->response_received_time_get(); + s->request_sent_time = std::min(s->client_request_time, s->request_sent_time); + s->response_received_time = std::min(s->client_request_time, s->response_received_time); + // Evaluate actual document freshness - don't let STALE_ON_REVALIDATE short-circuit this // by temporarily clearing the flag. We need to know if the object is truly fresh or stale. MgmtByte saved_action = s->cache_open_write_fail_action; @@ -3401,7 +3409,7 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) // Don't fire yet if action == PREPARE_TO_WRITE and write_lock_state != READ_RETRY, // because we're about to attempt the write lock and may find content on retry. // In that case, the hook will fire later: - // - If READ_RETRY finds HIT: HandleCacheOpenReadHitFreshness fires with HIT + // - If READ_RETRY finds HIT: handle_cache_write_lock fires CACHE_LOOKUP_COMPLETE with HIT // - If READ_RETRY finds MISS: We come back here with write_lock_state == READ_RETRY if (s->cache_lookup_complete_deferred) { if (s->cache_info.action == CacheAction_t::NO_ACTION || s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { From 4df65d7f1a29541a27b3d8919e5def458acc83bd Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Thu, 5 Feb 2026 14:55:01 -0700 Subject: [PATCH 09/16] more review fixes --- include/proxy/http/HttpTransact.h | 3 ++- src/proxy/http/HttpTransact.cc | 32 ++++++++++++++++--------------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/include/proxy/http/HttpTransact.h b/include/proxy/http/HttpTransact.h index 229217eea5b..75c5fdb4cfb 100644 --- a/include/proxy/http/HttpTransact.h +++ b/include/proxy/http/HttpTransact.h @@ -1105,7 +1105,8 @@ class HttpTransact static void handle_response_keep_alive_headers(State *s, HTTPVersion ver, HTTPHdr *heads); static int get_max_age(HTTPHdr *response); static int calculate_document_freshness_limit(State *s, HTTPHdr *response, time_t response_date, bool *heuristic); - static Freshness_t what_is_document_freshness(State *s, HTTPHdr *client_request, HTTPHdr *cached_obj_response); + static Freshness_t what_is_document_freshness(State *s, HTTPHdr *client_request, HTTPHdr *cached_obj_response, + bool evaluate_actual_freshness = false); static Authentication_t AuthenticationNeeded(const OverridableHttpConfigParams *p, HTTPHdr *client_request, HTTPHdr *obj_response); static void handle_parent_down(State *s); diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index 8cff1f7ca7e..f62d0c22daa 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -2960,11 +2960,11 @@ HttpTransact::HandleCacheOpenReadHit(State *s) // If serving stale due to write lock failure (actions 2, 3, or 6), adjust VIA to reflect stale serving. // This ensures correct statistics attribution (cache_hit_stale_served instead of cache_hit_fresh). + // Note: VIA_SERVER_RESULT is left as default (space) since the origin server was never contacted. if (s->serving_stale_due_to_write_lock) { TxnDbg(dbg_ctl_http_trans, "Serving stale due to write lock failure, adjusting VIA for statistics"); SET_VIA_STRING(VIA_DETAIL_CACHE_LOOKUP, VIA_DETAIL_MISS_EXPIRED); SET_VIA_STRING(VIA_CACHE_RESULT, VIA_IN_CACHE_STALE); - SET_VIA_STRING(VIA_SERVER_RESULT, VIA_SERVER_ERROR); } if (s->cache_lookup_result == CacheLookupResult_t::HIT_WARNING) { @@ -3234,9 +3234,6 @@ HttpTransact::handle_cache_write_lock(State *s) } if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { - TxnDbg(dbg_ctl_http_trans, "READ_RETRY: destroying server_request before cache hit handling"); - s->hdr_info.server_request.destroy(); - // For READ_RETRY with cached object, evaluate actual freshness to decide the path. // If the initial lookup was a MISS, cache_lookup_complete_deferred is true and we // need to fire the hook here with the final result. If the initial lookup was a @@ -3250,17 +3247,16 @@ HttpTransact::handle_cache_write_lock(State *s) s->request_sent_time = std::min(s->client_request_time, s->request_sent_time); s->response_received_time = std::min(s->client_request_time, s->response_received_time); - // Evaluate actual document freshness - don't let STALE_ON_REVALIDATE short-circuit this - // by temporarily clearing the flag. We need to know if the object is truly fresh or stale. - MgmtByte saved_action = s->cache_open_write_fail_action; - s->cache_open_write_fail_action = static_cast(CacheOpenWriteFailAction_t::READ_RETRY); - Freshness_t freshness = what_is_document_freshness(s, &s->hdr_info.client_request, obj->response_get()); - s->cache_open_write_fail_action = saved_action; + // Evaluate actual document freshness. Pass true to skip the STALE_ON_REVALIDATE + // short-circuit so we get the real freshness, not the "return FRESH to bypass revalidation" result. + Freshness_t freshness = what_is_document_freshness(s, &s->hdr_info.client_request, obj->response_get(), true); if (freshness == Freshness_t::FRESH || freshness == Freshness_t::WARNING) { // Object is fresh - serve it from cache for both action 5 and 6. // This is the main benefit of request collapsing: we found a valid cached object. + // Destroy server_request since we're serving from cache. TxnDbg(dbg_ctl_http_trans, "READ_RETRY: found fresh object, serving from cache"); + s->hdr_info.server_request.destroy(); s->cache_lookup_result = (freshness == Freshness_t::FRESH) ? CacheLookupResult_t::HIT_FRESH : CacheLookupResult_t::HIT_WARNING; @@ -3276,8 +3272,10 @@ HttpTransact::handle_cache_write_lock(State *s) is_stale_cache_response_returnable(s); if (can_serve_stale) { - // Action 6: Serve stale content without revalidation + // Action 6: Serve stale content without revalidation. + // Destroy server_request since we're serving from cache. TxnDbg(dbg_ctl_http_trans, "READ_RETRY: object stale, serving stale (action 6)"); + s->hdr_info.server_request.destroy(); s->serving_stale_due_to_write_lock = true; s->cache_lookup_result = CacheLookupResult_t::HIT_STALE; @@ -3288,8 +3286,9 @@ HttpTransact::handle_cache_write_lock(State *s) HandleCacheOpenReadHit(s); } else { // Action 5: Object is stale, go to origin for revalidation. - // Set NO_ACTION to prevent cache write attempts. - TxnDbg(dbg_ctl_http_trans, "READ_RETRY: object stale, going to origin (action 5)"); + // Keep server_request intact - it has conditional headers (If-Modified-Since, If-None-Match). + // Set NO_ACTION since we can't write to cache (no write lock). + TxnDbg(dbg_ctl_http_trans, "READ_RETRY: object stale, going to origin with conditional request (action 5)"); s->cache_info.action = CacheAction_t::NO_ACTION; handle_cache_write_lock_go_to_origin(s); } @@ -7440,7 +7439,8 @@ HttpTransact::calculate_document_freshness_limit(State *s, HTTPHdr *response, ti // ////////////////////////////////////////////////////////////////////////////// HttpTransact::Freshness_t -HttpTransact::what_is_document_freshness(State *s, HTTPHdr *client_request, HTTPHdr *cached_obj_response) +HttpTransact::what_is_document_freshness(State *s, HTTPHdr *client_request, HTTPHdr *cached_obj_response, + bool evaluate_actual_freshness) { bool heuristic, do_revalidate = false; int age_limit; @@ -7456,7 +7456,9 @@ HttpTransact::what_is_document_freshness(State *s, HTTPHdr *client_request, HTTP // but for write lock failure scenarios we want to serve the stale content directly without // revalidation. The serving_stale_due_to_write_lock flag tracks that we're actually serving // stale content, so VIA strings and statistics can be correctly attributed. - if (s->cache_open_write_fail_action & static_cast(CacheOpenWriteFailAction_t::STALE_ON_REVALIDATE)) { + // When evaluate_actual_freshness is true, skip this short-circuit to get the real freshness. + if (!evaluate_actual_freshness && + (s->cache_open_write_fail_action & static_cast(CacheOpenWriteFailAction_t::STALE_ON_REVALIDATE))) { if (is_stale_cache_response_returnable(s)) { TxnDbg(dbg_ctl_http_match, "cache_serve_stale_on_write_lock_fail, return FRESH to bypass revalidation"); s->serving_stale_due_to_write_lock = true; From d747e4ee12acc972b81f9d60355f2083777a32d0 Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Thu, 5 Feb 2026 17:53:13 -0700 Subject: [PATCH 10/16] Fix action 6, previously it was getting a write fail, and then just immediately falling back to serving stale. Now it stores of a copy of the object ptr that was found stale so it can follow the same retry path as 5 which if that fails then it will serve the stale object --- include/proxy/http/HttpTransact.h | 1 + src/proxy/http/HttpTransact.cc | 54 +++--- .../cache/cache-write-lock-contention.test.py | 175 ++++++++++++++++++ 3 files changed, 202 insertions(+), 28 deletions(-) create mode 100644 tests/gold_tests/cache/cache-write-lock-contention.test.py diff --git a/include/proxy/http/HttpTransact.h b/include/proxy/http/HttpTransact.h index 75c5fdb4cfb..0e05c1e14b8 100644 --- a/include/proxy/http/HttpTransact.h +++ b/include/proxy/http/HttpTransact.h @@ -487,6 +487,7 @@ class HttpTransact HTTPInfo transform_store; CacheDirectives directives; HTTPInfo *object_read = nullptr; + HTTPInfo *stale_fallback = nullptr; // Saved stale object for action 6 fallback during retry CacheWriteLock_t write_lock_state = CacheWriteLock_t::INIT; int lookup_count = 0; SquidHitMissCode hit_miss_code = SQUID_MISS_NONE; diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index f62d0c22daa..999f8b8baea 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -3266,32 +3266,16 @@ HttpTransact::handle_cache_write_lock(State *s) } HandleCacheOpenReadHit(s); } else { - // Object is stale. Decision depends on action type. - bool can_serve_stale = - (s->cache_open_write_fail_action & static_cast(CacheOpenWriteFailAction_t::STALE_ON_REVALIDATE)) && - is_stale_cache_response_returnable(s); - - if (can_serve_stale) { - // Action 6: Serve stale content without revalidation. - // Destroy server_request since we're serving from cache. - TxnDbg(dbg_ctl_http_trans, "READ_RETRY: object stale, serving stale (action 6)"); - s->hdr_info.server_request.destroy(); - s->serving_stale_due_to_write_lock = true; - s->cache_lookup_result = CacheLookupResult_t::HIT_STALE; - - if (s->cache_lookup_complete_deferred) { - s->cache_lookup_complete_deferred = false; - TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadHit); - } - HandleCacheOpenReadHit(s); - } else { - // Action 5: Object is stale, go to origin for revalidation. - // Keep server_request intact - it has conditional headers (If-Modified-Since, If-None-Match). - // Set NO_ACTION since we can't write to cache (no write lock). - TxnDbg(dbg_ctl_http_trans, "READ_RETRY: object stale, going to origin with conditional request (action 5)"); - s->cache_info.action = CacheAction_t::NO_ACTION; - handle_cache_write_lock_go_to_origin(s); + // Object is stale. Save it as potential fallback, then trigger actual cache retry. + // HandleCacheOpenReadMiss will serve stale fallback (action 6) or go to origin (action 5). + if (is_stale_cache_response_returnable(s)) { + s->cache_info.stale_fallback = s->cache_info.object_read; } + TxnDbg(dbg_ctl_http_trans, "READ_RETRY: object stale, triggering actual cache retry"); + s->cache_info.object_read = nullptr; + s->cache_info.write_status = CacheWriteStatus_t::LOCK_MISS; + s->hdr_info.server_request.destroy(); + TRANSACT_RETURN(StateMachineAction_t::CACHE_LOOKUP, nullptr); } } else { HandleCacheOpenReadMiss(s); @@ -3391,9 +3375,23 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) } else if (s->api_server_response_no_store) { // plugin may have decided not to cache the response s->cache_info.action = CacheAction_t::NO_ACTION; } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { - // READ_RETRY cache read failed (no object found). If there was a stale object, - // handle_cache_write_lock would have called HandleCacheOpenReadHit, not us. - // Don't attempt cache write - just proxy to origin without caching. + // READ_RETRY cache read failed (no fresh object found). + // Check if we have a stale fallback (saved from action 6 revalidation case). + bool is_action_6 = + (s->cache_open_write_fail_action & static_cast(CacheOpenWriteFailAction_t::STALE_ON_REVALIDATE)) != 0; + + if (is_action_6 && s->cache_info.stale_fallback != nullptr) { + // Action 6: Serve stale fallback after retries exhausted. + TxnDbg(dbg_ctl_http_trans, "READ_RETRY: retries exhausted, serving stale (action 6)"); + s->cache_info.object_read = s->cache_info.stale_fallback; + s->cache_info.stale_fallback = nullptr; + s->serving_stale_due_to_write_lock = true; + s->cache_lookup_result = CacheLookupResult_t::HIT_STALE; + s->cache_lookup_complete_deferred = false; + HandleCacheOpenReadHit(s); + return; + } + // Action 5 or no stale fallback: proceed to origin without caching. TxnDbg(dbg_ctl_http_trans, "READ_RETRY cache read failed, bypassing cache"); s->cache_info.action = CacheAction_t::NO_ACTION; } else { diff --git a/tests/gold_tests/cache/cache-write-lock-contention.test.py b/tests/gold_tests/cache/cache-write-lock-contention.test.py new file mode 100644 index 00000000000..87ed723ecb0 --- /dev/null +++ b/tests/gold_tests/cache/cache-write-lock-contention.test.py @@ -0,0 +1,175 @@ +''' +Test cache write lock contention behavior with parallel requests for actions 5 and 6. + +This test issues multiple parallel curl requests to trigger actual cache write lock +contention. It is skipped by default because the behavior is timing-sensitive and +not suitable for CI environments. + +To run this test manually: + export RUN_CACHE_CONTENTION_TEST=1 + cd build/tests + ./autest.sh --sandbox /tmp/sbcursor --clean=none -f cache-write-lock-contention +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +Test.Summary = ''' +Test cache write lock contention with parallel requests (actions 5 and 6). + +This test is SKIPPED by default. Set RUN_CACHE_CONTENTION_TEST=1 to run it. +''' + +Test.ContinueOnFail = True + +Test.SkipUnless( + Condition( + lambda: os.environ.get('RUN_CACHE_CONTENTION_TEST', '').lower() in ('1', 'true', 'yes'), + "Set RUN_CACHE_CONTENTION_TEST=1 to run this timing-sensitive test", True)) + + +def make_parallel_curl(url, count, stagger_delay=None): + """ + Generate a parallel curl command string. + + Args: + url: The URL to request + count: Number of parallel requests + stagger_delay: If set, add this delay between requests (e.g., "0.1") + If None, all requests fire simultaneously + """ + parts = ['('] + for i in range(1, count + 1): + if i > 1 and stagger_delay: + parts.append(f'sleep {stagger_delay} && ') + parts.append( + f'{{curl}} -s -o /dev/null -w "req{i}: %{{{{http_code}}}}\\n" "{url}" ' + f'-H "Host: example.com" -H "X-Request: req{i}" & ') + parts.append('wait)') + return ''.join(parts) + + +class ContentionTest: + """ + Base class for cache write lock contention tests. + + Tests parallel requests to trigger cache write lock contention and verify + that the configured fail_action behaves correctly. + """ + + def __init__(self, name, action, description, stale_scenario=False): + self.name = name + self.action = action + self.description = description + self.stale_scenario = stale_scenario + self._setup() + + def _setup(self): + origin_delay = 5 if self.stale_scenario else 3 + self.server = Test.MakeOriginServer(f"server_{self.name}", delay=origin_delay) + + max_age = 1 if self.stale_scenario else 300 + self.server.addResponse( + "sessionlog.json", { + "headers": f"GET /test-{self.name} HTTP/1.1\r\nHost: example.com\r\n\r\n", + "timestamp": "1469733493.993", + "body": "" + }, { + "headers": + f"HTTP/1.1 200 OK\r\n" + f"Content-Length: 100\r\n" + f"Cache-Control: max-age={max_age}\r\n" + f"X-Origin: {self.name}\r\n" + f"Connection: close\r\n\r\n", + "timestamp": "1469733493.993", + "body": "X" * 100 + }) + + self.ts = Test.MakeATSProcess(f"ts_{self.name}", enable_cache=True) + + read_retries = 2 if self.stale_scenario else 10 + retry_time = 100 if self.stale_scenario else 500 + + self.ts.Disk.records_config.update( + { + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'http|cache|http_cache|http_trans', + 'proxy.config.http.cache.open_write_fail_action': self.action, + 'proxy.config.http.cache.max_open_write_retries': 1, + 'proxy.config.http.cache.max_open_write_retry_timeout': 0, + 'proxy.config.http.cache.max_open_read_retries': read_retries, + 'proxy.config.http.cache.open_read_retry_time': retry_time, + 'proxy.config.cache.enable_read_while_writer': 1, + 'proxy.config.http.cache.max_stale_age': 300, + }) + self.ts.Disk.remap_config.AddLine(f'map http://example.com/ http://127.0.0.1:{self.server.Variables.Port}/') + + def run(self): + url_path = f"/test-{self.name}" + url = f"http://127.0.0.1:{self.ts.Variables.port}{url_path}" + + if self.stale_scenario: + tr = Test.AddTestRun(f"{self.description} - Prime cache") + tr.Processes.Default.StartBefore(self.server) + tr.Processes.Default.StartBefore(self.ts) + tr.MakeCurlCommand(f'-s -D - "{url}" -H "Host: example.com"', ts=self.ts) + tr.Processes.Default.ReturnCode = 0 + tr.StillRunningAfter = self.ts + tr.StillRunningAfter = self.server + + tr = Test.AddTestRun(f"{self.description} - Wait for stale") + tr.Processes.Default.Command = "sleep 3" + tr.Processes.Default.ReturnCode = 0 + tr.StillRunningAfter = self.ts + tr.StillRunningAfter = self.server + + tr = Test.AddTestRun(self.description) + if not self.stale_scenario: + tr.Processes.Default.StartBefore(self.server) + tr.Processes.Default.StartBefore(self.ts) + + count = 8 if self.stale_scenario else 5 + stagger = None if self.stale_scenario else "0.1" + tr.MakeCurlCommandMulti(make_parallel_curl(url, count, stagger), ts=self.ts) + tr.Processes.Default.ReturnCode = 0 + tr.StillRunningAfter = self.ts + tr.StillRunningAfter = self.server + + tr = Test.AddTestRun(f"{self.description} - Verify cache") + tr.MakeCurlCommand(f'-s -D - "{url}" -H "Host: example.com"', ts=self.ts) + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression("200 OK", "Cache serves 200") + tr.StillRunningAfter = self.ts + tr.StillRunningAfter = self.server + + self.ts.Disk.traffic_out.Content = Testers.ExcludesExpression("FATAL|ink_release_assert|ink_abort", "No crashes") + + if self.stale_scenario and self.action == 6: + self.ts.Disk.traffic_out.Content += Testers.ContainsExpression( + "serving stale \\(action 6\\)|object stale, serving stale", "Stale fallback triggered") + else: + self.ts.Disk.traffic_out.Content += Testers.ContainsExpression( + "cache open read failure.*retrying|read while write", "Cache contention occurred") + + +# Action 5: READ_RETRY - retries cache reads, goes to origin if exhausted +ContentionTest("a5_fresh", 5, "Action 5 - fresh contention").run() +ContentionTest("a5_stale", 5, "Action 5 - stale revalidation", stale_scenario=True).run() + +# Action 6: READ_RETRY_STALE_ON_REVALIDATE - like action 5, but serves stale if available +ContentionTest("a6_fresh", 6, "Action 6 - fresh contention").run() +ContentionTest("a6_stale", 6, "Action 6 - stale fallback", stale_scenario=True).run() From e907664264180947300768f1315b9011169389bf Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Fri, 6 Feb 2026 13:32:43 -0700 Subject: [PATCH 11/16] Revise documentation for cache write lock contention test moving instructions below license --- .../cache/cache-write-lock-contention.test.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tests/gold_tests/cache/cache-write-lock-contention.test.py b/tests/gold_tests/cache/cache-write-lock-contention.test.py index 87ed723ecb0..c6998cfb7f1 100644 --- a/tests/gold_tests/cache/cache-write-lock-contention.test.py +++ b/tests/gold_tests/cache/cache-write-lock-contention.test.py @@ -1,15 +1,3 @@ -''' -Test cache write lock contention behavior with parallel requests for actions 5 and 6. - -This test issues multiple parallel curl requests to trigger actual cache write lock -contention. It is skipped by default because the behavior is timing-sensitive and -not suitable for CI environments. - -To run this test manually: - export RUN_CACHE_CONTENTION_TEST=1 - cd build/tests - ./autest.sh --sandbox /tmp/sbcursor --clean=none -f cache-write-lock-contention -''' # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -26,6 +14,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +''' +Test cache write lock contention behavior with parallel requests for actions 5 and 6. + +This test issues multiple parallel curl requests to trigger actual cache write lock +contention. It is skipped by default because the behavior is timing-sensitive and +not suitable for CI environments. + +To run this test manually: + export RUN_CACHE_CONTENTION_TEST=1 + cd build/tests + ./autest.sh --sandbox /tmp/sbcursor --clean=none -f cache-write-lock-contention +''' + import os Test.Summary = ''' From b372261a8d063a7105599acf8049ff71e5f45885 Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Fri, 6 Feb 2026 13:34:42 -0700 Subject: [PATCH 12/16] Clean up comments in cache-read-retry-stale test Removed unnecessary comment about test case. --- tests/gold_tests/cache/cache-read-retry-stale.test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/gold_tests/cache/cache-read-retry-stale.test.py b/tests/gold_tests/cache/cache-read-retry-stale.test.py index 1407fee532d..8ec31e2aa0c 100644 --- a/tests/gold_tests/cache/cache-read-retry-stale.test.py +++ b/tests/gold_tests/cache/cache-read-retry-stale.test.py @@ -1,6 +1,3 @@ -''' -Test cache_open_write_fail_action = 6 (READ_RETRY with stale fallback) -''' # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information From 37980431f09fe5aaacfaf66ff85c944a99250489 Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Fri, 6 Feb 2026 13:44:36 -0700 Subject: [PATCH 13/16] formatting --- tests/gold_tests/cache/cache-write-lock-contention.test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/gold_tests/cache/cache-write-lock-contention.test.py b/tests/gold_tests/cache/cache-write-lock-contention.test.py index c6998cfb7f1..7e8ebc9ad1c 100644 --- a/tests/gold_tests/cache/cache-write-lock-contention.test.py +++ b/tests/gold_tests/cache/cache-write-lock-contention.test.py @@ -13,7 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - ''' Test cache write lock contention behavior with parallel requests for actions 5 and 6. From 992ffa12b873c932ef12217aa3d5cc9da470ac6f Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Sun, 8 Feb 2026 20:00:08 -0700 Subject: [PATCH 14/16] One more fix for plugin hooks --- src/proxy/http/HttpTransact.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index 999f8b8baea..b6de1b77378 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -3387,7 +3387,11 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) s->cache_info.stale_fallback = nullptr; s->serving_stale_due_to_write_lock = true; s->cache_lookup_result = CacheLookupResult_t::HIT_STALE; - s->cache_lookup_complete_deferred = false; + + if (s->cache_lookup_complete_deferred) { + s->cache_lookup_complete_deferred = false; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadHit); + } HandleCacheOpenReadHit(s); return; } From babdd353c5164f50c42c203a75c5a3e0b12e38c4 Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Sun, 8 Feb 2026 20:19:31 -0700 Subject: [PATCH 15/16] fix issue where stale was not being reset, so when a fresh object is found it may be reported stale --- src/proxy/http/HttpTransact.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index b6de1b77378..f889600af74 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -3254,6 +3254,9 @@ HttpTransact::handle_cache_write_lock(State *s) if (freshness == Freshness_t::FRESH || freshness == Freshness_t::WARNING) { // Object is fresh - serve it from cache for both action 5 and 6. // This is the main benefit of request collapsing: we found a valid cached object. + // Clear stale-serving flag in case it was set during initial stale short-circuit. + s->serving_stale_due_to_write_lock = false; + // Destroy server_request since we're serving from cache. TxnDbg(dbg_ctl_http_trans, "READ_RETRY: found fresh object, serving from cache"); s->hdr_info.server_request.destroy(); From 9ae358fdd4fa9e0b18e72bd1ba596f83456332aa Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Sun, 8 Feb 2026 20:28:01 -0700 Subject: [PATCH 16/16] cleanup of stale_fallback --- src/proxy/http/HttpTransact.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index f889600af74..d35f0401b09 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -3254,8 +3254,9 @@ HttpTransact::handle_cache_write_lock(State *s) if (freshness == Freshness_t::FRESH || freshness == Freshness_t::WARNING) { // Object is fresh - serve it from cache for both action 5 and 6. // This is the main benefit of request collapsing: we found a valid cached object. - // Clear stale-serving flag in case it was set during initial stale short-circuit. + // Clear stale-related state in case it was set during initial stale short-circuit. s->serving_stale_due_to_write_lock = false; + s->cache_info.stale_fallback = nullptr; // Destroy server_request since we're serving from cache. TxnDbg(dbg_ctl_http_trans, "READ_RETRY: found fresh object, serving from cache"); @@ -3400,7 +3401,8 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) } // Action 5 or no stale fallback: proceed to origin without caching. TxnDbg(dbg_ctl_http_trans, "READ_RETRY cache read failed, bypassing cache"); - s->cache_info.action = CacheAction_t::NO_ACTION; + s->cache_info.stale_fallback = nullptr; // Clear unused fallback + s->cache_info.action = CacheAction_t::NO_ACTION; } else { HttpTransact::set_cache_prepare_write_action_for_new_request(s); }