diff --git a/doc/admin-guide/files/records.yaml.en.rst b/doc/admin-guide/files/records.yaml.en.rst index e7057658d30..17ce3cb9c47 100644 --- a/doc/admin-guide/files/records.yaml.en.rst +++ b/doc/admin-guide/files/records.yaml.en.rst @@ -2892,7 +2892,7 @@ Dynamic Content & Content Negotiation The number of times to attempt a cache open write upon failure to get a write lock. This config is ignored when :ts:cv:`proxy.config.http.cache.open_write_fail_action` is - set to ``5`` or :ts:cv:`proxy.config.http.cache.max_open_write_retry_timeout` is set to gt ``0``. + set to ``5`` or ``6``, or when :ts:cv:`proxy.config.http.cache.max_open_write_retry_timeout` is set to gt ``0``. .. ts:cv:: CONFIG proxy.config.http.cache.max_open_write_retry_timeout INT 0 :reloadable: @@ -2901,7 +2901,7 @@ Dynamic Content & Content Negotiation A timeout for attempting a cache open write upon failure to get a write lock. This config is ignored when :ts:cv:`proxy.config.http.cache.open_write_fail_action` is - set to ``5``. + set to ``5`` or ``6``. .. ts:cv:: CONFIG proxy.config.http.cache.open_write_fail_action INT 0 :reloadable: @@ -2929,8 +2929,14 @@ Dynamic Content & Content Negotiation with :ts:cv:`proxy.config.cache.enable_read_while_writer` configuration allows to collapse concurrent requests without a need for any plugin. Make sure to configure the :ref:`admin-config-read-while-writer` feature - correctly. Note that this option may result in CACHE_LOOKUP_COMPLETE HOOK - being called back more than once. + correctly. With this option, CACHE_LOOKUP_COMPLETE HOOK is deferred for + read retries so that plugins see only the final cache lookup result. + ``6`` Retry Cache Read on a Cache Write Lock failure (same as ``5``), but if + read retries are exhausted and a stale cached object exists, serve the + stale content if allowed. This combines the request collapsing behavior + of ``5`` with the stale-serving fallback of ``2``. If stale is not + returnable (e.g., due to ``Cache-Control: must-revalidate``), go to + origin server. ===== ====================================================================== Customizable User Response Pages diff --git a/include/proxy/http/HttpConfig.h b/include/proxy/http/HttpConfig.h index 85cb7ea433c..234a930e883 100644 --- a/include/proxy/http/HttpConfig.h +++ b/include/proxy/http/HttpConfig.h @@ -371,6 +371,7 @@ enum class CacheOpenWriteFailAction_t { ERROR_ON_MISS_STALE_ON_REVALIDATE = 0x03, ERROR_ON_MISS_OR_REVALIDATE = 0x04, READ_RETRY = 0x05, + READ_RETRY_STALE_ON_REVALIDATE = 0x06, TOTAL_TYPES }; diff --git a/include/proxy/http/HttpTransact.h b/include/proxy/http/HttpTransact.h index 856fbe991d0..0e05c1e14b8 100644 --- a/include/proxy/http/HttpTransact.h +++ b/include/proxy/http/HttpTransact.h @@ -487,6 +487,7 @@ class HttpTransact HTTPInfo transform_store; CacheDirectives directives; HTTPInfo *object_read = nullptr; + HTTPInfo *stale_fallback = nullptr; // Saved stale object for action 6 fallback during retry CacheWriteLock_t write_lock_state = CacheWriteLock_t::INIT; int lookup_count = 0; SquidHitMissCode hit_miss_code = SQUID_MISS_NONE; @@ -703,6 +704,15 @@ class HttpTransact /// configuration. bool is_cacheable_due_to_negative_caching_configuration = false; + /// Set when stale content is served due to cache write lock failure. + /// Used to correctly attribute statistics and VIA strings. + bool serving_stale_due_to_write_lock = false; + + /// Set when CACHE_LOOKUP_COMPLETE hook is deferred for action 5/6. + /// The hook will fire later with the final result once we know if + /// stale content will be served or if we're going to origin. + bool cache_lookup_complete_deferred = false; + MgmtByte cache_open_write_fail_action = 0; HttpConfigParams *http_config_param = nullptr; @@ -998,9 +1008,12 @@ class HttpTransact static void HandleCacheOpenReadHitFreshness(State *s); static void HandleCacheOpenReadHit(State *s); static void HandleCacheOpenReadMiss(State *s); + static void HandleCacheOpenReadMissGoToOrigin(State *s); static void set_cache_prepare_write_action_for_new_request(State *s); static void build_response_from_cache(State *s, HTTPWarningCode warning_code); static void handle_cache_write_lock(State *s); + static void handle_cache_write_lock_go_to_origin(State *s); + static void handle_cache_write_lock_go_to_origin_continue(State *s); static void HandleResponse(State *s); static void HandleUpdateCachedObject(State *s); static void HandleUpdateCachedObjectContinue(State *s); @@ -1093,7 +1106,8 @@ class HttpTransact static void handle_response_keep_alive_headers(State *s, HTTPVersion ver, HTTPHdr *heads); static int get_max_age(HTTPHdr *response); static int calculate_document_freshness_limit(State *s, HTTPHdr *response, time_t response_date, bool *heuristic); - static Freshness_t what_is_document_freshness(State *s, HTTPHdr *client_request, HTTPHdr *cached_obj_response); + static Freshness_t what_is_document_freshness(State *s, HTTPHdr *client_request, HTTPHdr *cached_obj_response, + bool evaluate_actual_freshness = false); static Authentication_t AuthenticationNeeded(const OverridableHttpConfigParams *p, HTTPHdr *client_request, HTTPHdr *obj_response); static void handle_parent_down(State *s); diff --git a/src/proxy/http/HttpCacheSM.cc b/src/proxy/http/HttpCacheSM.cc index 075e81694fc..7893fda4b1a 100644 --- a/src/proxy/http/HttpCacheSM.cc +++ b/src/proxy/http/HttpCacheSM.cc @@ -43,6 +43,14 @@ namespace { DbgCtl dbg_ctl_http_cache{"http_cache"}; + +// Helper to check if cache_open_write_fail_action has READ_RETRY behavior +inline bool +is_read_retry_action(MgmtByte action) +{ + return action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || + action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE); +} } // end anonymous namespace //// @@ -215,12 +223,11 @@ HttpCacheSM::state_cache_open_write(int event, void *data) break; case CACHE_EVENT_OPEN_WRITE_FAILED: { - if (master_sm->t_state.txn_conf->cache_open_write_fail_action == - static_cast(CacheOpenWriteFailAction_t::READ_RETRY)) { + if (is_read_retry_action(master_sm->t_state.txn_conf->cache_open_write_fail_action)) { // fall back to open_read_tries - // Note that when CacheOpenWriteFailAction_t::READ_RETRY is configured, max_cache_open_write_retries + // Note that when READ_RETRY actions are configured, max_cache_open_write_retries // is automatically ignored. Make sure to not disable max_cache_open_read_retries - // with CacheOpenWriteFailAction_t::READ_RETRY as this results in proxy'ing to origin + // with READ_RETRY actions as this results in proxy'ing to origin // without write retries in both a cache miss or a cache refresh scenario. if (write_retry_done()) { @@ -264,8 +271,7 @@ HttpCacheSM::state_cache_open_write(int event, void *data) _read_retry_event = nullptr; } - if (master_sm->t_state.txn_conf->cache_open_write_fail_action == - static_cast(CacheOpenWriteFailAction_t::READ_RETRY)) { + if (is_read_retry_action(master_sm->t_state.txn_conf->cache_open_write_fail_action)) { Dbg(dbg_ctl_http_cache, "[%" PRId64 "] [state_cache_open_write] cache open write failure %d. " "falling back to read retry...", diff --git a/src/proxy/http/HttpConfig.cc b/src/proxy/http/HttpConfig.cc index 3b7352b1fc6..19e5b1d4d3d 100644 --- a/src/proxy/http/HttpConfig.cc +++ b/src/proxy/http/HttpConfig.cc @@ -1384,7 +1384,9 @@ HttpConfig::reconfigure() params->disallow_post_100_continue = INT_TO_BOOL(m_master.disallow_post_100_continue); params->oride.cache_open_write_fail_action = m_master.oride.cache_open_write_fail_action; - if (params->oride.cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY)) { + if (params->oride.cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || + params->oride.cache_open_write_fail_action == + static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE)) { if (params->oride.max_cache_open_read_retries <= 0 || params->oride.max_cache_open_write_retries <= 0) { Warning("Invalid config, cache_open_write_fail_action (%d), max_cache_open_read_retries (%" PRIu64 "), " "max_cache_open_write_retries (%" PRIu64 ")", diff --git a/src/proxy/http/HttpSM.cc b/src/proxy/http/HttpSM.cc index d9cfb662923..8c339a35c91 100644 --- a/src/proxy/http/HttpSM.cc +++ b/src/proxy/http/HttpSM.cc @@ -2537,9 +2537,11 @@ HttpSM::state_cache_open_write(int event, void *data) case CACHE_EVENT_OPEN_READ: if (!t_state.cache_info.object_read) { t_state.cache_open_write_fail_action = t_state.txn_conf->cache_open_write_fail_action; - // Note that CACHE_LOOKUP_COMPLETE may be invoked more than once - // if CacheOpenWriteFailAction_t::READ_RETRY is configured - ink_assert(t_state.cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY)); + // READ_RETRY mode: write lock failed, no stale object available. + // CACHE_LOOKUP_COMPLETE will fire from HandleCacheOpenReadMiss with MISS result. + ink_assert(t_state.cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || + t_state.cache_open_write_fail_action == + static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE)); t_state.cache_lookup_result = HttpTransact::CacheLookupResult_t::NONE; t_state.cache_info.write_lock_state = HttpTransact::CacheWriteLock_t::READ_RETRY; break; @@ -2558,8 +2560,9 @@ HttpSM::state_cache_open_write(int event, void *data) t_state.source = HttpTransact::Source_t::CACHE; // clear up CacheLookupResult_t::MISS, let Freshness function decide // hit status - t_state.cache_lookup_result = HttpTransact::CacheLookupResult_t::NONE; - t_state.cache_info.write_lock_state = HttpTransact::CacheWriteLock_t::READ_RETRY; + t_state.cache_open_write_fail_action = t_state.txn_conf->cache_open_write_fail_action; + t_state.cache_lookup_result = HttpTransact::CacheLookupResult_t::NONE; + t_state.cache_info.write_lock_state = HttpTransact::CacheWriteLock_t::READ_RETRY; break; case HTTP_TUNNEL_EVENT_DONE: @@ -2663,7 +2666,21 @@ HttpSM::state_cache_open_read(int event, void *data) ink_assert(t_state.transact_return_point == nullptr); t_state.transact_return_point = HttpTransact::HandleCacheOpenRead; - setup_cache_lookup_complete_api(); + + // For READ_RETRY actions (5 and 6), skip the CACHE_LOOKUP_COMPLETE hook now. + // The hook will fire later with the final result: HIT if stale content is found + // during retry, or MISS if nothing is found (from HandleCacheOpenReadMiss). + // This ensures plugins see only the final cache lookup result, avoiding issues + // like stats double-counting and duplicate hook registrations. + if (t_state.txn_conf->cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || + t_state.txn_conf->cache_open_write_fail_action == + static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE)) { + SMDbg(dbg_ctl_http, "READ_RETRY configured, deferring CACHE_LOOKUP_COMPLETE hook"); + t_state.cache_lookup_complete_deferred = true; + call_transact_and_set_next_state(nullptr); + } else { + setup_cache_lookup_complete_api(); + } break; default: diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index 0f5e689222d..d35f0401b09 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -2591,12 +2591,22 @@ HttpTransact::HandleCacheOpenReadHitFreshness(State *s) } } + // If we're serving stale due to write lock failure, override cache_lookup_result to HIT_STALE + // so plugins see the accurate status. what_is_document_freshness returns FRESH to bypass + // revalidation, but the content is actually stale. + if (s->serving_stale_due_to_write_lock) { + s->cache_lookup_result = HttpTransact::CacheLookupResult_t::HIT_STALE; + } + ink_assert(s->cache_lookup_result != HttpTransact::CacheLookupResult_t::MISS); if (s->cache_lookup_result == HttpTransact::CacheLookupResult_t::HIT_STALE) { SET_VIA_STRING(VIA_DETAIL_CACHE_LOOKUP, VIA_DETAIL_MISS_EXPIRED); SET_VIA_STRING(VIA_CACHE_RESULT, VIA_IN_CACHE_STALE); } + // Clear the deferred flag if set (for READ_RETRY cases that found stale content) + s->cache_lookup_complete_deferred = false; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HttpTransact::HandleCacheOpenReadHit); } @@ -2787,7 +2797,9 @@ HttpTransact::HandleCacheOpenReadHit(State *s) // proxy.config.http.cache.ignore_server_no_cache is set to 0 (i.e don't ignore no cache -- the default setting) // // But, we only do this if we're not in an API updating the cached object (see TSHttpTxnUpdateCachedObject) - if ((((s->cache_lookup_result == CacheLookupResult_t::HIT_STALE) || + // Also skip revalidation if we're serving stale due to write lock failure - we've already decided + // to serve the stale content to avoid contention, so don't try to revalidate. + if ((((s->cache_lookup_result == CacheLookupResult_t::HIT_STALE && !s->serving_stale_due_to_write_lock) || ((obj->response_get()->get_cooked_cc_mask() & MIME_COOKED_MASK_CC_NO_CACHE) && !s->cache_control.ignore_server_no_cache)) && (s->api_update_cached_object != HttpTransact::UpdateCachedObject_t::CONTINUE))) { needs_revalidate = true; @@ -2942,14 +2954,27 @@ HttpTransact::HandleCacheOpenReadHit(State *s) HttpCacheSM &cache_sm = s->state_machine->get_cache_sm(); TxnDbg(dbg_ctl_http_trans, "CacheOpenRead --- HIT-FRESH read while write %d", cache_sm.is_readwhilewrite_inprogress()); - if (cache_sm.is_readwhilewrite_inprogress()) + if (cache_sm.is_readwhilewrite_inprogress()) { SET_VIA_STRING(VIA_CACHE_RESULT, VIA_IN_CACHE_RWW_HIT); + } + + // If serving stale due to write lock failure (actions 2, 3, or 6), adjust VIA to reflect stale serving. + // This ensures correct statistics attribution (cache_hit_stale_served instead of cache_hit_fresh). + // Note: VIA_SERVER_RESULT is left as default (space) since the origin server was never contacted. + if (s->serving_stale_due_to_write_lock) { + TxnDbg(dbg_ctl_http_trans, "Serving stale due to write lock failure, adjusting VIA for statistics"); + SET_VIA_STRING(VIA_DETAIL_CACHE_LOOKUP, VIA_DETAIL_MISS_EXPIRED); + SET_VIA_STRING(VIA_CACHE_RESULT, VIA_IN_CACHE_STALE); + } if (s->cache_lookup_result == CacheLookupResult_t::HIT_WARNING) { build_response_from_cache(s, HTTPWarningCode::HERUISTIC_EXPIRATION); - } else if (s->cache_lookup_result == CacheLookupResult_t::HIT_STALE) { + } else if (s->cache_lookup_result == CacheLookupResult_t::HIT_STALE && !s->serving_stale_due_to_write_lock) { ink_assert(server_up == false); build_response_from_cache(s, HTTPWarningCode::REVALIDATION_FAILED); + } else if (s->serving_stale_due_to_write_lock) { + // Serving stale due to write lock failure - no Warning header (deprecated per RFC 9111) + build_response_from_cache(s, HTTPWarningCode::NONE); } else { build_response_from_cache(s, HTTPWarningCode::NONE); } @@ -3166,7 +3191,9 @@ HttpTransact::handle_cache_write_lock(State *s) // Write failed and read retry triggered // Clean up server_request and re-initiate // Cache Lookup - ink_assert(s->cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY)); + ink_assert(s->cache_open_write_fail_action == static_cast(CacheOpenWriteFailAction_t::READ_RETRY) || + s->cache_open_write_fail_action == + static_cast(CacheOpenWriteFailAction_t::READ_RETRY_STALE_ON_REVALIDATE)); s->cache_info.write_status = CacheWriteStatus_t::LOCK_MISS; StateMachineAction_t next; next = StateMachineAction_t::CACHE_LOOKUP; @@ -3207,21 +3234,102 @@ HttpTransact::handle_cache_write_lock(State *s) } if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { - TxnDbg(dbg_ctl_http_error, "calling hdr_info.server_request.destroy"); - s->hdr_info.server_request.destroy(); - HandleCacheOpenReadHitFreshness(s); - } else { - StateMachineAction_t next; - next = how_to_open_connection(s); - if (next == StateMachineAction_t::ORIGIN_SERVER_OPEN || next == StateMachineAction_t::ORIGIN_SERVER_RAW_OPEN) { - s->next_action = next; - TRANSACT_RETURN(next, nullptr); + // For READ_RETRY with cached object, evaluate actual freshness to decide the path. + // If the initial lookup was a MISS, cache_lookup_complete_deferred is true and we + // need to fire the hook here with the final result. If the initial lookup was a + // HIT_STALE (revalidation case), the hook already fired and deferred is false. + CacheHTTPInfo *obj = s->cache_info.object_read; + if (obj != nullptr) { + // Restore request/response times from cached object for freshness calculations and Age header. + // Similar to HandleCacheOpenReadHitFreshness, handle clock skew by capping times. + s->request_sent_time = obj->request_sent_time_get(); + s->response_received_time = obj->response_received_time_get(); + s->request_sent_time = std::min(s->client_request_time, s->request_sent_time); + s->response_received_time = std::min(s->client_request_time, s->response_received_time); + + // Evaluate actual document freshness. Pass true to skip the STALE_ON_REVALIDATE + // short-circuit so we get the real freshness, not the "return FRESH to bypass revalidation" result. + Freshness_t freshness = what_is_document_freshness(s, &s->hdr_info.client_request, obj->response_get(), true); + + if (freshness == Freshness_t::FRESH || freshness == Freshness_t::WARNING) { + // Object is fresh - serve it from cache for both action 5 and 6. + // This is the main benefit of request collapsing: we found a valid cached object. + // Clear stale-related state in case it was set during initial stale short-circuit. + s->serving_stale_due_to_write_lock = false; + s->cache_info.stale_fallback = nullptr; + + // Destroy server_request since we're serving from cache. + TxnDbg(dbg_ctl_http_trans, "READ_RETRY: found fresh object, serving from cache"); + s->hdr_info.server_request.destroy(); + s->cache_lookup_result = + (freshness == Freshness_t::FRESH) ? CacheLookupResult_t::HIT_FRESH : CacheLookupResult_t::HIT_WARNING; + + if (s->cache_lookup_complete_deferred) { + s->cache_lookup_complete_deferred = false; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadHit); + } + HandleCacheOpenReadHit(s); + } else { + // Object is stale. Save it as potential fallback, then trigger actual cache retry. + // HandleCacheOpenReadMiss will serve stale fallback (action 6) or go to origin (action 5). + if (is_stale_cache_response_returnable(s)) { + s->cache_info.stale_fallback = s->cache_info.object_read; + } + TxnDbg(dbg_ctl_http_trans, "READ_RETRY: object stale, triggering actual cache retry"); + s->cache_info.object_read = nullptr; + s->cache_info.write_status = CacheWriteStatus_t::LOCK_MISS; + s->hdr_info.server_request.destroy(); + TRANSACT_RETURN(StateMachineAction_t::CACHE_LOOKUP, nullptr); + } } else { - // hehe! - s->next_action = next; - ink_assert(s->next_action == StateMachineAction_t::DNS_LOOKUP); - return; + HandleCacheOpenReadMiss(s); } + } else { + handle_cache_write_lock_go_to_origin(s); + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Name : handle_cache_write_lock_go_to_origin +// Description: Helper to proceed to origin server from handle_cache_write_lock. +// Called when write lock succeeded or failed without READ_RETRY. +// +/////////////////////////////////////////////////////////////////////////////// +void +HttpTransact::handle_cache_write_lock_go_to_origin(State *s) +{ + TxnDbg(dbg_ctl_http_trans, "handle_cache_write_lock_go_to_origin - proceeding to origin"); + + // If the CACHE_LOOKUP_COMPLETE hook was deferred (for action 5/6), fire it now. + // We're going to origin without READ_RETRY, so MISS is the final result. + if (s->cache_lookup_complete_deferred) { + TxnDbg(dbg_ctl_http_trans, "Firing deferred CACHE_LOOKUP_COMPLETE with MISS before origin"); + s->cache_lookup_complete_deferred = false; + s->cache_lookup_result = CacheLookupResult_t::MISS; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, handle_cache_write_lock_go_to_origin_continue); + } + + handle_cache_write_lock_go_to_origin_continue(s); +} + +/////////////////////////////////////////////////////////////////////////////// +// Name : handle_cache_write_lock_go_to_origin_continue +// Description: Continuation of handle_cache_write_lock_go_to_origin after +// the deferred CACHE_LOOKUP_COMPLETE hook fires. +// +/////////////////////////////////////////////////////////////////////////////// +void +HttpTransact::handle_cache_write_lock_go_to_origin_continue(State *s) +{ + StateMachineAction_t next; + next = how_to_open_connection(s); + if (next == StateMachineAction_t::ORIGIN_SERVER_OPEN || next == StateMachineAction_t::ORIGIN_SERVER_RAW_OPEN) { + s->next_action = next; + TRANSACT_RETURN(next, nullptr); + } else { + s->next_action = next; + ink_assert(s->next_action == StateMachineAction_t::DNS_LOOKUP); + return; } } @@ -3271,20 +3379,69 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) } else if (s->api_server_response_no_store) { // plugin may have decided not to cache the response s->cache_info.action = CacheAction_t::NO_ACTION; } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { - // We've looped back around due to failing to read during READ_RETRY mode. - // Don't attempt another cache write - just proxy to origin without caching. + // READ_RETRY cache read failed (no fresh object found). + // Check if we have a stale fallback (saved from action 6 revalidation case). + bool is_action_6 = + (s->cache_open_write_fail_action & static_cast(CacheOpenWriteFailAction_t::STALE_ON_REVALIDATE)) != 0; + + if (is_action_6 && s->cache_info.stale_fallback != nullptr) { + // Action 6: Serve stale fallback after retries exhausted. + TxnDbg(dbg_ctl_http_trans, "READ_RETRY: retries exhausted, serving stale (action 6)"); + s->cache_info.object_read = s->cache_info.stale_fallback; + s->cache_info.stale_fallback = nullptr; + s->serving_stale_due_to_write_lock = true; + s->cache_lookup_result = CacheLookupResult_t::HIT_STALE; + + if (s->cache_lookup_complete_deferred) { + s->cache_lookup_complete_deferred = false; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadHit); + } + HandleCacheOpenReadHit(s); + return; + } + // Action 5 or no stale fallback: proceed to origin without caching. TxnDbg(dbg_ctl_http_trans, "READ_RETRY cache read failed, bypassing cache"); - s->cache_info.action = CacheAction_t::NO_ACTION; + s->cache_info.stale_fallback = nullptr; // Clear unused fallback + s->cache_info.action = CacheAction_t::NO_ACTION; } else { HttpTransact::set_cache_prepare_write_action_for_new_request(s); } - /////////////////////////////////////////////////////////////// - // a normal miss would try to fetch the document from the // - // origin server, unless the origin server isn't resolvable, // - // but if "CacheControl: only-if-cached" is set, then we are // - // supposed to send a 504 (GATEWAY TIMEOUT) response. // - /////////////////////////////////////////////////////////////// + // If the CACHE_LOOKUP_COMPLETE hook was deferred, decide whether to fire it now. + // Only fire when we're at a terminal state for cache lookup: + // - NO_ACTION: Not attempting write lock, MISS is the final result + // - READ_RETRY: Already retried and still missed, MISS is the final result + // + // Don't fire yet if action == PREPARE_TO_WRITE and write_lock_state != READ_RETRY, + // because we're about to attempt the write lock and may find content on retry. + // In that case, the hook will fire later: + // - If READ_RETRY finds HIT: handle_cache_write_lock fires CACHE_LOOKUP_COMPLETE with HIT + // - If READ_RETRY finds MISS: We come back here with write_lock_state == READ_RETRY + if (s->cache_lookup_complete_deferred) { + if (s->cache_info.action == CacheAction_t::NO_ACTION || s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { + TxnDbg(dbg_ctl_http_trans, "Firing deferred CACHE_LOOKUP_COMPLETE with MISS (action=%d, write_lock_state=%d)", + static_cast(s->cache_info.action), static_cast(s->cache_info.write_lock_state)); + s->cache_lookup_complete_deferred = false; + s->cache_lookup_result = CacheLookupResult_t::MISS; + TRANSACT_RETURN(StateMachineAction_t::API_CACHE_LOOKUP_COMPLETE, HandleCacheOpenReadMissGoToOrigin); + } + TxnDbg(dbg_ctl_http_trans, "Deferring CACHE_LOOKUP_COMPLETE hook - waiting for write lock attempt"); + } + + // Proceed to origin server (handles DNS lookup, parent proxy, etc.) + HandleCacheOpenReadMissGoToOrigin(s); +} + +/////////////////////////////////////////////////////////////////////////////// +// Name : HandleCacheOpenReadMissGoToOrigin +// Description: Contains the "go to origin" logic for cache miss cases. +// Handles DNS lookup, parent proxy selection, and request building. +// +/////////////////////////////////////////////////////////////////////////////// +void +HttpTransact::HandleCacheOpenReadMissGoToOrigin(State *s) +{ + TxnDbg(dbg_ctl_http_trans, "HandleCacheOpenReadMissGoToOrigin - proceeding to origin"); HTTPHdr *h = &s->hdr_info.client_request; @@ -3327,8 +3484,6 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) build_error_response(s, HTTPStatus::GATEWAY_TIMEOUT, "Not Cached", "cache#not_in_cache"); s->next_action = StateMachineAction_t::SEND_ERROR_CACHE_NOOP; } - - return; } void @@ -7291,7 +7446,8 @@ HttpTransact::calculate_document_freshness_limit(State *s, HTTPHdr *response, ti // ////////////////////////////////////////////////////////////////////////////// HttpTransact::Freshness_t -HttpTransact::what_is_document_freshness(State *s, HTTPHdr *client_request, HTTPHdr *cached_obj_response) +HttpTransact::what_is_document_freshness(State *s, HTTPHdr *client_request, HTTPHdr *cached_obj_response, + bool evaluate_actual_freshness) { bool heuristic, do_revalidate = false; int age_limit; @@ -7300,9 +7456,19 @@ HttpTransact::what_is_document_freshness(State *s, HTTPHdr *client_request, HTTP uint32_t cc_mask, cooked_cc_mask; uint32_t os_specifies_revalidate; - if (s->cache_open_write_fail_action & static_cast(CacheOpenWriteFailAction_t::STALE_ON_REVALIDATE)) { + // This check works for STALE_ON_REVALIDATE(0x2), ERROR_ON_MISS_STALE_ON_REVALIDATE(0x3), and + // READ_RETRY_STALE_ON_REVALIDATE(0x6). + // We return FRESH (not STALE) intentionally to bypass the revalidation code path in + // HandleCacheOpenReadHit. Returning STALE would trigger origin server contact for revalidation, + // but for write lock failure scenarios we want to serve the stale content directly without + // revalidation. The serving_stale_due_to_write_lock flag tracks that we're actually serving + // stale content, so VIA strings and statistics can be correctly attributed. + // When evaluate_actual_freshness is true, skip this short-circuit to get the real freshness. + if (!evaluate_actual_freshness && + (s->cache_open_write_fail_action & static_cast(CacheOpenWriteFailAction_t::STALE_ON_REVALIDATE))) { if (is_stale_cache_response_returnable(s)) { - TxnDbg(dbg_ctl_http_match, "cache_serve_stale_on_write_lock_fail, return FRESH"); + TxnDbg(dbg_ctl_http_match, "cache_serve_stale_on_write_lock_fail, return FRESH to bypass revalidation"); + s->serving_stale_due_to_write_lock = true; return (Freshness_t::FRESH); } } diff --git a/src/records/RecordsConfig.cc b/src/records/RecordsConfig.cc index 88526c08ad3..ce94cdff93e 100644 --- a/src/records/RecordsConfig.cc +++ b/src/records/RecordsConfig.cc @@ -610,13 +610,15 @@ static constexpr RecordElement RecordsConfig[] = , {RECT_CONFIG, "proxy.config.http.cache.max_open_write_retry_timeout", RECD_INT, "0", RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL} , - // # open_write_fail_action has 3 options: + // # open_write_fail_action options: // # // # 0 - default. disable cache and goto origin // # 1 - return error if cache miss // # 2 - serve stale until proxy.config.http.cache.max_stale_age, then goto origin, if revalidate // # 3 - return error if cache miss or serve stale until proxy.config.http.cache.max_stale_age, then goto origin, if revalidate // # 4 - return error if cache miss or if revalidate + // # 5 - retry cache read (read-while-writer) on write lock failure, goto origin if retries exhausted + // # 6 - retry cache read on write lock failure, if retries exhausted serve stale if allowed, otherwise goto origin {RECT_CONFIG, "proxy.config.http.cache.open_write_fail_action", RECD_INT, "0", RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL} , // # when_to_revalidate has 4 options: diff --git a/tests/gold_tests/cache/cache-read-retry-stale.test.py b/tests/gold_tests/cache/cache-read-retry-stale.test.py new file mode 100644 index 00000000000..8ec31e2aa0c --- /dev/null +++ b/tests/gold_tests/cache/cache-read-retry-stale.test.py @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +Test.Summary = ''' +Smoke test for cache_open_write_fail_action = 6 (READ_RETRY_STALE_ON_REVALIDATE) to verify: +1. Action 6 is accepted by the configuration system +2. Basic caching works correctly with action 6 enabled +3. The system does not crash + +Note: The stale fallback behavior (serving stale on read retry exhaustion) is +difficult to reliably test due to timing sensitivity. This test focuses on +verifying the feature is functional without causing instability. +''' + +Test.ContinueOnFail = True + +Test.ATSReplayTest(replay_file="replay/cache-read-retry-stale.replay.yaml") diff --git a/tests/gold_tests/cache/cache-write-lock-contention.test.py b/tests/gold_tests/cache/cache-write-lock-contention.test.py new file mode 100644 index 00000000000..7e8ebc9ad1c --- /dev/null +++ b/tests/gold_tests/cache/cache-write-lock-contention.test.py @@ -0,0 +1,175 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Test cache write lock contention behavior with parallel requests for actions 5 and 6. + +This test issues multiple parallel curl requests to trigger actual cache write lock +contention. It is skipped by default because the behavior is timing-sensitive and +not suitable for CI environments. + +To run this test manually: + export RUN_CACHE_CONTENTION_TEST=1 + cd build/tests + ./autest.sh --sandbox /tmp/sbcursor --clean=none -f cache-write-lock-contention +''' + +import os + +Test.Summary = ''' +Test cache write lock contention with parallel requests (actions 5 and 6). + +This test is SKIPPED by default. Set RUN_CACHE_CONTENTION_TEST=1 to run it. +''' + +Test.ContinueOnFail = True + +Test.SkipUnless( + Condition( + lambda: os.environ.get('RUN_CACHE_CONTENTION_TEST', '').lower() in ('1', 'true', 'yes'), + "Set RUN_CACHE_CONTENTION_TEST=1 to run this timing-sensitive test", True)) + + +def make_parallel_curl(url, count, stagger_delay=None): + """ + Generate a parallel curl command string. + + Args: + url: The URL to request + count: Number of parallel requests + stagger_delay: If set, add this delay between requests (e.g., "0.1") + If None, all requests fire simultaneously + """ + parts = ['('] + for i in range(1, count + 1): + if i > 1 and stagger_delay: + parts.append(f'sleep {stagger_delay} && ') + parts.append( + f'{{curl}} -s -o /dev/null -w "req{i}: %{{{{http_code}}}}\\n" "{url}" ' + f'-H "Host: example.com" -H "X-Request: req{i}" & ') + parts.append('wait)') + return ''.join(parts) + + +class ContentionTest: + """ + Base class for cache write lock contention tests. + + Tests parallel requests to trigger cache write lock contention and verify + that the configured fail_action behaves correctly. + """ + + def __init__(self, name, action, description, stale_scenario=False): + self.name = name + self.action = action + self.description = description + self.stale_scenario = stale_scenario + self._setup() + + def _setup(self): + origin_delay = 5 if self.stale_scenario else 3 + self.server = Test.MakeOriginServer(f"server_{self.name}", delay=origin_delay) + + max_age = 1 if self.stale_scenario else 300 + self.server.addResponse( + "sessionlog.json", { + "headers": f"GET /test-{self.name} HTTP/1.1\r\nHost: example.com\r\n\r\n", + "timestamp": "1469733493.993", + "body": "" + }, { + "headers": + f"HTTP/1.1 200 OK\r\n" + f"Content-Length: 100\r\n" + f"Cache-Control: max-age={max_age}\r\n" + f"X-Origin: {self.name}\r\n" + f"Connection: close\r\n\r\n", + "timestamp": "1469733493.993", + "body": "X" * 100 + }) + + self.ts = Test.MakeATSProcess(f"ts_{self.name}", enable_cache=True) + + read_retries = 2 if self.stale_scenario else 10 + retry_time = 100 if self.stale_scenario else 500 + + self.ts.Disk.records_config.update( + { + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'http|cache|http_cache|http_trans', + 'proxy.config.http.cache.open_write_fail_action': self.action, + 'proxy.config.http.cache.max_open_write_retries': 1, + 'proxy.config.http.cache.max_open_write_retry_timeout': 0, + 'proxy.config.http.cache.max_open_read_retries': read_retries, + 'proxy.config.http.cache.open_read_retry_time': retry_time, + 'proxy.config.cache.enable_read_while_writer': 1, + 'proxy.config.http.cache.max_stale_age': 300, + }) + self.ts.Disk.remap_config.AddLine(f'map http://example.com/ http://127.0.0.1:{self.server.Variables.Port}/') + + def run(self): + url_path = f"/test-{self.name}" + url = f"http://127.0.0.1:{self.ts.Variables.port}{url_path}" + + if self.stale_scenario: + tr = Test.AddTestRun(f"{self.description} - Prime cache") + tr.Processes.Default.StartBefore(self.server) + tr.Processes.Default.StartBefore(self.ts) + tr.MakeCurlCommand(f'-s -D - "{url}" -H "Host: example.com"', ts=self.ts) + tr.Processes.Default.ReturnCode = 0 + tr.StillRunningAfter = self.ts + tr.StillRunningAfter = self.server + + tr = Test.AddTestRun(f"{self.description} - Wait for stale") + tr.Processes.Default.Command = "sleep 3" + tr.Processes.Default.ReturnCode = 0 + tr.StillRunningAfter = self.ts + tr.StillRunningAfter = self.server + + tr = Test.AddTestRun(self.description) + if not self.stale_scenario: + tr.Processes.Default.StartBefore(self.server) + tr.Processes.Default.StartBefore(self.ts) + + count = 8 if self.stale_scenario else 5 + stagger = None if self.stale_scenario else "0.1" + tr.MakeCurlCommandMulti(make_parallel_curl(url, count, stagger), ts=self.ts) + tr.Processes.Default.ReturnCode = 0 + tr.StillRunningAfter = self.ts + tr.StillRunningAfter = self.server + + tr = Test.AddTestRun(f"{self.description} - Verify cache") + tr.MakeCurlCommand(f'-s -D - "{url}" -H "Host: example.com"', ts=self.ts) + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression("200 OK", "Cache serves 200") + tr.StillRunningAfter = self.ts + tr.StillRunningAfter = self.server + + self.ts.Disk.traffic_out.Content = Testers.ExcludesExpression("FATAL|ink_release_assert|ink_abort", "No crashes") + + if self.stale_scenario and self.action == 6: + self.ts.Disk.traffic_out.Content += Testers.ContainsExpression( + "serving stale \\(action 6\\)|object stale, serving stale", "Stale fallback triggered") + else: + self.ts.Disk.traffic_out.Content += Testers.ContainsExpression( + "cache open read failure.*retrying|read while write", "Cache contention occurred") + + +# Action 5: READ_RETRY - retries cache reads, goes to origin if exhausted +ContentionTest("a5_fresh", 5, "Action 5 - fresh contention").run() +ContentionTest("a5_stale", 5, "Action 5 - stale revalidation", stale_scenario=True).run() + +# Action 6: READ_RETRY_STALE_ON_REVALIDATE - like action 5, but serves stale if available +ContentionTest("a6_fresh", 6, "Action 6 - fresh contention").run() +ContentionTest("a6_stale", 6, "Action 6 - stale fallback", stale_scenario=True).run() diff --git a/tests/gold_tests/cache/replay/cache-read-retry-stale.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry-stale.replay.yaml new file mode 100644 index 00000000000..de28fe96484 --- /dev/null +++ b/tests/gold_tests/cache/replay/cache-read-retry-stale.replay.yaml @@ -0,0 +1,187 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# This replay file tests cache_open_write_fail_action = 6 (READ_RETRY_STALE_ON_REVALIDATE) +# +# This is a smoke test that verifies: +# 1. Action 6 is accepted by the configuration system +# 2. Basic caching works correctly with action 6 enabled +# 3. The system does not crash +# +# The stale fallback behavior (serving stale on read retry exhaustion) is +# difficult to reliably test due to timing sensitivity. This test focuses +# on verifying the feature is functional without causing instability. +# + +meta: + version: "1.0" + +autest: + description: 'Test cache_open_write_fail_action = 6 (READ_RETRY with stale fallback) smoke test' + dns: + name: 'dns-read-retry-stale' + + server: + name: 'origin-read-retry-stale' + + client: + name: 'client-read-retry-stale' + + ats: + name: 'ts-read-retry-stale' + process_config: + enable_cache: true + + records_config: + proxy.config.diags.debug.enabled: 1 + proxy.config.diags.debug.tags: 'http_cache' + # Enable READ_RETRY_STALE_ON_REVALIDATE mode (action 6) + proxy.config.http.cache.open_write_fail_action: 6 + # Configure retry parameters + proxy.config.http.cache.max_open_write_retries: 1 + proxy.config.http.cache.max_open_write_retry_timeout: 0 + proxy.config.http.cache.max_open_read_retries: 3 + proxy.config.http.cache.open_read_retry_time: 100 + proxy.config.cache.enable_read_while_writer: 1 + # Allow serving stale content + proxy.config.http.cache.max_stale_age: 300 + + remap_config: + - from: "http://example.com/" + to: "http://backend.example.com:{SERVER_HTTP_PORT}/" + + log_validation: + traffic_out: + # Should NOT contain crash indicators + excludes: + - expression: "FATAL|ALERT|Emergency|ink_release_assert|ink_abort" + description: "Verify ATS does not crash with READ_RETRY_STALE mode enabled" + +sessions: + ############################################################################# + # First session: Cache a normal object + ############################################################################# + - transactions: + - client-request: + method: "GET" + version: "1.1" + url: /test-object + headers: + fields: + - [uuid, cache-prime] + - [Host, example.com] + + server-response: + status: 200 + reason: OK + headers: + fields: + - [Content-Length, 100] + - [Cache-Control, "max-age=300"] + - [X-Response, cached-content] + + proxy-response: + status: 200 + headers: + fields: + - [X-Response, {value: 'cached-content', as: equal}] + + ############################################################################# + # Second session: Verify cache hit (action 6 doesn't break basic caching) + ############################################################################# + - transactions: + - client-request: + delay: 500ms + method: "GET" + version: "1.1" + url: /test-object + headers: + fields: + - [uuid, cache-hit] + - [Host, example.com] + + # Should NOT reach origin (cache hit) + server-response: + status: 500 + reason: Should Not Reach + headers: + fields: + - [Content-Length, 100] + - [X-Response, should-not-reach] + + # Should serve from cache + proxy-response: + status: 200 + headers: + fields: + - [X-Response, {value: 'cached-content', as: equal}] + + ############################################################################# + # Third session: Test a different uncached object + ############################################################################# + - transactions: + - client-request: + method: "GET" + version: "1.1" + url: /another-object + headers: + fields: + - [uuid, another-cache] + - [Host, example.com] + + server-response: + status: 200 + reason: OK + headers: + fields: + - [Content-Length, 50] + - [Cache-Control, "max-age=300"] + - [X-Response, another-content] + + proxy-response: + status: 200 + headers: + fields: + - [X-Response, {value: 'another-content', as: equal}] + + ############################################################################# + # Fourth session: Verify second object cached + ############################################################################# + - transactions: + - client-request: + delay: 500ms + method: "GET" + version: "1.1" + url: /another-object + headers: + fields: + - [uuid, another-hit] + - [Host, example.com] + + server-response: + status: 500 + reason: Should Not Reach + headers: + fields: + - [Content-Length, 100] + - [X-Response, should-not-reach] + + proxy-response: + status: 200 + headers: + fields: + - [X-Response, {value: 'another-content', as: equal}]