From bce023fbea0cfb8df754810b640a5f911f9483a2 Mon Sep 17 00:00:00 2001 From: Chris Burroughs Date: Tue, 6 Jan 2026 13:25:30 -0500 Subject: [PATCH] cache temporary redirects with explicit caching directives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RFC 9111 §3 - Storing Responses in Caches, describes the conditions under which a cache may store a response. Notably all final (aka not 1xx) status codes are *potentially* cacheable, subject to the other listed conditions. In brief a response must have either an explicit header (such as a `max-age` directive) OR have a heuristically cacheable status code. Which status codes are heuristically cacheable is helpfully summarized in RFC 9110 §15.1 - Overview of Status Codes: 200, 203, 204, 206, 300, 301, 308, 404, 405, 410, 414, and 501) CacheController has historically maintained a more conservative list of cacheable_status_codes that are a subset of the heuristically cacheable ones. This diff makes related changes: * Temporary redirects are now cached by default. Besides being correct in the general case, temporary redirects are used by several Python Registries as discussed in #384. * Because temporary redirects are not heuristically cacheable, add a check to only cache them when explicit headers are present. This means that regardless of which status_codes are passed to CacheController, the HTTP specification is still followed. * As defensive future proofing, make sure the response is final before much else is done. fixes #384 --- cachecontrol/controller.py | 64 ++++++++++++++++++++++++++++++++----- cachecontrol/heuristics.py | 31 +++++++++--------- tests/test_cache_control.py | 34 +++++++++++++++++++- 3 files changed, 105 insertions(+), 24 deletions(-) diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py index c7dd8a17..d7dc02cb 100644 --- a/cachecontrol/controller.py +++ b/cachecontrol/controller.py @@ -18,6 +18,7 @@ from requests.structures import CaseInsensitiveDict +from cachecontrol.heuristics import HEURISTICALLY_CACHEABLE_STATUSES from cachecontrol.cache import DictCache, SeparateBodyBaseCache from cachecontrol.serialize import Serializer @@ -33,6 +34,17 @@ URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") +NEVER_CACHE_STATUSES = ( + # Per https://www.rfc-editor.org/rfc/rfc9111.html#section-3 the status + # code must be final + 100, + 101, + # From httplib2: Don't cache 206's since we aren't going to + # handle byte range requests + 206, +) + + PERMANENT_REDIRECT_STATUSES = (301, 308) @@ -60,7 +72,20 @@ def __init__( self.cache = DictCache() if cache is None else cache self.cache_etags = cache_etags self.serializer = serializer or Serializer() - self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308) + # Per https://www.rfc-editor.org/rfc/rfc9111.html#section-3-2.7.1 all + # all final response codes are potentially cacheable, subject to the + # other conditions. CacheController conservatively only caches common + # ones codes. For example, even with a max-age set, a 500 Internal + # Server Error will not be cached + self.cacheable_status_codes = status_codes or ( + 200, + 203, + 300, + 301, + 302, + 307, + 308, + ) @classmethod def _urlnorm(cls, uri: str) -> str: @@ -128,12 +153,12 @@ def parse_cache_control(self, headers: Mapping[str, str]) -> dict[str, int | Non except IndexError: if required: logger.debug( - "Missing value for cache-control " "directive: %s", + "Missing value for cache-control directive: %s", directive, ) except ValueError: logger.debug( - "Invalid value for cache-control directive " "%s, must be %s", + "Invalid value for cache-control directive %s, must be %s", directive, typ.__name__, ) @@ -343,12 +368,11 @@ def cache_response( else: response = response_or_ref - # From httplib2: Don't cache 206's since we aren't going to - # handle byte range requests - cacheable_status_codes = status_codes or self.cacheable_status_codes - if response.status not in cacheable_status_codes: + if response.status in NEVER_CACHE_STATUSES: logger.debug( - "Status code %s not in %s", response.status, cacheable_status_codes + "Status code %s in never cache set %s", + response.status, + NEVER_CACHE_STATUSES, ) return @@ -378,6 +402,30 @@ def cache_response( cc_req = self.parse_cache_control(request.headers) cc = self.parse_cache_control(response_headers) + # "at least one of the following" from + # https://www.rfc-editor.org/rfc/rfc9111.html#section-3-2.7.1 + has_explicit_freshness = ( + "public" in cc or "expires" in response_headers or "max-age" in cc + # NOTE: s-maxage is also listed in the RFC section, but + # cache_response() doesn't currently express the concept of shared + # vs private caching + ) + cacheable_status_codes = status_codes or self.cacheable_status_codes + if response.status not in cacheable_status_codes: + logger.debug( + "Status code %s not in %s", response.status, cacheable_status_codes + ) + return + if ( + response.status not in HEURISTICALLY_CACHEABLE_STATUSES + and not has_explicit_freshness + ): + logger.debug( + "Status code %s is not heuristically cacheable and no explicit caching headers are set", + response.status, + ) + return + assert request.url is not None cache_url = self.cache_url(request.url) logger.debug('Updating cache with response from "%s"', cache_url) diff --git a/cachecontrol/heuristics.py b/cachecontrol/heuristics.py index d95e78a9..a7fd39aa 100644 --- a/cachecontrol/heuristics.py +++ b/cachecontrol/heuristics.py @@ -14,6 +14,21 @@ TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT" +# Concise list from https://www.rfc-editor.org/rfc/rfc9110#section-15.1 +HEURISTICALLY_CACHEABLE_STATUSES = { + 200, + 203, + 204, + 206, + 300, + 301, + 404, + 405, + 410, + 414, + 501, +} + def expire_after(delta: timedelta, date: datetime | None = None) -> datetime: date = date or datetime.now(timezone.utc) @@ -107,20 +122,6 @@ class LastModified(BaseHeuristic): Unlike mozilla we limit this to 24-hr. """ - cacheable_by_default_statuses = { - 200, - 203, - 204, - 206, - 300, - 301, - 404, - 405, - 410, - 414, - 501, - } - def update_headers(self, resp: HTTPResponse) -> dict[str, str]: headers: Mapping[str, str] = resp.headers @@ -130,7 +131,7 @@ def update_headers(self, resp: HTTPResponse) -> dict[str, str]: if "cache-control" in headers and headers["cache-control"] != "public": return {} - if resp.status not in self.cacheable_by_default_statuses: + if resp.status not in HEURISTICALLY_CACHEABLE_STATUSES: return {} if "date" not in headers or "last-modified" not in headers: diff --git a/tests/test_cache_control.py b/tests/test_cache_control.py index 40cac5b5..d341a6f2 100644 --- a/tests/test_cache_control.py +++ b/tests/test_cache_control.py @@ -39,6 +39,14 @@ def cc(self): # Cache controller fixture return CacheController(Mock(), serializer=Mock()) + @pytest.mark.parametrize("status_code", [100, 101]) + def test_no_cache_without_final(self, cc, status_code): + now = time.strftime(TIME_FMT, time.gmtime()) + resp = self.resp({"cache-control": "max-age=3600", "date": now}) + resp.status = status_code + cc.cache_response(Mock(), resp) + assert not cc.cache.set.called + def test_no_cache_non_20x_response(self, cc): # No caching without some extra headers, so we add them now = time.strftime(TIME_FMT, time.gmtime()) @@ -47,7 +55,7 @@ def test_no_cache_non_20x_response(self, cc): no_cache_codes = [201, 400, 500] for code in no_cache_codes: resp.status = code - cc.cache_response(Mock(), resp) + cc.cache_response(self.req(), resp) assert not cc.cache.set.called # this should work b/c the resp is 20x @@ -71,6 +79,17 @@ def test_no_cache_with_wrong_sized_body(self, cc): assert not cc.cache.set.called + @pytest.mark.parametrize("status_code", [302, 307]) + def test_no_cache_non_heuristic_status_without_explicit_freshness( + self, cc, status_code + ): + now = time.strftime(TIME_FMT, time.gmtime()) + # NOTE: Only 'date' header, NO "hard" cache-control/expires/public headers + resp = self.resp({"date": now}) + resp.status = status_code + cc.cache_response(self.req(), resp) + assert not cc.cache.set.called + def test_cache_response_no_cache_control(self, cc): resp = self.resp() cc.cache_response(self.req(), resp) @@ -85,6 +104,19 @@ def test_cache_response_cache_max_age(self, cc): cc.serializer.dumps.assert_called_with(req, resp, None) cc.cache.set.assert_called_with(self.url, ANY, expires=3600) + def test_cache_response_explicit_non_heuristic(self, cc): + # meets the requirements of + # https://www.rfc-editor.org/rfc/rfc9111.html#section-3-2.7.1 + # *without* being heuristically cacheable + now = time.strftime(TIME_FMT, time.gmtime()) + resp = self.resp({"cache-control": "max-age=3600", "date": now}) + resp.status = 302 + + req = self.req() + cc.cache_response(req, resp) + cc.serializer.dumps.assert_called_with(req, resp, None) + cc.cache.set.assert_called_with(self.url, ANY, expires=3600) + def test_cache_response_cache_max_age_with_invalid_value_not_cached(self, cc): now = time.strftime(TIME_FMT, time.gmtime()) # Not a valid header; this would be from a misconfigured server