From 13b90322b279cf80924788b850208c04b5af7569 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 8 May 2026 16:44:10 -0500 Subject: [PATCH] Surface real failures in paginated waterdata responses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs in _walk_pages (and the parallel get_stats_data pagination loop) caused silent or misleading failures when a paginated request was interrupted mid-walk: 1. The except handler called _error_body(resp) — but when client.request() itself raised (ConnectionError, Timeout, etc.), `resp` still pointed at the *previous successful page*. The "incomplete" log message therefore described the wrong request, and on a non-JSON 200 body would itself raise inside resp.json() and bury the original failure. 2. No status-code check was performed on paginated responses. The initial request guards `if resp.status_code != 200`, but the loop doesn't. A 5xx body that didn't include "numberReturned" was silently turned into an empty DataFrame by _get_resp_data, the `next` link wasn't found, and the loop quietly exited — handing the user truncated data with no error logged anywhere. This affects every paginated waterdata getter (get_daily, get_continuous, get_monitoring_locations, …). Fix: - Guard the loop body with `if resp.status_code != 200: raise RuntimeError(_error_body(resp))`, mirroring the initial request. - Capture the exception with `as e` and log `e` instead of touching the stale `resp`. Same change in get_stats_data. Behavior preserved: on failure the loop still logs and returns whatever pages were collected ("best effort"). The change is purely to make the failure observable and the log message accurate. Two new tests cover (a) network-exception mid-pagination and (b) 5xx mid-pagination, asserting that the actual failure surfaces in the error log. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 + dataretrieval/waterdata/utils.py | 22 +++++--- tests/waterdata_utils_test.py | 87 ++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2faaeb42..548d4878 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/08/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved. + **05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**. **05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`. diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 784f2969..7756e101 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -635,11 +635,18 @@ def _walk_pages( headers=headers, data=content if method == "POST" else None, ) + # Mirror the initial-request check; otherwise a 5xx body + # without "numberReturned" silently yields an empty frame + # and the loop quietly stops. + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) - except Exception: # noqa: BLE001 - error_text = _error_body(resp) - logger.error("Request incomplete. %s", error_text) + except Exception as e: # noqa: BLE001 + # Report the *actual* failure — not _error_body(resp) on a + # stale prior-page response (which would describe the wrong + # request and can itself raise on non-JSON bodies). + logger.error("Request incomplete: %s", e) logger.warning( "Request failed for URL: %s. Data download interrupted.", curr_url ) @@ -1099,14 +1106,15 @@ def get_stats_data( params=args, headers=headers, ) + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) body = resp.json() all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) next_token = body["next"] - except Exception: # noqa: BLE001 - error_text = _error_body(resp) - logger.error("Request incomplete. %s", error_text) + except Exception as e: # noqa: BLE001 + logger.error("Request incomplete: %s", e) logger.warning( - "Request failed for URL: %s. Data download interrupted.", resp.url + "Request failed for URL: %s. Data download interrupted.", url ) next_token = None diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 8151ab37..3008be15 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -86,6 +86,93 @@ def test_walk_pages_multiple_mocked(): assert mock_client.request.call_args[0][1] == "https://example.com/page2" +def _resp_ok(features): + """Build a 200-OK mock response carrying the given features list.""" + resp = mock.MagicMock() + resp.json.return_value = { + "numberReturned": len(features), + "features": features, + "links": [{"rel": "next", "href": "https://example.com/page2"}] + if features + else [], + } + resp.headers = {} + resp.links = {"next": {"url": "https://example.com/page2"}} if features else {} + resp.status_code = 200 + resp.url = "https://example.com/page1" + return resp + + +def _walk_pages_with_failure(failure_resp_or_exc): + """Run _walk_pages where page 1 succeeds and page 2 fails as given.""" + resp1 = _resp_ok([{"id": "1", "properties": {"val": "a"}}]) + + mock_client = mock.MagicMock(spec=requests.Session) + mock_client.send.return_value = resp1 + if isinstance(failure_resp_or_exc, BaseException): + mock_client.request.side_effect = failure_resp_or_exc + else: + mock_client.request.return_value = failure_resp_or_exc + + mock_req = mock.MagicMock(spec=requests.PreparedRequest) + mock_req.method = "GET" + mock_req.headers = {} + mock_req.url = "https://example.com/page1" + + return _walk_pages(geopd=False, req=mock_req, client=mock_client) + + +def test_walk_pages_logs_actual_exception_when_request_raises(caplog): + """When client.request() itself raises (e.g. a connection error), the + logged failure must reflect that exception — not _error_body() on the + stale prior-page response, which described the wrong request and could + itself crash on non-JSON bodies.""" + import logging + + caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") + + df, _ = _walk_pages_with_failure(requests.ConnectionError("boom")) + + # First page's data is preserved (best-effort behavior). + assert list(df["val"]) == ["a"] + # Logged error mentions the actual ConnectionError, not a stale page body. + error_messages = [ + r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR + ] + assert any("boom" in m for m in error_messages), error_messages + + +def test_walk_pages_surfaces_5xx_mid_pagination(caplog): + """A non-200 response on a paginated page must be detected; previously + the loop happily called _get_resp_data() on the 5xx body, which — + lacking 'numberReturned' — silently produced an empty DataFrame and + the loop quietly stopped with no error logged.""" + import logging + + caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") + + page2_500 = mock.MagicMock() + page2_500.status_code = 503 + page2_500.json.return_value = { + "code": "ServiceUnavailable", + "description": "upstream timeout", + } + page2_500.url = "https://example.com/page2" + + df, _ = _walk_pages_with_failure(page2_500) + + assert list(df["val"]) == ["a"] + error_messages = [ + r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR + ] + # The 5xx is now visible in the error log (it would previously have + # been silently swallowed because _get_resp_data returned an empty + # frame and the loop stopped quietly). + assert any("503" in m or "ServiceUnavailable" in m for m in error_messages), ( + error_messages + ) + + def test_handle_stats_nesting_tolerates_missing_drop_columns(): """If the upstream stats response shape ever changes such that one of the columns we try to drop ("type", "properties.data") is absent, the