diff --git a/NEWS.md b/NEWS.md index 2faaeb42..548d4878 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/08/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved. + **05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**. **05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`. diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 784f2969..7756e101 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -635,11 +635,18 @@ def _walk_pages( headers=headers, data=content if method == "POST" else None, ) + # Mirror the initial-request check; otherwise a 5xx body + # without "numberReturned" silently yields an empty frame + # and the loop quietly stops. + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) - except Exception: # noqa: BLE001 - error_text = _error_body(resp) - logger.error("Request incomplete. %s", error_text) + except Exception as e: # noqa: BLE001 + # Report the *actual* failure — not _error_body(resp) on a + # stale prior-page response (which would describe the wrong + # request and can itself raise on non-JSON bodies). + logger.error("Request incomplete: %s", e) logger.warning( "Request failed for URL: %s. Data download interrupted.", curr_url ) @@ -1099,14 +1106,15 @@ def get_stats_data( params=args, headers=headers, ) + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) body = resp.json() all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) next_token = body["next"] - except Exception: # noqa: BLE001 - error_text = _error_body(resp) - logger.error("Request incomplete. %s", error_text) + except Exception as e: # noqa: BLE001 + logger.error("Request incomplete: %s", e) logger.warning( - "Request failed for URL: %s. Data download interrupted.", resp.url + "Request failed for URL: %s. Data download interrupted.", url ) next_token = None diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 8151ab37..3008be15 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -86,6 +86,93 @@ def test_walk_pages_multiple_mocked(): assert mock_client.request.call_args[0][1] == "https://example.com/page2" +def _resp_ok(features): + """Build a 200-OK mock response carrying the given features list.""" + resp = mock.MagicMock() + resp.json.return_value = { + "numberReturned": len(features), + "features": features, + "links": [{"rel": "next", "href": "https://example.com/page2"}] + if features + else [], + } + resp.headers = {} + resp.links = {"next": {"url": "https://example.com/page2"}} if features else {} + resp.status_code = 200 + resp.url = "https://example.com/page1" + return resp + + +def _walk_pages_with_failure(failure_resp_or_exc): + """Run _walk_pages where page 1 succeeds and page 2 fails as given.""" + resp1 = _resp_ok([{"id": "1", "properties": {"val": "a"}}]) + + mock_client = mock.MagicMock(spec=requests.Session) + mock_client.send.return_value = resp1 + if isinstance(failure_resp_or_exc, BaseException): + mock_client.request.side_effect = failure_resp_or_exc + else: + mock_client.request.return_value = failure_resp_or_exc + + mock_req = mock.MagicMock(spec=requests.PreparedRequest) + mock_req.method = "GET" + mock_req.headers = {} + mock_req.url = "https://example.com/page1" + + return _walk_pages(geopd=False, req=mock_req, client=mock_client) + + +def test_walk_pages_logs_actual_exception_when_request_raises(caplog): + """When client.request() itself raises (e.g. a connection error), the + logged failure must reflect that exception — not _error_body() on the + stale prior-page response, which described the wrong request and could + itself crash on non-JSON bodies.""" + import logging + + caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") + + df, _ = _walk_pages_with_failure(requests.ConnectionError("boom")) + + # First page's data is preserved (best-effort behavior). + assert list(df["val"]) == ["a"] + # Logged error mentions the actual ConnectionError, not a stale page body. + error_messages = [ + r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR + ] + assert any("boom" in m for m in error_messages), error_messages + + +def test_walk_pages_surfaces_5xx_mid_pagination(caplog): + """A non-200 response on a paginated page must be detected; previously + the loop happily called _get_resp_data() on the 5xx body, which — + lacking 'numberReturned' — silently produced an empty DataFrame and + the loop quietly stopped with no error logged.""" + import logging + + caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") + + page2_500 = mock.MagicMock() + page2_500.status_code = 503 + page2_500.json.return_value = { + "code": "ServiceUnavailable", + "description": "upstream timeout", + } + page2_500.url = "https://example.com/page2" + + df, _ = _walk_pages_with_failure(page2_500) + + assert list(df["val"]) == ["a"] + error_messages = [ + r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR + ] + # The 5xx is now visible in the error log (it would previously have + # been silently swallowed because _get_resp_data returned an empty + # frame and the loop stopped quietly). + assert any("503" in m or "ServiceUnavailable" in m for m in error_messages), ( + error_messages + ) + + def test_handle_stats_nesting_tolerates_missing_drop_columns(): """If the upstream stats response shape ever changes such that one of the columns we try to drop ("type", "properties.data") is absent, the