Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
**05/08/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved.

**05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**.

**05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`.
Expand Down
22 changes: 15 additions & 7 deletions dataretrieval/waterdata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,11 +635,18 @@ def _walk_pages(
headers=headers,
data=content if method == "POST" else None,
)
# Mirror the initial-request check; otherwise a 5xx body
# without "numberReturned" silently yields an empty frame
# and the loop quietly stops.
if resp.status_code != 200:
raise RuntimeError(_error_body(resp))
dfs.append(_get_resp_data(resp, geopd=geopd))
curr_url = _next_req_url(resp)
except Exception: # noqa: BLE001
error_text = _error_body(resp)
logger.error("Request incomplete. %s", error_text)
except Exception as e: # noqa: BLE001
# Report the *actual* failure — not _error_body(resp) on a
# stale prior-page response (which would describe the wrong
# request and can itself raise on non-JSON bodies).
logger.error("Request incomplete: %s", e)
logger.warning(
"Request failed for URL: %s. Data download interrupted.", curr_url
)
Expand Down Expand Up @@ -1099,14 +1106,15 @@ def get_stats_data(
params=args,
headers=headers,
)
if resp.status_code != 200:
raise RuntimeError(_error_body(resp))
body = resp.json()
all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS))
next_token = body["next"]
except Exception: # noqa: BLE001
error_text = _error_body(resp)
logger.error("Request incomplete. %s", error_text)
except Exception as e: # noqa: BLE001
logger.error("Request incomplete: %s", e)
logger.warning(
"Request failed for URL: %s. Data download interrupted.", resp.url
"Request failed for URL: %s. Data download interrupted.", url
)
next_token = None

Expand Down
87 changes: 87 additions & 0 deletions tests/waterdata_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,93 @@ def test_walk_pages_multiple_mocked():
assert mock_client.request.call_args[0][1] == "https://example.com/page2"


def _resp_ok(features):
"""Build a 200-OK mock response carrying the given features list."""
resp = mock.MagicMock()
resp.json.return_value = {
"numberReturned": len(features),
"features": features,
"links": [{"rel": "next", "href": "https://example.com/page2"}]
if features
else [],
}
resp.headers = {}
resp.links = {"next": {"url": "https://example.com/page2"}} if features else {}
resp.status_code = 200
resp.url = "https://example.com/page1"
return resp


def _walk_pages_with_failure(failure_resp_or_exc):
"""Run _walk_pages where page 1 succeeds and page 2 fails as given."""
resp1 = _resp_ok([{"id": "1", "properties": {"val": "a"}}])

mock_client = mock.MagicMock(spec=requests.Session)
mock_client.send.return_value = resp1
if isinstance(failure_resp_or_exc, BaseException):
mock_client.request.side_effect = failure_resp_or_exc
else:
mock_client.request.return_value = failure_resp_or_exc

mock_req = mock.MagicMock(spec=requests.PreparedRequest)
mock_req.method = "GET"
mock_req.headers = {}
mock_req.url = "https://example.com/page1"

return _walk_pages(geopd=False, req=mock_req, client=mock_client)


def test_walk_pages_logs_actual_exception_when_request_raises(caplog):
"""When client.request() itself raises (e.g. a connection error), the
logged failure must reflect that exception — not _error_body() on the
stale prior-page response, which described the wrong request and could
itself crash on non-JSON bodies."""
import logging

caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils")

df, _ = _walk_pages_with_failure(requests.ConnectionError("boom"))

# First page's data is preserved (best-effort behavior).
assert list(df["val"]) == ["a"]
# Logged error mentions the actual ConnectionError, not a stale page body.
error_messages = [
r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR
]
assert any("boom" in m for m in error_messages), error_messages


def test_walk_pages_surfaces_5xx_mid_pagination(caplog):
"""A non-200 response on a paginated page must be detected; previously
the loop happily called _get_resp_data() on the 5xx body, which —
lacking 'numberReturned' — silently produced an empty DataFrame and
the loop quietly stopped with no error logged."""
import logging

caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils")

page2_500 = mock.MagicMock()
page2_500.status_code = 503
page2_500.json.return_value = {
"code": "ServiceUnavailable",
"description": "upstream timeout",
}
page2_500.url = "https://example.com/page2"

df, _ = _walk_pages_with_failure(page2_500)

assert list(df["val"]) == ["a"]
error_messages = [
r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR
]
# The 5xx is now visible in the error log (it would previously have
# been silently swallowed because _get_resp_data returned an empty
# frame and the loop stopped quietly).
assert any("503" in m or "ServiceUnavailable" in m for m in error_messages), (
error_messages
)


def test_handle_stats_nesting_tolerates_missing_drop_columns():
"""If the upstream stats response shape ever changes such that one of
the columns we try to drop ("type", "properties.data") is absent, the
Expand Down