From da6ac463dd521b522cbe90280c4297d92806746b Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 3 May 2026 14:04:54 -0500 Subject: [PATCH 1/2] Fix get_nearest_continuous: accept scalar targets and missing time column MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The docstring says ``targets`` accepts "anything ``pandas.to_datetime`` consumes", which includes a bare string or ``pd.Timestamp``. But ``pd.to_datetime("2024-01-01T00:00:00Z", utc=True)`` returns a scalar ``Timestamp``, and ``pd.DatetimeIndex(scalar)`` raises ``TypeError`` — so single-value cases crashed despite the documented contract. Wrap a scalar result in a one-element ``DatetimeIndex`` so any ``pandas.to_datetime``-consumable input works. Also: when the user passes ``properties`` that excludes ``time``, the helper used to crash with ``KeyError`` deep inside ``df.assign``. Detect the missing column up front and raise a ``ValueError`` pointing at the likely cause. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/nearest.py | 20 +++++++++++- tests/waterdata_nearest_test.py | 51 ++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/nearest.py b/dataretrieval/waterdata/nearest.py index 29484875..6bb48187 100644 --- a/dataretrieval/waterdata/nearest.py +++ b/dataretrieval/waterdata/nearest.py @@ -137,7 +137,7 @@ def get_nearest_continuous( ... ) """ _check_nearest_kwargs(kwargs, on_tie) - targets = pd.DatetimeIndex(pd.to_datetime(targets, utc=True)) + targets = _coerce_targets(targets) window_td = pd.Timedelta(window) if len(targets) == 0: @@ -151,6 +151,11 @@ def get_nearest_continuous( filter_lang="cql-text", **kwargs, ) + if "time" not in df.columns: + raise ValueError( + "get_nearest_continuous requires a 'time' column in the response; " + "if a `properties` kwarg was passed, include 'time' in it" + ) if df.empty: return _empty_nearest_result(df), md @@ -172,6 +177,19 @@ def get_nearest_continuous( return pd.DataFrame(selected).reset_index(drop=True), md +def _coerce_targets(targets) -> pd.DatetimeIndex: + """Accept anything ``pandas.to_datetime`` consumes, including a single value. + + A bare scalar (string, ``Timestamp``, ``datetime``, …) becomes a + one-element ``DatetimeIndex``; an iterable round-trips through + ``pd.to_datetime`` directly. + """ + parsed = pd.to_datetime(targets, utc=True) + if isinstance(parsed, pd.DatetimeIndex): + return parsed + return pd.DatetimeIndex([parsed]) + + def _check_nearest_kwargs(kwargs: dict, on_tie: OnTie) -> None: """Reject kwargs the helper owns; validate ``on_tie``.""" for forbidden in ("time", "filter", "filter_lang"): diff --git a/tests/waterdata_nearest_test.py b/tests/waterdata_nearest_test.py index 4dc0ab9d..3f988a6b 100644 --- a/tests/waterdata_nearest_test.py +++ b/tests/waterdata_nearest_test.py @@ -265,3 +265,54 @@ def test_forwards_kwargs_to_get_continuous(patch_get_continuous): _, kwargs = patch_get_continuous.call_args assert kwargs["statistic_id"] == "00011" assert kwargs["approval_status"] == "Approved" + + +def test_accepts_single_string_target(patch_get_continuous): + """A bare scalar target must round-trip through pd.to_datetime. + + Regression: previously `pd.DatetimeIndex(pd.to_datetime("...", utc=True))` + raised TypeError because pd.to_datetime returns a scalar Timestamp for a + single-string input. + """ + patch_get_continuous.return_value = ( + _fake_df([{"time": "2023-06-15T10:30:00Z", "value": 22.4}]), + mock.Mock(), + ) + result, _ = get_nearest_continuous( + "2023-06-15T10:30:31Z", monitoring_location_id="USGS-02238500" + ) + assert len(result) == 1 + assert result["target_time"].iloc[0] == pd.Timestamp( + "2023-06-15T10:30:31Z", tz="UTC" + ) + + +def test_accepts_single_timestamp_target(patch_get_continuous): + """A single ``pd.Timestamp`` target also round-trips.""" + patch_get_continuous.return_value = ( + _fake_df([{"time": "2023-06-15T10:30:00Z", "value": 22.4}]), + mock.Mock(), + ) + target = pd.Timestamp("2023-06-15T10:30:31Z", tz="UTC") + result, _ = get_nearest_continuous(target, monitoring_location_id="USGS-02238500") + assert len(result) == 1 + + +def test_missing_time_column_raises_helpful_error(patch_get_continuous): + """If the response has no 'time' column (e.g. user passed `properties` + that excluded it), raise ValueError instead of crashing with KeyError. + """ + df_no_time = pd.DataFrame( + { + "value": [22.4], + "monitoring_location_id": ["USGS-02238500"], + } + ) + patch_get_continuous.return_value = (df_no_time, mock.Mock()) + + with pytest.raises(ValueError, match="'time' column"): + get_nearest_continuous( + ["2023-06-15T10:30:31Z"], + monitoring_location_id="USGS-02238500", + properties=["value", "monitoring_location_id"], + ) From 23898203427ff71782d091c06c9659ae02af5d2c Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 4 May 2026 10:09:57 -0500 Subject: [PATCH 2/2] Preserve list-like target inputs and avoid double-tz in test Per copilot review on PR #251: - _coerce_targets: detect non-DatetimeIndex iterables (Series, ndarray) via pd.api.types.is_scalar so the elements are preserved instead of being wrapped in a single-element list. Add a regression test passing a pd.Series of two timestamps and assert both are processed. - Tests: drop the redundant tz='UTC' on pd.Timestamp inputs that already carry a Z suffix; pandas 2.x raises on double timezone specification. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/nearest.py | 8 +++++--- tests/waterdata_nearest_test.py | 22 ++++++++++++++++++---- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/dataretrieval/waterdata/nearest.py b/dataretrieval/waterdata/nearest.py index 6bb48187..837d7928 100644 --- a/dataretrieval/waterdata/nearest.py +++ b/dataretrieval/waterdata/nearest.py @@ -181,13 +181,15 @@ def _coerce_targets(targets) -> pd.DatetimeIndex: """Accept anything ``pandas.to_datetime`` consumes, including a single value. A bare scalar (string, ``Timestamp``, ``datetime``, …) becomes a - one-element ``DatetimeIndex``; an iterable round-trips through - ``pd.to_datetime`` directly. + one-element ``DatetimeIndex``; an iterable (list, ``Series``, ``ndarray``) + is wrapped directly so its elements are preserved. """ parsed = pd.to_datetime(targets, utc=True) if isinstance(parsed, pd.DatetimeIndex): return parsed - return pd.DatetimeIndex([parsed]) + if pd.api.types.is_scalar(parsed): + return pd.DatetimeIndex([parsed]) + return pd.DatetimeIndex(parsed) def _check_nearest_kwargs(kwargs: dict, on_tie: OnTie) -> None: diff --git a/tests/waterdata_nearest_test.py b/tests/waterdata_nearest_test.py index 3f988a6b..64deeccd 100644 --- a/tests/waterdata_nearest_test.py +++ b/tests/waterdata_nearest_test.py @@ -282,9 +282,7 @@ def test_accepts_single_string_target(patch_get_continuous): "2023-06-15T10:30:31Z", monitoring_location_id="USGS-02238500" ) assert len(result) == 1 - assert result["target_time"].iloc[0] == pd.Timestamp( - "2023-06-15T10:30:31Z", tz="UTC" - ) + assert result["target_time"].iloc[0] == pd.Timestamp("2023-06-15T10:30:31Z") def test_accepts_single_timestamp_target(patch_get_continuous): @@ -293,11 +291,27 @@ def test_accepts_single_timestamp_target(patch_get_continuous): _fake_df([{"time": "2023-06-15T10:30:00Z", "value": 22.4}]), mock.Mock(), ) - target = pd.Timestamp("2023-06-15T10:30:31Z", tz="UTC") + target = pd.Timestamp("2023-06-15T10:30:31Z") result, _ = get_nearest_continuous(target, monitoring_location_id="USGS-02238500") assert len(result) == 1 +def test_accepts_pandas_series_targets(patch_get_continuous): + """A ``pd.Series`` of timestamps preserves all elements (not just the first).""" + patch_get_continuous.return_value = ( + _fake_df( + [ + {"time": "2023-06-15T10:30:00Z", "value": 22.4}, + {"time": "2023-06-16T10:30:00Z", "value": 22.5}, + ] + ), + mock.Mock(), + ) + targets = pd.Series(["2023-06-15T10:30:31Z", "2023-06-16T10:30:31Z"]) + result, _ = get_nearest_continuous(targets, monitoring_location_id="USGS-02238500") + assert len(result) == 2 + + def test_missing_time_column_raises_helpful_error(patch_get_continuous): """If the response has no 'time' column (e.g. user passed `properties` that excluded it), raise ValueError instead of crashing with KeyError.