From 7c66421ed902a5454de952c2b4b83def6ece2f69 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Wed, 10 Dec 2025 16:09:17 -0600 Subject: [PATCH] add row ordering by time and site and move frivolous id columns to end of df --- dataretrieval/waterdata/utils.py | 52 ++++++++++++++++++++++++++++++-- tests/waterdata_test.py | 4 ++- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 1bcc58a..46d58b6 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -662,9 +662,24 @@ def _arrange_cols( plural = output_id.replace("_id", "s_id") if plural in properties: df = df.rename(columns={"id": plural}) - return df.loc[:, [col for col in properties if col in df.columns]] + df = df.loc[:, [col for col in properties if col in df.columns]] else: - return df.rename(columns={"id": output_id}) + df = df.rename(columns={"id": output_id}) + + # Move meaningless-to-user, extra id columns to the end + # of the dataframe, if they exist + extra_id_cols = set(df.columns).intersection({ + "latest_continuous_id", + "latest_daily_id", + "daily_id", + "continuous_id", + "field_measurement_id" + }) + if extra_id_cols: + id_col_order = [col for col in df.columns if col not in extra_id_cols] + list(extra_id_cols) + df = df.loc[:, id_col_order] + + return df def _type_cols(df: pd.DataFrame) -> pd.DataFrame: @@ -712,6 +727,36 @@ def _type_cols(df: pd.DataFrame) -> pd.DataFrame: return df +def _sort_rows(df: pd.DataFrame) -> pd.DataFrame: + """ + Sorts rows by 'time' and 'monitoring_location_id' columns if they + exist. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing water data. + + Returns + ------- + pd.DataFrame + The DataFrame with rows ordered by time and site. + + """ + if "time" in df.columns and "monitoring_location_id" in df.columns: + df = df.sort_values( + by=["time", "monitoring_location_id"], + ignore_index=True + ) + elif "time" in df.columns: + df = df.sort_values( + by="time", + ignore_index=True + ) + + return df + + def get_ogc_data( args: Dict[str, Any], output_id: str, service: str ) -> Tuple[pd.DataFrame, BaseMetadata]: @@ -769,7 +814,10 @@ def get_ogc_data( return_list = _type_cols(return_list) return_list = _arrange_cols(return_list, properties, output_id) + + return_list = _sort_rows(return_list) # Create metadata object from response metadata = BaseMetadata(response) return return_list, metadata + diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 2745d34..096a50a 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -122,16 +122,18 @@ def test_get_daily(): ) assert "daily_id" in df.columns assert "geometry" in df.columns + assert df.columns[-1] == "daily_id" assert df.shape[1] == 12 assert df.parameter_code.unique().tolist() == ["00060"] assert df.monitoring_location_id.unique().tolist() == ["USGS-05427718"] assert df["time"].apply(lambda x: isinstance(x, datetime.date)).all() + assert df["time"].iloc[0] < df["time"].iloc[-1] assert hasattr(md, 'url') assert hasattr(md, 'query_time') assert df["value"].dtype == "float64" def test_get_daily_properties(): - df, md = get_daily( + df,_ = get_daily( monitoring_location_id="USGS-05427718", parameter_code="00060", time="2025-01-01/..",