Skip to content

Commit 1804b8d

Browse files
authored
Merge pull request #15 from staskh/14-bug-cannot-reshape-array
Add Pandas 1.5.x Compatibility for Databricks Environments and fixing DST transition bugs
2 parents a585c43 + a6e314f commit 1804b8d

50 files changed

Lines changed: 2514 additions & 177 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,8 @@ sandbox.ipynb
179179
tests/data/day1.csv
180180
tests/data/day2.csv
181181
tests/data/day5.csv
182+
14-bug/
183+
184+
# Other virtual environments
185+
.venv-*
186+
venv-*

iglu_python/active_percent.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
import pandas as pd
66

7-
from .utils import check_data_columns, localize_naive_timestamp
7+
from .utils import check_data_columns, get_local_tz
88

99

1010
def active_percent(
@@ -99,7 +99,7 @@ def active_percent(
9999
return df
100100

101101

102-
def active_percent_single(
102+
def active_percent_single( # noqa: C901
103103
data: pd.Series,
104104
dt0: Optional[int] = None,
105105
tz: str = "",
@@ -117,12 +117,18 @@ def active_percent_single(
117117
if not isinstance(data.index, pd.DatetimeIndex):
118118
raise ValueError("Series must have a DatetimeIndex")
119119

120+
# localize data.index to the timezone if it is not already
121+
if data.index.tzinfo is None:
122+
if not tz or tz == "":
123+
tz = get_local_tz()
124+
data.index = data.index.tz_localize(tz)
125+
120126
data = data.dropna()
121127
if len(data) == 0:
122128
return {"active_percent": 0, "ndays": 0, "start_date": None, "end_date": None}
123129

124130
# Calculate time differences between consecutive measurements
125-
time_diffs = np.array(data.index.diff().total_seconds() / 60) # Convert to minutes
131+
time_diffs = np.array(data.index.to_series().diff().dt.total_seconds() / 60) # Convert to minutes
126132

127133
# Automatically determine dt0 if not provided
128134
if dt0 is None:
@@ -151,12 +157,20 @@ def active_percent_single(
151157
elif range_type == "manual":
152158
# Handle consistent end date if provided
153159
if consistent_end_date is not None:
154-
end_date = localize_naive_timestamp(pd.to_datetime(consistent_end_date))
160+
end_date = pd.to_datetime(consistent_end_date)
155161
else:
156162
end_date = data.index.max()
157163
start_date = end_date - pd.Timedelta(days=int(ndays))
158164

159165
# Filter data to the specified date range
166+
# bring timestamps to teh same timezone as start_date
167+
tz = data.index.tz
168+
# Localize start_date only if it is naive and tz is not None
169+
if start_date.tzinfo is None and tz is not None:
170+
start_date = start_date.tz_localize(tz)
171+
# Localize end_date only if it is naive and tz is not None
172+
if end_date.tzinfo is None and tz is not None:
173+
end_date = end_date.tz_localize(tz)
160174
mask = (data.index >= start_date) & (data.index <= end_date)
161175
data = data[mask]
162176

iglu_python/cv_measures.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,19 @@ def _calculate_series_cv(subject_data: pd.DataFrame | pd.Series, dt0=None, inter
8282
# active_days is a list of days that have at least 2 non-missing values
8383
# dt0 is the time frequency for interpolation in minutes
8484

85-
# calculate devioation and median for each day
85+
# calculate deviation and median for each day
86+
# with warnings.catch_warnings():
87+
# warnings.simplefilter("ignore", category=RuntimeWarning)
8688
daily_deviations = np.apply_along_axis(np.nanstd, 1, gd2d, ddof=1)
8789
daily_mean = np.apply_along_axis(np.nanmean, 1, gd2d)
8890

8991
cv = daily_deviations * 100 / daily_mean
9092

9193
# calculate mean of daily deviations
9294
cv_mean = np.nanmean(cv)
93-
cv_sd = np.nanstd(cv, ddof=1)
95+
if len(cv) > 1:
96+
cv_sd = np.nanstd(cv, ddof=1)
97+
else:
98+
cv_sd = np.nan
9499

95100
return {"CVmean": cv_mean, "CVsd": cv_sd}

iglu_python/roc.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
import numpy as np
44
import pandas as pd
55

6-
from .utils import CGMS2DayByDay, check_data_columns
6+
from .utils import CGMS2DayByDay, check_data_columns, get_local_tz
77

88

9-
def roc(
9+
def roc( # noqa: C901
1010
data: Union[pd.DataFrame, pd.Series],
1111
timelag: int = 15,
1212
dt0: int = 5,
@@ -138,6 +138,17 @@ def roc_single(data: pd.DataFrame, timelag: int, dt0: int = None, inter_gap: int
138138
if len(subject_data) == 0:
139139
continue
140140

141+
# Ensure 'time' is a DatetimeIndex before localizing
142+
if not tz or tz == "":
143+
tz = get_local_tz()
144+
if pd.api.types.is_datetime64_any_dtype(subject_data["time"]):
145+
if subject_data["time"].dt.tz is None:
146+
subject_data["time"] = subject_data["time"].dt.tz_localize(tz)
147+
else:
148+
subject_data["time"] = subject_data["time"].dt.tz_convert(tz)
149+
else:
150+
subject_data["time"] = pd.to_datetime(subject_data["time"]).dt.tz_localize(tz)
151+
141152
roc_values = roc_single(subject_data, timelag, dt0, inter_gap, tz)
142153

143154
# Create time points for ROC values

iglu_python/utils.py

Lines changed: 87 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def get_local_tz():
4545
return local_tz
4646

4747

48-
def check_data_columns(data: pd.DataFrame, time_check=False, tz="") -> pd.DataFrame:
48+
def check_data_columns(data: pd.DataFrame, time_check=False, tz="") -> pd.DataFrame: # noqa: C901
4949
"""
5050
Check if the input DataFrame has the required columns and correct data types.
5151
@@ -90,8 +90,15 @@ def check_data_columns(data: pd.DataFrame, time_check=False, tz="") -> pd.DataFr
9090
except Exception as e:
9191
raise ValueError("Column 'time' must be datetime") from e
9292

93-
if not pd.api.types.is_string_dtype(data["id"]):
94-
data["id"] = data["id"].astype(str)
93+
# Check if id column is string-like (pandas 1.5.x compatible)
94+
try:
95+
# Try pandas 2.0+ method first
96+
if not pd.api.types.is_string_dtype(data["id"]):
97+
data["id"] = data["id"].astype(str)
98+
except AttributeError:
99+
# Fallback for pandas 1.5.x
100+
if not isinstance(data["id"].dtype, object):
101+
data["id"] = data["id"].astype(str)
95102

96103
# check if data frame empty
97104
if data.empty:
@@ -105,25 +112,26 @@ def check_data_columns(data: pd.DataFrame, time_check=False, tz="") -> pd.DataFr
105112
# if data["gl"].isna().any():
106113
# warnings.warn("Data contains missing glucose values")
107114

108-
# convert time to specified timezone
109-
# TODO: check if this is correct (R-implementation compatibility)
110-
# if tz and tz != "":
111-
# # First remove timezone information, then localize to specified timezone
112-
# data['time'] = pd.to_datetime(data['time']).dt.tz_localize(None).dt.tz_localize(tz)
113-
#
114-
# this is implementation compatible with R implementation
115-
# but seems incorrect, as it convert time to TZ instead of localizing it to TZ
116-
if tz != "":
117-
# Create a copy to avoid dtype warning and properly handle timezone conversion
118-
data["time"] = pd.to_datetime(data["time"]).apply(localize_naive_timestamp).dt.tz_convert(tz)
119-
else:
120-
# Create a copy to avoid dtype warning
121-
data["time"] = pd.to_datetime(data["time"]).apply(localize_naive_timestamp)
115+
if time_check:
116+
# convert time to specified timezone
117+
# TODO: check if this is correct (R-implementation compatibility)
118+
# if tz and tz != "":
119+
# # First remove timezone information, then localize to specified timezone
120+
# data['time'] = pd.to_datetime(data['time']).dt.tz_localize(None).dt.tz_localize(tz)
121+
#
122+
# this is implementation compatible with R implementation
123+
# but seems incorrect, as it convert time to TZ instead of localizing it to TZ
124+
if tz and tz != "":
125+
# Create a copy to avoid dtype warning and properly handle timezone conversion
126+
data["time"] = pd.to_datetime(data["time"]).apply(localize_naive_timestamp).dt.tz_convert(tz)
127+
else:
128+
# Create a copy to avoid dtype warning
129+
data["time"] = pd.to_datetime(data["time"]).apply(localize_naive_timestamp)
122130

123131
return data
124132

125133

126-
def CGMS2DayByDay(
134+
def CGMS2DayByDay( # noqa: C901
127135
data: pd.DataFrame | pd.Series,
128136
dt0: Optional[pd.Timestamp] = None,
129137
inter_gap: int = 45,
@@ -135,6 +143,7 @@ def CGMS2DayByDay(
135143
The function takes CGM data and interpolates it onto a uniform time grid,
136144
with each row representing a day and each column representing a time point.
137145
Missing values are linearly interpolated when close enough to non-missing values.
146+
Note: all datetime indexes are converted into naive format to avoid DST transition bugs.
138147
139148
data : pd.DataFrame or pd.Series
140149
DataFrame with columns 'id', 'time', and 'gl'. Should only be data for 1 subject.
@@ -172,33 +181,60 @@ def CGMS2DayByDay(
172181
if isinstance(data, pd.Series):
173182
if not isinstance(data.index, pd.DatetimeIndex):
174183
raise ValueError("Series must have a DatetimeIndex")
175-
data = pd.DataFrame(
176-
{
177-
"id": ["subject1"] * len(data.values),
178-
"time": data.index,
179-
"gl": data.values,
180-
}
181-
)
182-
# Check data format
183-
data = check_data_columns(data, tz)
184-
185-
# Get unique subjects
186-
subjects = data["id"].unique()
187-
if len(subjects) > 1:
188-
raise ValueError("Multiple subjects detected. Please provide a single subject.")
189-
190-
# Sort by time
191-
data = data.sort_values("time")
184+
# convert time to naive timezone (so no DST transition issues in np.interp())
185+
data.index = data.index.tz_localize(None)
186+
elif isinstance(data, pd.DataFrame):
187+
# convert dataframe to series
188+
# check that all id's are the same
189+
if not data["id"].nunique() == 1:
190+
raise ValueError("Multiple subjects detected. Please provide a single subject.")
191+
# check that time is datetime
192+
if not pd.api.types.is_datetime64_any_dtype(data["time"]):
193+
try:
194+
data["time"] = pd.to_datetime(data["time"])
195+
except Exception as e:
196+
raise ValueError("Column 'time' must be datetime") from e
197+
# Check data types
198+
if not pd.api.types.is_numeric_dtype(data["gl"]):
199+
try:
200+
data["gl"] = pd.to_numeric(data["gl"])
201+
except Exception as e:
202+
raise ValueError("Column 'gl' must be numeric") from e
203+
# convert dataframe to series
204+
data_reset = data.reset_index(drop=True)
205+
# convert time to naive timezone
206+
# (so index would not convert it into UTC with a shift and no DST transition issues in np.interp())
207+
data_reset["time"] = data_reset["time"].dt.tz_localize(None)
208+
data = pd.Series(data_reset["gl"].values, index=data_reset["time"].values)
209+
else:
210+
raise ValueError("Input must be a Series or DataFrame")
192211

193212
# Calculate time step (dt0)
194213
if dt0 is None:
195-
# Use most common time difference
196-
time_diffs = data["time"].diff().dropna()
197-
dt0 = int(time_diffs.mode().iloc[0].total_seconds() / 60)
214+
# Use most common time difference (for pandas 1.5.x backward compatibility)
215+
time_diffs = pd.Series(data.index).diff().dropna()
216+
# Pandas TimedeltaIndex does not have a .mode() method directly.
217+
# We'll convert to seconds and use pd.Series.mode()
218+
# Use .dt accessor for pandas 1.5.x compatibility
219+
dt0 = int((time_diffs.dt.total_seconds() / 60).mode().iloc[0])
220+
221+
# Create time grid (pandas 1.5.x compatible)
222+
min_time = data.index.min()
223+
max_time = data.index.max()
224+
225+
# Use compatible floor/ceil methods for pandas 1.5.x
226+
if hasattr(min_time, "floor"):
227+
start_time = min_time.floor("D")
228+
else:
229+
# Fallback for pandas 1.5.x
230+
start_time = pd.Timestamp(min_time.date())
231+
232+
if hasattr(max_time, "ceil"):
233+
end_time = max_time.ceil("D")
234+
else:
235+
# Fallback for pandas 1.5.x
236+
end_time = pd.Timestamp(max_time.date()) + pd.Timedelta(days=1)
198237

199-
# Create time grid
200-
start_time = data["time"].min().floor("D")
201-
end_time = data["time"].max().ceil("D")
202238
time_grid = pd.date_range(start=start_time, end=end_time, freq=f"{dt0}min")
203239
if is_iglu_r_compatible():
204240
# remove the first time point
@@ -210,26 +246,26 @@ def CGMS2DayByDay(
210246
# find gaps in the data (using original data indexes, not time grid)
211247
gaps = []
212248
for i in range(len(data) - 1):
213-
if (data["time"].iloc[i + 1] - data["time"].iloc[i]).total_seconds() > inter_gap * 60:
249+
if (data.index[i + 1] - data.index[i]).total_seconds() > inter_gap * 60:
214250
gaps.append((i, i + 1))
215251

216252
# Interpolate glucose values
217253
interp_data = np.interp(
218254
(time_grid - start_time).total_seconds() / 60,
219-
(data["time"] - start_time).dt.total_seconds() / 60,
220-
data["gl"],
255+
(data.index - start_time).total_seconds() / 60,
256+
data.values,
221257
left=np.nan,
222258
right=np.nan,
223259
)
224260

225261
# put nan in the gaps
226262
for gap in gaps:
227263
gap_start_idx = gap[0]
228-
gap_start_time = data["time"].iloc[gap_start_idx]
264+
gap_start_time = data.index[gap_start_idx]
229265
# find the index of the gap start in the time grid
230266
gap_start_idx_in_time_grid = int(np.floor((gap_start_time - start_time).total_seconds() / (60 * dt0)))
231267
gap_end_idx = gap[1]
232-
gap_end_time = data["time"].iloc[gap_end_idx]
268+
gap_end_time = data.index[gap_end_idx]
233269
# find the index of the gap end in the time grid
234270
gap_end_idx_in_time_grid = int(
235271
# -1sec to indicate time before measurement
@@ -238,10 +274,10 @@ def CGMS2DayByDay(
238274
# put nan in the gap
239275
interp_data[gap_start_idx_in_time_grid:gap_end_idx_in_time_grid] = np.nan
240276

241-
# for compatibility with the R package, set values to nan before data['time'].min() and after data['time'].max()
242-
# find index of timegrid before data['time'].min() and after data['time'].max()
243-
# head_min_idx = np.where(time_grid >= data['time'].min())[0][0]
244-
# tail_max_idx = np.where(time_grid <= data['time'].max())[0][-1] + 1
277+
# for compatibility with the R package, set values to nan before data.index.min() and after data.index.max()
278+
# find index of timegrid before data.index.min() and after data.index.max()
279+
# head_min_idx = np.where(time_grid >= data.index.min())[0][0]
280+
# tail_max_idx = np.where(time_grid <= data.index.max())[0][-1] + 1
245281
# interp_data[:head_min_idx] = np.nan
246282
# interp_data[tail_max_idx:] = np.nan
247283

@@ -260,7 +296,7 @@ def CGMS2DayByDay(
260296
return interp_data, actual_dates, dt0
261297

262298

263-
def gd2d_to_df(gd2d, actual_dates, dt0):
299+
def gd2d_to_df(gd2d, actual_dates, dt0): # noqa: C901
264300
"""Convert gd2d (CGMS2DayByDay output) to a pandas DataFrame"""
265301
df = pd.DataFrame({"time": [], "gl": []})
266302

pyproject.toml

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "iglu_python"
7-
version = "0.3.1"
7+
version = "0.4.0"
88
description = "Python implementation of the iglu package for continuous glucose monitoring data analysis"
99
readme = "README.md"
1010
requires-python = ">=3.11"
@@ -28,7 +28,7 @@ dependencies = [
2828
"pandas",
2929
"tzlocal",
3030
"openpyxl",
31-
"matplotlib"
31+
"matplotlib",
3232
]
3333

3434
[project.urls]
@@ -39,13 +39,31 @@ Issues = "https://github.com/staskh/iglu_python/issues"
3939

4040
[project.optional-dependencies]
4141
dev = [
42-
"pytest>=7.0.0",
43-
"pytest-cov>=4.0.0",
42+
"pytest>=8.4.2",
43+
"pytest-cov>=7.0.0",
4444
"black>=25.1.0",
4545
"isort>=5.0.0",
4646
"mypy>=1.0.0",
4747
"ruff>=0.1.0",
4848
"pre-commit>=3.0.0",
49+
"hatch>=1.14.1",
50+
"twine>=6.2.0",
51+
"pyarrow>=21.0.0",
52+
]
53+
test = [
54+
"pytest>=8.4.2",
55+
"pytest-cov>=7.0.0",
56+
]
57+
lint = [
58+
"black>=25.1.0",
59+
"isort>=5.0.0",
60+
"mypy>=1.0.0",
61+
"ruff>=0.1.0",
62+
"pre-commit>=3.0.0",
63+
]
64+
build = [
65+
"hatch>=1.14.1",
66+
"twine>=6.2.0",
4967
]
5068

5169
[tool.hatch.build.targets.wheel]

0 commit comments

Comments
 (0)