@@ -45,7 +45,7 @@ def get_local_tz():
4545 return local_tz
4646
4747
48- def check_data_columns (data : pd .DataFrame , time_check = False , tz = "" ) -> pd .DataFrame :
48+ def check_data_columns (data : pd .DataFrame , time_check = False , tz = "" ) -> pd .DataFrame : # noqa: C901
4949 """
5050 Check if the input DataFrame has the required columns and correct data types.
5151
@@ -90,8 +90,15 @@ def check_data_columns(data: pd.DataFrame, time_check=False, tz="") -> pd.DataFr
9090 except Exception as e :
9191 raise ValueError ("Column 'time' must be datetime" ) from e
9292
93- if not pd .api .types .is_string_dtype (data ["id" ]):
94- data ["id" ] = data ["id" ].astype (str )
93+ # Check if id column is string-like (pandas 1.5.x compatible)
94+ try :
95+ # Try pandas 2.0+ method first
96+ if not pd .api .types .is_string_dtype (data ["id" ]):
97+ data ["id" ] = data ["id" ].astype (str )
98+ except AttributeError :
99+ # Fallback for pandas 1.5.x
100+ if not isinstance (data ["id" ].dtype , object ):
101+ data ["id" ] = data ["id" ].astype (str )
95102
96103 # check if data frame empty
97104 if data .empty :
@@ -105,25 +112,26 @@ def check_data_columns(data: pd.DataFrame, time_check=False, tz="") -> pd.DataFr
105112 # if data["gl"].isna().any():
106113 # warnings.warn("Data contains missing glucose values")
107114
108- # convert time to specified timezone
109- # TODO: check if this is correct (R-implementation compatibility)
110- # if tz and tz != "":
111- # # First remove timezone information, then localize to specified timezone
112- # data['time'] = pd.to_datetime(data['time']).dt.tz_localize(None).dt.tz_localize(tz)
113- #
114- # this is implementation compatible with R implementation
115- # but seems incorrect, as it convert time to TZ instead of localizing it to TZ
116- if tz != "" :
117- # Create a copy to avoid dtype warning and properly handle timezone conversion
118- data ["time" ] = pd .to_datetime (data ["time" ]).apply (localize_naive_timestamp ).dt .tz_convert (tz )
119- else :
120- # Create a copy to avoid dtype warning
121- data ["time" ] = pd .to_datetime (data ["time" ]).apply (localize_naive_timestamp )
115+ if time_check :
116+ # convert time to specified timezone
117+ # TODO: check if this is correct (R-implementation compatibility)
118+ # if tz and tz != "":
119+ # # First remove timezone information, then localize to specified timezone
120+ # data['time'] = pd.to_datetime(data['time']).dt.tz_localize(None).dt.tz_localize(tz)
121+ #
122+ # this is implementation compatible with R implementation
123+ # but seems incorrect, as it convert time to TZ instead of localizing it to TZ
124+ if tz and tz != "" :
125+ # Create a copy to avoid dtype warning and properly handle timezone conversion
126+ data ["time" ] = pd .to_datetime (data ["time" ]).apply (localize_naive_timestamp ).dt .tz_convert (tz )
127+ else :
128+ # Create a copy to avoid dtype warning
129+ data ["time" ] = pd .to_datetime (data ["time" ]).apply (localize_naive_timestamp )
122130
123131 return data
124132
125133
126- def CGMS2DayByDay (
134+ def CGMS2DayByDay ( # noqa: C901
127135 data : pd .DataFrame | pd .Series ,
128136 dt0 : Optional [pd .Timestamp ] = None ,
129137 inter_gap : int = 45 ,
@@ -135,6 +143,7 @@ def CGMS2DayByDay(
135143 The function takes CGM data and interpolates it onto a uniform time grid,
136144 with each row representing a day and each column representing a time point.
137145 Missing values are linearly interpolated when close enough to non-missing values.
146+ Note: all datetime indexes are converted into naive format to avoid DST transition bugs.
138147
139148 data : pd.DataFrame or pd.Series
140149 DataFrame with columns 'id', 'time', and 'gl'. Should only be data for 1 subject.
@@ -172,33 +181,60 @@ def CGMS2DayByDay(
172181 if isinstance (data , pd .Series ):
173182 if not isinstance (data .index , pd .DatetimeIndex ):
174183 raise ValueError ("Series must have a DatetimeIndex" )
175- data = pd .DataFrame (
176- {
177- "id" : ["subject1" ] * len (data .values ),
178- "time" : data .index ,
179- "gl" : data .values ,
180- }
181- )
182- # Check data format
183- data = check_data_columns (data , tz )
184-
185- # Get unique subjects
186- subjects = data ["id" ].unique ()
187- if len (subjects ) > 1 :
188- raise ValueError ("Multiple subjects detected. Please provide a single subject." )
189-
190- # Sort by time
191- data = data .sort_values ("time" )
184+ # convert time to naive timezone (so no DST transition issues in np.interp())
185+ data .index = data .index .tz_localize (None )
186+ elif isinstance (data , pd .DataFrame ):
187+ # convert dataframe to series
188+ # check that all id's are the same
189+ if not data ["id" ].nunique () == 1 :
190+ raise ValueError ("Multiple subjects detected. Please provide a single subject." )
191+ # check that time is datetime
192+ if not pd .api .types .is_datetime64_any_dtype (data ["time" ]):
193+ try :
194+ data ["time" ] = pd .to_datetime (data ["time" ])
195+ except Exception as e :
196+ raise ValueError ("Column 'time' must be datetime" ) from e
197+ # Check data types
198+ if not pd .api .types .is_numeric_dtype (data ["gl" ]):
199+ try :
200+ data ["gl" ] = pd .to_numeric (data ["gl" ])
201+ except Exception as e :
202+ raise ValueError ("Column 'gl' must be numeric" ) from e
203+ # convert dataframe to series
204+ data_reset = data .reset_index (drop = True )
205+ # convert time to naive timezone
206+ # (so index would not convert it into UTC with a shift and no DST transition issues in np.interp())
207+ data_reset ["time" ] = data_reset ["time" ].dt .tz_localize (None )
208+ data = pd .Series (data_reset ["gl" ].values , index = data_reset ["time" ].values )
209+ else :
210+ raise ValueError ("Input must be a Series or DataFrame" )
192211
193212 # Calculate time step (dt0)
194213 if dt0 is None :
195- # Use most common time difference
196- time_diffs = data ["time" ].diff ().dropna ()
197- dt0 = int (time_diffs .mode ().iloc [0 ].total_seconds () / 60 )
214+ # Use most common time difference (for pandas 1.5.x backward compatibility)
215+ time_diffs = pd .Series (data .index ).diff ().dropna ()
216+ # Pandas TimedeltaIndex does not have a .mode() method directly.
217+ # We'll convert to seconds and use pd.Series.mode()
218+ # Use .dt accessor for pandas 1.5.x compatibility
219+ dt0 = int ((time_diffs .dt .total_seconds () / 60 ).mode ().iloc [0 ])
220+
221+ # Create time grid (pandas 1.5.x compatible)
222+ min_time = data .index .min ()
223+ max_time = data .index .max ()
224+
225+ # Use compatible floor/ceil methods for pandas 1.5.x
226+ if hasattr (min_time , "floor" ):
227+ start_time = min_time .floor ("D" )
228+ else :
229+ # Fallback for pandas 1.5.x
230+ start_time = pd .Timestamp (min_time .date ())
231+
232+ if hasattr (max_time , "ceil" ):
233+ end_time = max_time .ceil ("D" )
234+ else :
235+ # Fallback for pandas 1.5.x
236+ end_time = pd .Timestamp (max_time .date ()) + pd .Timedelta (days = 1 )
198237
199- # Create time grid
200- start_time = data ["time" ].min ().floor ("D" )
201- end_time = data ["time" ].max ().ceil ("D" )
202238 time_grid = pd .date_range (start = start_time , end = end_time , freq = f"{ dt0 } min" )
203239 if is_iglu_r_compatible ():
204240 # remove the first time point
@@ -210,26 +246,26 @@ def CGMS2DayByDay(
210246 # find gaps in the data (using original data indexes, not time grid)
211247 gaps = []
212248 for i in range (len (data ) - 1 ):
213- if (data [ "time" ]. iloc [i + 1 ] - data [ "time" ]. iloc [i ]).total_seconds () > inter_gap * 60 :
249+ if (data . index [i + 1 ] - data . index [i ]).total_seconds () > inter_gap * 60 :
214250 gaps .append ((i , i + 1 ))
215251
216252 # Interpolate glucose values
217253 interp_data = np .interp (
218254 (time_grid - start_time ).total_seconds () / 60 ,
219- (data [ "time" ] - start_time ). dt .total_seconds () / 60 ,
220- data [ "gl" ] ,
255+ (data . index - start_time ).total_seconds () / 60 ,
256+ data . values ,
221257 left = np .nan ,
222258 right = np .nan ,
223259 )
224260
225261 # put nan in the gaps
226262 for gap in gaps :
227263 gap_start_idx = gap [0 ]
228- gap_start_time = data [ "time" ]. iloc [gap_start_idx ]
264+ gap_start_time = data . index [gap_start_idx ]
229265 # find the index of the gap start in the time grid
230266 gap_start_idx_in_time_grid = int (np .floor ((gap_start_time - start_time ).total_seconds () / (60 * dt0 )))
231267 gap_end_idx = gap [1 ]
232- gap_end_time = data [ "time" ]. iloc [gap_end_idx ]
268+ gap_end_time = data . index [gap_end_idx ]
233269 # find the index of the gap end in the time grid
234270 gap_end_idx_in_time_grid = int (
235271 # -1sec to indicate time before measurement
@@ -238,10 +274,10 @@ def CGMS2DayByDay(
238274 # put nan in the gap
239275 interp_data [gap_start_idx_in_time_grid :gap_end_idx_in_time_grid ] = np .nan
240276
241- # for compatibility with the R package, set values to nan before data['time']. min() and after data['time'] .max()
242- # find index of timegrid before data['time']. min() and after data['time'] .max()
243- # head_min_idx = np.where(time_grid >= data['time'] .min())[0][0]
244- # tail_max_idx = np.where(time_grid <= data['time'] .max())[0][-1] + 1
277+ # for compatibility with the R package, set values to nan before data.index. min() and after data.index .max()
278+ # find index of timegrid before data.index. min() and after data.index .max()
279+ # head_min_idx = np.where(time_grid >= data.index .min())[0][0]
280+ # tail_max_idx = np.where(time_grid <= data.index .max())[0][-1] + 1
245281 # interp_data[:head_min_idx] = np.nan
246282 # interp_data[tail_max_idx:] = np.nan
247283
@@ -260,7 +296,7 @@ def CGMS2DayByDay(
260296 return interp_data , actual_dates , dt0
261297
262298
263- def gd2d_to_df (gd2d , actual_dates , dt0 ):
299+ def gd2d_to_df (gd2d , actual_dates , dt0 ): # noqa: C901
264300 """Convert gd2d (CGMS2DayByDay output) to a pandas DataFrame"""
265301 df = pd .DataFrame ({"time" : [], "gl" : []})
266302
0 commit comments