staskh
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎iglu_python/active_percent.py‎
Lines changed: 18 additions & 4 deletions b/‎iglu_python/active_percent.py‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎iglu_python/cv_measures.py‎
Lines changed: 7 additions & 2 deletions b/‎iglu_python/cv_measures.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎iglu_python/roc.py‎
Lines changed: 13 additions & 2 deletions b/‎iglu_python/roc.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎iglu_python/utils.py‎
Lines changed: 87 additions & 51 deletions b/‎iglu_python/utils.py‎
Lines changed: 87 additions & 51 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 22 additions & 4 deletions b/‎pyproject.toml‎
Lines changed: 22 additions & 4 deletions
@@ -179,3 +179,8 @@ sandbox.ipynb
 tests/data/day1.csv
 tests/data/day2.csv
 tests/data/day5.csv
+14-bug/
+
+# Other virtual environments
+.venv-*
+venv-*
@@ -4,7 +4,7 @@
 import numpy as np
 import pandas as pd
 
-from .utils import check_data_columns, localize_naive_timestamp
+from .utils import check_data_columns, get_local_tz
 
 
 def active_percent(
@@ -99,7 +99,7 @@ def active_percent(
     return df
 
 
-def active_percent_single(
+def active_percent_single(  # noqa: C901
     data: pd.Series,
     dt0: Optional[int] = None,
     tz: str = "",
@@ -117,12 +117,18 @@ def active_percent_single(
     if not isinstance(data.index, pd.DatetimeIndex):
         raise ValueError("Series must have a DatetimeIndex")
 
+    # localize data.index to the timezone if it is not already
+    if data.index.tzinfo is None:
+        if not tz or tz == "":
+            tz = get_local_tz()
+        data.index = data.index.tz_localize(tz)
+
     data = data.dropna()
     if len(data) == 0:
         return {"active_percent": 0, "ndays": 0, "start_date": None, "end_date": None}
 
     # Calculate time differences between consecutive measurements
-    time_diffs = np.array(data.index.diff().total_seconds() / 60)  # Convert to minutes
+    time_diffs = np.array(data.index.to_series().diff().dt.total_seconds() / 60)  # Convert to minutes
 
     # Automatically determine dt0 if not provided
     if dt0 is None:
@@ -151,12 +157,20 @@ def active_percent_single(
     elif range_type == "manual":
         # Handle consistent end date if provided
         if consistent_end_date is not None:
-            end_date = localize_naive_timestamp(pd.to_datetime(consistent_end_date))
+            end_date = pd.to_datetime(consistent_end_date)
         else:
             end_date = data.index.max()
         start_date = end_date - pd.Timedelta(days=int(ndays))
 
         # Filter data to the specified date range
+        # bring timestamps to teh same timezone as start_date
+        tz = data.index.tz
+        # Localize start_date only if it is naive and tz is not None
+        if start_date.tzinfo is None and tz is not None:
+            start_date = start_date.tz_localize(tz)
+        # Localize end_date only if it is naive and tz is not None
+        if end_date.tzinfo is None and tz is not None:
+            end_date = end_date.tz_localize(tz)
         mask = (data.index >= start_date) & (data.index <= end_date)
         data = data[mask]
 
 
@@ -82,14 +82,19 @@ def _calculate_series_cv(subject_data: pd.DataFrame | pd.Series, dt0=None, inter
     # active_days is a list of days that have at least 2 non-missing values
     # dt0 is the time frequency for interpolation in minutes
 
-    # calculate devioation and median for each day
+    # calculate deviation and median for each day
+    # with warnings.catch_warnings():
+    #     warnings.simplefilter("ignore", category=RuntimeWarning)
     daily_deviations = np.apply_along_axis(np.nanstd, 1, gd2d, ddof=1)
     daily_mean = np.apply_along_axis(np.nanmean, 1, gd2d)
 
     cv = daily_deviations * 100 / daily_mean
 
     # calculate mean of daily deviations
     cv_mean = np.nanmean(cv)
-    cv_sd = np.nanstd(cv, ddof=1)
+    if len(cv) > 1:
+        cv_sd = np.nanstd(cv, ddof=1)
+    else:
+        cv_sd = np.nan
 
     return {"CVmean": cv_mean, "CVsd": cv_sd}
@@ -3,10 +3,10 @@
 import numpy as np
 import pandas as pd
 
-from .utils import CGMS2DayByDay, check_data_columns
+from .utils import CGMS2DayByDay, check_data_columns, get_local_tz
 
 
-def roc(
+def roc(  # noqa: C901
     data: Union[pd.DataFrame, pd.Series],
     timelag: int = 15,
     dt0: int = 5,
@@ -138,6 +138,17 @@ def roc_single(data: pd.DataFrame, timelag: int, dt0: int = None, inter_gap: int
         if len(subject_data) == 0:
             continue
 
+        # Ensure 'time' is a DatetimeIndex before localizing
+        if not tz or tz == "":
+            tz = get_local_tz()
+        if pd.api.types.is_datetime64_any_dtype(subject_data["time"]):
+            if subject_data["time"].dt.tz is None:
+                subject_data["time"] = subject_data["time"].dt.tz_localize(tz)
+            else:
+                subject_data["time"] = subject_data["time"].dt.tz_convert(tz)
+        else:
+            subject_data["time"] = pd.to_datetime(subject_data["time"]).dt.tz_localize(tz)
+
         roc_values = roc_single(subject_data, timelag, dt0, inter_gap, tz)
 
         # Create time points for ROC values
 
@@ -45,7 +45,7 @@ def get_local_tz():
     return local_tz
 
 
-def check_data_columns(data: pd.DataFrame, time_check=False, tz="") -> pd.DataFrame:
+def check_data_columns(data: pd.DataFrame, time_check=False, tz="") -> pd.DataFrame:  # noqa: C901
     """
     Check if the input DataFrame has the required columns and correct data types.
 
@@ -90,8 +90,15 @@ def check_data_columns(data: pd.DataFrame, time_check=False, tz="") -> pd.DataFr
         except Exception as e:
             raise ValueError("Column 'time' must be datetime") from e
 
-    if not pd.api.types.is_string_dtype(data["id"]):
-        data["id"] = data["id"].astype(str)
+    # Check if id column is string-like (pandas 1.5.x compatible)
+    try:
+        # Try pandas 2.0+ method first
+        if not pd.api.types.is_string_dtype(data["id"]):
+            data["id"] = data["id"].astype(str)
+    except AttributeError:
+        # Fallback for pandas 1.5.x
+        if not isinstance(data["id"].dtype, object):
+            data["id"] = data["id"].astype(str)
 
     # check if data frame empty
     if data.empty:
@@ -105,25 +112,26 @@ def check_data_columns(data: pd.DataFrame, time_check=False, tz="") -> pd.DataFr
     # if data["gl"].isna().any():
     #     warnings.warn("Data contains missing glucose values")
 
-    # convert time to specified timezone
-    # TODO: check if this is correct (R-implementation compatibility)
-    # if tz and tz != "":
-    #     # First remove timezone information, then localize to specified timezone
-    #     data['time'] = pd.to_datetime(data['time']).dt.tz_localize(None).dt.tz_localize(tz)
-    #
-    # this is implementation compatible with R implementation
-    # but seems incorrect, as it convert time to TZ instead of localizing it to TZ
-    if tz != "":
-        # Create a copy to avoid dtype warning and properly handle timezone conversion
-        data["time"] = pd.to_datetime(data["time"]).apply(localize_naive_timestamp).dt.tz_convert(tz)
-    else:
-        # Create a copy to avoid dtype warning
-        data["time"] = pd.to_datetime(data["time"]).apply(localize_naive_timestamp)
+    if time_check:
+        # convert time to specified timezone
+        # TODO: check if this is correct (R-implementation compatibility)
+        # if tz and tz != "":
+        #     # First remove timezone information, then localize to specified timezone
+        #     data['time'] = pd.to_datetime(data['time']).dt.tz_localize(None).dt.tz_localize(tz)
+        #
+        # this is implementation compatible with R implementation
+        # but seems incorrect, as it convert time to TZ instead of localizing it to TZ
+        if tz and tz != "":
+            # Create a copy to avoid dtype warning and properly handle timezone conversion
+            data["time"] = pd.to_datetime(data["time"]).apply(localize_naive_timestamp).dt.tz_convert(tz)
+        else:
+            # Create a copy to avoid dtype warning
+            data["time"] = pd.to_datetime(data["time"]).apply(localize_naive_timestamp)
 
     return data
 
 
-def CGMS2DayByDay(
+def CGMS2DayByDay(  # noqa: C901
     data: pd.DataFrame | pd.Series,
     dt0: Optional[pd.Timestamp] = None,
     inter_gap: int = 45,
@@ -135,6 +143,7 @@ def CGMS2DayByDay(
     The function takes CGM data and interpolates it onto a uniform time grid,
     with each row representing a day and each column representing a time point.
     Missing values are linearly interpolated when close enough to non-missing values.
+    Note: all datetime indexes are converted into naive format to avoid DST transition bugs.
 
     data : pd.DataFrame or pd.Series
         DataFrame with columns 'id', 'time', and 'gl'. Should only be data for 1 subject.
@@ -172,33 +181,60 @@ def CGMS2DayByDay(
     if isinstance(data, pd.Series):
         if not isinstance(data.index, pd.DatetimeIndex):
             raise ValueError("Series must have a DatetimeIndex")
-        data = pd.DataFrame(
-            {
-                "id": ["subject1"] * len(data.values),
-                "time": data.index,
-                "gl": data.values,
-            }
-        )
-    # Check data format
-    data = check_data_columns(data, tz)
-
-    # Get unique subjects
-    subjects = data["id"].unique()
-    if len(subjects) > 1:
-        raise ValueError("Multiple subjects detected. Please provide a single subject.")
-
-    # Sort by time
-    data = data.sort_values("time")
+        # convert time to naive timezone (so no DST transition issues in np.interp())
+        data.index = data.index.tz_localize(None)
+    elif isinstance(data, pd.DataFrame):
+        # convert dataframe to series
+        # check that all id's are the same
+        if not data["id"].nunique() == 1:
+            raise ValueError("Multiple subjects detected. Please provide a single subject.")
+        # check that time is datetime
+        if not pd.api.types.is_datetime64_any_dtype(data["time"]):
+            try:
+                data["time"] = pd.to_datetime(data["time"])
+            except Exception as e:
+                raise ValueError("Column 'time' must be datetime") from e
+        # Check data types
+        if not pd.api.types.is_numeric_dtype(data["gl"]):
+            try:
+                data["gl"] = pd.to_numeric(data["gl"])
+            except Exception as e:
+                raise ValueError("Column 'gl' must be numeric") from e
+        # convert dataframe to series
+        data_reset = data.reset_index(drop=True)
+        # convert time to naive timezone
+        # (so index would not convert it into UTC with a shift and no DST transition issues in np.interp())
+        data_reset["time"] = data_reset["time"].dt.tz_localize(None)
+        data = pd.Series(data_reset["gl"].values, index=data_reset["time"].values)
+    else:
+        raise ValueError("Input must be a Series or DataFrame")
 
     # Calculate time step (dt0)
     if dt0 is None:
-        # Use most common time difference
-        time_diffs = data["time"].diff().dropna()
-        dt0 = int(time_diffs.mode().iloc[0].total_seconds() / 60)
+        # Use most common time difference (for pandas 1.5.x backward compatibility)
+        time_diffs = pd.Series(data.index).diff().dropna()
+        # Pandas TimedeltaIndex does not have a .mode() method directly.
+        # We'll convert to seconds and use pd.Series.mode()
+        # Use .dt accessor for pandas 1.5.x compatibility
+        dt0 = int((time_diffs.dt.total_seconds() / 60).mode().iloc[0])
+
+    # Create time grid (pandas 1.5.x compatible)
+    min_time = data.index.min()
+    max_time = data.index.max()
+
+    # Use compatible floor/ceil methods for pandas 1.5.x
+    if hasattr(min_time, "floor"):
+        start_time = min_time.floor("D")
+    else:
+        # Fallback for pandas 1.5.x
+        start_time = pd.Timestamp(min_time.date())
+
+    if hasattr(max_time, "ceil"):
+        end_time = max_time.ceil("D")
+    else:
+        # Fallback for pandas 1.5.x
+        end_time = pd.Timestamp(max_time.date()) + pd.Timedelta(days=1)
 
-    # Create time grid
-    start_time = data["time"].min().floor("D")
-    end_time = data["time"].max().ceil("D")
     time_grid = pd.date_range(start=start_time, end=end_time, freq=f"{dt0}min")
     if is_iglu_r_compatible():
         # remove the first time point
@@ -210,26 +246,26 @@ def CGMS2DayByDay(
     # find gaps in the data (using original data indexes, not time grid)
     gaps = []
     for i in range(len(data) - 1):
-        if (data["time"].iloc[i + 1] - data["time"].iloc[i]).total_seconds() > inter_gap * 60:
+        if (data.index[i + 1] - data.index[i]).total_seconds() > inter_gap * 60:
             gaps.append((i, i + 1))
 
     # Interpolate glucose values
     interp_data = np.interp(
         (time_grid - start_time).total_seconds() / 60,
-        (data["time"] - start_time).dt.total_seconds() / 60,
-        data["gl"],
+        (data.index - start_time).total_seconds() / 60,
+        data.values,
         left=np.nan,
         right=np.nan,
     )
 
     # put nan in the gaps
     for gap in gaps:
         gap_start_idx = gap[0]
-        gap_start_time = data["time"].iloc[gap_start_idx]
+        gap_start_time = data.index[gap_start_idx]
         # find the index of the gap start in the time grid
         gap_start_idx_in_time_grid = int(np.floor((gap_start_time - start_time).total_seconds() / (60 * dt0)))
         gap_end_idx = gap[1]
-        gap_end_time = data["time"].iloc[gap_end_idx]
+        gap_end_time = data.index[gap_end_idx]
         # find the index of the gap end in the time grid
         gap_end_idx_in_time_grid = int(
             # -1sec to indicate time before measurement
@@ -238,10 +274,10 @@ def CGMS2DayByDay(
         # put nan in the gap
         interp_data[gap_start_idx_in_time_grid:gap_end_idx_in_time_grid] = np.nan
 
-    # for compatibility with the R package, set values to nan before data['time'].min() and after data['time'].max()
-    # find index of timegrid before data['time'].min() and after data['time'].max()
-    # head_min_idx = np.where(time_grid >= data['time'].min())[0][0]
-    # tail_max_idx = np.where(time_grid <= data['time'].max())[0][-1] + 1
+    # for compatibility with the R package, set values to nan before data.index.min() and after data.index.max()
+    # find index of timegrid before data.index.min() and after data.index.max()
+    # head_min_idx = np.where(time_grid >= data.index.min())[0][0]
+    # tail_max_idx = np.where(time_grid <= data.index.max())[0][-1] + 1
     # interp_data[:head_min_idx] = np.nan
     # interp_data[tail_max_idx:] = np.nan
 
@@ -260,7 +296,7 @@ def CGMS2DayByDay(
     return interp_data, actual_dates, dt0
 
 
-def gd2d_to_df(gd2d, actual_dates, dt0):
+def gd2d_to_df(gd2d, actual_dates, dt0):  # noqa: C901
     """Convert gd2d (CGMS2DayByDay output) to a pandas DataFrame"""
     df = pd.DataFrame({"time": [], "gl": []})
 
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "iglu_python"
-version = "0.3.1"
+version = "0.4.0"
 description = "Python implementation of the iglu package for continuous glucose monitoring data analysis"
 readme = "README.md"
 requires-python = ">=3.11"
@@ -28,7 +28,7 @@ dependencies = [
     "pandas",
     "tzlocal",
     "openpyxl",
-    "matplotlib"
+    "matplotlib",
 ]
 
 [project.urls]
@@ -39,13 +39,31 @@ Issues = "https://github.com/staskh/iglu_python/issues"
 
 [project.optional-dependencies]
 dev = [
-    "pytest>=7.0.0",
-    "pytest-cov>=4.0.0",
+    "pytest>=8.4.2",
+    "pytest-cov>=7.0.0",
     "black>=25.1.0",
     "isort>=5.0.0",
     "mypy>=1.0.0",
     "ruff>=0.1.0",
     "pre-commit>=3.0.0",
+    "hatch>=1.14.1",
+    "twine>=6.2.0",
+    "pyarrow>=21.0.0",
+]
+test = [
+    "pytest>=8.4.2",
+    "pytest-cov>=7.0.0",
+]
+lint = [
+    "black>=25.1.0",
+    "isort>=5.0.0",
+    "mypy>=1.0.0",
+    "ruff>=0.1.0",
+    "pre-commit>=3.0.0",
+]
+build = [
+    "hatch>=1.14.1",
+    "twine>=6.2.0",
 ]
 
 [tool.hatch.build.targets.wheel]