diff --git a/bluemath_tk/core/metrics.py b/bluemath_tk/core/metrics.py new file mode 100644 index 0000000..d8534e3 --- /dev/null +++ b/bluemath_tk/core/metrics.py @@ -0,0 +1,143 @@ +import numpy as np + + +def bias(X_true: np.ndarray, X_pred: np.ndarray) -> float: + """ + Calculate the BIAS. + + Parameters + ---------- + X_true : np.ndarray + True values. + X_pred : np.ndarray + Predicted values. + + Returns + ------- + float + The BIAS value. + """ + + if len(X_true) != len(X_pred): + raise ValueError("X_true and X_pred must have the same length") + + return float(sum(X_true - X_pred) / len(X_true)) + + +def si(X_true: np.ndarray, X_pred: np.ndarray) -> float: + """ + Calculate the Scatter Index (SI). + + Parameters + ---------- + X_true : np.ndarray + True values. + X_pred : np.ndarray + Predicted values. + + Returns + ------- + float + The Scatter Index value. + """ + + if len(X_true) != len(X_pred): + raise ValueError("X_true and X_pred must have the same length") + + return float( + np.sqrt( + sum(((X_true - X_true.mean()) - (X_pred - X_pred.mean())) ** 2) + / sum(X_true**2) + ) + ) + + +def mse(X_true: np.ndarray, X_pred: np.ndarray) -> float: + """ + Calculate Mean Squared Error (MSE). + + Parameters + ---------- + X_true : np.ndarray + True values. + X_pred : np.ndarray + Predicted values. + + Returns + ------- + float + Mean squared error. + """ + + if len(X_true) != len(X_pred): + raise ValueError("X_true and X_pred must have the same length") + + return float(np.mean((X_true - X_pred) ** 2)) + + +def mae(X_true: np.ndarray, X_pred: np.ndarray) -> float: + """ + Calculate Mean Absolute Error (MAE). + + Parameters + ---------- + X_true : np.ndarray + True values. + X_pred : np.ndarray + Predicted values. + + Returns + ------- + float + Mean absolute error. + """ + + if len(X_true) != len(X_pred): + raise ValueError("X_true and X_pred must have the same length") + + return float(np.mean(np.abs(X_true - X_pred))) + + +def rmse(X_true: np.ndarray, X_pred: np.ndarray) -> float: + """ + Calculate Root Mean Squared Error (RMSE). + + Parameters + ---------- + X_true : np.ndarray + True values. + X_pred : np.ndarray + Predicted values. + + Returns + ------- + float + Root mean squared error. + """ + + return float(np.sqrt(mse(X_true, X_pred))) + + +def r2(X_true: np.ndarray, X_pred: np.ndarray) -> float: + """ + Calculate the R² score. + + Parameters + ---------- + X_true : np.ndarray + True values. + X_pred : np.ndarray + Predicted values. + + Returns + ------- + float + The R² score. + """ + + if len(X_true) != len(X_pred): + raise ValueError("X_true and X_pred must have the same length") + + return float( + 1.0 - np.sum((X_true - X_pred) ** 2) / np.sum((X_true - X_true.mean()) ** 2) + ) diff --git a/bluemath_tk/core/pipeline.py b/bluemath_tk/core/pipeline.py deleted file mode 100644 index e42708c..0000000 --- a/bluemath_tk/core/pipeline.py +++ /dev/null @@ -1,347 +0,0 @@ -import itertools -from copy import deepcopy -from typing import Any, Callable, Dict, List, Union - -import numpy as np -import pandas as pd -import xarray as xr - - -class BlueMathPipeline: - """ - A flexible, modular pipeline for chaining together BlueMath models and data processing steps. - - This class allows you to define a sequence of steps, where each step must be a BlueMathModel. - Each step is defined by a dictionary specifying: - - 'name': str, a unique identifier for the step. - - 'model': the model instance to use (or will be created via 'model_init' and 'model_init_params'). - - 'model_init': (optional) a callable/class to instantiate the model. - - 'model_init_params': (optional) dict of parameters for model initialization. - - 'fit_method': (optional) str, the method name to call for fitting (default is based on model type). - - 'fit_params': (optional) dict, parameters for the fit method. - - 'pipeline_attributes_to_store': (optional) list of attribute names to store for later use. - - The pipeline supports advanced parameter passing, including referencing outputs from previous steps - and using callables for dynamic parameter computation. - """ - - # Map model class names to their default fit method - _default_fit_methods = { - "LHS": "generate", - "MDA": "fit_predict", - "KMA": "fit_predict", - "SOM": "fit_predict", - "PCA": "fit_transform", - "RBF": "fit_predict", - } - - def __init__(self, steps: List[Dict[str, Any]]): - """ - Initialize the BlueMathPipeline with a sequence of steps. - - Parameters - ---------- - steps : List[Dict[str, Any]] - A list of dictionaries, each specifying at least 'name' and 'model', - and optionally 'model_init', 'model_init_params', 'fit_method', 'fit_params', - and 'pipeline_attributes_to_store'. - """ - - self.steps = steps - self._pipeline_attributes = {} # Stores attributes from previous models for later reference - - @property - def pipeline_attributes(self) -> Dict[str, Dict[str, Any]]: - """ - Get the stored model attributes from each pipeline step. - - Returns - ------- - Dict[str, Dict[str, Any]] - A dictionary mapping step names to dictionaries of stored attributes. - - Raises - ------ - ValueError - If the pipeline has not been fit yet and no attributes are stored. - """ - - if len(self._pipeline_attributes) == 0: - raise ValueError( - "No model attributes found. Please fit the pipeline first." - ) - - return self._pipeline_attributes - - def fit(self, data: Union[np.ndarray, pd.DataFrame, xr.Dataset] = None): - """ - Fit all models in the pipeline sequentially, passing the output of each step as input to the next. - - For each step, the model is (optionally) initialized, then fit using the specified method and parameters. - Parameters and model initialization arguments can be dynamically computed using callables or references - to previous pipeline attributes. - - Parameters - ---------- - data : Union[np.ndarray, pd.DataFrame, xr.Dataset], optional - The input data to fit the models. If None, the pipeline expects each step to handle its own data. - - Returns - ------- - The output of the final step in the pipeline (could be transformed data, predictions, etc.). - """ - - if data is not None: - data = deepcopy(data) # Avoid modifying the original input data - - # Iterate over each step in the pipeline - for step in self.steps: - # If model needs to be initialized (using model_init and model_init_params) - if "model_init" in step and "model_init_params" in step: - for init_param_name, init_param_value in step[ - "model_init_params" - ].items(): - # If the parameter is a callable, call it with (pipeline, step, data) - if callable(init_param_value): - step["model_init_params"][init_param_name] = init_param_value( - self, step, data - ) - # If the parameter is a dict with 'data' and 'function', call the function - elif ( - isinstance(init_param_value, dict) - and "data" in init_param_value - and "function" in init_param_value - and callable(init_param_value["function"]) - ): - # Call the function with (pipeline, step, data) - step["model_init_params"][init_param_name] = init_param_value[ - "function" - ](self, step, init_param_value["data"]) - # If the parameter is the string 'data', replace with the current data - elif isinstance(init_param_value, str): - if init_param_value == "data": - step["model_init_params"][init_param_name] = data - - # Actually instantiate the model with the resolved parameters - step["model"] = step["model_init"](**step["model_init_params"]) - - # Retrieve the model instance for this step - model = step["model"] - default_method = self._default_fit_methods.get(type(model).__name__) - method_name = step.get( - "fit_method", default_method - ) # Use step's method or default - if method_name is None: - raise ValueError( - f"No fit method found for model {type(model).__name__}. Please specify a fit_method in the step." - ) - - # Prepare parameters for the fit method, resolving any callables or references - params = step.get("fit_params", {}).copy() - for param_name, param_value in params.items(): - if callable(param_value): - params[param_name] = param_value(self, step, data) - elif ( - isinstance(param_value, dict) - and "data" in param_value - and "function" in param_value - and callable(param_value["function"]) - ): - # Call the function with (data, pipeline, step) - params[param_name] = param_value["function"]( - self, step, param_value["data"] - ) - elif isinstance(param_value, str): - if param_value == "data": - params[param_name] = data - - # Call the fit method on the model with the resolved parameters - method = getattr(model, method_name) - try: - # Some methods expect 'data' as a named argument - data = method(data=data, **params) - except Exception as _e: - # If that fails, try calling without 'data' as a named argument - data = method(**params) - - # Store specified model attributes for later use, if requested - if "pipeline_attributes_to_store" in step: - self._pipeline_attributes[step["name"]] = { - attr_name: getattr(model, attr_name) - for attr_name in step["pipeline_attributes_to_store"] - } - - return data - - def _generate_param_combinations( - self, param_grid: Dict[str, List[Any]] - ) -> List[Dict[str, Any]]: - """ - Generate all possible combinations of parameters from a parameter grid for a single pipeline step. - - Parameters - ---------- - param_grid : Dict[str, List[Any]] - Dictionary mapping parameter names to lists of values to try for each parameter. - - Returns - ------- - List[Dict[str, Any]] - List of dictionaries, each representing a unique combination of parameters. - """ - - keys = param_grid.keys() - values = param_grid.values() - combinations = list(itertools.product(*values)) - - return [dict(zip(keys, combo)) for combo in combinations] - - def grid_search( - self, - data: Union[np.ndarray, pd.DataFrame], - param_grid: List[Dict[str, Any]], - metric: Callable = None, - target_data: Union[np.ndarray, pd.DataFrame] = None, - plot: bool = False, - ) -> Dict[str, Any]: - """ - Perform a grid search over all possible parameter combinations for all steps in the pipeline. - - This method evaluates every possible combination of parameters (from the provided grids) for each step, - fits the pipeline, and scores the result using the provided metric or the last model's score method. - The best parameter set (lowest score) is selected and the pipeline is updated accordingly. - - Parameters - ---------- - data : Union[np.ndarray, pd.DataFrame] - The input data to fit the models. - param_grid : List[Dict[str, Any]] - List of parameter grids for each step in the pipeline. Each grid is a dict mapping parameter names - to lists of values to try. Parameters can be for either model_init_params or fit_params. - metric : Callable, optional - Function to evaluate the final output. Should take (y_true, y_pred) as arguments. - If None, will use the last model's built-in score method if available. - target_data : Union[np.ndarray, pd.DataFrame], optional - Target data to evaluate against if using a custom metric. Required if metric is provided. - plot : bool, optional - If True, plot the score for each parameter combination after grid search. Default is False. - - Returns - ------- - Dict[str, Any] - Dictionary containing: - - 'best_params': the best parameter set for each step - - 'best_score': the best score achieved - - 'best_output': the output of the pipeline for the best parameters - - 'all_results': a list of all parameter sets and their scores/outputs - - Raises - ------ - ValueError - If the number of parameter grids does not match the number of pipeline steps, - or if a metric is provided but no target_data is given. - """ - - if len(param_grid) != len(self.steps): - raise ValueError( - "Number of parameter grids must match number of pipeline steps" - ) - - if metric is not None and target_data is None: - raise ValueError("target_data must be provided when using a custom metric") - - # Generate all possible parameter combinations for each step - all_param_combinations = [] - for step_params in param_grid: - step_combinations = self._generate_param_combinations(step_params) - all_param_combinations.append(step_combinations) - - # Cartesian product: all possible combinations across all steps - param_combinations = list(itertools.product(*all_param_combinations)) - - best_score = float("inf") # Initialize best score as infinity (lower is better) - best_params = None - best_output = None - all_results = [] - - # Iterate over every possible parameter combination - for step_params in param_combinations: - pipeline_copy = deepcopy(self) # Work on a copy to avoid side effects - - # Update each step in the pipeline with the current parameter set - for step_idx, params in enumerate(step_params): - step = pipeline_copy.steps[step_idx] - # Assign parameters to model_init_params or fit_params as appropriate - if "model_init_params" in step: - for param_name, param_value in params.items(): - if param_name in step["model_init_params"]: - step["model_init_params"][param_name] = param_value - else: - step.setdefault("fit_params", {})[param_name] = param_value - else: - step.setdefault("fit_params", {}).update(params) - # Re-initialize the model if model_init_params were updated - if "model_init_params" in step and "model_init" in step: - step["model"] = step["model_init"](**step["model_init_params"]) - - # Fit the pipeline and get the output for this parameter set - output = pipeline_copy.fit(data) - - # Score the output using the provided metric or the model's score method - if metric is not None: - score = metric(target_data, output) - else: - try: - score = pipeline_copy.steps[-1]["model"].score(target_data, output) - except (AttributeError, TypeError): - raise ValueError( - "Either provide a metric function and target_data, " - "or ensure the last model has a score method" - ) - - # Store the result for this parameter set - result = {"params": step_params, "score": score, "output": output} - all_results.append(result) - - # Update the best score/params/output if this is the best so far - if score < best_score: - best_score = score - best_params = step_params - best_output = output - - # After search, update the pipeline with the best parameters found - for step_idx, params in enumerate(best_params): - step = self.steps[step_idx] - if "model_init_params" in step: - for param_name, param_value in params.items(): - if param_name in step["model_init_params"]: - step["model_init_params"][param_name] = param_value - else: - step.setdefault("fit_params", {})[param_name] = param_value - else: - step.setdefault("fit_params", {}).update(params) - if "model_init_params" in step and "model_init" in step: - step["model"] = step["model_init"](**step["model_init_params"]) - - # Plotting if requested - if plot: - try: - import matplotlib.pyplot as plt - - scores = [result["score"] for result in all_results] - plt.figure(figsize=(6, 4)) - plt.plot(range(len(scores)), scores, marker="o", linestyle="-") - plt.xlabel("Parameter Combination Index") - plt.ylabel("Score") - plt.title("Grid Search Scores for Parameter Combinations") - plt.grid(True) - plt.show() - except ImportError: - print("matplotlib is not installed. Cannot plot grid search results.") - - return { - "best_params": best_params, - "best_score": best_score, - "best_output": best_output, - "all_results": all_results, - } diff --git a/bluemath_tk/core/plotting/scatter.py b/bluemath_tk/core/plotting/scatter.py index edc57da..a5c2438 100644 --- a/bluemath_tk/core/plotting/scatter.py +++ b/bluemath_tk/core/plotting/scatter.py @@ -5,84 +5,12 @@ from matplotlib.axes import Axes from matplotlib.figure import Figure from scipy.stats import gaussian_kde, probplot -from sklearn.metrics import mean_squared_error +from ..metrics import bias, r2, rmse, si from .base_plotting import DefaultStaticPlotting from .colors import default_colors -def rmse(pred: np.ndarray, tar: np.ndarray) -> float: - """ - Calculate the Root Mean Square Error between predicted and target values. - - Parameters - ---------- - pred : np.ndarray - Array of predicted values. - tar : np.ndarray - Array of target/actual values. - - Returns - ------- - float - The Root Mean Square Error value. - """ - - if len(pred) != len(tar): - raise ValueError("pred and tar must have the same length") - - return np.sqrt(((pred - tar) ** 2).mean()) - - -def bias(pred: np.ndarray, tar: np.ndarray) -> float: - """ - Calculate the bias between predicted and target values. - - Parameters - ---------- - pred : np.ndarray - Array of predicted values. - tar : np.ndarray - Array of target/actual values. - - Returns - ------- - float - The bias value (mean difference between predictions and targets). - """ - - if len(pred) != len(tar): - raise ValueError("pred and tar must have the same length") - - return sum(pred - tar) / len(pred) - - -def si(pred: np.ndarray, tar: np.ndarray) -> float: - """ - Calculate the Scatter Index between predicted and target values. - - Parameters - ---------- - pred : np.ndarray - Array of predicted values. - tar : np.ndarray - Array of target/actual values. - - Returns - ------- - float - The Scatter Index value. - """ - - if len(pred) != len(tar): - raise ValueError("pred and tar must have the same length") - - pred_mean = pred.mean() - tar_mean = tar.mean() - - return np.sqrt(sum(((pred - pred_mean) - (tar - tar_mean)) ** 2) / (sum(tar**2))) - - def density_scatter( x: np.ndarray, y: np.ndarray ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: @@ -174,16 +102,12 @@ def validation_scatter( props = dict( boxstyle="round", facecolor="w", edgecolor="grey", linewidth=0.8, alpha=0.5 ) - mse = mean_squared_error(x2, y2) - rmse_e = rmse(x2, y2) - BIAS = bias(x2, y2) - SI = si(x2, y2) label = "\n".join( ( - r"RMSE = %.2f" % (rmse_e,), - r"MSE = %.2f" % (mse,), - r"BIAS = %.2f" % (BIAS,), - R"SI = %.2f" % (SI,), + r"BIAS = %.2f" % (bias(x2, y2),), + r"SI = %.2f" % (si(x2, y2),), + r"RMSE = %.2f" % (rmse(x2, y2),), + r"R² = %.2f" % (r2(x2, y2),), ) ) axs.text( diff --git a/bluemath_tk/deeplearning/_base_model.py b/bluemath_tk/deeplearning/_base_model.py index 4529583..1aafe24 100644 --- a/bluemath_tk/deeplearning/_base_model.py +++ b/bluemath_tk/deeplearning/_base_model.py @@ -7,7 +7,6 @@ from tqdm import tqdm from ..core.models import BlueMathModel -from .metrics import compute_all_metrics class BaseDeepLearningModel(BlueMathModel): @@ -336,48 +335,6 @@ def encode( return np.concatenate(encodings, axis=0) - def evaluate( - self, - X: np.ndarray, - y: Optional[np.ndarray] = None, - spatial: bool = False, - verbose: int = 1, - ) -> Dict[str, float]: - """ - Evaluate the model on data. - - Parameters - ---------- - X : np.ndarray - Input data. - y : np.ndarray, optional - Target data. If None, assumes autoencoder (X is target). Default is None. - spatial : bool, optional - Whether to compute spatial metrics. If True, computes spatial metrics. Default is False. - verbose : int, optional - Verbosity level. If > 0, shows progress bar during prediction. Default is 1. - - Returns - ------- - Dict[str, float] - Dictionary of evaluation metrics. - - Raises - ------ - ValueError - If model is not fitted. - """ - - if not self.is_fitted: - raise ValueError("Model must be fitted before evaluation.") - - X_pred = self.predict(X, verbose=verbose) - - if y is None: - y = X # Autoencoder case - - return compute_all_metrics(y, X_pred, spatial=spatial) - def save_pytorch_model(self, model_path: str, **kwargs): """ Save the PyTorch model to a file. diff --git a/bluemath_tk/deeplearning/metrics.py b/bluemath_tk/deeplearning/metrics.py deleted file mode 100644 index 527e85b..0000000 --- a/bluemath_tk/deeplearning/metrics.py +++ /dev/null @@ -1,191 +0,0 @@ -""" -Project: BlueMath_tk -Sub-Module: deeplearning.metrics -Author: GeoOcean Research Group, Universidad de Cantabria -Repository: https://github.com/GeoOcean/BlueMath_tk.git -Status: Under development (Working) - -Evaluation metrics for deep learning models. -""" - -import numpy as np - - -def mse(A: np.ndarray, B: np.ndarray) -> float: - """ - Calculate Mean Squared Error (MSE). - - Parameters - ---------- - A : np.ndarray - True values. - B : np.ndarray - Predicted values. - - Returns - ------- - float - Mean squared error. - """ - return float(np.mean((A - B) ** 2)) - - -def mae(A: np.ndarray, B: np.ndarray) -> float: - """ - Calculate Mean Absolute Error (MAE). - - Parameters - ---------- - A : np.ndarray - True values. - B : np.ndarray - Predicted values. - - Returns - ------- - float - Mean absolute error. - """ - return float(np.mean(np.abs(A - B))) - - -def rmse(A: np.ndarray, B: np.ndarray) -> float: - """ - Calculate Root Mean Squared Error (RMSE). - - Parameters - ---------- - A : np.ndarray - True values. - B : np.ndarray - Predicted values. - - Returns - ------- - float - Root mean squared error. - """ - return float(np.sqrt(mse(A, B))) - - -def r2_overall(X: np.ndarray, Xhat: np.ndarray) -> float: - """ - Calculate overall R² score. - - Parameters - ---------- - X : np.ndarray - True values. - Xhat : np.ndarray - Predicted values. - - Returns - ------- - float - Overall R² score. - """ - SST = np.sum((X - X.mean(axis=0, keepdims=True)) ** 2) - SSE = np.sum((X - Xhat) ** 2) - return float(1.0 - SSE / (SST + 1e-12)) - - -def r2_per_feature(X: np.ndarray, Xhat: np.ndarray) -> np.ndarray: - """ - Calculate R² score per feature. - - Parameters - ---------- - X : np.ndarray - True values. - Xhat : np.ndarray - Predicted values. - - Returns - ------- - np.ndarray - Array of R² scores per feature, shape (n_features,). - """ - num = np.sum((X - Xhat) ** 2, axis=0) - den = np.sum((X - X.mean(axis=0, keepdims=True)) ** 2, axis=0) + 1e-12 - return 1.0 - num / den - - -def r2_map_over_time( - X_true: np.ndarray, X_pred: np.ndarray, axis_time: int = 0 -) -> np.ndarray: - """ - Calculate R² map over time for spatial data. - - Parameters - ---------- - X_true : np.ndarray - True values, shape (T, H, W, C) or similar. - X_pred : np.ndarray - Predicted values, same shape as X_true. - axis_time : int, optional - Time axis dimension, by default 0. - - Returns - ------- - np.ndarray - R² map over time, shape (H, W, C) or similar. - """ - Xt = X_true - np.nanmean(X_true, axis=axis_time, keepdims=True) - num = np.nanmean((X_true - X_pred) ** 2, axis=axis_time) - den = np.nanmean(Xt**2, axis=axis_time) + 1e-12 - return 1.0 - num / den - - -def r2(y: np.ndarray, yhat: np.ndarray) -> np.ndarray: - """ - Calculate R² score per feature (alternative implementation). - - Parameters - ---------- - y : np.ndarray - True values. - yhat : np.ndarray - Predicted values. - - Returns - ------- - np.ndarray - Array of R² scores per feature. - """ - ss_res = np.sum((y - yhat) ** 2, axis=0) - ss_tot = np.sum((y - y.mean(axis=0)) ** 2, axis=0) - return 1.0 - ss_res / (ss_tot + 1e-12) - - -def compute_all_metrics( - X_true: np.ndarray, X_pred: np.ndarray, spatial: bool = False -) -> dict: - """ - Compute all available metrics for model evaluation. - - Parameters - ---------- - X_true : np.ndarray - True values. - X_pred : np.ndarray - Predicted values. - spatial : bool, optional - Whether to compute spatial metrics, by default False. - - Returns - ------- - dict - Dictionary containing all computed metrics. - """ - metrics = { - "mse": mse(X_true, X_pred), - "mae": mae(X_true, X_pred), - "rmse": rmse(X_true, X_pred), - "r2_overall": r2_overall(X_true, X_pred), - "r2_per_feature": r2_per_feature(X_true, X_pred), - } - - if spatial and X_true.ndim >= 3: - metrics["r2_map_over_time"] = r2_map_over_time(X_true, X_pred) - - return metrics diff --git a/bluemath_tk/interpolation/rbf.py b/bluemath_tk/interpolation/rbf.py index 345d819..6ab8468 100644 --- a/bluemath_tk/interpolation/rbf.py +++ b/bluemath_tk/interpolation/rbf.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd from scipy.optimize import fmin, fminbound +from sklearn.model_selection import KFold from ..core.decorators import validate_data_rbf from ._base_interpolation import BaseInterpolation @@ -1049,3 +1050,108 @@ def fit_predict( ) return self.predict(dataset=dataset, num_workers=num_workers) + + +def basic_rbf_metric(df_true: pd.DataFrame, df_pred: pd.DataFrame) -> float: + """ + Calculate the basic RBF metric. + + Parameters + ---------- + df_true : pd.DataFrame + The true data. + df_pred : pd.DataFrame + The predicted data. + + Returns + ------- + float + The basic RBF metric. + """ + + return ((df_true - df_pred) ** 2).mean() + + +def KFold_cross_validation_RBF( + subset_data: pd.DataFrame, + target_data: pd.DataFrame, + subset_directional_variables: List[str] = [], + target_directional_variables: List[str] = [], + subset_custom_scale_factor: dict = {}, + normalize_target_data: bool = True, + target_custom_scale_factor: dict = {}, + num_workers: int = None, + iteratively_update_sigma: bool = False, + rbf_model: RBF = None, + n_splits: int = 5, + metric: Callable = basic_rbf_metric, +): + """ + Perform K-Fold cross-validation for the RBF model. + """ + + if rbf_model is None: + rbf_model = RBF() + + # Initialize the K-Fold cross-validation + kf = KFold(n_splits=n_splits) + + # Loop through the folds + kfold_results = {} + for i_fold, (train_index, test_index) in enumerate( + kf.split(subset_data, target_data) + ): + # Get the train and test data + subset_data_train, subset_data_test = ( + subset_data.iloc[train_index], + subset_data.iloc[test_index], + ) + target_data_train, target_data_test = ( + target_data.iloc[train_index], + target_data.iloc[test_index], + ) + + # Fit the RBF model + rbf_model.fit( + subset_data=subset_data_train, + target_data=target_data_train, + subset_directional_variables=subset_directional_variables, + target_directional_variables=target_directional_variables, + subset_custom_scale_factor=subset_custom_scale_factor, + normalize_target_data=normalize_target_data, + target_custom_scale_factor=target_custom_scale_factor, + num_workers=num_workers, + iteratively_update_sigma=iteratively_update_sigma, + ) + + # Predict the data + predictions = rbf_model.predict( + dataset=subset_data_test, num_workers=num_workers + ) + predictions.index = target_data_test.index.copy() + + # Calculate directional variables for target data test + for directional_variable in target_directional_variables: + ( + target_data_test[f"{directional_variable}_u"], + target_data_test[f"{directional_variable}_v"], + ) = rbf_model.get_uv_components( + x_deg=target_data_test[directional_variable] + ) + + # Store the results + kfold_results[i_fold] = { + "metric": metric( + target_data_test[rbf_model.target_processed_variables], + predictions[rbf_model.target_processed_variables], + ), + "train_index": train_index, + "test_index": test_index, + "predictions": predictions, + "target_data_train": target_data_train, + "target_data_test": target_data_test, + "subset_data_train": subset_data_train, + "subset_data_test": subset_data_test, + } + + return kfold_results diff --git a/bluemath_tk/predictor/xwt.py b/bluemath_tk/predictor/xwt.py index 2e7c153..cb856ce 100644 --- a/bluemath_tk/predictor/xwt.py +++ b/bluemath_tk/predictor/xwt.py @@ -16,7 +16,6 @@ from ..core.dask import setup_dask_client from ..core.decorators import validate_data_xwt from ..core.models import BlueMathModel -from ..core.pipeline import BlueMathPipeline from ..core.plotting.colors import get_cluster_colors, get_config_variables from ..datamining.kma import KMA from ..datamining.pca import PCA @@ -111,7 +110,7 @@ def __init__(self, message="XWT error occurred."): super().__init__(self.message) -class XWT(BlueMathModel, BlueMathPipeline): +class XWT(BlueMathModel): """ Xly Weather Types (XWT) class.