diff --git a/avaframe/ana4Stats/probAna.py b/avaframe/ana4Stats/probAna.py index 18afc1dee..e4344ccdf 100644 --- a/avaframe/ana4Stats/probAna.py +++ b/avaframe/ana4Stats/probAna.py @@ -24,7 +24,6 @@ from avaframe.out3Plot import statsPlots as sP from avaframe.in1Data import getInput as gI - # create local logger # change log level in calling module to DEBUG to see log messages log = logging.getLogger(__name__) @@ -203,17 +202,17 @@ def updateCfgRange(cfg, cfgProb, varName, varDict): if valVariation == "": valVariation = "-" parValue = ( - variationType - + "$" - + valSteps - + "$" - + valVariation - + "$" - + cfgDist["GENERAL"]["minMaxInterval"] - + "$" - + cfgDist["GENERAL"]["buildType"] - + "$" - + cfgDist["GENERAL"]["support"] + variationType + + "$" + + valSteps + + "$" + + valVariation + + "$" + + cfgDist["GENERAL"]["minMaxInterval"] + + "$" + + cfgDist["GENERAL"]["buildType"] + + "$" + + cfgDist["GENERAL"]["support"] ) # if variation using percent elif variationType.lower() == "percent": @@ -225,9 +224,9 @@ def updateCfgRange(cfg, cfgProb, varName, varDict): parValue = valVariation + "$" + valSteps if "ci" in valVariation: message = ( - "Variation Type: range - variationValue is %s not a valid option - only \ - scalar value allowed or consider variationType rangefromci" - % valVariation + "Variation Type: range - variationValue is %s not a valid option - only \ + scalar value allowed or consider variationType rangefromci" + % valVariation ) log.error(message) raise AssertionError(message) @@ -236,9 +235,9 @@ def updateCfgRange(cfg, cfgProb, varName, varDict): parValue = valVariation + "$" + valSteps else: message = ( - "Variation Type: %s - not a valid option, options are: percent, range, \ - normaldistribution, rangefromci" - % variationType + "Variation Type: %s - not a valid option, options are: percent, range, \ + normaldistribution, rangefromci" + % variationType ) log.error(message) raise AssertionError(message) @@ -272,9 +271,9 @@ def updateCfgRange(cfg, cfgProb, varName, varDict): valValues = np.linspace(float(valStart), float(valStop), int(valSteps)) else: message = ( - "Variation Type: %s - not a valid option, options are: percent, range, \ - normaldistribution, rangefromci" - % variationType + "Variation Type: %s - not a valid option, options are: percent, range, \ + normaldistribution, rangefromci" + % variationType ) log.error(message) raise AssertionError(message) @@ -616,8 +615,8 @@ def makeDictFromVars(cfg): if (len(varParList) == len(varValues) == len(cfg[lengthsPar].split("|")) == len(varTypes)) is False: message = ( - "For every parameter in varParList a variationValue, %s and variationType needs to be provided" - % lengthsPar + "For every parameter in varParList a variationValue, %s and variationType needs to be provided" + % lengthsPar ) log.error(message) raise AssertionError(message) @@ -801,13 +800,14 @@ def createSampleWithVariationStandardParameters(cfgProb, cfgStart, varParList, v "values": sampleWBounds, "typeList": cfgProb["PROBRUN"]["varParType"].split("|"), "thFromIni": "", + "bounds": np.column_stack((lowerBounds, upperBounds)).tolist() } return paramValuesD def createSampleWithVariationForThParameters( - avaDir, cfgProb, cfgStart, varParList, valVariationValue, varType, thReadFromShp + avaDir, cfgProb, cfgStart, varParList, valVariationValue, varType, thReadFromShp ): """Create a sample of parameters for a desired parameter variation, and fetch thickness values from shp file and perform variation for each feature within @@ -907,23 +907,23 @@ def createSampleWithVariationForThParameters( # set lower and upper bounds depending on varType (percent, range, rangefromci) lowerBounds[fullVarType == "percent"] = varValList[fullVarType == "percent"] - varValList[ fullVarType == "percent" - ] * (fullValVar[fullVarType == "percent"] / 100.0) + ] * (fullValVar[fullVarType == "percent"] / 100.0) upperBounds[fullVarType == "percent"] = varValList[fullVarType == "percent"] + varValList[ fullVarType == "percent" - ] * (fullValVar[fullVarType == "percent"] / 100.0) + ] * (fullValVar[fullVarType == "percent"] / 100.0) lowerBounds[fullVarType == "range"] = ( - varValList[fullVarType == "range"] - fullValVar[fullVarType == "range"] + varValList[fullVarType == "range"] - fullValVar[fullVarType == "range"] ) upperBounds[fullVarType == "range"] = ( - varValList[fullVarType == "range"] + fullValVar[fullVarType == "range"] + varValList[fullVarType == "range"] + fullValVar[fullVarType == "range"] ) lowerBounds[fullVarType == "rangefromci"] = ( - varValList[fullVarType == "rangefromci"] - ciValues[fullVarType == "rangefromci"] + varValList[fullVarType == "rangefromci"] - ciValues[fullVarType == "rangefromci"] ) upperBounds[fullVarType == "rangefromci"] = ( - varValList[fullVarType == "rangefromci"] + ciValues[fullVarType == "rangefromci"] + varValList[fullVarType == "rangefromci"] + ciValues[fullVarType == "rangefromci"] ) # create a sample of parameter values using scipy latin hypercube or morris sampling @@ -1102,9 +1102,18 @@ def createCfgFiles(paramValuesDList, comMod, cfg, cfgPath=""): cfgStart[section][par] = str(pVal[index]) else: cfgStart["GENERAL"][par] = str(pVal[index]) - if modName.lower() in ["com1dfa", "com5snowslide", "com6rockavalanche"]: + if modName.lower() in ["com1dfa", "com5snowslide", "com6rockavalanche", 'com8motpsa']: + # Check if visualisation exists in cfgStart, if not add the section + if not cfgStart.has_section("VISUALISATION"): + cfgStart.add_section("VISUALISATION") + cfgStart["VISUALISATION"]["scenario"] = str(count1) cfgStart["INPUT"]["thFromIni"] = paramValuesD["thFromIni"] + + # Safe read with fallback (no KeyError if PROBRUN or sampleMethod is missing) + sample_method = cfg.get("PROBRUN", "sampleMethod", fallback="") + cfgStart["VISUALISATION"]["sampleMethod"] = sample_method + if "releaseScenario" in paramValuesD.keys(): cfgStart["INPUT"]["releaseScenario"] = paramValuesD["releaseScenario"] cfgF = pathlib.Path(cfgPath, ("%d_%sCfg.ini" % (countS, modName))) diff --git a/avaframe/ana4Stats/probAnaCfg.ini b/avaframe/ana4Stats/probAnaCfg.ini index 1859b48b4..8fc86af48 100644 --- a/avaframe/ana4Stats/probAnaCfg.ini +++ b/avaframe/ana4Stats/probAnaCfg.ini @@ -14,6 +14,10 @@ unit = kPa peakLim = 1.0 # if only probability analysis is performed check for modName to locate peakFiles avaDir/Outputs/modName/peakFiles modName = com1DFA +# If modName = com8MoTPSA, simulations can be executed in chunks. +# chunkSize defines how many simulations are processed per chunk (running simulations, postprocessing and cleaning the working directory incrementally). +# If left empty, the default chunk size used in the script is 10. +chunkSize = [PROBRUN] @@ -40,7 +44,7 @@ samplingStrategy = 1 # #++++++VARIATION INFO FOR DRAW SAMPLES FROM FULL SET OF VARIATIONS # type of parameters that shall be varied -separated by | (options: float) varParType = float|float -# factor used to create the number of samples, if morris number of samples depends on number of varied variables and number of trajectories, for now use nSample as number of trajectories +# factor used to create the number of samples, if morris: number of samples depends on number of varied variables and number of trajectories, for now use nSample as number of trajectories, n >=2 for morris nSample = 40 # sample method used to create sample (options: latin, morris) sampleMethod = latin @@ -71,13 +75,13 @@ frictModel = samosAT [com4FlowPy_com4FlowPy_override] -# use default com1DFA config as base configuration (True) and override following parameters +# use default com4FlowPy config as base configuration (True) and override following parameters # if False and local is available use local defaultConfig = True [com8MoTPSA_com8MoTPSA_override] -# use default com1DFA config as base configuration (True) and override following parameters +# use default com8MoTPSA config as base configuration (True) and override following parameters # if False and local is available use local defaultConfig = True diff --git a/avaframe/ana6Optimisation/README_ana6.md b/avaframe/ana6Optimisation/README_ana6.md new file mode 100644 index 000000000..fe2b84541 --- /dev/null +++ b/avaframe/ana6Optimisation/README_ana6.md @@ -0,0 +1,226 @@ +# ana6 – Sensitivity Analysis & Optimisation + +The `ana6Optimisation` module provides tools for performing Morris sensitivity analysis and parameter optimisation +within the AvaFrame workflow. It supports input parameter ranking, convergence analysis of sensitivity indices and +surrogate-based optimisation strategies. The module can be used either sequentially (Morris analysis followed by +optimisation) or independently for direct optimisation. + +--- + +## Module Structure + +The module contains the following files: + +- `runMorrisSA.py` (configuration: `runMorrisSACfg.ini`) +- `runPlotMorrisConvergence.py` (uses `runMorrisSACfg.ini`) +- `runOptimisation.py` (configuration: `runOptimisationCfg.ini`) +- `optimisationUtils.py` + +--- + +## Workflow + +### Working Directory + +The above mentioned runScripts must be executed within the directory: `avaframe/ana6Optimisation` + +In `avaframeCfg.ini`, the avalanche reference directory (`avalancheDir`) must include the suffix `../`, for example: +`../data/avaFleisskar` + +This ensures correct relative path resolution within the AvaFrame project structure. + +--- + +### Loss Function and Configuration Settings + +The optimisation compares avalanche simulations with a reference runout polygon. Model performance is evaluated using a +weighted loss function combining: + +- a modified Tversky score *(1 − Tversky)* +- the normalized RMSE of the runout length between simulation and reference + +The Tversky score is computed from areal indicators (TP, FP, FN) within a predefined cropshape. The areal indicators are +calculated using `runPlotAreaRefDiffs.py`. Settings in `runMorrisSACfg.ini` and `runOptimisationCfg.ini`. + +The normalized runout length is computed using variables of the AIMEC analysis implemented in `ana3AIMEC.py`. Settings +are defined in `ana3AIMECCfg.ini`. + +In `ana3AIMECCfg.ini`, `probAnaCfg.ini`, `runMorrisSACfg.ini` (for Morris SA) and `runOptimisationCfg.ini` (for +optimisation), the parameter runoutLayer or layer defines which avalanche layer is used for the analysis (e.g. L1 or +L2). For now, the selected layer must be specified consistently in all configuration files. This is important because +the entire evaluation workflow (including AIMEC analysis, optimisation, and Morris sensitivity analysis) is performed +using this layer. + +For the optimisation workflow, the following parameters must additionally be set in `ana3AIMECCfg.ini`: + +- `resTypes = ppr` +- `anaMod = com8MoTPSA` +- `flagMass = False` (since `com8MoTPSA` currently does not produce a mass file) +- `includeReference = True` + +--- + +### Reference Data + +To compute these goodness-of-fit metrics and to perform the AIMEC analysis, the following data must be provided in: +`avaframe/data//Inputs`. + +The required folder structure is: +Folder: + +- **LINES** + Contains the AIMEC path as `path_aimec.shp` file. + +- **POLYGONS** + Contains the cropshape and defines the maximal extent of runout area that is used for calculating areal indicators. + This shapefile must have the suffix `_cropshape.shp`. + +- **REFDATA** + Defines the runout area of the reference event. This shapefile must have the suffix `_POLY.shp`. + +- **REL** + Defines the release area of the avalanche event. + +File: + +- **Digital Elevation Model (DEM)** + Must be placed directly in the `Inputs` directory and must cover the entire affected area. + +More details here in the section `Inputs`: https://docs.avaframe.org/en/latest/moduleCom1DFA.html + +___ + +### Morris Sensitivity Analysis (MorrisSA) + +The Morris sensitivity analysis provides a ranking of input parameters based on their influence on the model response. + +Before running `runMorrisSA.py`, the following step is required prior: + +- Execute `runAna4ProbAnaCom8MoTPSA.py` +- In `probAnaCfg.ini`: + - Set the sampling method to `'morris'` + - Define the number of Morris trajectories (`nSample`) + - Select the input parameters and define their variation bounds + +This step generates the required simulations and stores the sampled parameters and their bounds in a pickle file. + +**Afterwards:** + +- Run `runMorrisSA.py` +- Configure settings via `runMorrisSACfg.ini` +- The `MORRIS_CONVERGENCE` setting can be ignored for standard sensitivity analysis + +**Outputs:** + +- Pickle file containing: + - Ranked input parameters + - Morris sensitivity indices + - Parameter bounds +- Visualisation plots of the sensitivity results + +--- + +### Morris Convergence Analysis + +The convergence analysis evaluates how the Morris sensitivity indices stabilise with increasing numbers of trajectories. +Its purpose is to determine the minimum number of trajectories that yields robust results. + +**Requirements:** + +- Run `runAna4ProbAnaCom8MoTPSA` multiple times with different numbers of Morris trajectories +- Rename Output folders afterwards with the following naming convention: OutputsR`` + +where `` corresponds to the number of trajectories + +This process is computationally expensive, as it requires a large number of simulations. + +**Execution:** + +- Run `runPlotMorrisConvergence.py` + +**Output:** + +- Convergence plots of Morris sensitivity indices + +--- + +### Optimisation Strategies + +The optimisation process identifies the set of input parameters that yields the best agreement between simulation +results and a defined reference. “Best” is defined by the objective function implemented in the optimisation routine. +The optimisation problem is formulated as a minimisation of the loss function, where lower values indicate better +agreement between simulation results and the reference data. + +Two surrogate-based optimisation strategies are implemented. In both approaches, a Gaussian Process (GP) surrogate model +is used to approximate the loss function. The surrogate is trained using results from avalanche simulations and provides +predictions of the loss together with an estimate of the prediction uncertainty. + +#### Surrogate-based Non-sequential Optimisation + +In the non-sequential approach, a trained surrogate predicts the loss for a large number of parameter combinations +generated using Latin Hypercube Sampling (LHS). Parameter sets with the lowest predicted loss values are identified and +analysed statistically. Avalanche simulations are then performed for the best predicted parameter sets as well as for +the mean parameter values derived from the top-performing combinations. + +#### Surrogate-based Bayesian (Sequential) Optimisation + +In Bayesian optimisation, the GP surrogate model is updated iteratively. The procedure starts with a small initial set +of evaluated avalanche simulations. Based on these results, the surrogate model is trained and used to guide the +selection of new simulation points. The next parameter set is selected using the Expected Improvement (EI) acquisition +function. EI balances two objectives: + +- Exploitation – sampling regions where the surrogate predicts low loss values. +- Exploration – sampling regions with high predictive uncertainty. + +After each new avalanche simulation, the GP surrogate model is updated and the process is repeated. The optimisation +stops once a stopping criterion is reached (e.g. maximum number of iterations or very small EI values for several +iterations). + + +--- + +### Optimisation Workflow + +The optimisation can be performed using either non-sequential surrogate-based optimisation or sequential Bayesian +optimisation. The optimisation strategy can be selected in `runOptimisationCfg.ini` via the parameter `optType`. + +Independently of the chosen optimisation strategy, the workflow can be configured in two ways depending on whether +Morris sensitivity analysis is used before: + +**Scenario 1: Without prior Morris analysis (recommended):** + +- Execute `runAna4ProbAnaCom8MoTPSA.py` to generate some initial samples (for surrogate) +- In `probAnaCfg.ini`: + - Set the sampling method to `'latin'` + - Define the number of model runs (`nSample`) + - Select the input parameters and define their variation bounds +- Execute `runOptimisation.py` with scenario 1 in `runOptimisationCfg.ini` + +This is the standard scenario, as Latin Hypercube Sampling provides good coverage of the parameter space, which is +important for training the surrogate model. + +**Scenario 2: With prior Morris analysis:** + +- Parameter ranking is available +- Parameter bounds are already defined +- Execute `runOptimisation.py` with scenario 2 in `runOptimisationCfg.ini` + +This option is mainly intended for experimental use. Morris samples are designed for sensitivity analysis and parameter +ranking, but they do not provide an optimal coverage of the parameter space for surrogate-based optimisation. + +--- + +**Outputs:** + +- Optimal parameter set +- Visualisation plots of the optimisation results and progress + +--- + +## Notes + +- Performing Morris sensitivity analysis before optimisation is recommended to reduce the parameter space. However, + using Morris samples directly for optimisation is not recommended, since they do not provide optimal coverage of the + input parameter space. +- Convergence analysis significantly increases computational cost. +- All workflows are controlled via `.ini` configuration files. \ No newline at end of file diff --git a/avaframe/ana6Optimisation/optimisationUtils.py b/avaframe/ana6Optimisation/optimisationUtils.py new file mode 100644 index 000000000..eecf6d540 --- /dev/null +++ b/avaframe/ana6Optimisation/optimisationUtils.py @@ -0,0 +1,948 @@ +import numpy as np +import pathlib +import pickle +import logging +import pandas as pd +import configparser +import os +from scipy.stats import norm, qmc +from datetime import datetime +import re + +from sklearn.model_selection import KFold, cross_validate +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel + +from avaframe.in3Utils import cfgUtils +from avaframe.in3Utils import fileHandlerUtils as fU +from avaframe.com8MoTPSA import com8MoTPSA +from avaframe.ana3AIMEC import ana3AIMEC +from avaframe.ana4Stats import probAna +from avaframe.runScripts.runPlotAreaRefDiffs import runPlotAreaRefDiffs +import avaframe.out3Plot.outAna6Plots as saveResults + +# create local logger +log = logging.getLogger("avaframe") + + +def calcArealIndicatorsAndAimec(cfgOpt, avalancheDir): + """ + Calculate areal indicators between reference polygon and simulation and AIMEC analysis and save data in ana3AIMEC + and out1Peak. + + Parameters + ---------- + cfgOpt : configparser.ConfigParser + Global configuration object. Must contain section "GENERAL" + with keys: + - resType + - thresholdValueSimulation + - modName + avalancheDir : str + Path to the avalanche directory. + + """ + # ToDo: to reduce comp. cost: run AIMEC and calcArealIndicators only for new simulations + # Areal indicators + resType = cfgOpt["GENERAL"]["resType"] + thresholdValueSimulation = cfgOpt.getfloat("GENERAL", "thresholdValueSimulation") + modName = cfgOpt["GENERAL"]["modName"] + runPlotAreaRefDiffs(resType, thresholdValueSimulation, modName) + + # AIMEC + cfgAIMEC = cfgUtils.getModuleConfig(ana3AIMEC) + rasterTransfo, resAnalysisDF, plotDict, _, pathDict = ana3AIMEC.fullAimecAnalysis(avalancheDir, cfgAIMEC) + + +def readParamSetDF(inDir, varParList): + """ + Read parameter sets from .ini files in a directory and build a DataFrame. + + Parameters + ---------- + inDir : str or pathlib.Path + Path to directory containing .ini files. + varParList : list of str + List of parameter names to extract values from each .ini file. + + Returns + ------- + paramSetDF : pandas.DataFrame + DataFrame with simName, parameterSet and order as columns + """ + + # List to hold all parameters sets + paramSet = [] + order = [] + sampleMethods = [] + filenames = [] + + # Loop over all files in the folder + for filename in os.listdir(inDir): + if filename.endswith('.ini') and 'sourceConfiguration' not in filename: + filepath = os.path.join(inDir, filename) + + # Load the .ini file + config = configparser.ConfigParser() + config.read(filepath) + + # Set defaults per file + index = np.nan + sampleMethod = np.nan + + if 'VISUALISATION' in config.sections(): + # config is inifile + index = config['VISUALISATION']['scenario'] + if 'sampleMethod' in config['VISUALISATION']: + sampleMethod = config['VISUALISATION']['sampleMethod'] + else: + sampleMethod = np.nan + + row = [] # row contains 1 row + for param in varParList: + section = probAna.fetchParameterSection(config, param) + value = config[section][param] + value = float(value) + row.append(value) + + order.append(index) + sampleMethods.append(sampleMethod) + paramSet.append(row) # rows contains all rows + filenames.append(os.path.splitext(filename)[0]) + + # Convert to pandas DF + paramSetDF = pd.DataFrame({ + 'simName': filenames, + 'parameterSet': paramSet, # [row for row in paramSet], # Wrap each row as a list + 'order': pd.to_numeric(order), # convert to int + 'sampleMethods': sampleMethods + }) + return paramSetDF + + +def readArealIndicators(inDir): + """ + Read areal indicator results from a pickle file and convert to a DataFrame. + + Parameters + ---------- + inDir : str or pathlib.Path + Path to pickle file containing indicator results. + + Returns + ------- + indicatorsDF : pandas.DataFrame + DataFrame with simName and areal indicators, + """ + with open(inDir, "rb") as f: + all_results = pickle.load(f) + + indicatorsDF = pd.DataFrame(all_results) + return indicatorsDF + + +def addLossMetrics(df, referenceDF, cfg): + """ + Compute evaluation metrics (recall, precision, F1, Tversky score) and a combined weighted Loss (or optimisation) + variable from a given DataFrame. + + The metrics are based on area (number of pixel would also be possible). Invalid values + (division by zero) are replaced with 0. + + Parameters + ---------- + df : pandas.DataFrame + Input DataFrame with at least the columns + ``TP_SimRef_area``, ``FP_SimRef_area``, ``FN_SimRef_area``. + referenceDF: pandas.Dataframe + Dataframe with information of the reference in AIMEC e.g. reference_sRunout of the polygon + cfg: configparser.ConfigParser + Config parser that contains values of the loss function, either runOptimisationCfg.ini or runMorrisSA.ini file + + Returns + ------- + df : pandas.DataFrame + Same DataFrame with additional columns: + - ``recall`` : float + - ``precision`` : float + - ``f1_score`` : float + - ``tversky_score`` : float + - ``optimisationVariable`` : float + """ + # Decide if loss function is based on ncells or area + basedOn = '_area' + TP = df[f"TP_SimRef{basedOn}"] + FP = df[f"FP_SimRef{basedOn}"] + FN = df[f"FN_SimRef{basedOn}"] + + # Recall = TP / (TP + FN) + denomRecall = TP + FN + df["recall"] = np.where(denomRecall != 0, TP / denomRecall, 0.0) + # Precision = TP / (TP + FP) + denomPrecision = TP + FP + df["precision"] = np.where(denomPrecision != 0, TP / denomPrecision, 0.0) + # F1 Score + denomF1 = df['precision'] + df['recall'] + df["f1_score"] = np.where(denomF1 != 0, 2.0 * df['precision'] * df['recall'] / denomF1, 0.0) + + # Tversky score = TP / (TP + alpha * FP + beta * FN), gives penalty to overshoot --> alpha + alpha = cfg.getfloat('LOSS_PARAMETERS', 'tverskyAlpha') + beta = cfg.getfloat('LOSS_PARAMETERS', 'tverskyBeta') + + denomTversky = TP + alpha * FP + beta * FN + df["tversky_score"] = np.where(denomTversky != 0, TP / denomTversky, 0.0) + # Subtract 1 to ensure that 0 are good values and 1 bad + df['1-tversky'] = 1 - df["tversky_score"] + + # Runout + # RMSE divided with Runout length of Reference + sRunoutRef = referenceDF['reference_sRunout'].values + runoutRMSE = df['runoutLineDiff_poly_RMSE'] + df['runoutRMSENormalised'] = runoutRMSE / sRunoutRef # 0 is good, 1 is bad + + # Weights + wTversky = cfg.getfloat('LOSS_PARAMETERS', 'weightTversky') + wRunout = cfg.getfloat('LOSS_PARAMETERS', 'weightRunout') + + df['optimisationVariable'] = ( + wRunout * df['runoutRMSENormalised'].fillna(1) + wTversky * df['1-tversky'].fillna(1)) + return df + + +def buildFinalDF(avalancheDir, varParList, cfg): + """ + Build the final merged DataFrame for a given avalanche. + + Combines parameter sets, AIMEC results, and areal indicators into one DataFrame, + then computes evaluation metrics 'via addLossMetrics'. The resulting DataFrame contains one row per simulation. + If simulations are available for two layers (e.g. L1 and L2), only the layer specified in the configuration files + is considered. Simulations from the selected layer are kept, while simulations from the other layer are excluded + from the final DataFrame. The analysis layer must be defined in the corresponding .ini configuration files. Further + details are provided in README_ana6.md. + + Parameters + ---------- + avalancheDir : str + Path of avalanche directory + varParList: list of str + List of parameter names that are varied + cfg: configparser.ConfigParser + Config parser that contains values of the loss function, either runOptimisationCfg.ini or runMorrisSA.ini file + + Returns + ------- + finalDF : pandas.DataFrame + Final DataFrame containing: + - `simName` + - `parameterSet` + - `order`: used later for visualisation. Morris, Latin hypercube and sequential samples are assigned an order + index. For each sampling method, the index starts at 0 and increases in increments of 1 up to the + total number of samples of that method. + - Areal indicator columns + - Evaluation metrics (recall, precision, f1_score, tversky_score, optimisationVariable) + """ + + avaName = avalancheDir.split('/')[-1] + + # Folder where ini files from simulations are + inDir = pathlib.Path(avalancheDir, 'Outputs/com8MoTPSA/configurationFiles') + # Read parameterSetDF + paramSetDF = readParamSetDF(inDir, varParList) + + # Get Dataframe from AIMEC analysis + cfgAIMEC = cfgUtils.getModuleConfig(ana3AIMEC) + runoutResTypes = cfgAIMEC['AIMECSETUP']['resTypes'] + thresholdValue = cfgAIMEC['AIMECSETUP']['thresholdValue'] + domainWidth = cfgAIMEC['AIMECSETUP']['domainWidth'] + + AIMECPath = avalancheDir + '/Outputs/ana3AIMEC/com8MoTPSA/' + AIMECFileName = f"Results_{avaName}_{runoutResTypes}_lim_{thresholdValue}_w_{domainWidth}resAnalysisDF.csv" + + df_aimec = pd.read_csv(AIMECPath + AIMECFileName) + # Get data of the reference in AIMEC + referenceDF = pd.read_csv(f"{avalancheDir}/Outputs/ana3AIMEC/com8MoTPSA/referenceDF.csv") + + # Read areal indicators + arealIndicatorDir = pathlib.Path(avalancheDir, 'Outputs', 'out1Peak', 'arealIndicators.pkl') + indicatorsDF = readArealIndicators(arealIndicatorDir) + + # Remove layer suffix _L1 or _L2 from simName for merging, the layer is provided by cfg['GENERAL']['layer']. + layer = cfg['GENERAL']['layer'] + indicatorsDF["simName"] = indicatorsDF["simName"].str.replace(fr"_{layer}$", "", regex=True) + + # Merge df's + df_merged = pd.merge(paramSetDF, df_aimec, on='simName', how='inner') + df_merged = df_merged.merge(indicatorsDF, on="simName", how="left") + + # Add optimisation variables + finalDF = addLossMetrics(df_merged, referenceDF, cfg) + return finalDF + + +def createDFParameterLoss(df, paramSelected): + """ + Create DataFrames linking selected parameters with the loss function. + The meaning of selected depends on the chosen scenario. If Morris sensitivity analysis is not run beforehand, all + varied input parameters from probAnaCfg.ini are included. If Morris sensitivity analysis is run beforehand, only the topN + highest-ranked parameters are selected based on the Morris results. + + Parameters + ---------- + df : pandas.DataFrame + DataFrame that contains the finalDF including the input parameters and their values as well as the loss function. + paramSelected : list of str + List of parameters to include in the output DataFrames. This determines which parameters will be used for + optimisation and depends on the scenario. + + Returns + ------- + paramLossDF : pandas.DataFrame + DataFrame with one column per selected parameter and an additional + ``Loss`` column with the raw values of ``optimisationVariable``. + paramLossDFScaled : pandas.DataFrame + Same as ``paramLossDF`` but with the selected parameters normalised + to the range [0, 1] using min–max scaling. + """ + paramLossDF = df[paramSelected].copy() + paramLossDFScaled = (paramLossDF - paramLossDF.min()) / (paramLossDF.max() - paramLossDF.min()) # normalise + paramLossDF['Loss'] = df[ + 'optimisationVariable'] + paramLossDFScaled['Loss'] = df[ + 'optimisationVariable'] + return paramLossDF, paramLossDFScaled + + +def fitSurrogate(df, cfgOpt): + """ + Prepare data and initialize surrogate models for loss prediction. + + Parameters + ---------- + df : pandas.DataFrame + DataFrame containing input parameters as columns and a + column named 'Loss' as target variable. + cfgOpt: configparser.ConfigParser + Config parser of the runOptimisationCfg.ini file. + + Returns + ------- + X : numpy.ndarray + Feature matrix of shape (n_samples, n_features). + y : numpy.ndarray + Target vector of shape (n_samples,). + gp_pipe : sklearn.pipeline.Pipeline + Pipeline consisting of feature standardization and a + Gaussian Process regressor with Matern kernel. + """ + # Prepare X, y + y_col = 'Loss' + X = df.drop(columns=[y_col]).to_numpy(dtype=float) + y = df[y_col].to_numpy(dtype=float).reshape(-1) + n_features = X.shape[1] + + # GP kernel with Matern-Covariance + matern_nu = cfgOpt.getfloat('OPTIMISATION', 'matern_nu') + kernel = ( + ConstantKernel(1.0, (1e-3, 1e3)) # Output variance tells how strong Y varies + * Matern(length_scale=np.ones(n_features), + length_scale_bounds=(1e-2, 1e2), # in z (variance) space + nu=matern_nu) + + WhiteKernel(noise_level=1e-4, noise_level_bounds=(1e-8, 1e-1)) + ) + gp = GaussianProcessRegressor( + kernel=kernel, + alpha=1e-8, + normalize_y=True, + n_restarts_optimizer=10, + random_state=0, + ) + + # Pipelines (feature scaling + model) + gp_pipe = Pipeline([("x_scaler", StandardScaler()), ("model", gp)]) + return X, y, gp_pipe + + +def KFoldCrossValidation(X, y, pipe, cfgOpt, outDir, avaName, pipeName): + """ + Perform k-fold cross-validation for a regression pipeline. + + Parameters + ---------- + X : numpy.ndarray + Feature matrix of shape (n_samples, n_features). + y : numpy.ndarray + Target vector of shape (n_samples,). + pipe : sklearn.pipeline.Pipeline + Regression pipeline to be evaluated. + cfgOpt: configparser.ConfigParser + Config parser of the runOptimisationCfg.ini file. + outDir : pathlib.Path + File path where the generated image will be saved. + avaName : str + Name of the avalanche. Used for naming the output figure. + pipeName : str + Name of the pipeline, used for formatted console output. + + Returns + ------- + scores : dict + Dictionary containing cross-validation results as returned + by sklearn.model_selection.cross_validate. + """ + # Get optType + optType = cfgOpt['OPTIMISATION']['optType'] + # For losses, sklearn uses "neg_*" because higher-is-better internally + rmse_scorer = "neg_root_mean_squared_error" + mae_scorer = "neg_mean_absolute_error" + r2_scorer = "r2" + k = cfgOpt.getint('OPTIMISATION', 'k') + cv = KFold(n_splits=k, shuffle=True, random_state=0) + + scores = cross_validate( + pipe, X, y, cv=cv, + scoring={"rmse": rmse_scorer, "mae": mae_scorer, "r2": r2_scorer}, + return_train_score=True, + error_score="raise" # fail fast if something else is wrong + ) + + # NOTE: rmse/mae were returned as NEGATIVE numbers because the higher is better internally + neg_cols = ["test_rmse", "test_mae", "train_rmse", "train_mae"] + for c in neg_cols: + scores[c] = -scores[c] + # get smoothness parameter nu + matern_nu = cfgOpt.getfloat('OPTIMISATION', 'matern_nu') + row = { + "experiment_name": pipeName, + "n_samples": X.shape[0], + "kernel": f"Matern {matern_nu}", + "noise_model": "WhiteKernel", + } + + for split in ("test", "train"): + for m in ("rmse", "mae", "r2"): + arr = scores[f"{split}_{m}"] + row[f"{split} {m} mean"] = arr.mean() + row[f"{split} {m} std"] = arr.std() + + # Log results of cross validation + log.info(f"\n{pipeName}, {k}-fold CV:") + for split in ("test", "train"): + log.info(f" {split.capitalize()} metrics:") + for m in ("rmse", "mae", "r2"): + arr = scores[f"{split}_{m}"] + log.info(f" {m.upper():<4}: {arr.mean():.4g} ± {arr.std():.4g}") + + df = pd.DataFrame([row]) + # Include date, format: YYYYMMDD + date = datetime.now().strftime("%Y%m%d") + base = os.path.join(outDir, f"{avaName}_{k}FoldCV_{optType}") + df.to_csv(base + f'_{date}.csv', mode="a", header=not os.path.exists(base + f'_{date}.csv'), + index=False) # checks if header exists + + # Save results as image + saveResults.saveKFoldCVPrintImage(scores, pipeName, k, base + f'_Matern_{matern_nu}_Kernel_{date}.png') + return scores + + +def optimiseNonSeq(pipe, cfgOpt, paramBounds): + """ + Perform non-sequential surrogate-based optimization using + Latin Hypercube sampling. + + Parameters + ---------- + pipe : sklearn.pipeline.Pipeline + Trained surrogate model pipeline used to predict loss + and uncertainty. + cfgOpt : configparser.ConfigParser + Config parser of the runOptimisationCfg.ini file. + paramBounds : dict + Dictionary mapping parameter names to (min, max) bounds. + + Returns + ------- + topNStat : pandas.DataFrame + DataFrame containing statistics of the top N surrogate + candidates with the lowest predicted loss. + """ + paramSelected = list(paramBounds.keys()) + bounds = np.array(list(paramBounds.values()), dtype=float) # shape (d,2) + d = bounds.shape[0] + + # Create LH samples + seed = cfgOpt.getint('OPTIMISATION', 'seed') + n_lhs = cfgOpt.getint('OPTIMISATION', 'n_lhs') + sampler = qmc.LatinHypercube(d=d, seed=seed) + sample = sampler.random(n=n_lhs) + X0 = qmc.scale(sample, bounds[:, 0], bounds[:, 1]) + + # Prediction of the loss with GP-Model + mu, sigma = pipe.predict(X0, return_std=True) + # Convert X0 to pandas df for analyze function + df_candidates = pd.DataFrame(X0, columns=paramSelected) + n_top_samples = cfgOpt.getint('OPTIMISATION', 'n_surrogate_top') + topNStat, _ = analyzeTopCandidates(df_candidates, mu, sigma, paramSelected, N=n_top_samples) + return topNStat + + +def analyzeTopCandidates(df_candidates, mu, sigma, param_cols, N): + """ + Analyze the top N surrogate candidates. + + Parameters + ---------- + df_candidates : pandas.DataFrame + Candidate parameter sets. + mu : numpy.ndarray + Predicted loss values. + sigma : numpy.ndarray + Predicted uncertainty values. + param_cols : list of str + Parameter column names. + N : int + Number of best candidates to analyze. + + Returns + ------- + stats : dict + Summary statistics for the top N and best candidate. + topNData : pandas.DataFrame + Top N candidates with predicted mu and sigma. + """ + + # Top N points + idx_topN = np.argsort(mu)[:N] + topNData = df_candidates.iloc[idx_topN].copy() + topNData["mu"] = mu[idx_topN] + topNData["sigma"] = sigma[idx_topN] + + mean_params = topNData[param_cols].mean() + std_params = topNData[param_cols].std() + mean_mu = topNData["mu"].mean() + std_mu = topNData["mu"].std() + mean_sigma = topNData["sigma"].mean() + std_sigma = topNData["sigma"].std() + + log.info(f"Surrogate Top {N} candidates: mean ± std") + + for p in param_cols: + m, s = mean_params[p], std_params[p] + perc = (s / m * 100) if m != 0 else np.nan + log.info(f" {p:30s}: {m:.6f} ± {s:.6f} ({perc:.1f}%%)") + + perc_mu = (std_mu / mean_mu * 100) if mean_mu != 0 else np.nan + perc_sigma = (std_sigma / mean_sigma * 100) if mean_sigma != 0 else np.nan + + log.info(f"mu: {mean_mu:.4f} ± {std_mu:.4f} ({perc_mu:.1f}%%)") + log.info(f"sigma: {mean_sigma:.4f} ± {std_sigma:.4f} ({perc_sigma:.1f}%%)") + + # Best single point + idx_best = np.argmin(mu) + best_params = df_candidates.iloc[idx_best].copy() + best_loss = mu[idx_best] + best_sigma = sigma[idx_best] + + log.info("Best single parameter combination from Surrogate:") + + for p in param_cols: + log.info(f" {p:30s}: {best_params[p]:.4f}") + + log.info(f"mu: {best_loss:.4f}") + log.info(f"sigma: {best_sigma:.4f}") + + return { + "TopNBest": { + "mean_params": mean_params, + "std_params": std_params, + "mean_mu": mean_mu, + "std_mu": std_mu, + "mean_sigma": mean_sigma, + "std_sigma": std_sigma, + }, + "Best": { + "params": best_params, + "mu": best_loss, + "sigma": best_sigma, + } + }, topNData + + +def expectedImprovement(mu, sigma, f_best, xi): + """ + Compute the Expected Improvement (EI) acquisition function + for minimization problems. + Formula taken from: + https://ekamperi.github.io/machine%20learning/2021/06/11/acquisition-functions.html + + Parameters + ---------- + mu : numpy.ndarray + Predicted mean values of the surrogate model. + sigma : numpy.ndarray + Predicted standard deviations of the surrogate model. + f_best : float + Best observed objective value so far. + xi : float + Exploration parameter controlling exploitation–exploration + trade-off. + + Returns + ------- + ei : numpy.ndarray + Expected Improvement values for each candidate. + """ + sigma = np.maximum(sigma, 1e-12) # numeric safety + imp = f_best - mu - xi # minimization, that's why the sign is different, xi for finetunig exploitation + + Z = imp / sigma + ei = imp * norm.cdf(Z) + sigma * norm.pdf(Z) + ei[sigma <= 1e-12] = 0.0 # set EI to zero where sigma is 1e-12 + return ei + + +def lowerConfidenceBound(mu, sigma, k=0.0): + """ + Compute the Lower Confidence Bound (LCB) acquisition function. + Formula taken from: + https://ekamperi.github.io/machine%20learning/2021/06/11/acquisition-functions.html + + Parameters + ---------- + mu : numpy.ndarray + Predicted mean values of the surrogate model. + sigma : numpy.ndarray + Predicted standard deviations of the surrogate model. + k : float, optional + Exploration parameter controlling the trade-off between + exploitation and exploration. + + Returns + ------- + lcb : numpy.ndarray + Lower Confidence Bound values for each candidate. + """ + return -mu + k * sigma + + +def EINextPoint(pipe, y, paramBounds, cfgOpt): + """ + Propose the next evaluation point using Expected Improvement (EI) or lower confidence bound (LCB). + + This function generates a Latin Hypercube sample (LHS) of candidate points within the + parameter bounds, evaluates a surrogate model (``pipe``) on those candidates, and + selects the candidate that maximises EI or LCB for a *minimisation* + problem. + + Parameters + ---------- + pipe : sklearn.pipeline.Pipeline + Trained surrogate model pipeline used to predict loss + and uncertainty. + y : array-like + Observed objective values (loss) from previous evaluations. + paramBounds : dict + Dictionary mapping parameter names to (lower, upper) bounds. + cfgOpt : configparser.ConfigParser + Config parser of the runOptimisationCfg.ini file. + + Returns + ------- + xBest : numpy.ndarray + Vector of the values of the selected parameters that maximises EI among the LHS candidates. + xBestDict : dict + Mapping of parameter name to selected value for ``xBest``. + best_ei : float + Maximum EI value among the candidate set. + best_lcb : float + Maximum LCB value among the candidate set (computed for reference/diagnostics). + """ + paramSelected = list(paramBounds.keys()) + bounds = np.array(list(paramBounds.values()), dtype=float) # shape (d,2) + d = bounds.shape[0] + f_best = np.nanmin(y) + + # Create LH samples + seed = cfgOpt.getint('OPTIMISATION', 'seed') + n_lhs = cfgOpt.getint('OPTIMISATION', 'n_lhs') + sampler = qmc.LatinHypercube(d=d, seed=seed) + sample = sampler.random(n=n_lhs) + X0 = qmc.scale(sample, bounds[:, 0], bounds[:, 1]) + + # Predict with pipe + mu, sigma = pipe.predict(X0, return_std=True) + + log.info("Prediction statistics:") + log.info(" mu mean=%.6f max=%.6f", mu.mean(), mu.max()) + log.info(" sigma mean=%.6f max=%.6f", sigma.mean(), sigma.max()) + + # EI or LCB for minimization + xi = cfgOpt.getfloat('OPTIMISATION', 'xi') + ei = expectedImprovement(mu, sigma, f_best, xi) + lcb = lowerConfidenceBound(mu, sigma) + + xBest = X0[np.argmax(ei)].copy() + # xBest = X0[np.argmax(lcb)].copy() + xBestDict = {feat: float(val) for feat, val in zip(paramSelected, xBest)} + + return xBest, xBestDict, np.max(ei), np.max(lcb) + + +def writeCfgFiles(avalancheDir, paramSets, optimisationType, comModuleName, counter=None): + """ + Generate and write configuration files for a given computation module + based on optimisation results. + + Two configuration files are created: + 1. Using the mean parameter values of the top-N surrogate evaluations. + 2. Using the single best surrogate parameter set. + + Parameters + ---------- + avalancheDir : str or pathlib.Path + Path to the avalanche directory. + paramSets : dict or list of dict + Parameter set(s) to write to config files + optimisationType : str + Optimisation mode ('nonseq' or 'seq'), stored in the config file + for traceability. + comModuleName : str + Name of the computation module (e.g. "com8MoTPSA"). + Used for naming the configuration directory and files. + counter : int or None, optional + If provided, use this value as starting index for scenario/file naming. + Useful when calling this function inside an outer optimisation loop. + + Returns + ------- + cfgFiles : list of pathlib.Path + Paths to the written configuration files. + cfgPath : pathlib.Path + Directory where configuration files were stored. + """ + # Allow single dict as input + if isinstance(paramSets, dict): + paramSets = [paramSets] + + cfgFiles = [] + avaDir = pathlib.Path(avalancheDir) + # Directory where generated configuration files will be saved + cfgPath = avaDir / "Work" / f"{comModuleName}ConfigFiles" + fU.makeADir(cfgPath) + + # If counter is given, start indexing from there; otherwise start at 0 + start = int(counter) if counter is not None else 0 + + for i, xBestDict in enumerate(paramSets): + idx = start + i # index used for scenario + filename + + # Load a fresh module configuration template + cfgCom8MoTPSA = cfgUtils.getModuleConfig(com8MoTPSA, toPrint=False) + # Overwrite parameters in their corresponding sections + for param, val in xBestDict.items(): + section = probAna.fetchParameterSection(cfgCom8MoTPSA, param) + cfgCom8MoTPSA[section][param] = str(val) + + # Assign unique scenario ID and optimisation type for later identification + cfgCom8MoTPSA['VISUALISATION']['scenario'] = str(idx) + cfgCom8MoTPSA["VISUALISATION"]["sampleMethod"] = optimisationType + + # Write configuration file to disk + cfgF = cfgPath / f"{idx}_{comModuleName}Cfg.ini" + with open(cfgF, "w") as configfile: + cfgCom8MoTPSA.write(configfile) + + cfgFiles.append(cfgF) + + return cfgFiles, cfgPath + + +def findSimName(finalDF, paramValue, atol=1e-6): + """ + Return the simName in finalDF whose parameter columns match + the given paramValue within a numerical tolerance. + + Parameters + ---------- + finalDF : pandas.DataFrame + Must contain column simName and parameter columns given in paramValue. + paramValue : dict or iterable of (name, value) + Parameter values to match. + atol : float, optional + Absolute tolerance for float comparison (default: 1e-6). + + Returns + ------- + str + The first matching simName. + """ + mask = np.ones(len(finalDF), dtype=bool) + for col, val in dict(paramValue).items(): + s = pd.to_numeric(finalDF[col], errors="coerce") + mask &= np.isclose(s, float(val), atol=atol, rtol=0) + + matches = finalDF.loc[mask, "simName"] + return matches.iloc[0] + + +def loadVariationData(cfgOpt, avaDir, outDir=None): + """ + Load parameter bounds and selected parameters for optimisation. + + The used parameters and their bounds are not defined directly in this + function. Instead, they are obtained from results of previous simulation runs. + + Two execution modes are supported and controlled via cfgOpt['PARAM_BOUNDS']['scenario']: + + Scenario 1 (Manual definition): + - No prior Morris screening. + - Parameter names and corresponding bounds are loaded from a previously saved pickle file 'paramValuesD.pickle' + generated by ``runAna4ProbAnaCom8MoTPSA.py``. The parameter variation is therefore not defined within this + function, but is determined by the configuration specified in ``probAnaCfg.ini``. The ``probAnaCfg.ini`` file + contains the settings used to generate the initial sample set, including parameter ranges and variation rules. + + This is the standard scenario, as Latin Hypercube Sampling provides good coverage of the parameter space, which is + important for training the surrogate model. + + Scenario 2 (Morris pre-run): + - A Morris sensitivity analysis has already been executed. + - Ranked parameters and their bounds are loaded from 'sa_parameters_bounds.pkl' and morris samples are directly + used for optimisation. + - The top-N most influential parameters are selected for optimisation. + + This option is mainly intended for experimental use. Morris samples are designed for sensitivity analysis and + parameter ranking, but they do not provide an optimal coverage of the parameter space for surrogate-based optimisation. + + Parameters + ---------- + cfgOpt: configparser.ConfigParser + Configuration object containing the section 'PARAM_BOUNDS'. + outDir: pathlib.Path + Directory containing the Morris output file ('sa_parameters_bounds.pkl'). + avaDir: str + Directory of the avalanche. + + Returns + ------- + paramBounds : dict + Dictionary mapping parameter names to (min, max) tuples. + paramSelected : list + List of selected parameter names used for optimisation. + """ + # Read scenario flag + scenario = cfgOpt.getint('PARAM_BOUNDS', 'scenario') + + avaName = avaDir.split('/')[-1] + + # Scenario 1: Morris is not run prior + if scenario == 1: + # Load variation data with bounds from pickle file + inDir2 = pathlib.Path(avaDir, 'Outputs', "ana4Stats") + paramValuesD = pd.read_pickle(inDir2 / "paramValuesD.pickle") + paramSelected = paramValuesD['names'] + paramBounds = { + name: (float(bounds[0]), float(bounds[1])) + for name, bounds in zip(paramValuesD["names"], paramValuesD["bounds"]) + } + + # Scenario 2: Morris run prior + elif scenario == 2: + # Load SA data and define how much parameters should be optimized (variation bounds included) + SiDFSort = pd.read_pickle(outDir / f"{avaName}_sortedSAResultsWithBounds.pkl") + topN = cfgOpt.getint('PARAM_BOUNDS', 'topN') + n_available = len(SiDFSort['Parameter']) + # Check if enough input parameters are available + if topN > n_available: + message = ( + f"Invalid number of parameters topN={topN}. Only {n_available} parameters available from Morris sensitivity" + f" analysis." + ) + raise ValueError(message) + paramSelected = list(SiDFSort['Parameter'][:topN]) + paramBounds = dict(zip(paramSelected, SiDFSort['bounds'])) + + else: + message = f"Unknown scenario '{scenario}' for variation data. Expected 1 or 2." + log.error(message) + raise ValueError(message) + + return paramBounds, paramSelected + + +def loadMorrisConvergenceData(cfgMorrisSA, avalancheDir, avaName): + """ + Load Morris sensitivity analysis results for convergence plotting. + + Parameters + ---------- + cfgMorrisSA : configparser.ConfigParser + Morris configuration object containing section + 'MORRIS_CONVERGENCE' and 'GENERAL'. + avalancheDir : str or pathlib.Path + Root avalanche directory. + avaName : str + Avalanche name used in filename construction. + + Returns + ------- + SA_dfs : list[pandas.DataFrame] + Loaded Morris result DataFrames in the order defined in the ini file. + r_vals : list[int] + Corresponding number of trajectories (r) extracted from folder names. + outputs : list[str] + Output folder names as defined in the ini file. + outDir: pathlib.Path + Path where results are saved to. + reference_r : int + The number of trajectories used as refernce for ordering the parameters. + """ + + section = cfgMorrisSA["MORRIS_CONVERGENCE"] + + outputs = [] + r_vals = [] + + # Read outputs in ini order + for key, value in section.items(): + if key.lower().startswith("outputs"): + outputs.append(value) + + match = re.search(r"R(\d+)", value) + if match: + r_vals.append(int(match.group(1))) + else: + raise ValueError( + f"Could not extract r-value from folder name '{value}'" + ) + + # Build module path + resultDir = cfgMorrisSA["GENERAL"]["resultDir"] # e.g. Outputs/ana6MorrisSA + moduleDir = pathlib.Path(resultDir).name # e.g. ana6MorrisSA + comModuleName = cfgMorrisSA["GENERAL"]["modName"] + + filename = f"{avaName}_sortedSAResultsWithBounds.pkl" + + # Load DataFrames + selectedOutput = cfgMorrisSA['MORRIS_CONVERGENCE']['referenceOutput'] + + # Get reference trajectory number, import for determining of the order + match = re.search(r"R(\d+)", selectedOutput) + if match: + reference_r = int(match.group(1)) + else: + raise ValueError( + f"Could not extract r-value from referenceOutput '{selectedOutput}'" + ) + + selectedDir = None + SA_dfs = [] + + for out_name in outputs: + outDir = pathlib.Path(avalancheDir, out_name, moduleDir, comModuleName) + if out_name == selectedOutput: + selectedDir = outDir + with open(outDir / filename, "rb") as fi: + SA_dfs.append(pickle.load(fi)) + + return SA_dfs, r_vals, outputs, selectedDir, reference_r diff --git a/avaframe/ana6Optimisation/runMorrisSA.py b/avaframe/ana6Optimisation/runMorrisSA.py new file mode 100644 index 000000000..2a9ff42b5 --- /dev/null +++ b/avaframe/ana6Optimisation/runMorrisSA.py @@ -0,0 +1,118 @@ +import sys +import numpy as np +import pandas as pd +import pathlib +from datetime import datetime +from SALib.analyze import morris as morris_analyze + +from avaframe.in3Utils import cfgUtils +from avaframe.in3Utils import fileHandlerUtils as fU +import avaframe.out3Plot.outAna6Plots as saveResults +import optimisationUtils + + +def runMorrisSA(): + """ + Run script for the Morris sensitivity analysis. For usage read README_ana6.md. + """ + # Get module config + module = sys.modules[__name__] + cfgMorrisSA = cfgUtils.getModuleConfig(module, toPrint=False) + + # Load avalanche directory from general configuration file + cfgMain = cfgUtils.getGeneralConfig() + avalancheDir = cfgMain['MAIN']['avalancheDir'] + avaName = pathlib.Path(avalancheDir).name + + # Calculate Areal indicators and AIMEC and save the results in Outputs/ana3AIMEC and Outputs/out1Peak + optimisationUtils.calcArealIndicatorsAndAimec(cfgMorrisSA, avalancheDir) + + # Load variation data with bounds from pickle file + inDir = pathlib.Path(avalancheDir, 'Outputs', "ana4Stats") + paramValuesD = pd.read_pickle(inDir / "paramValuesD.pickle") + varParList = paramValuesD['names'] + + # Read and merge results from parameter sets (simulation data), areal indicators and AIMEC + finalDF = optimisationUtils.buildFinalDF(avalancheDir, varParList, cfgMorrisSA) + + # Check if there is simulation in finalDF + nAvailable = len(finalDF) + if nAvailable == 0: + message = "No simulations found in finalDF." + raise RuntimeError(message) + + # Use only morris samples + morrisDF = finalDF[finalDF['sampleMethods'] == 'morris'].copy(deep=True) + # Set order as index + morrisDF.set_index('order', inplace=True) + # Order df based on order (which is the index) + morrisDF.sort_index(inplace=True) + + # Define input for SA + problem = { + 'num_vars': len(varParList), + 'names': varParList, + 'bounds': paramValuesD['bounds'] + } + samples = np.vstack(morrisDF['parameterSet'].values).astype(float) + Y = morrisDF['optimisationVariable'].values + + # Perform SA + Si = morris_analyze.analyze( + problem, + samples, + Y, + conf_level=cfgMorrisSA.getfloat('MORRIS SA', 'conf_level'), + num_levels=cfgMorrisSA.getint('MORRIS SA', 'num_levels') + ) + + # Creating dict from Si + SiData = { + "Parameter": Si['names'], + "mu_star": Si['mu_star'], + "sigma": Si['sigma'], + "mu_star_conf": Si['mu_star_conf']} + + # Convert to dataframe + SiDF = pd.DataFrame(SiData) + + # Create folder for saving the results of the analysis, if not already existing + resultDir = cfgMorrisSA['GENERAL']['resultDir'] + comModuleName = cfgMorrisSA['GENERAL']['modName'] + outDir = pathlib.Path(avalancheDir, resultDir, comModuleName) + fU.makeADir(outDir) + + saveResults.barplotSA(SiDF, avaName, outDir) + saveResults.scatterplotSA(SiDF, avaName, outDir) + saveResults.scatterplotUncertaintySA(SiDF, avaName, outDir) + + # Sort SA results + SiDFSort = SiDF.sort_values("mu_star", ascending=False).reset_index(drop=True) + # Append bounds to SiDFSort + paramBounds = dict(zip(problem["names"], problem["bounds"])) + SiDFSort["bounds"] = SiDFSort["Parameter"].map(paramBounds) + # Save as Pickle for Optimization + SiDFSort.to_pickle(outDir / f"{avaName}_sortedSAResultsWithBounds.pkl") + + # Create df with parameters and the loss function for summary statistics + paramLossDF, paramLossScaledDF = optimisationUtils.createDFParameterLoss(morrisDF, SiDFSort['Parameter']) + + # topN is the number of best morris samples to use for statistics + # Raise error if topN is bigger than available model runs + topN = cfgMorrisSA.getint('MORRIS SA', 'topN') + if topN > nAvailable: + message = ( + f"Invalid configuration: Requested top {topN} simulations, but only {nAvailable} Morris simulations are " + "available." + ) + raise ValueError(message) + + paramLossSubsetDF = paramLossDF.sort_values(by='Loss', ascending=True)[:topN] + # Save mean values of best input parameters as csv + date = datetime.now().strftime("%Y%m%d") + csvPath = f"{outDir}/{avaName}_MorrisBEST{topN}Simulations_{date}.csv" + paramLossSubsetDF.describe().to_csv(csvPath) + + +if __name__ == '__main__': + runMorrisSA() diff --git a/avaframe/ana6Optimisation/runMorrisSACfg.ini b/avaframe/ana6Optimisation/runMorrisSACfg.ini new file mode 100644 index 000000000..431f62e8e --- /dev/null +++ b/avaframe/ana6Optimisation/runMorrisSACfg.ini @@ -0,0 +1,53 @@ +### Config File - This file contains the main settings the morris SA + +# Sidenote: (1) when running runMorrisSA.py the working directory needs to be in the ana6Optimisation folder +# (2) avaDirectory in avaframeCfg.ini need ../ as prefix + +# SA ... sensitivity analysis + +[GENERAL] +# USER input for running and plotting a comparison of simulation result to reference polygon +# These parameters correspond to the settings in runPlotAreaRefDiffs.py +resType = ppr +thresholdValueSimulation = 1 +modName = com8MoTPSA +# layer to use for analysis (e.g. L1, L2) +layer = L2 + +# Output Settings: +# in this folder the results will be saved, avalancheDir + this folder +resultDir = Outputs/ana6MorrisSA + +[LOSS_PARAMETERS] +# The loss function evaluates the agreement between simulation results and reference data. It is defined as a weighted sum of a modified Tversky score (1 - Tversky score) and the normalized runout length difference. Areal indicators are calculated only within the avalanche impact area, which is determined by the reference polygon and a cropshape polygon. + +# alpha gives penalty if simulation overshoots the reference (FP) +# beta gives penalty if simulation falls short of the reference (FN) +tverskyAlpha = 2 +tverskyBeta = 1 + +# Loss function: (1-TverskyScore) * weightTversky + RunoutNormalised * weightRunout +weightTversky = 0.25 +weightRunout = 0.75 + +[MORRIS SA] +# confidence level and number of levels used for SA +conf_level = 0.95 +num_levels = 6 +# number of best morris simulations used for statistics (topN < available model runs) +topN = 10 + + +[MORRIS_CONVERGENCE] +# for morris convergence plot, morris needs to be run with different morris trajectories prior +# define Output folder where results of morris analyses with different trajectories (R) are saved, naming: OutputsRNumber +Outputs1 = OutputsR5 +Outputs2 = OutputsR10 +Outputs3 = OutputsR20 +Outputs4 = OutputsR40 + +# get order of parameters from this output, also save result of convergence into child folders of this parent folder +referenceOutput = OutputsR10 + + + diff --git a/avaframe/ana6Optimisation/runOptimisation.py b/avaframe/ana6Optimisation/runOptimisation.py new file mode 100644 index 000000000..67025d5ea --- /dev/null +++ b/avaframe/ana6Optimisation/runOptimisation.py @@ -0,0 +1,200 @@ +import sys +import pathlib +import pickle +import pandas as pd + +from avaframe.in3Utils import cfgUtils, logUtils +from avaframe.in3Utils import fileHandlerUtils as fU +from avaframe.in3Utils import initializeProject as initProj +import avaframe.out3Plot.outAna6Plots as saveResults +from avaframe.com8MoTPSA import com8MoTPSA +import optimisationUtils + + +def runOptimisation(): + """ + Run script for the optimization process. For usage read README_ana6.md. + """ + # Get module config + module = sys.modules[__name__] + cfgOpt = cfgUtils.getModuleConfig(module, toPrint=False) + + # Load avalanche directory from general configuration file + cfgMain = cfgUtils.getGeneralConfig() + avalancheDir = cfgMain['MAIN']['avalancheDir'] + avaName = pathlib.Path(avalancheDir).name + + # Create folder for saving the results of the analysis, if not already existing + resultDirOpt = cfgOpt['GENERAL']['resultDir'] + comModuleName = cfgOpt['GENERAL']['modName'] + outDir = pathlib.Path(avalancheDir, resultDirOpt, comModuleName) + fU.makeADir(outDir) + + # Get config from morris for path to morris results + cfgDir = 'runMorrisSA.py' + cfgMorrisSA = cfgUtils.getModuleConfig(pathlib.Path(cfgDir), toPrint=False) + resultDirMorris = cfgMorrisSA['GENERAL']['resultDir'] + inDir = pathlib.Path(avalancheDir, resultDirMorris, comModuleName) + + # Start logging + logName = 'runOptimisation' + log = logUtils.initiateLogger(avalancheDir, logName) + log.info('MAIN SCRIPT') + log.info('Current avalanche: %s', avalancheDir) + + # Load variation parameters and their bounds + paramBounds, paramSelected = optimisationUtils.loadVariationData(cfgOpt, avalancheDir, inDir) + + # Calculate Areal indicators and AIMEC and save the results in Outputs/ana3AIMEC and Outputs/out1Peak + optimisationUtils.calcArealIndicatorsAndAimec(cfgOpt, avalancheDir) + + # Read and merge results from parameter sets (simulation data), areal indicators and AIMEC + finalDF = optimisationUtils.buildFinalDF(avalancheDir, paramSelected, cfgOpt) + + # Check if there is simulation in finalDF + nAvailable = len(finalDF) + if nAvailable == 0: + message = "No simulations found in finalDF." + log.error(message) + raise RuntimeError(message) + + # ------------------------------------------------------------------------------------------------------------------ + # Two surrogate-based optimisation strategies are available: + # nonseq: Non-sequential optimisation uses a fixed set of simulations to train the surrogate and evaluate the loss. + # seq: Sequential (Bayesian) optimisation uses an initial set of samples to train the surrogate and iteratively + # adds new simulations using the Expected Improvement acquisition function. + optimisationType = cfgOpt['OPTIMISATION']['optType'].lower() + if optimisationType == 'nonseq': + csv_path = outDir / f"{avaName}_BestOrCurrentSimulation_NonSeq.csv" + # Save sim with currently best y + saveResults.saveBestOrSpecificSimulation(finalDF, paramSelected, csv_path=csv_path) + + # Create df with most important parameters and the loss function + emulatorDF, emulatorScaledDF = optimisationUtils.createDFParameterLoss(finalDF, paramSelected) + + # Create surrogate + X, y, gp_pipe = optimisationUtils.fitSurrogate(emulatorDF, cfgOpt) # X,y are features of emulatorDF + + # K fold cross validation + optimisationUtils.KFoldCrossValidation(X, y, gp_pipe, cfgOpt, outDir, avaName, "Gaussian Process Matern Kernel") + + # Fit final pipline + gp_pipe.fit(X, y) + + # Optimize non-sequential (only use pipe once to find best param) + topNStat = optimisationUtils.optimiseNonSeq(gp_pipe, cfgOpt, paramBounds) + paramSets = [ + topNStat["TopNBest"]["mean_params"], + topNStat["Best"]["params"], + ] + # Generate and write config files with mean parameter values of best N surrogate evaluations and with best + # parameter values of surrogate + # Clean input directory(ies) of old work files + initProj.cleanSingleAvaDir(avalancheDir, deleteOutput=False) + + cfgFiles, cfgPath = optimisationUtils.writeCfgFiles(avalancheDir, paramSets, optimisationType, comModuleName) + + # Perform com8MoTPSA simulations + com8MoTPSA.com8MoTPSAMain(cfgMain, cfgInfo=cfgPath) + + # Calculate Areal indicators and AIMEC and save the results in Outputs/ana3AIMEC and Outputs/out1Peak + optimisationUtils.calcArealIndicatorsAndAimec(cfgOpt, avalancheDir) + + # Read and merge results from parameter sets (simulation data), areal indicators and AIMEC + finalDF = optimisationUtils.buildFinalDF(avalancheDir, paramSelected, cfgOpt) + + simNameMean = optimisationUtils.findSimName(finalDF, topNStat["TopNBest"]["mean_params"], atol=1e-6) + simNameBest = optimisationUtils.findSimName(finalDF, topNStat["Best"]["params"], atol=1e-6) + + # Create image of table + saveResults.saveTopCandidates(finalDF, paramSelected, cfgOpt, topNStat, + out_path=outDir / f"{avaName}_StatisticsBestParameterValues_NonSeqAnalysis.png", + title=f"{avaName}, NonSeq-Analysis: Best Surrogate vs Best Model Run", + simNameMean=simNameMean, simNameBest=simNameBest) + + # Save latest real sim + saveResults.saveBestOrSpecificSimulation(finalDF, paramSelected, simName=simNameMean, csv_path=csv_path) + + # ------------------------------------------------------------------------------------------------------------------ + elif optimisationType == 'seq': + csv_path = outDir / f"{avaName}_BestOrCurrentSimulation_BO.csv" + # Save sim with currently best y + saveResults.saveBestOrSpecificSimulation(finalDF, paramSelected, csv_path=csv_path) + + eiThreshold = cfgOpt.getfloat('OPTIMISATION', 'eiThreshold') + nGoodSims = cfgOpt.getfloat('OPTIMISATION', 'numberOfGoodSimulations') + countGoodSims = 0 + bo_max_iterations = cfgOpt.getint('OPTIMISATION', 'bo_max_iterations') + + # Before the loop: start counter based on existing BO ("seq") simulations + # Important to avoid same order number across different BO runs + seq_orders = pd.to_numeric(finalDF.loc[finalDF["sampleMethods"] == "seq", "order"], errors="coerce") + start_counter = int(seq_orders.max() + 1) if seq_orders.notna().any() else 0 + + for i in range(bo_max_iterations): + # Create df with most important parameters and the loss function + emulatorDF, emulatorScaledDF = optimisationUtils.createDFParameterLoss(finalDF, paramSelected) + # Train surrogate + X, y, gp_pipe = optimisationUtils.fitSurrogate(emulatorDF, cfgOpt) # X,y are features of emulatorDF + + # K fold cross validation + optimisationUtils.KFoldCrossValidation(X, y, gp_pipe, cfgOpt, outDir, avaName, + "Gaussian Process Matern Kernel") + + # Fit final pipline + gp_pipe.fit(X, y) + + # Get next input parameters with EI + xBest, xBestDict, ei, lcb = optimisationUtils.EINextPoint(gp_pipe, y, paramBounds, cfgOpt) + + # Clean input directory(ies) of old work files + initProj.cleanSingleAvaDir(avalancheDir, deleteOutput=False) + + # Generate and write config file + cfgFiles, cfgPath = optimisationUtils.writeCfgFiles(avalancheDir, xBestDict, optimisationType, + comModuleName, counter=start_counter + i) + + # Perform com8MoTPSA simulation + com8MoTPSA.com8MoTPSAMain(cfgMain, cfgInfo=cfgPath) + + # Calculate Areal indicators and AIMEC and save the results in Outputs/ana3AIMEC and Outputs/out1Peak + optimisationUtils.calcArealIndicatorsAndAimec(cfgOpt, avalancheDir) + + # Read and merge results from parameter sets (simulation data), areal indicators and AIMEC + finalDF = optimisationUtils.buildFinalDF(avalancheDir, paramSelected, cfgOpt) + + # Save latest sim + simName = optimisationUtils.findSimName(finalDF, xBestDict, atol=1e-6) + saveResults.saveBestOrSpecificSimulation(finalDF, paramSelected, ei, lcb, simName, + csv_path=csv_path) + # If ei is smaller than threshold, the simulation is counted as 'good', if number of good simulations is + # reached, optimization stops + if ei < eiThreshold: + countGoodSims = countGoodSims + 1 + if countGoodSims >= nGoodSims: + break + + saveResults.saveTopCandidates(finalDF, paramSelected, cfgOpt, + out_path=outDir / f"{avaName}_StatisticsBestParameterValues_BOAnalysis.png", + title=f"{avaName}, Seq-Analysis: Best Model Runs") + + # Save BO plots + n_top_samples = cfgOpt.getint('OPTIMISATION', 'n_model_top') + emulatorDF, emulatorScaledDF = optimisationUtils.createDFParameterLoss(finalDF, paramSelected) + + saveResults.BOConvergencePlot(finalDF, avaName, outDir, cfgOpt) + saveResults.BOBoxplot(emulatorDF, avaName, outDir, N=n_top_samples) + saveResults.BOBoxplotNormalised(emulatorDF, paramBounds, avaName, outDir, N=n_top_samples) + + else: + message = f"Unknown optimisation type '{optimisationType}'. Expected 'nonseq' or 'seq'." + log.error(message) + raise ValueError(message) + + # Save pickle file of finalDF + with open(outDir / f"{avaName}_finalDF.pickle", "wb") as fi: + pickle.dump(finalDF, fi) + + +if __name__ == '__main__': + runOptimisation() diff --git a/avaframe/ana6Optimisation/runOptimisationCfg.ini b/avaframe/ana6Optimisation/runOptimisationCfg.ini new file mode 100644 index 000000000..9d1f66aa2 --- /dev/null +++ b/avaframe/ana6Optimisation/runOptimisationCfg.ini @@ -0,0 +1,81 @@ +### Config File - This file contains the main settings for the optimisation process + +# Sidenote: (1) when running runOptimisation.py the working directory needs to be in the ana6Optimisation folder +# (2) avaDirectory in avaframeCfg.ini need ../ as prefix + +# BO ... bayesian optimisation +# EI ... expected improvement + + +[GENERAL] +# USER input for running and plotting a comparison of simulation result to reference polygon +# These parameters correspond to the settings in runPlotAreaRefDiffs.py +resType = ppr +thresholdValueSimulation = 1 +modName = com8MoTPSA +# layer to use for analysis (e.g. L1, L2) +layer = L2 + +# Output Settings: +# in this folder the results will be saved, avalancheDir + this folder +resultDir = Outputs/ana6Optimisation + + +[PARAM_BOUNDS] +# 2 scenarios: choose 1 or 2 +# (1): morris is not run prior, this is the preferred scenario, input parameters and their bounds are read from a previously saved pickle file generated by ``runAna4ProbAnaCom8MoTPSA.py``. The parameter variation is determined by the configuration specified in ``probAnaCfg.ini``. +# (2): morris is run prior, in this case, a dataframe of ranked input parameters is already saved by runMorrisSA.py as pickle file, The optimisation is then based on morris samples, this is experimental, since Morris samples do not provide optimal coverage of the input space. +# The user only needs to determine how much input topN parameters to use for optimisation. +scenario = 1 + +# topN is only relevant for scenario 2: number of top ranked input parameters to use for optimisation +topN = 7 + + +[LOSS_PARAMETERS] +# The loss function evaluates the agreement between simulation results and reference data. It is defined as a weighted sum of a modified Tversky score (1 - Tversky score) and the normalized runout length difference. Areal indicators are calculated only within the avalanche impact area, which is determined by the reference polygon and a cropshape polygon. + +# alpha gives penalty if simulation overshoots the reference (FP) +# beta gives penalty if simulation falls short of the reference (FN) +tverskyAlpha = 2 +tverskyBeta = 1 + +# Loss function: (1-TverskyScore) * weightTversky + RunoutNormalised * weightRunout +weightTversky = 0.25 +weightRunout = 0.75 + + +[OPTIMISATION] +# Surrogate-based optimisation using a Gaussian Process (GP) surrogate. +# Non-sequential optimisation evaluates a fixed set of Latin Hypercube simulations to train the surrogate and predict the loss. +# Bayesian optimisation also uses the GP surrogate but sequentially selects new simulations using the Expected Improvement acquisition function. + +# Type of optimisation: seq or nonseq +optType = nonseq +# define the smoothness of the matern kernel of the surrogate, e.g.: 0.5, 1.5, 2.5 +matern_nu = 2.5 +# define k for k-fold cross validation +k = 5 +# sampling seed for generation of Latin Hypercube samples used for loss-prediction with surrogate +seed = 12345 +# number of LH samples +n_lhs = 1000000 +# determine the number N of surrogate's best simulations used for statistics +n_surrogate_top = 100 +# determine the number N of models best simulations used for statistics (n < available model runs) +n_model_top = 10 + +# settings for seq: +# xi controls the exploration–exploitation balance in the Expected Improvement acquisition function. +# Increasing xi (e.g., xi = 0.01) encourages exploration (sampling uncertain regions). +# Decreasing xi (e.g., xi = 0.0) favors exploitation (sampling near the current optimum). +# In practice, xi is typically chosen as a small non-negative value (e.g., 0.0–0.1). +xi = 0.0 + +# If EI falls below the threshold, the simulation is counted as 'good'. The sequential optimisation stops once the specified number of good simulations is reached. Otherwise, the optimisation terminates after bo_max_iterations. +eiThreshold = 0.01 +numberOfGoodSimulations = 10 +bo_max_iterations = 40 + + + diff --git a/avaframe/ana6Optimisation/runPlotMorrisConvergence.py b/avaframe/ana6Optimisation/runPlotMorrisConvergence.py new file mode 100644 index 000000000..d2a3e0b1f --- /dev/null +++ b/avaframe/ana6Optimisation/runPlotMorrisConvergence.py @@ -0,0 +1,37 @@ +import pathlib + +import avaframe.out3Plot.outAna6Plots as saveResults +from avaframe.in3Utils import cfgUtils +import optimisationUtils + + +def runPlotMorrisConvergence(): + """ + Run script for the Morris convergence plot. For usage read README_ana6.md. + """ + # Load avalanche directory from general configuration file + cfgMain = cfgUtils.getGeneralConfig() + avalancheDir = cfgMain['MAIN']['avalancheDir'] + avaName = pathlib.Path(avalancheDir).name + + # Load morris config file + cfgDir = 'runMorrisSA.ini' + cfgMorrisSA = cfgUtils.getModuleConfig(pathlib.Path(cfgDir), toPrint=False) + + # Load Morris sensitivity analysis results for convergence plot + SA_dfs, r_vals, outputs, outDir, reference_r = optimisationUtils.loadMorrisConvergenceData(cfgMorrisSA, + avalancheDir, + avaName) + + # Top 7 parameters + saveResults.plotMorrisConvergence(SA_dfs, r_vals, reference_r, k=7, + outpath=outDir / f'{avaName}_MorrisSAConvergencePlotTop7.png', + title=f"{avaName} Convergence plot for top 7 parameters") + # All parameters + saveResults.plotMorrisConvergence(SA_dfs, r_vals, reference_r, k=None, + outpath=outDir / f'{avaName}_MorrisSAConvergencePlotAll.png', + title=f"{avaName} Convergence plot for all parameters") + + +if __name__ == '__main__': + runPlotMorrisConvergence() diff --git a/avaframe/com1DFA/com1DFATools.py b/avaframe/com1DFA/com1DFATools.py index 5ba11a87a..aba363f48 100644 --- a/avaframe/com1DFA/com1DFATools.py +++ b/avaframe/com1DFA/com1DFATools.py @@ -346,6 +346,7 @@ def initializeInputs(avalancheDir, cleanRemeshedRasters, module=com1DFA): simDFExisting, simNameExisting = cfgUtils.readConfigurationInfoFromDone( avalancheDir, specDir="", + modName=modName ) # fetch input data - dem, release-, entrainment- and resistance areas (and secondary release areas) diff --git a/avaframe/com8MoTPSA/com8MoTPSA.py b/avaframe/com8MoTPSA/com8MoTPSA.py index 033e7d4ec..a14716e42 100644 --- a/avaframe/com8MoTPSA/com8MoTPSA.py +++ b/avaframe/com8MoTPSA/com8MoTPSA.py @@ -14,16 +14,18 @@ import avaframe.com1DFA.com1DFA as com1DFA from avaframe.in3Utils import cfgUtils from avaframe.in2Trans import rasterUtils as rU +from avaframe.ana4Stats import probAna from avaframe.in1Data import getInput as gI import avaframe.in3Utils.fileHandlerUtils as fU from avaframe.out1Peak import outPlotAllPeak as oP import avaframe.in3Utils.MoTUtils as mT +from avaframe.in3Utils.initializeProject import _checkForFolderAndDelete # create local logger log = logging.getLogger(__name__) -def com8MoTPSAMain(cfgMain, cfgInfo=None): +def com8MoTPSAMain(cfgMain, cfgInfo=None, returnSimName=None): """Run the full MoT-PSA workflow: generate configs, run simulations in parallel, postprocess. Parameters @@ -32,6 +34,8 @@ def com8MoTPSAMain(cfgMain, cfgInfo=None): main AvaFrame configuration (avalancheDir, nCPU, plot flags) cfgInfo : dict or None, optional override configuration info passed to MoTGenerateConfigs + returnSimName : any, optional + if not None, return the first simDict key after running """ # Get all necessary information from the configuration files currentModule = sys.modules[__name__] @@ -48,27 +52,53 @@ def com8MoTPSAMain(cfgMain, cfgInfo=None): # Preprocess the simulations, mainly creating the rcf files rcfFiles = com8MoTPSAPreprocess(simDict, inputSimFiles, cfgMain) + # Check if there is simulation to be run + if not rcfFiles: + log.warning("There is no simulation to be performed for releaseScenario") + return None + # And now we run the simulations startTime = time.time() log.info("--- STARTING (potential) PARALLEL PART ----") - # Get number of CPU Cores wanted - nCPU = cfgUtils.getNumberOfProcesses(cfgMain, len(rcfFiles)) + # Split into chunks to postprocess and clean up working directory incrementally + # Get chunkSize from probAnaCfg.ini, if it is empty use 10 as default + cfgProbAna = cfgUtils.getModuleConfig(probAna) + chunkSize = cfgProbAna.get('GENERAL', 'chunkSize', fallback='') + if chunkSize == '': + chunkSize = 10 + else: + chunkSize = int(chunkSize) + + rcfChunks = [rcfFiles[i:i + chunkSize] for i in range(0, len(rcfFiles), chunkSize)] + + for rcfFilesChunk in rcfChunks: + simNamesChunk = [p.stem for p in rcfFilesChunk] + + nCPU = cfgUtils.getNumberOfProcesses(cfgMain, len(rcfFilesChunk)) + + with Pool(processes=nCPU) as pool: + results = pool.map(com8MoTPSATask, rcfFilesChunk) + pool.close() + pool.join() - # Create parallel pool and run - # with multiprocessing.Pool(processes=nCPU) as pool: - with Pool(processes=nCPU) as pool: - results = pool.map(com8MoTPSATask, rcfFiles) - pool.close() - pool.join() + timeNeeded = "%.2f" % (time.time() - startTime) + log.info("Overall (parallel) com8MoTPSA computation took: %s s ", timeNeeded) + log.info("--- ENDING (potential) PARALLEL PART ----") - timeNeeded = "%.2f" % (time.time() - startTime) - log.info("Overall (parallel) com8MoTPSA computation took: %s s " % timeNeeded) - log.info("--- ENDING (potential) PARALLEL PART ----") + # Postprocess the simulations + com8MoTPSAPostprocess(simNamesChunk, cfgMain, inputSimFiles) - # Postprocess the simulations - com8MoTPSAPostprocess(simDict, cfgMain, inputSimFiles) + # Delete folder in Work directory after postprocessing to reduce memory costs + avaDir = cfgMain["MAIN"]["avalancheDir"] + for sim in simNamesChunk: + folderName = "Work/com8MoTPSA/" + sim + _checkForFolderAndDelete(avaDir, folderName) + + if returnSimName is not None and simDict: + return next(iter(simDict)) + return None def copyRawToLayerPeakFiles(workDir, outputDirPeakFile): @@ -106,7 +136,7 @@ def copyRawToLayerPeakFiles(workDir, outputDirPeakFile): shutil.copy2(source, target) -def com8MoTPSAPostprocess(simDict, cfgMain, inputSimFiles): +def com8MoTPSAPostprocess(simNames, cfgMain, inputSimFiles): """Postprocess MoT-PSA results: rename outputs to L1/L2 peak files, generate plots and reports. For each simulation, copies DataTime.txt and renames raw MoT-PSA output files @@ -114,8 +144,8 @@ def com8MoTPSAPostprocess(simDict, cfgMain, inputSimFiles): Parameters ---------- - simDict : dict - simulation dictionary + simNames : list + list of simulation name strings cfgMain : configparser.ConfigParser main AvaFrame configuration (avalancheDir, plot flags) inputSimFiles : dict @@ -128,7 +158,7 @@ def com8MoTPSAPostprocess(simDict, cfgMain, inputSimFiles): outputDirPeakFile = pathlib.Path(avalancheDir) / "Outputs" / "com8MoTPSA" / "peakFiles" fU.makeADir(outputDirPeakFile) - for key in simDict: + for key in simNames: workDir = pathlib.Path(avalancheDir) / "Work" / "com8MoTPSA" / str(key) # Copy DataTime.txt @@ -137,6 +167,13 @@ def com8MoTPSAPostprocess(simDict, cfgMain, inputSimFiles): copyRawToLayerPeakFiles(workDir, outputDirPeakFile) + # Write config indicator files to track completed simulations + configFileName = "%s.ini" % key + for saveDir in ["configurationFilesDone", "configurationFilesLatest"]: + configDir = pathlib.Path(avalancheDir, "Outputs", "com8MoTPSA", "configurationFiles", saveDir) + with open((configDir / configFileName), "w") as fi: + fi.write("see directory configurationFiles for info on config") + # create plots and report modName = __name__.split(".")[-1] reportDir = pathlib.Path(avalancheDir, "Outputs", modName, "reports") diff --git a/avaframe/com8MoTPSA/com8MoTPSACfg.ini b/avaframe/com8MoTPSA/com8MoTPSACfg.ini index 6c5a76d08..71ed10960 100644 --- a/avaframe/com8MoTPSA/com8MoTPSACfg.ini +++ b/avaframe/com8MoTPSA/com8MoTPSACfg.ini @@ -162,20 +162,28 @@ Output format = ESRI_ASCII_Grid [Physical_parameters] Gravitational acceleration (m/s^2) = 9.81 +# Flow density for dense layer (rho) Density (kg/m^3) = 250.0 Rheology = Voellmy Parameters = auto +# Dry-friction cefficient (mu) Dry-friction coefficient (-) = 0.400 -Turbulent drag coefficient (-) = 0.0010 +# Turbulent drag coefficient (k_01) +Turbulent drag coefficient (-) = 0.0010 Effective drag height (m) = 0.0 Centrifugal effects = yes Passive earth-pressure coeff. (-) = 1.0 +# Basal drag coefficient (k_02) Basal drag coeff. 0-2 (-) = 0.008 +# Basal drag coefficient (k_12) Basal drag coeff. 1-2 (-) = 0.04 +# Top drag coefficient k_a2 Top drag coeff. (-) = 0.0001 +# Parabolic shape function of concentration (f_c) Conc_L2_Prof_Coeff_0 (-) = 1.3333 Conc_L2_Prof_Coeff_1 (-) = -0.66667 Conc_L2_Prof_Coeff_2 (-) = 0.0 +# Parabolic shape function of velocity (f_u) Speed_L2_Prof_Coeff_0 (-) = 1.4 Speed_L2_Prof_Coeff_1 (-) = 0.13333 Speed_L2_Prof_Coeff_2 (-) = -1.4 @@ -192,15 +200,22 @@ Entrainment L2 = none Erosion coefficient L1 (-) = 0.0 Bed strength profile = constant Bed friction coefficient (-) = 0.0 +# Bed density (rho_b) Bed density (kg/m^3) = 140.0 Deposition = yes +# Deposition density for dense layer (rho_d) Deposition density L1 (kg/m^3) = 400.0 Suspension model = TJSM +# Shear strength dense layer (tau_s) Avalanche shear strength (Pa) = 0.0 +# Shear strength deposit tau_d Avalanche shear strength deposit (Pa) = 0.0 +# Decay coefficient of the snow avalanche shear strength for suspension process (tau_su) Decay coeff. snow s. s. suspension (-) = 1.0 +# Decay coefficient of the snow avalanche shear strength for deposition process (tau_du) Decay coeff. snow s. s. deposition (-) = 1.0 Entrainment coeff. m12 (-) = 0.0 +# Settling speed of powder snow layer to dense layer (w_s) Deposition rate 21 (m/s) = 0.50 Constant density L1 = yes Evolving geometry = no @@ -222,3 +237,7 @@ Initial CFL number (-) = 0.8 # peak files and plots are exported, option to turn off exports when exportData is set to False # this affects export of peak files and also generation of peak file plots exportData = True + +[VISUALISATION] +# scenario name - can be used for plotting +scenario = diff --git a/avaframe/com8MoTPSA/runAna4ProbAnaCom8MoTPSA.py b/avaframe/com8MoTPSA/runAna4ProbAnaCom8MoTPSA.py index 0d482bb7e..3d3da6a3c 100644 --- a/avaframe/com8MoTPSA/runAna4ProbAnaCom8MoTPSA.py +++ b/avaframe/com8MoTPSA/runAna4ProbAnaCom8MoTPSA.py @@ -9,7 +9,6 @@ import argparse # Local imports -from avaframe.com1DFA import com1DFA import avaframe.com8MoTPSA.com8MoTPSA as com8MoTPSA from avaframe.out3Plot import statsPlots as sP from avaframe.ana4Stats import probAna @@ -18,7 +17,6 @@ from avaframe.in3Utils import logUtils import avaframe.in3Utils.fileHandlerUtils as fU from avaframe.out3Plot import outQuickPlot as oP -import configparser def runProbAna(avalancheDir=''): @@ -37,9 +35,6 @@ def runProbAna(avalancheDir=''): # log file name; leave empty to use default runLog.log logName = 'runAna4ProbAna' - # Load general configuration filee - cfgMain = cfgUtils.getGeneralConfig() - # Load avalanche directory from general configuration file, # if not provided as input argument cfgMain = cfgUtils.getGeneralConfig() @@ -97,7 +92,6 @@ def runProbAna(avalancheDir=''): if anaPerformed is False: log.warning('No files found for configuration: %s' % probConf) - # make a plot of the contours inputDir = pathlib.Path(avalancheDir, 'Outputs', 'ana4Stats') outName = '%s_prob_%s_%s_lim%s' % (str(avalancheDir.stem), @@ -116,16 +110,15 @@ def runProbAna(avalancheDir=''): # plot probability maps sP.plotProbMap(avalancheDir, inputDir, cfgProb, demPlot=True) - # # copy outputs to folder called like probability configurations - # outputFiles = avalancheDir / 'Outputs' / 'ana4Stats' - # saveFiles = avalancheDir / 'Outputs' / ('ana4Stats_' + probConf) - # shutil.move(outputFiles, saveFiles) + # # copy outputs to folder called like probability configurations + # outputFiles = avalancheDir / 'Outputs' / 'ana4Stats' + # saveFiles = avalancheDir / 'Outputs' / ('ana4Stats_' + probConf) + # shutil.move(outputFiles, saveFiles) return if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Run ana4ProbAna workflow') parser.add_argument('avadir', metavar='a', type=str, nargs='?', default='', help='the avalanche directory') diff --git a/avaframe/in3Utils/cfgUtils.py b/avaframe/in3Utils/cfgUtils.py index dafd133ba..f6d3527e3 100644 --- a/avaframe/in3Utils/cfgUtils.py +++ b/avaframe/in3Utils/cfgUtils.py @@ -55,7 +55,6 @@ from avaframe.in3Utils import logUtils from avaframe.in3Utils import fileHandlerUtils as fU - log = logging.getLogger(__name__) @@ -96,7 +95,7 @@ def getGeneralConfig(nameFile=""): def getModuleConfig( - module, avalancheDir="", fileOverride="", batchCfgDir="", modInfo=False, toPrint=True, onlyDefault=False + module, avalancheDir="", fileOverride="", batchCfgDir="", modInfo=False, toPrint=True, onlyDefault=False ): """Returns the configuration for a given module returns a configParser object OR pathlib.Path (when batchCfgDir is used) @@ -721,12 +720,12 @@ def writeDictToJson(inDict, outFilePath): def createConfigurationInfo( - avaDir, - comModule="com1DFA", - standardCfg="", - writeCSV=False, - specDir="", - simNameList=[], + avaDir, + comModule="com1DFA", + standardCfg="", + writeCSV=False, + specDir="", + simNameList=[], ): """Read configurations from all simulations configuration ini files from directory @@ -958,7 +957,7 @@ def setStrnanToNan(simDF, simDFTest, name): return simDF -def readConfigurationInfoFromDone(avaDir, specDir="", latest=False): +def readConfigurationInfoFromDone(avaDir, specDir="", latest=False, modName=''): """Check avaName/Outputs/com1DFA/configurationFilesDone and pass names of all files found in this directory and create corresponding simDF this is useful if e.g. no allConfigurations.csv has @@ -976,6 +975,8 @@ def readConfigurationInfoFromDone(avaDir, specDir="", latest=False): path to a directory where simulation configuration files directory called configurationFiles can be found - optional latest: bool if True check for files found in avaName/Outputs/com1DFA/configurationFilesLatest + modName: str + name of the module to be used for task (optional) Returns -------- @@ -989,7 +990,7 @@ def readConfigurationInfoFromDone(avaDir, specDir="", latest=False): if specDir != "": inDir = pathlib.Path(specDir, "configurationFiles") else: - inDir = pathlib.Path(avaDir, "Outputs", "com1DFA", "configurationFiles") + inDir = pathlib.Path(avaDir, "Outputs", modName, "configurationFiles") # search inDir/configurationFilesDone or inDir/configurationFilesLatest (depending on latest flag) for already existing sims if latest: @@ -1009,7 +1010,7 @@ def readConfigurationInfoFromDone(avaDir, specDir="", latest=False): # create simDF (dataFrame with one row per simulation of configuration files found in configDir) simDF = createConfigurationInfo( avaDir, - comModule="com1DFA", + comModule=modName, standardCfg="", writeCSV=False, specDir=specDir, diff --git a/avaframe/out3Plot/outAna6Plots.py b/avaframe/out3Plot/outAna6Plots.py new file mode 100644 index 000000000..5b03238a8 --- /dev/null +++ b/avaframe/out3Plot/outAna6Plots.py @@ -0,0 +1,934 @@ +import numpy as np +import pandas as pd +import pathlib +import matplotlib.pyplot as plt +from datetime import datetime + + +def barplotSA(SiDF, avaName, outDir): + """ + Create a bar plot of Morris sensitivity results. + + Bars show μ* as percentage of the total sensitivity, with bar width + proportional to σ. A vertical dashed line is drawn where the cumulative μ* + first reaches 80%. + + Parameters + ---------- + SiDF : pandas.DataFrame + DataFrame with at least the following columns: + - ``Parameter`` : str, parameter names + - ``mu_star`` : float, mean absolute elementary effect + - ``mu_star_conf`` : float, optional, confidence interval of μ* + - ``sigma`` : float, standard deviation of elementary effects + avaName : str + Avalanche name, used in the saved filename. + outDir : str or pathlib.Path + Directory where the plot is saved. + """ + + # 1) Sort by mu_star (descending) + df = SiDF.sort_values("mu_star", ascending=False).reset_index(drop=True) + + # 2) Normalize μ* so the percentages sum to 100 + total_mu = df["mu_star"].sum() + mu_pct = 100 * df["mu_star"] / total_mu + + # scale the confidence interval to the same percentage units + if "mu_star_conf" in df: + mu_pct_conf = 100 * df["mu_star_conf"] / total_mu + else: + mu_pct_conf = None + + # 3) Map σ to bar widths + wmin, wmax = 0.3, 0.9 + rng = df["sigma"].max() - df["sigma"].min() + sigma_norm = (df["sigma"] - df["sigma"].min()) / (rng if rng else 1) + bar_widths = wmin + (wmax - wmin) * sigma_norm + + # 4) Plot + x = np.arange(len(df)) + fig, ax = plt.subplots(figsize=(12, 6)) + ax.bar( + x, mu_pct, width=bar_widths, edgecolor="black", + capsize=3 if mu_pct_conf is not None else 0 + ) + + ax.set_xticks(x, df["Parameter"], rotation=60, ha="right") + ax.set_ylabel("μ* (% of total)") + ax.set_title("Sensitivity: μ* (percent of total) with σ mapped to bar width") + + # Show value labels + for xi, yi in zip(x, mu_pct): + ax.text(xi, yi, f"{yi:.1f}%", ha="center", va="bottom", fontsize=9) + + # 7) Save figure + # Include date, format: YYYYMMDD + date = datetime.now().strftime("%Y%m%d") + figName = f"{outDir}/{avaName}_paramRanking_{date}.png" + plt.savefig(figName, dpi=300, bbox_inches="tight") + + +def scatterplotSA(SiDF, avaName, outDir): + """ + Create a scatter plot of Morris sensitivity results (μ* vs σ). + + Parameters + ---------- + SiDF : pandas.DataFrame + DataFrame with at least the following columns: + - ``Parameter`` : str, parameter names + - ``mu_star`` : float, mean absolute elementary effect + - ``sigma`` : float, standard deviation of elementary effects + avaName : str + Avalanche name, used in the saved filename. + outDir : str or pathlib.Path + Directory where the plot is saved. + """ + + # Scatter Plot + plt.figure(figsize=(12, 7)) + plt.scatter(SiDF['mu_star'], SiDF['sigma'], color='blue') + + # Annotate name to the points + for i, txt in enumerate(SiDF['Parameter']): + plt.annotate(txt, (SiDF['mu_star'][i], SiDF['sigma'][i]), fontsize=9, xytext=(5, 5), textcoords='offset points') + + # Labels and title + plt.xlabel("mu_star (Influence)", fontsize=12) + plt.ylabel("sigma (Nonlinearity / interactions)", fontsize=12) + plt.title("Morris sensitivity analysis: mu_star vs sigma", fontsize=14) + plt.grid(True) + + # Save figure + # Include date, format: YYYYMMDD + date = datetime.now().strftime("%Y%m%d") + figName = f"{outDir}/{avaName}_Scatterplot_{date}.png" + plt.savefig(figName, dpi=300, bbox_inches="tight") + + +def scatterplotUncertaintySA(SiDF, avaName, outDir): + """ + Create a scatter plot of Morris sensitivity results with uncertainty. + Plots μ* vs σ with horizontal error bars given by ``mu_star_conf``. + + Parameters + ---------- + SiDF : pandas.DataFrame + DataFrame with at least the following columns: + - ``Parameter`` : str, parameter names + - ``mu_star`` : float, mean absolute elementary effect + - ``mu_star_conf`` : float, confidence interval of μ* + - ``sigma`` : float, standard deviation of elementary effects + avaName : str + Avalanche name, used in the saved filename. + outDir : str or pathlib.Path + Directory where the plot is saved. + """ + # Plot with error bars + plt.figure(figsize=(12, 7)) + plt.errorbar( + SiDF['mu_star'], SiDF['sigma'], + xerr=SiDF['mu_star_conf'], + fmt='o', color='blue', ecolor='gray', elinewidth=1.5, capsize=4 + ) + + x_range = SiDF['mu_star'].max() - SiDF['mu_star'].min() + y_range = SiDF['sigma'].max() - SiDF['sigma'].min() + + x_offset = 0.01 * x_range + y_offset = 0.01 * y_range + + for i in range(len(SiDF)): + plt.text( + SiDF['mu_star'][i] + x_offset, + SiDF['sigma'][i] + y_offset, + SiDF['Parameter'][i], + fontsize=9 + ) + + # Labels and title + plt.xlabel("mu_star (Influence strength)", fontsize=12) + plt.ylabel("sigma (Nonlinearity / interactions)", fontsize=12) + plt.title("Morris sensitivity analysis: mu_star vs sigma", fontsize=14) + + # Save figure + # Include date, format: YYYYMMDD + date = datetime.now().strftime("%Y%m%d") + figName = f"{outDir}/{avaName}_ScatterplotUncertainty_{date}.png" + plt.savefig(figName, dpi=300, bbox_inches="tight") + + +def BOConvergencePlot(finalDF, avaName, outDir, cfgOpt): + """ + Visualises the evolution of the optimisation variable (loss) across sampling phases: + 1. Latin hypercube (scenario 1) OR Morris (scenario 2) + 2. Bayesian optimisation (EI/LCB) -> sampleMethods == "seq" + 3. Non-sequential surrogate-based sampling -> sampleMethods == "nonseq" + + Parameters + ---------- + finalDF : pandas.DataFrame + DataFrame containing simulation results. + + avaName : str + Name of the avalanche. Used for naming the output figure. + + outDir : pathlib.Path + Directory where the convergence plot will be saved. + + cfgOpt: configparser.ConfigParser + Configuration object containing the section 'PARAM_BOUNDS'. + """ + # Read scenario flag + scenario = cfgOpt.getint("PARAM_BOUNDS", "scenario") + + # Color palette + c_best = "#4e79a7" # Blue + c_bo = "#59a14f" # Green + c_lhs = "#76b7b2" # Turquoise + c_nonseq = "#e15759" # Red + c_morris = "#edc948" # Yellow + + df = finalDF.dropna(subset=["sampleMethods", "order", "optimisationVariable"]).copy() + + # Decide which method represents the initial sampling phase + if scenario == 1: + initial_method = "latin" + initial_label = "LatinHypercube" + initial_color = c_lhs + elif scenario == 2: + initial_method = "morris" + initial_label = "Morris" + initial_color = c_morris + else: + message = ( + f"Unknown scenario '{scenario}' in cfgOpt['PARAM_BOUNDS']['scenario']. " + "Expected 1 (Morris) or 2 (Latin hypercube)." + ) + raise ValueError(message) + + # Split by sampling method + initial = df[df["sampleMethods"] == initial_method].sort_values("order") + bo = df[df["sampleMethods"] == "seq"].sort_values("order") + nonseq = df[df["sampleMethods"] == "nonseq"].sort_values("order") + + # Iteration axis: make phases contiguous on x-axis + current_offset = 0 + + if not initial.empty: + initial = initial.copy() + initial["iter"] = initial["order"] + current_offset = int(initial["iter"].max()) + 1 + + if not bo.empty: + bo = bo.copy() + bo["iter"] = bo["order"] + current_offset + current_offset = int(bo["iter"].max()) + 1 + + if not nonseq.empty: + nonseq = nonseq.sort_index().copy() + nonseq["iter"] = np.arange(current_offset, current_offset + len(nonseq)) + current_offset = int(nonseq["iter"].max()) + 1 + + # Combine for best-so-far curve + parts = [initial, bo] + if not nonseq.empty: + parts.append(nonseq) + + if not any(not p.empty for p in parts): + message = "No data available to plot (all sampling phases are empty after filtering)." + raise ValueError(message) + + all_df = pd.concat([p for p in parts if not p.empty], ignore_index=True).sort_values("iter") + all_df["best_so_far"] = all_df["optimisationVariable"].cummin() + + # Plot + fig, ax = plt.subplots(figsize=(9, 5)) + + if not initial.empty: + ax.scatter( + initial["iter"], + initial["optimisationVariable"], + s=35, + alpha=0.75, + color=initial_color, + label=initial_label, + ) + + if not bo.empty: + ax.scatter( + bo["iter"], + bo["optimisationVariable"], + s=40, + alpha=0.85, + color=c_bo, + label="Bayesian optimisation (EI/LCB)", + ) + + if not nonseq.empty: + ax.scatter( + nonseq["iter"], + nonseq["optimisationVariable"], + s=40, + alpha=0.85, + color=c_nonseq, + label="Non-sequential sampling", + ) + + ax.plot( + all_df["iter"], + all_df["best_so_far"], + linewidth=1.5, + color=c_best, + label="Best-so-far", + ) + + # Separator lines (initial is assumed to exist) + if (not bo.empty) or (not nonseq.empty): + ax.axvline(initial["iter"].max() + 0.5, linestyle="--", linewidth=0.7, color="black") + + if (not bo.empty) and (not nonseq.empty): + ax.axvline(bo["iter"].max() + 0.5, linestyle="--", linewidth=0.7, color="black") + + ax.set_xlabel("Iteration") + ax.set_ylabel("Optimisation variable (loss)") + ax.set_title(f"Convergence: {initial_label} → Bayesian optimisation") + ax.legend(frameon=False, loc="best") + + fig.tight_layout() + + # Save figure + date = datetime.now().strftime("%Y%m%d") + figName = f"{outDir}/{avaName}_BOConvergence_{initial_label}_{date}.png" + fig.savefig(figName, dpi=300, bbox_inches="tight") + plt.close(fig) + + +def BOBoxplot(paramLossDF, avaName, outDir, N=10): + """ + Create boxplots of the top-N parameter sets based on loss. + + This function selects the N best-performing simulations (lowest loss) + and visualises the distribution of their parameter values using boxplots. + Each parameter is plotted in a separate subplot. + + Parameters + ---------- + paramLossDF : pandas.DataFrame + DataFrame containing model parameters and corresponding loss values. + + avaName : str + Name of the avalanche. Used for naming the output figure. + + outDir : pathlib.Path + Directory where the boxplot figure will be saved. + + N : int, optional + Number of best-performing simulations (lowest loss) to include. + Default is 10. + """ + df_best = paramLossDF.nsmallest(N, "Loss") + param_cols = paramLossDF.columns.drop('Loss') + + fig, axes = plt.subplots(2, 4, figsize=(10, 6)) + axes = axes.flatten() + for ax, col in zip(axes, param_cols): + ax.boxplot(df_best[col]) + ax.set_xticklabels([col]) + + # If less than 8 parameters, remove empty axes + for ax in axes[len(param_cols):]: + ax.axis("off") + plt.tight_layout() + + # Save figure + # Include date, format: YYYYMMDD + date = datetime.now().strftime("%Y%m%d") + figName = f"{outDir}/{avaName}_BOBoxplot_{date}.png" + plt.savefig(figName, dpi=300, bbox_inches="tight") + + +def BOBoxplotNormalised(paramLossDF, paramBounds, avaName, outDir, N=10): + """ + Create normalized boxplots of parameters for the top-N simulations. + + This function selects the N best-performing simulations (lowest loss) + and visualises the distribution of their parameter values after + min–max normalization based on predefined parameter bounds. + + Parameters + ---------- + paramLossDF : pandas.DataFrame + DataFrame containing model parameters and corresponding loss values. + + paramBounds : dict + Dictionary mapping parameter names to their (min, max) bounds: + {parameter_name: (lower_bound, upper_bound)} + + Bounds are used for min–max normalization. + + avaName : str + Name of the avalanche. Used for naming the output figure. + + outDir : pathlib.Path + Directory where the normalized boxplot figure will be saved. + + N : int, optional + Number of best-performing simulations (lowest loss) to include. + Default is 10. + """ + df_best = paramLossDF.nsmallest(N, "Loss") + param_cols = paramLossDF.columns.drop('Loss') + + # normalize using parameter bounds + data = [ + (df_best[c] - paramBounds[c][0]) / (paramBounds[c][1] - paramBounds[c][0]) + for c in param_cols + ] + + fig, ax = plt.subplots(figsize=(15, 7)) + ax.boxplot(data) + ax.set_xticks(range(1, len(param_cols) + 1)) + ax.set_xticklabels(param_cols, rotation=45, ha="right", fontsize=14) + ax.set_ylabel("Normalized value (min–max)", fontsize=14) + ax.set_title(f"Normalized parameter distributions (best {N})", fontsize=16) + ax.tick_params(axis="y", labelsize=12) + fig.tight_layout() + + # Save figure + # Include date, format: YYYYMMDD + date = datetime.now().strftime("%Y%m%d") + figName = f"{outDir}/{avaName}_BOBoxplotNormalised_{date}.png" + plt.savefig(figName, dpi=300, bbox_inches="tight") + + +def saveKFoldCVPrintImage(scores, pipeName, k, out_path): + """ + Save a summary of K-fold cross-validation results as an image. The output image contains metrics for both + training and test sets. + + Reported metrics per split: + - RMSE (Root Mean Squared Error) + - MAE (Mean Absolute Error) + - R² (Coefficient of determination) + + Parameters + ---------- + scores : dict + Dictionary containing cross-validation results. Expected keys: + - "train_rmse", "train_mae", "train_r2" + - "test_rmse", "test_mae", "test_r2" + + pipeName : str + Name of the model or pipeline. Included in the figure header. + + k : int + Number of folds used in cross-validation. + + out_path : str or pathlib.Path + File path where the generated image will be saved. + """ + lines = [f"{pipeName}, {k}-fold CV:\n"] + for split in ("test", "train"): + lines.append(f"{split.capitalize()} metrics:") + for m in ("rmse", "mae", "r2"): + arr = scores[f"{split}_{m}"] + lines.append(f" {m.upper():<4}: {arr.mean():.4g} ± {arr.std():.4g}") + lines.append("") + + text = "\n".join(lines) + + fig = plt.figure(figsize=(6, 4)) + plt.axis("off") + plt.text(0.01, 0.99, text, va="top", family="monospace") + + plt.savefig(out_path, dpi=300, bbox_inches="tight") + plt.close(fig) + + +def saveBestOrSpecificSimulation(finalDF, paramSelected, ei=None, lcb=None, simName=None, csv_path='dummy.csv'): + """ + Save either the best model run (based on optimisationVariable) or a + specified simulation (simName) to a CSV file. + + Parameters + ---------- + finalDF : pandas.DataFrame + containing all simulation results and optimization metrics. + paramSelected : list of str + List of parameter names that should be includedvin the exported output. + ei : float, optional + Expected Improvement value (used in Bayesian optimization). + If None, the column will be created with None. + lcb : float, optional + Lower Confidence Bound value (used in Bayesian optimization). + If None, the column will be created with None. + simName : str, optional + If provided, the row corresponding to this simulation name is saved. + If None, the row with the minimal optimisationVariable is saved. + csv_path : str or pathlib.Path, optional + Path to the CSV file. + + Notes + ----- + Only a subset of relevant columns (including selected parameters) + is written to the output file. + """ + # Subset df, save only important entries + cols_keep = ['simName', 'sampleMethods', 'order', 'Simulation time (s)', 'Minimum time step (s)', + 'Initial CFL number (-)', 'TP_SimRef_cells', 'TP_SimRef_area', 'FP_SimRef_cells', 'FP_SimRef_area', + 'FN_SimRef_cells', 'FN_SimRef_area', 'recall', 'precision', 'f1_score', 'tversky_score', '1-tversky', + 'runoutRMSENormalised', 'optimisationVariable'] + columns_keep = cols_keep[:6] + [p for p in paramSelected if p not in cols_keep] + cols_keep[6:] + df = finalDF[columns_keep] + + if simName is not None: + row = df.loc[df['simName'] == simName].copy() + else: + idx = df['optimisationVariable'].idxmin() + row = df.loc[[idx]].copy() + + # Ensure the optional columns always exist + if ei is not None: + row["ei"] = ei + else: + row['ei'] = None + if lcb is not None: + row["lcb"] = lcb + else: + row['lcb'] = None + # Write to csv + path = pathlib.Path(csv_path) + row.to_csv(path, mode="a", index=False, header=not path.exists()) + + +def saveTopCandidates(finalDF, paramSelected, cfgOpt, results_dict=None, out_path="analysisTable.png", title=None, + simNameMean=None, simNameBest=None): + """ + Create result table(s) for surrogate/model top candidates and save as PNG and CSV. + + Behavior + -------- + - If `results_dict` is provided: creates up to two tables + 1) Surrogate summary (TopNBest mean/std + surrogate single best in "best" column) + Optionally appends a row "optimisationVariable" where: + - "mean" is filled from `simNameMean` (lookup in finalDF) + - "best" is filled from `simNameBest` (lookup in finalDF) + 2) Model summary (Top N + single best), where N is read from cfgOpt['OPTIMISATION']['n_model_top'] + - If `results_dict` is None: creates only the model table (2). + + Outputs + ------- + - PNG at `out_path` + - CSV next to the PNG with suffix `_tables.csv` (tidy format with a `table` column) + + Parameters + ---------- + finalDF : pandas.DataFrame + Must contain column "optimisationVariable". + If simNameMean/simNameBest is used, must contain column "simName". + Must contain parameter columns listed in `paramSelected`. + paramSelected : list[str] + Parameter column names to summarize for the model table. + cfgOpt : configparser.ConfigParser-like + Needs: + - cfgOpt['OPTIMISATION']['n_surrogate_top'] + - cfgOpt['OPTIMISATION']['n_model_top'] + results_dict : dict | None, optional + If provided, expected keys: + - "TopNBest": dict with "mean_params", "std_params", "mean_mu", "std_mu", "mean_sigma", "std_sigma" + - "Best": dict with "params", "mu", "sigma" + out_path : str | pathlib.Path, optional + Path to save PNG figure. + title : str | None, optional + Optional global title for the PNG figure. + simNameMean : str | None, optional + Simulation name whose model optimisationVariable is placed into the surrogate table row + "optimisationVariable" under the "mean" column. + simNameBest : str | None, optional + Simulation name whose model optimisationVariable is placed into the surrogate table row + "optimisationVariable" under the "best" column. + + Returns + ------- + pathlib.Path + Path to the saved PNG figure. + """ + tables = [] + + # ========================================================== + # Surrogate table (optional) + # ========================================================== + if results_dict is not None: + top = results_dict["TopNBest"] + + # --- normalize mean/std params to Series --- + mean_params = top["mean_params"] + std_params = top["std_params"] + + # If params are list/tuple of (name,value) pairs, convert to dict first + if not isinstance(mean_params, dict): + mean_params = dict(mean_params) + if not isinstance(std_params, dict): + std_params = dict(std_params) + + df_top = pd.DataFrame( + { + "mean": pd.Series(mean_params, dtype="float64"), + "std": pd.Series(std_params, dtype="float64"), + } + ) + df_top["relStd [%]"] = relstd(df_top["std"], df_top["mean"]) + + extra = pd.DataFrame( + { + "mean": [top["mean_mu"], top["mean_sigma"]], + "std": [top["std_mu"], top["std_sigma"]], + }, + index=["optimisationVariable_Surrogate", "sigma"], + ) + extra["relStd [%]"] = relstd(extra["std"], extra["mean"]) + + df_top = pd.concat([df_top, extra], axis=0) + + # --- create/attach "best" column from surrogate Best --- + best = results_dict["Best"] + best_params = best["params"] + if not isinstance(best_params, dict): + best_params = dict(best_params) + + best_series = pd.Series(best_params, dtype="float64") + best_series.loc["optimisationVariable_Surrogate"] = float(best["mu"]) + best_series.loc["sigma"] = float(best["sigma"]) + df_top["best"] = best_series + + # --- add model optimisationVariable row: mean and/or best --- + def _get_optvar_for_sim(sim_name, label): + if "simName" not in finalDF.columns: + raise ValueError(f"{label} set but finalDF has no 'simName' column.") + sel = finalDF.loc[finalDF["simName"] == sim_name, "optimisationVariable"] + if sel.empty: + raise ValueError(f"{label}='{sim_name}' not found in finalDF['simName'].") + return float(pd.to_numeric(sel.iloc[0], errors="coerce")) + + if simNameMean is not None or simNameBest is not None: + # Ensure row exists + if "optimisationVariable" not in df_top.index: + df_top.loc["optimisationVariable", ["mean", "std", "relStd [%]", "best"]] = [np.nan, np.nan, np.nan, + np.nan] + + if simNameMean is not None: + df_top.loc["optimisationVariable", ["mean", "std", "relStd [%]"]] = [ + _get_optvar_for_sim(simNameMean, "simNameMean"), + np.nan, + np.nan, + ] + if simNameBest is not None: + df_top.loc["optimisationVariable", "best"] = _get_optvar_for_sim(simNameBest, "simNameBest") + + n_surrogate_top = cfgOpt.getint("OPTIMISATION", "n_surrogate_top") + mean_tag = f" ({simNameMean})" if simNameMean else "" + best_tag = f" ({simNameBest})" if simNameBest else "" + + t1 = ( + f"Surrogate: Mean of Top {n_surrogate_top} Best{mean_tag} " + f"+ Single Best{best_tag}" + ) + tables.append((fmt_df(df_top), t1)) + + # ========================================================== + # Model table (always) + # ========================================================== + if "optimisationVariable" not in finalDF.columns: + raise ValueError("finalDF must contain column 'optimisationVariable'.") + + best_idx = finalDF["optimisationVariable"].idxmin() + + # Raise error if topN is bigger than available model runs + n_model_top = cfgOpt.getint("OPTIMISATION", "n_model_top") + nAvailable = len(finalDF.index) + if n_model_top > nAvailable: + message = ( + f"Invalid configuration: Requested top {n_model_top} simulations, but only {nAvailable} simulations are " + "available." + ) + raise ValueError(message) + + topN = finalDF.nsmallest(n_model_top, "optimisationVariable") + + df_topN = summary_table(topN, paramSelected, best_row=finalDF.loc[best_idx]) + + opt_mean = pd.to_numeric(topN["optimisationVariable"], errors="coerce").mean() + opt_std = pd.to_numeric(topN["optimisationVariable"], errors="coerce").std() + + df_topN.loc["optimisationVariable", ["mean", "std", "relStd [%]", "best"]] = [ + opt_mean, + opt_std, + relstd(opt_std, opt_mean), + pd.to_numeric(finalDF.loc[best_idx, "optimisationVariable"], errors="coerce"), + ] + + best_sim = finalDF.at[best_idx, "simName"] if "simName" in finalDF.columns else "" + t2 = f"Model: Mean of Top {n_model_top} Best + Single Best{f' ({best_sim})' if best_sim else ''}" + tables.append((fmt_df(df_topN), t2)) + + # ========================================================== + # Plot + # ========================================================== + out_path = pathlib.Path(out_path) + out_path.parent.mkdir(parents=True, exist_ok=True) + + fig_h = 1.2 + 0.38 * sum(len(df) for df, _ in tables) + fig, axes = plt.subplots(len(tables), 1, figsize=(10, fig_h)) + axes = [axes] if len(tables) == 1 else axes + + if title: + fig.suptitle(title, fontsize=14, y=0.99) + + for ax, (df_disp, t) in zip(axes, tables): + ax.axis("off") + tbl = ax.table( + cellText=df_disp.values, + rowLabels=df_disp.index.tolist(), + colLabels=df_disp.columns.tolist(), + loc="center", + ) + tbl.auto_set_font_size(False) + tbl.set_fontsize(9) + tbl.scale(1, 1.2) + ax.set_title(t, fontsize=12, pad=10) + + plt.tight_layout(rect=(0, 0, 1, 0.98)) + fig.savefig(out_path, dpi=200, bbox_inches="tight") + plt.close(fig) + + # ========================================================== + # CSV export (tidy) + # ========================================================== + csv_path = out_path.with_name(f"{out_path.stem}_tables.csv") + frames = [] + for df_disp, t in tables: + tmp = df_disp.copy().reset_index(names=["row"]) + tmp.insert(0, "table", t) + frames.append(tmp) + pd.concat(frames, ignore_index=True).to_csv(csv_path, index=False) + + return out_path + + +def formatSig(x): + """ + Format numbers for display in tables (significant digits, sci notation for small values). + - NaN or non-numeric values → "" + - 0 → "0" + - |x| < 1e-3 → scientific notation with 2 decimal places + - |x| < 100 → 2 significant digits + - |x| ≥ 100 → rounded integer + + Parameters + ---------- + x : Any + Value to be formatted. If conversion to float fails or the value + is NaN, an empty string is returned. + + Returns + ------- + str + Formatted string representation suitable for compact table output. + """ + # --- Number formatting --- + try: + x = float(x) + except (TypeError, ValueError): + return "" + if pd.isna(x): + return "" + if x == 0.0: + return "0" + if abs(x) < 1e-3: + return f"{x:.2e}" # small numbers in scientific notation + elif abs(x) < 100: + return f"{x:.2g}" # 2 significant digits + else: + return str(int(round(x))) + + +def fmt_df(df): + """ + Apply formatSig to common summary columns for prettier display/export. + + Parameters + ---------- + df : pandas.DataFrame + Input DataFrame containing summary statistics. + + Returns + ------- + pandas.DataFrame + A copy of the input DataFrame with selected columns formatted + as strings for display purposes. + + """ + g = df.copy() + for c in ["mean", "std", "relStd [%]", "best"]: + if c in g.columns: + g[c] = g[c].map(formatSig) + return g + + +def relstd(std, mean): + """ + Relative std in percent; returns NaN if mean is 0 or NaN. + + Parameters + ---------- + std : array-like or scalar + Standard deviation values. + + mean : array-like or scalar + Mean values corresponding to `std`. + + Returns + ------- + numpy.ndarray + Relative standard deviation in percent. Returns NaN where: + - mean is 0 + - mean is NaN + - conversion to numeric fails + """ + mean = pd.to_numeric(mean, errors="coerce") + std = pd.to_numeric(std, errors="coerce") + return np.where((mean == 0) | pd.isna(mean), np.nan, std / mean * 100.0) + + +def summary_table(df, cols, best_row=None): + """ + Compute summary statistics (mean, standard deviation, relative standard deviation, + and optional best values) for selected numeric columns. + + The function converts the specified columns to numeric (coercing errors to NaN), + computes column-wise statistics, and returns them in a compact summary table. + + Parameters + ---------- + df : pandas.DataFrame + Input DataFrame containing the data to summarise. + + cols : list-like + List of column names for which summary statistics should be computed. + + best_row : pandas.Series or dict-like, optional + Row containing reference or "best" parameter values. If provided, + the values corresponding to `cols` are included in the output under + the column "best". If None, the "best" column is filled with NaN. + + Returns + ------- + pandas.DataFrame + DataFrame indexed by `cols` containing: + - "mean" : Column-wise mean + - "std" : Column-wise standard deviation + - "relStd [%]" : Relative standard deviation in percent + - "best" : Reference/best values (if provided) + """ + X = df[cols].apply(pd.to_numeric, errors="coerce") + out = pd.DataFrame({"mean": X.mean(), "std": X.std()}) + out["relStd [%]"] = relstd(out["std"], out["mean"]) + if best_row is not None: + out["best"] = pd.to_numeric(best_row[cols], errors="coerce").values + else: + out["best"] = np.nan + return out + + +def plotMorrisConvergence(SA_dfs, r_values, reference_r, k=None, outpath=None, title=None): + """ + Plot convergence of Morris mu_star sensitivities. + + Parameters + ---------- + SA_dfs : list of pandas.DataFrame + Morris result DataFrames. + Must contain columns: 'mu_star', 'Parameter' + r_values : list[int] + Number of trajectories corresponding to SA_dfs. + reference_r : int + Trajectory count r that defines the reference dataset (parameter ranking/order). + k : int or None, optional + Number of top parameters to plot. If None, all parameters are plotted. + outpath : str or pathlib.Path, optional + If given, figure is saved to this path. + title : str, optional + Plot title. + Notes + -------- + - This code was written with AI. + + """ + + # -------------------------------------------------- + # Normalize (in-place safe) + ALIGN BY PARAMETER NAME + # -------------------------------------------------- + SA_dfs_aligned = [] + for df in SA_dfs: + d = df.copy() + if "pct" not in d.columns: + d["pct"] = d["mu_star"] / d["mu_star"].sum() * 100 + # Key fix: align all dataframes by the unique parameter name + d = d.set_index("Parameter") + SA_dfs_aligned.append(d) + + if reference_r not in r_values: + raise ValueError(f"reference_r={reference_r} not found in r_values") + + ref_idx = r_values.index(reference_r) + SA_ref = SA_dfs_aligned[ref_idx] + + # -------------------------------------------------- + # Select parameters (REFERENCE DEFINES ORDER) + # -------------------------------------------------- + if k is None: + top_params = SA_ref.index.tolist() + else: + top_params = SA_ref["pct"].nlargest(k).index.tolist() + + labels = top_params # already parameter names + + # -------------------------------------------------- + # Stack results (REINDEX EACH DF TO REFERENCE ORDER) + # -------------------------------------------------- + Y = np.column_stack([ + d.reindex(top_params)["pct"].to_numpy() + for d in SA_dfs_aligned + ]) + + # -------------------------------------------------- + # Plot + # -------------------------------------------------- + fig, ax = plt.subplots(figsize=(12, 8)) + ax.grid(True, ls="--", lw=0.8, color="gray", alpha=0.6, zorder=0) + + for label, y in zip(labels, Y): + ax.plot( + r_values, + y, + marker="o", + lw=1.5, + label=label + ) + + ax.set_xticks(r_values, [f"r = {r}" for r in r_values]) + ax.set_xlabel("Number of Morris trajectories (r)", fontsize=16) + ax.set_ylabel("Mu* sensitivity (%)", fontsize=16) + + if title is None: + title = "Morris sensitivity convergence" + ax.set_title(title, fontsize=18) + + ax.tick_params(axis="both", labelsize=14) + ax.legend(fontsize=11) + fig.tight_layout() + + if outpath is not None: + fig.savefig(outpath, dpi=300, bbox_inches="tight") diff --git a/avaframe/runScripts/runPlotAreaRefDiffs.py b/avaframe/runScripts/runPlotAreaRefDiffs.py index 9afb82ebf..57bbb703f 100644 --- a/avaframe/runScripts/runPlotAreaRefDiffs.py +++ b/avaframe/runScripts/runPlotAreaRefDiffs.py @@ -6,6 +6,8 @@ # importing general python modules import pathlib import numpy as np +import pickle +import re # Local imports from avaframe.in3Utils import cfgUtils @@ -19,100 +21,62 @@ import avaframe.com1DFA.DFAtools as DFAtls -################USER Input############# -resType = "ppr" -thresholdValueSimulation = 0.9 -modName = "com1DFA" -############################################################ - -# Load avalanche directory from general configuration file -cfgMain = cfgUtils.getGeneralConfig() -avalancheDir = cfgMain["MAIN"]["avalancheDir"] -outDir = pathlib.Path(avalancheDir, "Outputs", "out1Peak") -fU.makeADir(outDir) - -# Start logging -logName = "plotAreaDiff_%s" % (resType) -# Start logging -log = logUtils.initiateLogger(avalancheDir, logName) -log.info("MAIN SCRIPT") -log.info("Current avalanche: %s", avalancheDir) - - -# initialize DEM from avalancheDir (used to perform simulations) -# TODO: if meshCellSize was changed - use actual simulation DEM -dem = gI.readDEM(avalancheDir) -# get normal vector of the grid mesh -dem = gT.getNormalMesh(dem, num=1) -# get real Area -dem = DFAtls.getAreaMesh(dem, 1) -dem["originalHeader"] = dem["header"] - -# read reference data set -inDir = pathlib.Path(avalancheDir, "Inputs") -referenceFile, availableFile, _ = gI.getAndCheckInputFiles( - inDir, "REFDATA", "POLY", fileExt="shp", fileSuffix="POLY" -) -# convert polygon to raster with value 1 inside polygon and 0 outside the polygon -referenceLine = shpConv.readLine(referenceFile, "reference", dem) -referenceLine = gT.prepareArea(referenceLine, dem, np.sqrt(2), combine=True, checkOverlap=False) - -# if available zoom into area provided by crop shp file in Inputs/CROPSHAPE -cropFile, cropInfo, _ = gI.getAndCheckInputFiles( - inDir, "POLYGONS", "cropFile", fileExt="shp", fileSuffix="_cropshape" -) -if cropInfo: - cropLine = shpConv.readLine(cropFile, "cropFile", dem) - cropLine = gT.prepareArea(cropLine, dem, np.sqrt(2), combine=True, checkOverlap=False) - -if modName in ["com1DFA", "com5SnowSlide", "com6RockAvalanche", "com8MoTPSA", "com9MoTVoellmy"]: - # load dataFrame for all configurations of simulations in avalancheDir - simDF = cfgUtils.createConfigurationInfo(avalancheDir) - # create data frame that lists all available simulations and path to their result type result files - inputsDF, resTypeList = fU.makeSimFromResDF(avalancheDir, "com1DFA") - # merge parameters as columns to dataDF for matching simNames - dataDF = inputsDF.merge(simDF, left_on="simName", right_on="simName") - - ## loop over all simulations and load desired resType - for index, row in dataDF.iterrows(): - simFile = row[resType] - simData = IOf.readRaster(simFile) - - # compute referenceMask and simulationMask and true positive, false positive and false neg. arrays - # here thresholdValueReference is set to 0.9 as when converting the polygon to a raster, - # values inside polygon are set to 1 and outside to 0 - refMask, compMask, indicatorDict = oPD.computeAreaDiff( - referenceLine["rasterData"], - simData["rasterData"], - 0.9, - thresholdValueSimulation, - dem, - cropToArea=cropLine["rasterData"], +def runPlotAreaRefDiffs(resType, thresholdValueSimulation, modName): + # Load avalanche directory from general configuration file + cfgMain = cfgUtils.getGeneralConfig() + avalancheDir = cfgMain["MAIN"]["avalancheDir"] + outDir = pathlib.Path(avalancheDir, "Outputs", "out1Peak") + fU.makeADir(outDir) + + # Start logging + logName = "plotAreaDiff_%s" % (resType) + # Start logging + log = logUtils.initiateLogger(avalancheDir, logName) + log.info("MAIN SCRIPT") + log.info("Current avalanche: %s", avalancheDir) + + # initialize DEM from avalancheDir (used to perform simulations) + # TODO: if meshCellSize was changed - use actual simulation DEM + dem = gI.readDEM(avalancheDir) + # get normal vector of the grid mesh + dem = gT.getNormalMesh(dem, num=1) + # get real Area + dem = DFAtls.getAreaMesh(dem, 1) + dem["originalHeader"] = dem["header"] + + # read reference data set + inDir = pathlib.Path(avalancheDir, "Inputs") + referenceFile, availableFile, _ = gI.getAndCheckInputFiles( + inDir, "REFDATA", "POLY", fileExt="shp", fileSuffix="POLY" ) + # convert polygon to raster with value 1 inside polygon and 0 outside the polygon + referenceLine = shpConv.readLine(referenceFile, "reference", dem) + referenceLine = gT.prepareArea(referenceLine, dem, np.sqrt(2), combine=True, checkOverlap=False) - # plot differences - oPD.plotAreaDiff( - referenceLine["rasterData"], - refMask, - simData["rasterData"], - compMask, - resType, - simData["header"], - thresholdValueSimulation, - outDir, - indicatorDict, - row["simName"], - cropFile=cropFile, + # if available zoom into area provided by crop shp file in Inputs/CROPSHAPE + cropFile, cropInfo, _ = gI.getAndCheckInputFiles( + inDir, "POLYGONS", "cropFile", fileExt="shp", fileSuffix="_cropshape" ) -else: - # load all result files - resultDir = pathlib.Path(avalancheDir, "Outputs", modName, "peakFiles") - peakFilesList = list(resultDir.glob("*_%s.tif" % resType)) + list(resultDir.glob("*_%s.asc" % resType)) - for pF in peakFilesList: - simData = IOf.readRaster(pF) - simName = pF.stem + if cropInfo: + cropLine = shpConv.readLine(cropFile, "cropFile", dem) + cropLine = gT.prepareArea(cropLine, dem, np.sqrt(2), combine=True, checkOverlap=False) + + if modName in ["com1DFA", "com5SnowSlide", "com6RockAvalanche"]: + # load dataFrame for all configurations of simulations in avalancheDir + simDF = cfgUtils.createConfigurationInfo(avalancheDir) + # create data frame that lists all available simulations and path to their result type result files + inputsDF, resTypeList = fU.makeSimFromResDF(avalancheDir, "com1DFA") + # merge parameters as columns to dataDF for matching simNames + dataDF = inputsDF.merge(simDF, left_on="simName", right_on="simName") + + ## loop over all simulations and load desired resType + for index, row in dataDF.iterrows(): + simFile = row[resType] + simData = IOf.readRaster(simFile) # compute referenceMask and simulationMask and true positive, false positive and false neg. arrays + # here thresholdValueReference is set to 0.9 as when converting the polygon to a raster, + # values inside polygon are set to 1 and outside to 0 refMask, compMask, indicatorDict = oPD.computeAreaDiff( referenceLine["rasterData"], simData["rasterData"], @@ -133,6 +97,76 @@ thresholdValueSimulation, outDir, indicatorDict, - simName, + row["simName"], cropFile=cropFile, ) + else: + # load all result files + resultDir = pathlib.Path(avalancheDir, "Outputs", modName, "peakFiles") + peakFilesList = list(resultDir.glob("*_%s.tif" % resType)) + list(resultDir.glob("*_%s.asc" % resType)) + + allResults = [] + + for pF in peakFilesList: + simData = IOf.readRaster(pF) + simName = pF.stem + + # compute referenceMask and simulationMask and true positive, false positive and false neg. arrays + refMask, compMask, indicatorDict = oPD.computeAreaDiff( + referenceLine["rasterData"], + simData["rasterData"], + 0.9, + thresholdValueSimulation, + dem, + cropToArea=cropLine["rasterData"], + ) + + # plot differences + oPD.plotAreaDiff( + referenceLine["rasterData"], + refMask, + simData["rasterData"], + compMask, + resType, + simData["header"], + thresholdValueSimulation, + outDir, + indicatorDict, + simName, + cropFile=cropFile, + ) + + allResults.append({ + "sim_name": simName, + "res_type": resType, + "threshold": thresholdValueSimulation, + "indicator_dict": indicatorDict, + }) + + # Save summary of TP/FP/FN indicators as pickle + rows = [] + for entry in allResults: + cleanName = re.sub(r"_ppr$", "", entry["sim_name"]) + indicators = entry["indicator_dict"] + row = {"simName": cleanName} + + for key, subdict in indicators.items(): + shortKey = key.replace("truePositive", "TP_SimRef") \ + .replace("falsePositive", "FP_SimRef") \ + .replace("falseNegative", "FN_SimRef") + row["%s_cells" % shortKey] = subdict.get("nCells", None) + row["%s_area" % shortKey] = subdict.get("areaSum", None) + + rows.append(row) + + with open(outDir / "arealIndicators.pkl", "wb") as f: + pickle.dump(rows, f) + + +if __name__ == "__main__": + ################USER Input############# + resType = "ppr" + thresholdValueSimulation = 1 + modName = "com8MoTPSA" + + runPlotAreaRefDiffs(resType, thresholdValueSimulation, modName) diff --git a/pyproject.toml b/pyproject.toml index afe62dd2b..cdcde8914 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dependencies = [ "contextily", "geopandas", "fiona", + "scikit-learn", ] # Setuptools