From d92f0495e8b7fddedb3c7c461ac4883a632ee8e2 Mon Sep 17 00:00:00 2001 From: Yaseen Shady <139421618+yshady-acheev@users.noreply.github.com> Date: Tue, 21 Jan 2025 21:37:53 +0000 Subject: [PATCH 1/8] Interactive ipynb --- mlos_demo_mysql.ipynb | 583 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 582 insertions(+), 1 deletion(-) diff --git a/mlos_demo_mysql.ipynb b/mlos_demo_mysql.ipynb index 5db1f25..58f18a0 100644 --- a/mlos_demo_mysql.ipynb +++ b/mlos_demo_mysql.ipynb @@ -567,12 +567,593 @@ "First, let's select a configuration parameter we want to study along with the benchmark metric we've collected durting our trials. Here we pick the MySQL `innodb_buffer_pool_instances` configuration parameter, and see how cheanging it impacts the benchmark's latency 95th percentile (the `latency_pct` metric). We also pick a secondary configuration parameter, `innodb_flush_method` to add an extra dimension to our analysis." ] }, + { + "cell_type": "markdown", + "id": "f6eb4cd1", + "metadata": {}, + "source": [ + "### First Explore and Learn Interactively" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "15a4de22", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e9be2c4f82d9438ba3e539365e81a1d5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HBox(children=(Dropdown(description='X-axis:', options=('config.innodb_buffer_pool_dump_pct', '…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7a08122fc2dc48dfacf1ebe627c07989", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from ipywidgets import interact, widgets, HBox, VBox\n", + "\n", + "# Define columns of interest\n", + "config_columns = [col for col in df.columns if col.startswith(\"config.\")]\n", + "result_columns = [col for col in df.columns if col.startswith(\"result.\")]\n", + "status_options = tuple(df[\"status\"].unique()) # Convert to tuple for SelectMultiple\n", + "\n", + "# Define a function for plotting relationships interactively with filtering\n", + "def enhanced_plot_relationship(x_axis, y_axis, status_filter, show_grid, save_plot):\n", + " # Filter data based on status\n", + " filtered_df = df[df[\"status\"].isin(status_filter)]\n", + " \n", + " # Create the plot\n", + " plt.figure(figsize=(12, 8))\n", + " sns.scatterplot(data=filtered_df, x=x_axis, y=y_axis, hue=\"status\")\n", + " plt.title(f\"Relationship: {y_axis} vs {x_axis}\")\n", + " plt.xlabel(x_axis)\n", + " plt.ylabel(y_axis)\n", + " plt.grid(show_grid)\n", + " plt.legend(title=\"Status\")\n", + " plt.show()\n", + "\n", + " # Save the plot if the option is selected\n", + " if save_plot:\n", + " plt.savefig(f\"{x_axis}_vs_{y_axis}.png\", dpi=300)\n", + " print(f\"Plot saved as {x_axis}_vs_{y_axis}.png\")\n", + "\n", + "# Widgets for filtering and plot settings\n", + "x_axis_dropdown = widgets.Dropdown(options=config_columns, description=\"X-axis:\")\n", + "y_axis_dropdown = widgets.Dropdown(options=result_columns, description=\"Y-axis:\")\n", + "status_filter_multi = widgets.SelectMultiple(options=status_options, description=\"Status:\", value=status_options)\n", + "show_grid_toggle = widgets.Checkbox(value=True, description=\"Show Grid\")\n", + "save_plot_toggle = widgets.Checkbox(value=False, description=\"Save Plot\")\n", + "\n", + "# Display widgets in a neat layout\n", + "ui = VBox([\n", + " HBox([x_axis_dropdown, y_axis_dropdown]),\n", + " HBox([status_filter_multi, show_grid_toggle, save_plot_toggle])\n", + "])\n", + "\n", + "# Connect widgets to the enhanced plotting function\n", + "out = widgets.interactive_output(\n", + " enhanced_plot_relationship,\n", + " {\n", + " \"x_axis\": x_axis_dropdown,\n", + " \"y_axis\": y_axis_dropdown,\n", + " \"status_filter\": status_filter_multi,\n", + " \"show_grid\": show_grid_toggle,\n", + " \"save_plot\": save_plot_toggle,\n", + " }\n", + ")\n", + "\n", + "# Display UI and output\n", + "display(ui, out)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "46d463f7", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8f2dac0852c5496bb018b6a949d5cc8a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(Dropdown(description='Result Column:', options=('result.errors', 'result.events', 'resul…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def plot_histogram(result_column, bins):\n", + " plt.figure(figsize=(10, 6))\n", + " sns.histplot(df[result_column].dropna(), bins=bins, kde=True)\n", + " plt.title(f\"Histogram of {result_column}\")\n", + " plt.xlabel(result_column)\n", + " plt.ylabel(\"Frequency\")\n", + " plt.grid(True)\n", + " plt.show()\n", + "\n", + "# Interactive widget for histogram\n", + "widgets.interact(\n", + " plot_histogram,\n", + " result_column=widgets.Dropdown(options=result_columns, description=\"Result Column:\"),\n", + " bins=widgets.IntSlider(value=20, min=5, max=50, step=5, description=\"Bins:\")\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "3f5921c6", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "adc57aae2f05412b83f298a6277c5846", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(Dropdown(description='Group By:', options=('config.innodb_buffer_pool_dump_pct', 'config…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def plot_whisker(selected_config, selected_result):\n", + " \"\"\"\n", + " Create a box-and-whisker plot to show the distribution of a result metric\n", + " grouped by a configuration parameter or status.\n", + " \"\"\"\n", + " plt.figure(figsize=(12, 6))\n", + " sns.boxplot(\n", + " data=df,\n", + " x=selected_config,\n", + " y=selected_result,\n", + " hue=\"status\", # Color-code by status for additional insights\n", + " showfliers=True, # Show outliers\n", + " palette=\"Set2\"\n", + " )\n", + " plt.title(f\"Distribution of {selected_result} by {selected_config} (Grouped by Status)\")\n", + " plt.xlabel(selected_config)\n", + " plt.ylabel(selected_result)\n", + " plt.xticks(rotation=45)\n", + " plt.legend(title=\"Status\", loc=\"upper right\")\n", + " plt.grid(True)\n", + " plt.tight_layout()\n", + " plt.show()\n", + "\n", + "# Interactive widget for whisker plot\n", + "widgets.interact(\n", + " plot_whisker,\n", + " selected_config=widgets.Dropdown(\n", + " options=config_columns + [\"status\"], # Add 'status' for grouping by trial outcome\n", + " description=\"Group By:\"\n", + " ),\n", + " selected_result=widgets.Dropdown(\n", + " options=result_columns,\n", + " description=\"Metric:\"\n", + " )\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "4dfe14bb", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3346432cb6824a3f80b62585b46882aa", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(Dropdown(description='Correlation Method:', options=('pearson', 'spearman', 'kendall'), …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def plot_heatmap(corr_method):\n", + " # Select only numeric columns from config and result columns\n", + " numeric_columns = df[config_columns + result_columns].select_dtypes(include=[\"number\"])\n", + " \n", + " if numeric_columns.empty:\n", + " print(\"No numeric columns available for correlation.\")\n", + " return\n", + " \n", + " corr_matrix = numeric_columns.corr(method=corr_method)\n", + " \n", + " plt.figure(figsize=(12, 8))\n", + " sns.heatmap(corr_matrix, annot=True, cmap=\"coolwarm\", fmt=\".2f\")\n", + " plt.title(f\"Correlation Heatmap ({corr_method.capitalize()})\")\n", + " plt.show()\n", + "\n", + "# Interactive widget for heatmap\n", + "widgets.interact(\n", + " plot_heatmap,\n", + " corr_method=widgets.Dropdown(\n", + " options=[\"pearson\", \"spearman\", \"kendall\"],\n", + " description=\"Correlation Method:\",\n", + " value=\"pearson\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "7086c477", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "002afb14e8674a37a5d29c04438a3826", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(Dropdown(description='Metric:', options=('result.errors', 'result.events', 'result.laten…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def find_optimal_config(metric, percentile):\n", + " \"\"\"\n", + " Find configurations with a result metric below the specified percentile.\n", + " \"\"\"\n", + " if metric not in result_columns:\n", + " print(f\"Invalid metric selected. Please choose from: {result_columns}\")\n", + " return\n", + " \n", + " # Compute the specified percentile threshold\n", + " threshold = df[metric].quantile(percentile / 100)\n", + "\n", + " # Filter configurations meeting the threshold\n", + " optimal_configs = df[df[metric] <= threshold]\n", + "\n", + " # Display the results\n", + " print(f\"Optimal Configurations (Top {percentile}% based on {metric} ≤ {threshold:.2f}):\")\n", + " display(optimal_configs[config_columns + [metric]].sort_values(by=metric))\n", + "\n", + " # Highlight the best configuration\n", + " if not optimal_configs.empty:\n", + " best_config = optimal_configs.loc[optimal_configs[metric].idxmin()]\n", + " print(\"\\nBest Configuration:\")\n", + " display(best_config[config_columns + [metric]])\n", + " else:\n", + " print(\"No configurations meet the criteria.\")\n", + "\n", + "# Interactive widget for percentile analysis\n", + "widgets.interact(\n", + " find_optimal_config,\n", + " metric=widgets.Dropdown(\n", + " options=result_columns,\n", + " description=\"Metric:\"\n", + " ),\n", + " percentile=widgets.IntSlider(\n", + " value=10,\n", + " min=1,\n", + " max=100,\n", + " step=1,\n", + " description=\"Percentile:\"\n", + " )\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "1f9bf98d", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "551ce55d380b431894bd99cd3eb10925", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(Dropdown(description='X-axis:', options=('config.innodb_buffer_pool_dump_pct', 'config.i…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def plot_pairplots(selected_x, selected_y, hue_col):\n", + " \"\"\"\n", + " Create a pair plot with the given x, y, and hue for visualization.\n", + " \"\"\"\n", + " # Filter relevant columns\n", + " data_subset = df[[selected_x, selected_y, hue_col]].dropna()\n", + "\n", + " # Create the pair plot\n", + " plt.figure(figsize=(10, 6))\n", + " sns.scatterplot(\n", + " data=data_subset, x=selected_x, y=selected_y, hue=hue_col, alpha=0.7\n", + " )\n", + " plt.title(f\"{selected_y} vs {selected_x} (Hue: {hue_col})\")\n", + " plt.xlabel(selected_x)\n", + " plt.ylabel(selected_y)\n", + " plt.grid(True)\n", + " plt.legend(title=hue_col, loc=\"upper right\")\n", + " plt.show()\n", + "\n", + "# Interactive widget for pair plot exploration\n", + "widgets.interact(\n", + " plot_pairplots,\n", + " selected_x=widgets.Dropdown(\n", + " options=config_columns,\n", + " description=\"X-axis:\"\n", + " ),\n", + " selected_y=widgets.Dropdown(\n", + " options=result_columns,\n", + " description=\"Y-axis:\"\n", + " ),\n", + " hue_col=widgets.Dropdown(\n", + " options=[\"status\"] + config_columns + result_columns,\n", + " description=\"Hue:\"\n", + " )\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "a587a2be", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "566f635a9eee4bba883cdb947a85ffd1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(Dropdown(description='Config ID 1:', options=(1088, 1089, 1090, 1125, 1126, 1127, 1128, …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# Function to compare two configurations\n", + "def compare_two_configs(config_id_1, config_id_2, metric):\n", + " \"\"\"\n", + " Compare results of two configuration IDs graphically.\n", + "\n", + " Parameters:\n", + " - config_id_1: First configuration ID\n", + " - config_id_2: Second configuration ID\n", + " - metric: Performance metric to compare\n", + " \"\"\"\n", + " config_1_data = df[df[\"tunable_config_id\"] == config_id_1]\n", + " config_2_data = df[df[\"tunable_config_id\"] == config_id_2]\n", + "\n", + " plt.figure(figsize=(12, 6))\n", + "\n", + " # Plot results for config 1\n", + " sns.lineplot(\n", + " data=config_1_data,\n", + " x=\"trial_id\",\n", + " y=metric,\n", + " marker=\"o\",\n", + " label=f\"Config {config_id_1}\",\n", + " color=\"blue\"\n", + " )\n", + "\n", + " # Plot results for config 2\n", + " sns.lineplot(\n", + " data=config_2_data,\n", + " x=\"trial_id\",\n", + " y=metric,\n", + " marker=\"o\",\n", + " label=f\"Config {config_id_2}\",\n", + " color=\"orange\"\n", + " )\n", + "\n", + " plt.title(f\"Comparison of {metric} for Config {config_id_1} vs Config {config_id_2}\")\n", + " plt.xlabel(\"Trial ID\")\n", + " plt.ylabel(metric)\n", + " plt.legend()\n", + " plt.grid(True)\n", + " plt.tight_layout()\n", + " plt.show()\n", + "\n", + "# Interactive widgets for selecting configurations and metric\n", + "interact(\n", + " compare_two_configs,\n", + " config_id_1=widgets.Dropdown(\n", + " options=df[\"tunable_config_id\"].unique(),\n", + " description=\"Config ID 1:\"\n", + " ),\n", + " config_id_2=widgets.Dropdown(\n", + " options=df[\"tunable_config_id\"].unique(),\n", + " description=\"Config ID 2:\"\n", + " ),\n", + " metric=widgets.Dropdown(\n", + " options=result_columns,\n", + " description=\"Metric:\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "03f87a86", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2a4eab45ef2741cabc1fd01d0a11bb1a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(Dropdown(description='Config 1 ID:', options=(1088, 1089, 1090, 1125, 1126, 1127, 1128, …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def compare_configs(config1, config2):\n", + " # Filter rows for the selected configurations\n", + " config1_row = df.loc[df['tunable_config_id'] == config1]\n", + " config2_row = df.loc[df['tunable_config_id'] == config2]\n", + "\n", + " # Ensure both configurations are found\n", + " if config1_row.empty or config2_row.empty:\n", + " print(\"One or both of the selected configurations do not exist.\")\n", + " return\n", + "\n", + " # Combine configuration and result columns for display\n", + " compare_columns = config_columns + result_columns\n", + " comparison_df = pd.DataFrame({\n", + " \"Parameter\": compare_columns,\n", + " f\"Config {config1}\": config1_row[compare_columns].iloc[0].values,\n", + " f\"Config {config2}\": config2_row[compare_columns].iloc[0].values\n", + " })\n", + "\n", + " # Display the comparison as a table\n", + " display(comparison_df)\n", + "\n", + "# Interactive widget for configuration comparison\n", + "widgets.interact(\n", + " compare_configs,\n", + " config1=widgets.Dropdown(\n", + " options=df[\"tunable_config_id\"].unique(),\n", + " description=\"Config 1 ID:\"\n", + " ),\n", + " config2=widgets.Dropdown(\n", + " options=df[\"tunable_config_id\"].unique(),\n", + " description=\"Config 2 ID:\"\n", + " )\n", + ")" + ] + }, { "cell_type": "markdown", "id": "7cb4794f", "metadata": {}, "source": [ - "### First automatically with mlos_viz" + "### Also automatically with mlos_viz" ] }, { From 41851ea917fd35d747b0a641ac41bde8203c22e3 Mon Sep 17 00:00:00 2001 From: Yaseen Shady <139421618+yshady-acheev@users.noreply.github.com> Date: Tue, 21 Jan 2025 21:50:11 +0000 Subject: [PATCH 2/8] Example of streamlit integration on codespaces --- app.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 app.py diff --git a/app.py b/app.py new file mode 100644 index 0000000..cb80860 --- /dev/null +++ b/app.py @@ -0,0 +1,95 @@ +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import streamlit as st + +from mlos_bench.storage import from_config + + +# Load the storage configuration +@st.cache_resource +def load_storage(): + return from_config(config="storage/sqlite.jsonc") + + +storage = load_storage() + +# Sidebar for Experiment Selection +st.sidebar.title("Azure MySQL Config Analyzer") +experiment_id = st.sidebar.selectbox("Select Experiment", options=storage.experiments.keys()) + +# Load selected experiment +exp = storage.experiments[experiment_id] +df = exp.results_df + +st.title(f"Azure MySQL Experiment: {experiment_id}") +st.write(f"Description: {exp.description}") + +# Metrics and Columns +config_columns = [col for col in df.columns if col.startswith("config.")] +result_columns = [col for col in df.columns if col.startswith("result.")] +metrics = result_columns + +# Section 1: Data Overview +st.header("Data Overview") +if st.checkbox("Show Raw Data"): + st.write(df) + +# Section 2: Compare Configurations +st.header("Compare Configurations") + +config_id_1 = st.selectbox("Config ID 1", df["tunable_config_id"].unique()) +config_id_2 = st.selectbox("Config ID 2", df["tunable_config_id"].unique()) +metric = st.selectbox("Metric to Compare", metrics) + +if st.button("Compare Configurations"): + config_1_data = df[df["tunable_config_id"] == config_id_1] + config_2_data = df[df["tunable_config_id"] == config_id_2] + + fig, ax = plt.subplots(figsize=(10, 6)) + + sns.lineplot( + data=config_1_data, + x="trial_id", + y=metric, + marker="o", + label=f"Config {config_id_1}", + ax=ax, + ) + sns.lineplot( + data=config_2_data, + x="trial_id", + y=metric, + marker="o", + label=f"Config {config_id_2}", + ax=ax, + ) + + ax.set_title(f"Comparison of {metric}") + ax.set_xlabel("Trial ID") + ax.set_ylabel(metric) + ax.legend() + ax.grid() + + st.pyplot(fig) + +# Section 3: Pair Plot +st.header("Pair Plot for Configurations") +selected_columns = st.multiselect("Select Columns for Pair Plot", config_columns + result_columns) + +if st.button("Generate Pair Plot") and selected_columns: + fig = sns.pairplot(df[selected_columns]) + st.pyplot(fig) + +# Section 4: Heatmap +st.header("Correlation Heatmap") +corr_method = st.radio("Correlation Method", ["pearson", "kendall", "spearman"]) + +if st.button("Generate Heatmap"): + corr_matrix = df[selected_columns].corr(method=corr_method) + + fig, ax = plt.subplots(figsize=(12, 8)) + sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", ax=ax) + st.pyplot(fig) + +# streamlit run app.py --server.port 8501 --server.address 0.0.0.0 From 2917deb05246ad9e06cb3e0676345a48a34dc5b7 Mon Sep 17 00:00:00 2001 From: Yaseen Shady <139421618+yshady-acheev@users.noreply.github.com> Date: Wed, 22 Jan 2025 05:18:37 +0000 Subject: [PATCH 3/8] reworking --- app.py | 1325 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 1269 insertions(+), 56 deletions(-) diff --git a/app.py b/app.py index cb80860..196d203 100644 --- a/app.py +++ b/app.py @@ -2,94 +2,1307 @@ import seaborn as sns import matplotlib.pyplot as plt import streamlit as st - +import plotly +import plotly.express as px +import plotly.graph_objs as go +from sklearn.cluster import KMeans +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler from mlos_bench.storage import from_config +# -------------------------------------------------------------------------------- +# Streamlit Configuration +# -------------------------------------------------------------------------------- +st.set_page_config( + page_title="Azure MySQL Config Analyzer", layout="wide", initial_sidebar_state="expanded" +) -# Load the storage configuration +# -------------------------------------------------------------------------------- +# Data Loading and Caching +# -------------------------------------------------------------------------------- @st.cache_resource def load_storage(): + """ + Load the MLOS storage configuration for the experiments. + This function is cached to prevent reloading on every interaction. + """ return from_config(config="storage/sqlite.jsonc") - storage = load_storage() -# Sidebar for Experiment Selection +# -------------------------------------------------------------------------------- +# Sidebar - Experiment Selection and Filtering +# -------------------------------------------------------------------------------- st.sidebar.title("Azure MySQL Config Analyzer") -experiment_id = st.sidebar.selectbox("Select Experiment", options=storage.experiments.keys()) -# Load selected experiment -exp = storage.experiments[experiment_id] -df = exp.results_df +# Experiment Selection +experiment_id = st.sidebar.selectbox( + "Select Experiment", + options=list(storage.experiments.keys()), + help="Choose the experiment you want to analyze.", +) -st.title(f"Azure MySQL Experiment: {experiment_id}") -st.write(f"Description: {exp.description}") +# Load the selected experiment +exp = storage.experiments[experiment_id] +df = exp.results_df.copy() -# Metrics and Columns +# Extract configuration and result columns config_columns = [col for col in df.columns if col.startswith("config.")] result_columns = [col for col in df.columns if col.startswith("result.")] metrics = result_columns -# Section 1: Data Overview -st.header("Data Overview") -if st.checkbox("Show Raw Data"): - st.write(df) +# -------------------------------------------------------------------------------- +# Main Title and Description +# -------------------------------------------------------------------------------- +st.title(f"Azure MySQL Experiment: {experiment_id}") +st.write(f"**Description**: {exp.description}") + +# -------------------------------------------------------------------------------- +# Tabs Creation +# -------------------------------------------------------------------------------- +tabs = st.tabs( + [ + "Dashboard", + "Data Overview", + "Configurations Analysis", + "Failure Analysis", + "Correlation Heatmap", + "Parallel Coordinates", + "Config Params Scatter", + "Top & Bottom Configs", + "Optimization Suggestions", + "Clustering", + "Advanced Statistics", + "Anomaly Detection", + "Save Analysis", + ] +) -# Section 2: Compare Configurations -st.header("Compare Configurations") +# -------------------------------------------------------------------------------- +# TAB 1: Dashboard +# -------------------------------------------------------------------------------- +with tabs[0]: + st.header("Dashboard") + st.write("### Key Metrics Overview") -config_id_1 = st.selectbox("Config ID 1", df["tunable_config_id"].unique()) -config_id_2 = st.selectbox("Config ID 2", df["tunable_config_id"].unique()) -metric = st.selectbox("Metric to Compare", metrics) + # Calculate key metrics + total_trials = len(df) + success_trials = df["status"].value_counts().get("SUCCESS", 0) + failure_trials = df["status"].value_counts().get("FAILED", 0) + success_rate = (success_trials / total_trials) * 100 if total_trials > 0 else 0 + failure_rate = (failure_trials / total_trials) * 100 if total_trials > 0 else 0 -if st.button("Compare Configurations"): - config_1_data = df[df["tunable_config_id"] == config_id_1] - config_2_data = df[df["tunable_config_id"] == config_id_2] + # Display key metrics + col1, col2, col3 = st.columns(3) + col1.metric("Total Trials", total_trials) + col2.metric("Successful Trials", success_trials) + col3.metric("Failure Rate (%)", f"{failure_rate:.2f}") - fig, ax = plt.subplots(figsize=(10, 6)) + # Visualization: Success vs Failure + fig = px.pie( + names=["Success", "Failure"], + values=[success_trials, failure_trials], + title="Trial Outcomes", + color=["Success", "Failure"], + color_discrete_map={"Success": "green", "Failure": "red"}, + ) + st.plotly_chart(fig, use_container_width=True) + + # Visualization: Top 5 Metrics + st.write("### Top 5 Metrics") + top_metrics = df[result_columns].mean().sort_values(ascending=False).head(5) + fig_metrics = px.bar( + top_metrics, + x=top_metrics.index.str.replace("result.", "").str.replace("_", " ").str.title(), + y=top_metrics.values, + labels={"x": "Metric", "y": "Average Value"}, + title="Top 5 Average Metrics", + color=top_metrics.values, + color_continuous_scale="Blues", + ) + st.plotly_chart(fig_metrics, use_container_width=True) + +# -------------------------------------------------------------------------------- +# TAB 2: Data Overview +# -------------------------------------------------------------------------------- +with tabs[1]: + st.header("Data Overview") + st.write("Explore experiment data and key statistics.") + + # Data Filtering + with st.expander("Filter Data"): + st.subheader("Apply Filters") + trial_id_filter = st.text_input( + "Filter by Trial ID (comma-separated)", help="Enter trial IDs separated by commas." + ) + status_filter = st.multiselect( + "Filter by Status", + options=df["status"].unique(), + default=df["status"].unique(), + help="Select one or more statuses to filter the trials.", + ) + config_filter = st.multiselect( + "Filter by Configuration ID", + options=df["tunable_config_id"].unique(), + default=df["tunable_config_id"].unique(), + help="Select one or more configuration IDs to filter the trials.", + ) + + if st.button("Apply Filters"): + filtered_df = df.copy() + if trial_id_filter: + try: + trial_ids = [ + int(tid.strip()) + for tid in trial_id_filter.split(",") + if tid.strip().isdigit() + ] + filtered_df = filtered_df[filtered_df["trial_id"].isin(trial_ids)] + except ValueError: + st.error("Please enter valid trial IDs separated by commas.") + if status_filter: + filtered_df = filtered_df[filtered_df["status"].isin(status_filter)] + if config_filter: + filtered_df = filtered_df[filtered_df["tunable_config_id"].isin(config_filter)] + st.session_state.filtered_df = filtered_df + st.success("Filters applied successfully!") + + # Display filtered data or original data + if "filtered_df" in st.session_state: + display_df = st.session_state.filtered_df + else: + display_df = df + + if st.checkbox("Show Data Table"): + st.dataframe(display_df) + st.write("### Descriptive Statistics:") + st.write(display_df.describe()) + +# -------------------------------------------------------------------------------- +# TAB 3: Configurations Analysis +# -------------------------------------------------------------------------------- +with tabs[2]: + st.header("Configurations Analysis") + st.write("Visualize performance metrics across different configurations.") + + config_id = st.selectbox( + "Select Configuration ID", + options=df["tunable_config_id"].unique(), + help="Choose a configuration to analyze its performance over trials.", + ) + metric = st.selectbox( + "Select Metric", options=metrics, help="Choose a performance metric to visualize." + ) - sns.lineplot( - data=config_1_data, + config_data = df[df["tunable_config_id"] == config_id] + fig = px.line( + config_data, x="trial_id", y=metric, - marker="o", - label=f"Config {config_id_1}", - ax=ax, + title=f"{metric.replace('result.', '').replace('_', ' ').title()} over Trials for Configuration {config_id}", + markers=True, + labels={ + "trial_id": "Trial ID", + metric: metric.replace("result.", "").replace("_", " ").title(), + }, + template="plotly_white", ) - sns.lineplot( - data=config_2_data, + st.plotly_chart(fig, use_container_width=True) + + # Additional Insights: Moving Average + window_size = st.slider( + "Select Moving Average Window Size", + 1, + 10, + 3, + help="Smooth the metric by applying a moving average.", + ) + config_data[f"{metric}_MA"] = config_data[metric].rolling(window=window_size).mean() + fig_ma = px.line( + config_data, x="trial_id", - y=metric, - marker="o", - label=f"Config {config_id_2}", - ax=ax, + y=f"{metric}_MA", + title=f"{metric.replace('result.', '').replace('_', ' ').title()} - Moving Average (Window Size={window_size})", + markers=True, + labels={ + "trial_id": "Trial ID", + f"{metric}_MA": f"{metric.replace('result.', '').replace('_', ' ').title()} (MA)", + }, + template="plotly_white", + ) + st.plotly_chart(fig_ma, use_container_width=True) + +# -------------------------------------------------------------------------------- +# TAB 4: Failure Analysis +# -------------------------------------------------------------------------------- +with tabs[3]: + st.header("Failure Analysis") + st.write("Analyze failure rates and trends across trials.") + + if "status" in df.columns: + # Failure Rate Distribution + st.subheader("Failure Rate Distribution") + failure_counts = df["status"].value_counts() + fig_pie = px.pie( + values=failure_counts.values, + names=failure_counts.index, + title="Failure Rate Distribution", + color=failure_counts.index, + color_discrete_map={"FAILED": "red", "SUCCESS": "green"}, + ) + st.plotly_chart(fig_pie, use_container_width=True) + + # Failure Rate Trend Over Trials + st.subheader("Failure Rate Trend Over Trials") + failure_rate_trend = ( + df.groupby("trial_id")["status"] + .apply(lambda x: (x == "FAILED").mean() * 100) + .reset_index() + ) + failure_rate_trend.columns = ["Trial ID", "Failure Rate (%)"] + fig_line = px.line( + failure_rate_trend, + x="Trial ID", + y="Failure Rate (%)", + title="Failure Rate Trend Over Trials", + markers=True, + labels={"Trial ID": "Trial ID", "Failure Rate (%)": "Failure Rate (%)"}, + template="plotly_white", + ) + st.plotly_chart(fig_line, use_container_width=True) + else: + st.info("No 'status' column found in the dataset.") + +# -------------------------------------------------------------------------------- +# TAB 5: Correlation Heatmap +# -------------------------------------------------------------------------------- +with tabs[4]: + st.header("Correlation Heatmap") + st.write("Visualize correlations between selected configuration and result metrics.") + + selected_columns = st.multiselect( + "Select Columns for Heatmap", + options=config_columns + result_columns, + default=config_columns[:2] + result_columns[:2], + help="Choose multiple columns to analyze their correlation.", + ) + + if st.button("Generate Heatmap"): + if selected_columns: + corr_matrix = df[selected_columns].corr() + fig = px.imshow( + corr_matrix, + text_auto=True, + color_continuous_scale="Viridis", + title="Correlation Heatmap", + labels={"color": "Correlation Coefficient"}, + ) + st.plotly_chart(fig, use_container_width=True) + else: + st.warning("Please select at least one column to generate the heatmap.") + else: + st.info("Select columns and click 'Generate Heatmap' to visualize correlations.") + +# -------------------------------------------------------------------------------- +# TAB 6: Parallel Coordinates +# -------------------------------------------------------------------------------- +with tabs[5]: + st.header("Parallel Coordinates Plot") + st.write( + "Explore multi-dimensional relationships between configuration parameters and metrics." + ) + + parallel_columns = st.multiselect( + "Select Columns for Parallel Plot", + options=config_columns + result_columns, + default=config_columns[:3] + result_columns[:2], + help="Choose multiple columns to include in the parallel coordinates plot.", + ) + + if parallel_columns: + color_metric = st.selectbox( + "Select Metric for Coloring", + options=result_columns, + help="Choose a result metric to color-code the parallel coordinates.", + ) + fig = px.parallel_coordinates( + df, + dimensions=parallel_columns, + color=color_metric, + color_continuous_scale=px.colors.diverging.Tealrose, + title="Parallel Coordinates Plot", + labels={ + col: col.replace("config.", "").replace("_", " ").title() + for col in parallel_columns + }, + template="plotly_white", + ) + st.plotly_chart(fig, use_container_width=True) + else: + st.info("Select columns to generate the parallel coordinates plot.") + +# -------------------------------------------------------------------------------- +# TAB 7: Config Params Scatter +# -------------------------------------------------------------------------------- +with tabs[6]: + st.header("Configuration Parameters Scatter Plot") + st.write( + "Analyze the relationship between multiple configuration parameters and a selected result metric." + ) + + if not config_columns: + st.warning("No configuration parameters available in the dataset.") + elif not metrics: + st.warning("No result metrics available in the dataset.") + else: + # Select multiple configuration parameters + selected_config_params = st.multiselect( + "Select Configuration Parameters", + options=config_columns, + default=config_columns[:2], + help="Choose one or more configuration parameters to analyze.", + ) + + # Select one result metric + selected_result_metric = st.selectbox( + "Select Result Metric", + options=metrics, + help="Choose a result metric to analyze against the selected configuration parameters.", + ) + + if selected_config_params: + # Determine layout based on number of selected parameters + plots_per_row = 2 + num_plots = len(selected_config_params) + num_rows = (num_plots + plots_per_row - 1) // plots_per_row + + for row in range(num_rows): + cols = st.columns(plots_per_row) + for i in range(plots_per_row): + plot_index = row * plots_per_row + i + if plot_index < num_plots: + config_param = selected_config_params[plot_index] + with cols[i]: + fig = px.scatter( + df, + x=config_param, + y=selected_result_metric, + color="tunable_config_id", + title=f"{config_param.replace('config.', '').replace('_', ' ').title()} vs {selected_result_metric.replace('result.', '').replace('_', ' ').title()}", + labels={ + config_param: config_param.replace("config.", "") + .replace("_", " ") + .title(), + selected_result_metric: selected_result_metric.replace( + "result.", "" + ) + .replace("_", " ") + .title(), + }, + hover_data=["trial_id", "tunable_config_id"], + trendline="ols", + template="plotly_white", + ) + + st.plotly_chart(fig, use_container_width=True) + + # Calculate and display the correlation coefficient + corr_coeff = ( + df[[config_param, selected_result_metric]].corr().iloc[0, 1] + ) + st.markdown(f"**Correlation Coefficient:** {corr_coeff:.2f}") + else: + st.info( + "Please select at least one configuration parameter to generate scatter plots." + ) + +# -------------------------------------------------------------------------------- +# TAB 8: Top & Bottom Configurations +# -------------------------------------------------------------------------------- +with tabs[7]: + st.header("Top and Bottom Configurations") + st.write( + "Identify configurations with the best and worst performance based on selected metrics." + ) + + n_configs = st.slider( + "Number of Configurations to Display", + min_value=1, + max_value=10, + value=5, + help="Select how many top and bottom configurations to display.", + ) + + # Select metric for ranking + tb_metric = st.selectbox( + "Select Metric for Ranking", + options=metrics, + index=0, + key="tb_metric", + help="Choose a metric to rank configurations.", + ) + optimization_method = st.radio( + "Select Optimization Method", + ["Maximize", "Minimize"], + index=0, + key="tb_opt_method", + help="Choose whether to find configurations that maximize or minimize the selected metric.", + ) + + if not df.empty: + if optimization_method == "Maximize": + top_configs = df.nlargest(n_configs, tb_metric) + bottom_configs = df.nsmallest(n_configs, tb_metric) + else: + top_configs = df.nsmallest(n_configs, tb_metric) + bottom_configs = df.nlargest(n_configs, tb_metric) + + st.subheader("Top Configurations") + st.dataframe(top_configs) + + st.subheader("Bottom Configurations") + st.dataframe(bottom_configs) + else: + st.warning("No data available to identify top/bottom configurations.") + +# -------------------------------------------------------------------------------- +# TAB 9: Optimization Suggestions +# -------------------------------------------------------------------------------- +with tabs[8]: + st.header("Optimization Suggestions") + st.write("Discover optimal configurations based on selected performance metrics.") + + target_metric = st.selectbox( + "Select Metric for Optimization", + options=metrics, + index=0, + key="opt_target_metric", + help="Choose a performance metric to optimize.", + ) + optimization_method = st.radio( + "Select Optimization Method", + ["Maximize", "Minimize"], + index=0, + key="opt_method_choice", + help="Choose whether to maximize or minimize the selected metric.", + ) + + if not df.empty: + if optimization_method == "Maximize": + optimal_config = df.loc[df[target_metric].idxmax()] + else: + optimal_config = df.loc[df[target_metric].idxmin()] + + st.write( + f"**Optimal Configuration ({optimization_method} {target_metric.replace('result.', '').replace('_', ' ').title()}):**" + ) + st.json(optimal_config[config_columns].to_dict()) + else: + st.warning("No data available for optimization.") + +# -------------------------------------------------------------------------------- +# TAB 10: Clustering +# -------------------------------------------------------------------------------- +with tabs[9]: + st.header("Clustering Analysis") + st.write("Group similar configurations to identify patterns and clusters.") + + cluster_columns = st.multiselect( + "Select Columns for Clustering", + options=config_columns + result_columns, + default=config_columns[:3], + help="Choose multiple columns to perform clustering.", + ) + num_clusters = st.slider( + "Number of Clusters", + min_value=2, + max_value=10, + value=3, + help="Define the number of clusters for K-Means.", + ) + + if len(cluster_columns) >= 2: + if st.button("Generate Clustering"): + clustering_data = df[cluster_columns].dropna() + + # Standardize the data + scaler = StandardScaler() + clustering_data_scaled = scaler.fit_transform(clustering_data) + + # Perform K-Means clustering + kmeans = KMeans(n_clusters=num_clusters, random_state=42) + clusters = kmeans.fit_predict(clustering_data_scaled) + df["cluster"] = clusters + + # Optional: Dimensionality Reduction for 3D Plotting + if len(cluster_columns) > 3: + pca = PCA(n_components=3) + principal_components = pca.fit_transform(clustering_data_scaled) + df["PC1"] = principal_components[:, 0] + df["PC2"] = principal_components[:, 1] + df["PC3"] = principal_components[:, 2] + fig = px.scatter_3d( + df, + x="PC1", + y="PC2", + z="PC3", + color="cluster", + title="3D Scatter Plot with PCA and Clustering", + labels={ + "PC1": "Principal Component 1", + "PC2": "Principal Component 2", + "PC3": "Principal Component 3", + }, + template="plotly_white", + ) + elif len(cluster_columns) == 3: + fig = px.scatter_3d( + df, + x=cluster_columns[0], + y=cluster_columns[1], + z=cluster_columns[2], + color="cluster", + title="3D Scatter Plot with Clustering", + labels={ + cluster_columns[0]: cluster_columns[0] + .replace("config.", "") + .replace("_", " ") + .title(), + cluster_columns[1]: cluster_columns[1] + .replace("config.", "") + .replace("_", " ") + .title(), + cluster_columns[2]: cluster_columns[2] + .replace("config.", "") + .replace("_", " ") + .title(), + }, + template="plotly_white", + ) + else: + fig = px.scatter( + df, + x=cluster_columns[0], + y=cluster_columns[1], + color="cluster", + title="2D Scatter Plot with Clustering", + labels={ + cluster_columns[0]: cluster_columns[0] + .replace("config.", "") + .replace("_", " ") + .title(), + cluster_columns[1]: cluster_columns[1] + .replace("config.", "") + .replace("_", " ") + .title(), + }, + template="plotly_white", + ) + + st.plotly_chart(fig, use_container_width=True) + + # Cluster Centroids + centroids = kmeans.cluster_centers_ + centroids_df = pd.DataFrame(centroids, columns=cluster_columns) + st.subheader("Cluster Centroids") + st.write(centroids_df) + else: + st.warning("Please select at least two columns for clustering.") + +# -------------------------------------------------------------------------------- +# TAB 10: Advanced Statistics +# -------------------------------------------------------------------------------- +with tabs[10]: + st.header("Advanced Statistics") + st.write("Perform advanced statistical analyses on the experiment data.") + + # Select Metric for Statistical Analysis + selected_metric = st.selectbox( + "Select Metric for Statistical Analysis", + options=metrics, + help="Choose a result metric to perform statistical tests.", + ) + + # Debugging: Display selected_metric and its type + st.write(f"**Selected Metric:** {selected_metric}") + st.write(f"**Selected Metric Type:** {df[selected_metric].dtype}") + + # Check if the selected metric is numeric + if pd.api.types.is_numeric_dtype(df[selected_metric]): + st.subheader( + f"Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}" + ) + st.write(df[selected_metric].describe()) + + # Define the template + template_value = "plotly_white" + st.write(f"**Template Type:** {type(template_value)}, **Value:** {template_value}") + + # Histogram with KDE + try: + fig_hist = px.histogram( + df, + x=selected_metric, + nbins=30, + title=f"Distribution of {selected_metric.replace('result.', '').replace('_', ' ').title()}", + marginal="kde", + labels={ + selected_metric: selected_metric.replace("result.", "") + .replace("_", " ") + .title() + }, + template=template_value, # Ensure this is a string + ) + st.plotly_chart(fig_hist, use_container_width=True) + except Exception as e: + st.error(f"An error occurred while generating the histogram: {e}") + + # Box Plot + st.subheader( + f"Box Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" + ) + try: + fig_box = px.box( + df, + y=selected_metric, + points="all", + title=f"Box Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", + labels={ + selected_metric: selected_metric.replace("result.", "") + .replace("_", " ") + .title() + }, + template=template_value, # Ensure this is a string + ) + st.plotly_chart(fig_box, use_container_width=True) + except Exception as e: + st.error(f"An error occurred while generating the box plot: {e}") + + # Violin Plot + st.subheader( + f"Violin Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" + ) + try: + fig_violin = px.violin( + df, + y=selected_metric, + box=True, + points="all", + title=f"Violin Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", + labels={ + selected_metric: selected_metric.replace("result.", "") + .replace("_", " ") + .title() + }, + template=template_value, # Ensure this is a string + ) + st.plotly_chart(fig_violin, use_container_width=True) + except Exception as e: + st.error(f"An error occurred while generating the violin plot: {e}") + else: + st.warning( + f"The selected metric '{selected_metric}' is not numeric and cannot be plotted." + ) + + # Display Plotly Version for Debugging + st.subheader("Plotly Version") + st.write(f"Plotly version: {plotly.__version__}") + + # Optional: Display the selected template + st.subheader("Template Information") + st.write(f"Selected Template: {template_value}") + + +# -------------------------------------------------------------------------------- +# TAB 12: Anomaly Detection +# -------------------------------------------------------------------------------- +with tabs[11]: + st.header("Anomaly Detection") + st.write("Identify anomalous trials based on selected metrics.") + + anomaly_metric = st.selectbox( + "Select Metric for Anomaly Detection", + options=metrics, + help="Choose a result metric to perform anomaly detection.", + ) + threshold = st.slider( + "Set Anomaly Threshold (Standard Deviations)", + min_value=1.0, + max_value=5.0, + value=3.0, + step=0.5, + help="Define how many standard deviations away from the mean a data point should be to be considered an anomaly.", + ) + + mean_val = df[anomaly_metric].mean() + std_val = df[anomaly_metric].std() + upper_bound = mean_val + threshold * std_val + lower_bound = mean_val - threshold * std_val + + anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] + + st.subheader(f"Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}") + if not anomalies.empty: + st.write(f"Total Anomalies Detected: {len(anomalies)}") + st.dataframe(anomalies) + + # Visualization: Scatter Plot Highlighting Anomalies + fig_anomaly = px.scatter( + df, + x="trial_id", + y=anomaly_metric, + color=df.index.isin(anomalies.index), + title=f"Anomaly Detection in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}", + labels={ + "trial_id": "Trial ID", + anomaly_metric: anomaly_metric.replace("result.", "").replace("_", " ").title(), + }, + color_discrete_map={True: "red", False: "blue"}, + template="plotly_white", + ) + st.plotly_chart(fig_anomaly, use_container_width=True) + else: + st.success("No anomalies detected based on the current threshold.") + +# -------------------------------------------------------------------------------- +# TAB 13: Save Analysis Report +# -------------------------------------------------------------------------------- +with tabs[12]: + st.header("Save Analysis Report") + st.write("Download a comprehensive analysis report of your experiment.") + + report_options = st.multiselect( + "Select Sections to Include in the Report", + options=[ + "Data Overview", + "Configurations Analysis", + "Failure Analysis", + "Correlation Heatmap", + "Parallel Coordinates", + "Config Params Scatter", + "Top & Bottom Configs", + "Optimization Suggestions", + "Clustering", + "Advanced Statistics", + "Anomaly Detection", + ], + default=[ + "Data Overview", + "Configurations Analysis", + "Failure Analysis", + "Correlation Heatmap", + "Top & Bottom Configs", + "Optimization Suggestions", + ], + help="Choose which sections of the analysis you want to include in the report.", + ) + + if st.button("Download Report"): + # Generate the report based on selected sections + report = f"# Azure MySQL Config Analyzer Report\n\n## Experiment: {experiment_id}\n\n**Description:** {exp.description}\n\n" + + if "Data Overview" in report_options: + report += "## Data Overview\n" + report += f"### Descriptive Statistics\n{df.describe().to_markdown()}\n\n" + + if "Configurations Analysis" in report_options: + report += "## Configurations Analysis\n" + # Example: Include top configuration analysis + top_config = df.loc[ + df["result.metric"].idxmax() + ] # Replace 'result.metric' with actual metric if needed + report += f"### Optimal Configuration\n{top_config[config_columns].to_dict()}\n\n" + + if "Failure Analysis" in report_options: + report += "## Failure Analysis\n" + failure_counts = df["status"].value_counts() + report += f"### Failure Rate Distribution\n{failure_counts.to_dict()}\n\n" + + if "Correlation Heatmap" in report_options: + report += "## Correlation Heatmap\n" + selected_columns = config_columns + result_columns # Adjust as needed + corr_matrix = df[selected_columns].corr() + report += f"### Correlation Matrix\n{corr_matrix.to_markdown()}\n\n" + + if "Parallel Coordinates" in report_options: + report += "## Parallel Coordinates\n" + # Example placeholder + report += "### Parallel Coordinates Plot was generated in the application.\n\n" + + if "Config Params Scatter" in report_options: + report += "## Configuration Parameters Scatter Plot\n" + # Example placeholder + report += "### Scatter plots were generated in the application.\n\n" + + if "Top & Bottom Configs" in report_options: + report += "## Top & Bottom Configurations\n" + n_configs = st.session_state.get("n_configs_display", 5) + tb_metric = st.session_state.get("tb_metric", metrics[0]) + optimization_method = st.session_state.get("tb_opt_method", "Maximize") + if optimization_method == "Maximize": + top_configs = df.nlargest(n_configs, tb_metric) + bottom_configs = df.nsmallest(n_configs, tb_metric) + else: + top_configs = df.nsmallest(n_configs, tb_metric) + bottom_configs = df.nlargest(n_configs, tb_metric) + report += f"### Top {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{top_configs.to_markdown()}\n\n" + report += f"### Bottom {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{bottom_configs.to_markdown()}\n\n" + + if "Optimization Suggestions" in report_options: + report += "## Optimization Suggestions\n" + target_metric = st.session_state.get("opt_target_metric", metrics[0]) + optimization_method = st.session_state.get("opt_method_choice", "Maximize") + if optimization_method == "Maximize": + optimal_config = df.loc[df[target_metric].idxmax()] + else: + optimal_config = df.loc[df[target_metric].idxmin()] + report += f"### Optimal Configuration ({optimization_method} {target_metric.replace('result.', '').replace('_', ' ').title()}):\n{optimal_config[config_columns].to_dict()}\n\n" + + if "Clustering" in report_options: + report += "## Clustering Analysis\n" + # Example placeholder + report += "### Clustering results were generated in the application.\n\n" + + if "Advanced Statistics" in report_options: + report += "## Advanced Statistics\n" + selected_metric = st.session_state.get("advanced_stat_metric", metrics[0]) + report += f"### Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}\n{df[selected_metric].describe().to_markdown()}\n\n" + + if "Anomaly Detection" in report_options: + report += "## Anomaly Detection\n" + anomaly_metric = st.session_state.get("anomaly_metric", metrics[0]) + threshold = st.session_state.get("anomaly_threshold", 3.0) + mean_val = df[anomaly_metric].mean() + std_val = df[anomaly_metric].std() + upper_bound = mean_val + threshold * std_val + lower_bound = mean_val - threshold * std_val + anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] + report += f"### Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()} (Threshold: {threshold} Std Dev)\n{anomalies.to_markdown()}\n\n" + + # Download the report as a text file + st.download_button( + label="Download Report as Text", + data=report, + file_name="analysis_report.txt", + mime="text/plain", + ) + + # Optionally, provide the CSV report + st.subheader("Download Descriptive Statistics") + if st.button("Download Descriptive Statistics as CSV"): + report_csv = df.describe().to_csv() + st.download_button( + label="Download CSV Report", + data=report_csv, + file_name="descriptive_statistics.csv", + mime="text/csv", + ) + + st.info("Select the sections you want to include in the report and click 'Download Report'.") + +# -------------------------------------------------------------------------------- +# TAB 10: Clustering +# -------------------------------------------------------------------------------- +with tabs[9]: + st.header("Clustering Analysis") + st.write("Group similar configurations to identify patterns and clusters.") + + cluster_columns = st.multiselect( + "Select Columns for Clustering", + options=config_columns + result_columns, + default=config_columns[:3], + help="Choose multiple columns to perform clustering.", + key="clustering_columns_select", # Unique key + ) + + num_clusters = st.slider( + "Number of Clusters", + min_value=2, + max_value=10, + value=3, + help="Define the number of clusters for K-Means.", + key="num_clusters_slider_clustering", # Unique key ) - ax.set_title(f"Comparison of {metric}") - ax.set_xlabel("Trial ID") - ax.set_ylabel(metric) - ax.legend() - ax.grid() + if len(cluster_columns) >= 2: + if st.button("Generate Clustering", key="gen cluster"): + clustering_data = df[cluster_columns].dropna() + + # Standardize the data + scaler = StandardScaler() + clustering_data_scaled = scaler.fit_transform(clustering_data) + + # Perform K-Means clustering + kmeans = KMeans(n_clusters=num_clusters, random_state=42) + clusters = kmeans.fit_predict(clustering_data_scaled) + df["cluster"] = clusters + + # Optional: Dimensionality Reduction for 3D Plotting + if len(cluster_columns) > 3: + pca = PCA(n_components=3) + principal_components = pca.fit_transform(clustering_data_scaled) + df["PC1"] = principal_components[:, 0] + df["PC2"] = principal_components[:, 1] + df["PC3"] = principal_components[:, 2] + fig = px.scatter_3d( + df, + x="PC1", + y="PC2", + z="PC3", + color="cluster", + title="3D Scatter Plot with PCA and Clustering", + labels={ + "PC1": "Principal Component 1", + "PC2": "Principal Component 2", + "PC3": "Principal Component 3", + }, + template="plotly_white", + ) + elif len(cluster_columns) == 3: + fig = px.scatter_3d( + df, + x=cluster_columns[0], + y=cluster_columns[1], + z=cluster_columns[2], + color="cluster", + title="3D Scatter Plot with Clustering", + labels={ + cluster_columns[0]: cluster_columns[0] + .replace("config.", "") + .replace("_", " ") + .title(), + cluster_columns[1]: cluster_columns[1] + .replace("config.", "") + .replace("_", " ") + .title(), + cluster_columns[2]: cluster_columns[2] + .replace("config.", "") + .replace("_", " ") + .title(), + }, + template="plotly_white", + ) + else: + fig = px.scatter( + df, + x=cluster_columns[0], + y=cluster_columns[1], + color="cluster", + title="2D Scatter Plot with Clustering", + labels={ + cluster_columns[0]: cluster_columns[0] + .replace("config.", "") + .replace("_", " ") + .title(), + cluster_columns[1]: cluster_columns[1] + .replace("config.", "") + .replace("_", " ") + .title(), + }, + template="plotly_white", + ) + + st.plotly_chart(fig, use_container_width=True) + + # Cluster Centroids + centroids = kmeans.cluster_centers_ + centroids_df = pd.DataFrame(centroids, columns=cluster_columns) + st.subheader("Cluster Centroids") + st.write(centroids_df) + else: + st.warning("Please select at least two columns for clustering.") + +# -------------------------------------------------------------------------------- +# TAB 11: Advanced Statistics +# -------------------------------------------------------------------------------- +# -------------------------------------------------------------------------------- +# TAB 11: Advanced Statistics +# -------------------------------------------------------------------------------- +with tabs[10]: + st.header("Advanced Statistics") + st.write("Perform advanced statistical analyses on the experiment data.") + + selected_metric = st.selectbox( + "Select Metric for Statistical Analysis", + options=metrics, + help="Choose a result metric to perform statistical tests.", + key="sel adv", + ) + + st.subheader( + f"Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}" + ) + + # Display data type and missing values + st.write(f"Data Type: {df[selected_metric].dtype}") + st.write(f"Missing Values: {df[selected_metric].isnull().sum()}") + + # Handle missing values + plot_df = df.dropna(subset=[selected_metric]) + + # Check if the selected metric is numeric + if pd.api.types.is_numeric_dtype(plot_df[selected_metric]): + st.write(plot_df[selected_metric].describe()) + + # Histogram with KDE + fig_hist = px.histogram( + plot_df, + x=selected_metric, + nbins=30, + title=f"Distribution of {selected_metric.replace('result.', '').replace('_', ' ').title()}", + marginal="kde", + labels={ + selected_metric: selected_metric.replace("result.", "").replace("_", " ").title() + }, + template="plotly_white", + ) + st.plotly_chart(fig_hist, use_container_width=True) + + # Box Plot + st.subheader( + f"Box Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" + ) + fig_box = px.box( + plot_df, + y=selected_metric, + points="all", + title=f"Box Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", + labels={ + selected_metric: selected_metric.replace("result.", "").replace("_", " ").title() + }, + template="plotly_white", + ) + st.plotly_chart(fig_box, use_container_width=True) + + # Violin Plot + st.subheader( + f"Violin Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" + ) + fig_violin = px.violin( + plot_df, + y=selected_metric, + box=True, + points="all", + title=f"Violin Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", + labels={ + selected_metric: selected_metric.replace("result.", "").replace("_", " ").title() + }, + template="plotly_white", + ) + st.plotly_chart(fig_violin, use_container_width=True) + else: + st.error( + f"The selected metric '{selected_metric}' is not numeric. Please select a numeric metric for statistical analysis." + ) + + +# -------------------------------------------------------------------------------- +# TAB 12: Anomaly Detection +# -------------------------------------------------------------------------------- +with tabs[11]: + st.header("Anomaly Detection") + st.write("Identify anomalous trials based on selected metrics.") + + anomaly_metric = st.selectbox( + "Select Metric for Anomaly Detection", + options=metrics, + help="Choose a result metric to perform anomaly detection.", + ) + threshold = st.slider( + "Set Anomaly Threshold (Standard Deviations)", + min_value=1.0, + max_value=5.0, + value=3.0, + step=0.5, + help="Define how many standard deviations away from the mean a data point should be to be considered an anomaly.", + ) + + mean_val = df[anomaly_metric].mean() + std_val = df[anomaly_metric].std() + upper_bound = mean_val + threshold * std_val + lower_bound = mean_val - threshold * std_val + + anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] + + st.subheader(f"Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}") + if not anomalies.empty: + st.write(f"Total Anomalies Detected: {len(anomalies)}") + st.dataframe(anomalies) + + # Visualization: Scatter Plot Highlighting Anomalies + fig_anomaly = px.scatter( + df, + x="trial_id", + y=anomaly_metric, + color=df.index.isin(anomalies.index), + title=f"Anomaly Detection in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}", + labels={ + "trial_id": "Trial ID", + anomaly_metric: anomaly_metric.replace("result.", "").replace("_", " ").title(), + }, + color_discrete_map={True: "red", False: "blue"}, + template="plotly_white", + ) + st.plotly_chart(fig_anomaly, use_container_width=True) + else: + st.success("No anomalies detected based on the current threshold.") + +# -------------------------------------------------------------------------------- +# TAB 13: Save Analysis Report +# -------------------------------------------------------------------------------- +with tabs[12]: + st.header("Save Analysis Report") + st.write("Download a comprehensive analysis report of your experiment.") + + report_options = st.multiselect( + "Select Sections to Include in the Report", + options=[ + "Data Overview", + "Configurations Analysis", + "Failure Analysis", + "Correlation Heatmap", + "Parallel Coordinates", + "Config Params Scatter", + "Top & Bottom Configs", + "Optimization Suggestions", + "Clustering", + "Advanced Statistics", + "Anomaly Detection", + ], + default=[ + "Data Overview", + "Configurations Analysis", + "Failure Analysis", + "Correlation Heatmap", + "Top & Bottom Configs", + "Optimization Suggestions", + ], + help="Choose which sections of the analysis you want to include in the report.", + ) + + if st.button("Download Report"): + # Generate the report based on selected sections + report = f"# Azure MySQL Config Analyzer Report\n\n## Experiment: {experiment_id}\n\n**Description:** {exp.description}\n\n" + + if "Data Overview" in report_options: + report += "## Data Overview\n" + report += f"### Descriptive Statistics\n{df.describe().to_markdown()}\n\n" + + if "Configurations Analysis" in report_options: + report += "## Configurations Analysis\n" + # Example: Include top configuration analysis + if "result.metric" in df.columns: + top_config = df.loc[ + df["result.metric"].idxmax() + ] # Replace 'result.metric' with actual metric + report += f"### Optimal Configuration\n{top_config[config_columns].to_dict()}\n\n" + else: + report += ( + "### Configurations Analysis details were generated in the application.\n\n" + ) + + if "Failure Analysis" in report_options: + report += "## Failure Analysis\n" + failure_counts = df["status"].value_counts() + report += f"### Failure Rate Distribution\n{failure_counts.to_dict()}\n\n" + + if "Correlation Heatmap" in report_options: + report += "## Correlation Heatmap\n" + selected_columns = config_columns + result_columns # Adjust as needed + corr_matrix = df[selected_columns].corr() + report += f"### Correlation Matrix\n{corr_matrix.to_markdown()}\n\n" + + if "Parallel Coordinates" in report_options: + report += "## Parallel Coordinates\n" + report += "### Parallel Coordinates Plot was generated in the application.\n\n" + + if "Config Params Scatter" in report_options: + report += "## Configuration Parameters Scatter Plot\n" + report += "### Scatter plots were generated in the application.\n\n" + + if "Top & Bottom Configs" in report_options: + report += "## Top & Bottom Configurations\n" + n_configs = st.session_state.get("n_configs_display", 5) + tb_metric = st.session_state.get("tb_metric", metrics[0]) + optimization_method = st.session_state.get("tb_opt_method", "Maximize") + if optimization_method == "Maximize": + top_configs = df.nlargest(n_configs, tb_metric) + bottom_configs = df.nsmallest(n_configs, tb_metric) + else: + top_configs = df.nsmallest(n_configs, tb_metric) + bottom_configs = df.nlargest(n_configs, tb_metric) + report += f"### Top {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{top_configs.to_markdown()}\n\n" + report += f"### Bottom {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{bottom_configs.to_markdown()}\n\n" + + if "Optimization Suggestions" in report_options: + report += "## Optimization Suggestions\n" + target_metric = st.session_state.get("opt_target_metric", metrics[0]) + optimization_method = st.session_state.get("opt_method_choice", "Maximize") + if optimization_method == "Maximize": + optimal_config = df.loc[df[target_metric].idxmax()] + else: + optimal_config = df.loc[df[target_metric].idxmin()] + report += f"### Optimal Configuration ({optimization_method} {target_metric.replace('result.', '').replace('_', ' ').title()}):\n{optimal_config[config_columns].to_dict()}\n\n" - st.pyplot(fig) + if "Clustering" in report_options: + report += "## Clustering Analysis\n" + report += "### Clustering results were generated in the application.\n\n" -# Section 3: Pair Plot -st.header("Pair Plot for Configurations") -selected_columns = st.multiselect("Select Columns for Pair Plot", config_columns + result_columns) + if "Advanced Statistics" in report_options: + report += "## Advanced Statistics\n" + selected_metric = st.session_state.get("advanced_stat_metric", metrics[0]) + report += f"### Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}\n{df[selected_metric].describe().to_markdown()}\n\n" -if st.button("Generate Pair Plot") and selected_columns: - fig = sns.pairplot(df[selected_columns]) - st.pyplot(fig) + if "Anomaly Detection" in report_options: + report += "## Anomaly Detection\n" + anomaly_metric = st.session_state.get("anomaly_metric", metrics[0]) + threshold = st.session_state.get("anomaly_threshold", 3.0) + mean_val = df[anomaly_metric].mean() + std_val = df[anomaly_metric].std() + upper_bound = mean_val + threshold * std_val + lower_bound = mean_val - threshold * std_val + anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] + report += f"### Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()} (Threshold: {threshold} Std Dev)\n{anomalies.to_markdown()}\n\n" -# Section 4: Heatmap -st.header("Correlation Heatmap") -corr_method = st.radio("Correlation Method", ["pearson", "kendall", "spearman"]) + # Download the report as a text file + st.download_button( + label="Download Report as Text", + data=report, + file_name="analysis_report.txt", + mime="text/plain", + ) -if st.button("Generate Heatmap"): - corr_matrix = df[selected_columns].corr(method=corr_method) + # Optionally, provide the CSV report + st.subheader("Download Descriptive Statistics") + if st.button("Download Descriptive Statistics as CSV"): + report_csv = df.describe().to_csv() + st.download_button( + label="Download CSV Report", + data=report_csv, + file_name="descriptive_statistics.csv", + mime="text/csv", + ) - fig, ax = plt.subplots(figsize=(12, 8)) - sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", ax=ax) - st.pyplot(fig) + st.info("Select the sections you want to include in the report and click 'Download Report'.") -# streamlit run app.py --server.port 8501 --server.address 0.0.0.0 +# -------------------------------------------------------------------------------- +# Additional UI/UX Enhancements +# -------------------------------------------------------------------------------- +st.sidebar.markdown("---") +st.sidebar.markdown("#### Tips for Better Workflow") +st.sidebar.markdown( + """ +- **Start with the Dashboard** to get an overview of key metrics. +- **Use Data Overview** to understand and filter your dataset. +- **Configurations Analysis** helps visualize specific configuration performances. +- **Failure Analysis** highlights trial outcomes and trends. +- **Correlation Heatmap** and **Parallel Coordinates** allow in-depth correlation and multi-dimensional analysis. +- **Config Params Scatter** plots relationships between configuration parameters and metrics. +- **Top & Bottom Configs** identify the best and worst-performing configurations. +- **Optimization Suggestions** provide insights into optimal configurations. +- **Clustering** groups similar configurations for pattern recognition. +- **Advanced Statistics** offers detailed statistical analyses of your metrics. +- **Anomaly Detection** helps identify outliers and unusual trial performances. +- **Save Analysis** lets you download a comprehensive report of your findings. + """ +) From 51a2f45ebe7a5eb111f4b8e8d405e37ad62364cc Mon Sep 17 00:00:00 2001 From: Yaseen Shady <139421618+yshady-acheev@users.noreply.github.com> Date: Wed, 22 Jan 2025 05:21:59 +0000 Subject: [PATCH 4/8] reworking --- app.py | 1308 ---------------------------------------- dashboard.py | 1627 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1627 insertions(+), 1308 deletions(-) delete mode 100644 app.py create mode 100644 dashboard.py diff --git a/app.py b/app.py deleted file mode 100644 index 196d203..0000000 --- a/app.py +++ /dev/null @@ -1,1308 +0,0 @@ -import pandas as pd -import seaborn as sns -import matplotlib.pyplot as plt -import streamlit as st -import plotly -import plotly.express as px -import plotly.graph_objs as go -from sklearn.cluster import KMeans -from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler -from mlos_bench.storage import from_config - -# -------------------------------------------------------------------------------- -# Streamlit Configuration -# -------------------------------------------------------------------------------- -st.set_page_config( - page_title="Azure MySQL Config Analyzer", layout="wide", initial_sidebar_state="expanded" -) - -# -------------------------------------------------------------------------------- -# Data Loading and Caching -# -------------------------------------------------------------------------------- -@st.cache_resource -def load_storage(): - """ - Load the MLOS storage configuration for the experiments. - This function is cached to prevent reloading on every interaction. - """ - return from_config(config="storage/sqlite.jsonc") - -storage = load_storage() - -# -------------------------------------------------------------------------------- -# Sidebar - Experiment Selection and Filtering -# -------------------------------------------------------------------------------- -st.sidebar.title("Azure MySQL Config Analyzer") - -# Experiment Selection -experiment_id = st.sidebar.selectbox( - "Select Experiment", - options=list(storage.experiments.keys()), - help="Choose the experiment you want to analyze.", -) - -# Load the selected experiment -exp = storage.experiments[experiment_id] -df = exp.results_df.copy() - -# Extract configuration and result columns -config_columns = [col for col in df.columns if col.startswith("config.")] -result_columns = [col for col in df.columns if col.startswith("result.")] -metrics = result_columns - -# -------------------------------------------------------------------------------- -# Main Title and Description -# -------------------------------------------------------------------------------- -st.title(f"Azure MySQL Experiment: {experiment_id}") -st.write(f"**Description**: {exp.description}") - -# -------------------------------------------------------------------------------- -# Tabs Creation -# -------------------------------------------------------------------------------- -tabs = st.tabs( - [ - "Dashboard", - "Data Overview", - "Configurations Analysis", - "Failure Analysis", - "Correlation Heatmap", - "Parallel Coordinates", - "Config Params Scatter", - "Top & Bottom Configs", - "Optimization Suggestions", - "Clustering", - "Advanced Statistics", - "Anomaly Detection", - "Save Analysis", - ] -) - -# -------------------------------------------------------------------------------- -# TAB 1: Dashboard -# -------------------------------------------------------------------------------- -with tabs[0]: - st.header("Dashboard") - st.write("### Key Metrics Overview") - - # Calculate key metrics - total_trials = len(df) - success_trials = df["status"].value_counts().get("SUCCESS", 0) - failure_trials = df["status"].value_counts().get("FAILED", 0) - success_rate = (success_trials / total_trials) * 100 if total_trials > 0 else 0 - failure_rate = (failure_trials / total_trials) * 100 if total_trials > 0 else 0 - - # Display key metrics - col1, col2, col3 = st.columns(3) - col1.metric("Total Trials", total_trials) - col2.metric("Successful Trials", success_trials) - col3.metric("Failure Rate (%)", f"{failure_rate:.2f}") - - # Visualization: Success vs Failure - fig = px.pie( - names=["Success", "Failure"], - values=[success_trials, failure_trials], - title="Trial Outcomes", - color=["Success", "Failure"], - color_discrete_map={"Success": "green", "Failure": "red"}, - ) - st.plotly_chart(fig, use_container_width=True) - - # Visualization: Top 5 Metrics - st.write("### Top 5 Metrics") - top_metrics = df[result_columns].mean().sort_values(ascending=False).head(5) - fig_metrics = px.bar( - top_metrics, - x=top_metrics.index.str.replace("result.", "").str.replace("_", " ").str.title(), - y=top_metrics.values, - labels={"x": "Metric", "y": "Average Value"}, - title="Top 5 Average Metrics", - color=top_metrics.values, - color_continuous_scale="Blues", - ) - st.plotly_chart(fig_metrics, use_container_width=True) - -# -------------------------------------------------------------------------------- -# TAB 2: Data Overview -# -------------------------------------------------------------------------------- -with tabs[1]: - st.header("Data Overview") - st.write("Explore experiment data and key statistics.") - - # Data Filtering - with st.expander("Filter Data"): - st.subheader("Apply Filters") - trial_id_filter = st.text_input( - "Filter by Trial ID (comma-separated)", help="Enter trial IDs separated by commas." - ) - status_filter = st.multiselect( - "Filter by Status", - options=df["status"].unique(), - default=df["status"].unique(), - help="Select one or more statuses to filter the trials.", - ) - config_filter = st.multiselect( - "Filter by Configuration ID", - options=df["tunable_config_id"].unique(), - default=df["tunable_config_id"].unique(), - help="Select one or more configuration IDs to filter the trials.", - ) - - if st.button("Apply Filters"): - filtered_df = df.copy() - if trial_id_filter: - try: - trial_ids = [ - int(tid.strip()) - for tid in trial_id_filter.split(",") - if tid.strip().isdigit() - ] - filtered_df = filtered_df[filtered_df["trial_id"].isin(trial_ids)] - except ValueError: - st.error("Please enter valid trial IDs separated by commas.") - if status_filter: - filtered_df = filtered_df[filtered_df["status"].isin(status_filter)] - if config_filter: - filtered_df = filtered_df[filtered_df["tunable_config_id"].isin(config_filter)] - st.session_state.filtered_df = filtered_df - st.success("Filters applied successfully!") - - # Display filtered data or original data - if "filtered_df" in st.session_state: - display_df = st.session_state.filtered_df - else: - display_df = df - - if st.checkbox("Show Data Table"): - st.dataframe(display_df) - st.write("### Descriptive Statistics:") - st.write(display_df.describe()) - -# -------------------------------------------------------------------------------- -# TAB 3: Configurations Analysis -# -------------------------------------------------------------------------------- -with tabs[2]: - st.header("Configurations Analysis") - st.write("Visualize performance metrics across different configurations.") - - config_id = st.selectbox( - "Select Configuration ID", - options=df["tunable_config_id"].unique(), - help="Choose a configuration to analyze its performance over trials.", - ) - metric = st.selectbox( - "Select Metric", options=metrics, help="Choose a performance metric to visualize." - ) - - config_data = df[df["tunable_config_id"] == config_id] - fig = px.line( - config_data, - x="trial_id", - y=metric, - title=f"{metric.replace('result.', '').replace('_', ' ').title()} over Trials for Configuration {config_id}", - markers=True, - labels={ - "trial_id": "Trial ID", - metric: metric.replace("result.", "").replace("_", " ").title(), - }, - template="plotly_white", - ) - st.plotly_chart(fig, use_container_width=True) - - # Additional Insights: Moving Average - window_size = st.slider( - "Select Moving Average Window Size", - 1, - 10, - 3, - help="Smooth the metric by applying a moving average.", - ) - config_data[f"{metric}_MA"] = config_data[metric].rolling(window=window_size).mean() - fig_ma = px.line( - config_data, - x="trial_id", - y=f"{metric}_MA", - title=f"{metric.replace('result.', '').replace('_', ' ').title()} - Moving Average (Window Size={window_size})", - markers=True, - labels={ - "trial_id": "Trial ID", - f"{metric}_MA": f"{metric.replace('result.', '').replace('_', ' ').title()} (MA)", - }, - template="plotly_white", - ) - st.plotly_chart(fig_ma, use_container_width=True) - -# -------------------------------------------------------------------------------- -# TAB 4: Failure Analysis -# -------------------------------------------------------------------------------- -with tabs[3]: - st.header("Failure Analysis") - st.write("Analyze failure rates and trends across trials.") - - if "status" in df.columns: - # Failure Rate Distribution - st.subheader("Failure Rate Distribution") - failure_counts = df["status"].value_counts() - fig_pie = px.pie( - values=failure_counts.values, - names=failure_counts.index, - title="Failure Rate Distribution", - color=failure_counts.index, - color_discrete_map={"FAILED": "red", "SUCCESS": "green"}, - ) - st.plotly_chart(fig_pie, use_container_width=True) - - # Failure Rate Trend Over Trials - st.subheader("Failure Rate Trend Over Trials") - failure_rate_trend = ( - df.groupby("trial_id")["status"] - .apply(lambda x: (x == "FAILED").mean() * 100) - .reset_index() - ) - failure_rate_trend.columns = ["Trial ID", "Failure Rate (%)"] - fig_line = px.line( - failure_rate_trend, - x="Trial ID", - y="Failure Rate (%)", - title="Failure Rate Trend Over Trials", - markers=True, - labels={"Trial ID": "Trial ID", "Failure Rate (%)": "Failure Rate (%)"}, - template="plotly_white", - ) - st.plotly_chart(fig_line, use_container_width=True) - else: - st.info("No 'status' column found in the dataset.") - -# -------------------------------------------------------------------------------- -# TAB 5: Correlation Heatmap -# -------------------------------------------------------------------------------- -with tabs[4]: - st.header("Correlation Heatmap") - st.write("Visualize correlations between selected configuration and result metrics.") - - selected_columns = st.multiselect( - "Select Columns for Heatmap", - options=config_columns + result_columns, - default=config_columns[:2] + result_columns[:2], - help="Choose multiple columns to analyze their correlation.", - ) - - if st.button("Generate Heatmap"): - if selected_columns: - corr_matrix = df[selected_columns].corr() - fig = px.imshow( - corr_matrix, - text_auto=True, - color_continuous_scale="Viridis", - title="Correlation Heatmap", - labels={"color": "Correlation Coefficient"}, - ) - st.plotly_chart(fig, use_container_width=True) - else: - st.warning("Please select at least one column to generate the heatmap.") - else: - st.info("Select columns and click 'Generate Heatmap' to visualize correlations.") - -# -------------------------------------------------------------------------------- -# TAB 6: Parallel Coordinates -# -------------------------------------------------------------------------------- -with tabs[5]: - st.header("Parallel Coordinates Plot") - st.write( - "Explore multi-dimensional relationships between configuration parameters and metrics." - ) - - parallel_columns = st.multiselect( - "Select Columns for Parallel Plot", - options=config_columns + result_columns, - default=config_columns[:3] + result_columns[:2], - help="Choose multiple columns to include in the parallel coordinates plot.", - ) - - if parallel_columns: - color_metric = st.selectbox( - "Select Metric for Coloring", - options=result_columns, - help="Choose a result metric to color-code the parallel coordinates.", - ) - fig = px.parallel_coordinates( - df, - dimensions=parallel_columns, - color=color_metric, - color_continuous_scale=px.colors.diverging.Tealrose, - title="Parallel Coordinates Plot", - labels={ - col: col.replace("config.", "").replace("_", " ").title() - for col in parallel_columns - }, - template="plotly_white", - ) - st.plotly_chart(fig, use_container_width=True) - else: - st.info("Select columns to generate the parallel coordinates plot.") - -# -------------------------------------------------------------------------------- -# TAB 7: Config Params Scatter -# -------------------------------------------------------------------------------- -with tabs[6]: - st.header("Configuration Parameters Scatter Plot") - st.write( - "Analyze the relationship between multiple configuration parameters and a selected result metric." - ) - - if not config_columns: - st.warning("No configuration parameters available in the dataset.") - elif not metrics: - st.warning("No result metrics available in the dataset.") - else: - # Select multiple configuration parameters - selected_config_params = st.multiselect( - "Select Configuration Parameters", - options=config_columns, - default=config_columns[:2], - help="Choose one or more configuration parameters to analyze.", - ) - - # Select one result metric - selected_result_metric = st.selectbox( - "Select Result Metric", - options=metrics, - help="Choose a result metric to analyze against the selected configuration parameters.", - ) - - if selected_config_params: - # Determine layout based on number of selected parameters - plots_per_row = 2 - num_plots = len(selected_config_params) - num_rows = (num_plots + plots_per_row - 1) // plots_per_row - - for row in range(num_rows): - cols = st.columns(plots_per_row) - for i in range(plots_per_row): - plot_index = row * plots_per_row + i - if plot_index < num_plots: - config_param = selected_config_params[plot_index] - with cols[i]: - fig = px.scatter( - df, - x=config_param, - y=selected_result_metric, - color="tunable_config_id", - title=f"{config_param.replace('config.', '').replace('_', ' ').title()} vs {selected_result_metric.replace('result.', '').replace('_', ' ').title()}", - labels={ - config_param: config_param.replace("config.", "") - .replace("_", " ") - .title(), - selected_result_metric: selected_result_metric.replace( - "result.", "" - ) - .replace("_", " ") - .title(), - }, - hover_data=["trial_id", "tunable_config_id"], - trendline="ols", - template="plotly_white", - ) - - st.plotly_chart(fig, use_container_width=True) - - # Calculate and display the correlation coefficient - corr_coeff = ( - df[[config_param, selected_result_metric]].corr().iloc[0, 1] - ) - st.markdown(f"**Correlation Coefficient:** {corr_coeff:.2f}") - else: - st.info( - "Please select at least one configuration parameter to generate scatter plots." - ) - -# -------------------------------------------------------------------------------- -# TAB 8: Top & Bottom Configurations -# -------------------------------------------------------------------------------- -with tabs[7]: - st.header("Top and Bottom Configurations") - st.write( - "Identify configurations with the best and worst performance based on selected metrics." - ) - - n_configs = st.slider( - "Number of Configurations to Display", - min_value=1, - max_value=10, - value=5, - help="Select how many top and bottom configurations to display.", - ) - - # Select metric for ranking - tb_metric = st.selectbox( - "Select Metric for Ranking", - options=metrics, - index=0, - key="tb_metric", - help="Choose a metric to rank configurations.", - ) - optimization_method = st.radio( - "Select Optimization Method", - ["Maximize", "Minimize"], - index=0, - key="tb_opt_method", - help="Choose whether to find configurations that maximize or minimize the selected metric.", - ) - - if not df.empty: - if optimization_method == "Maximize": - top_configs = df.nlargest(n_configs, tb_metric) - bottom_configs = df.nsmallest(n_configs, tb_metric) - else: - top_configs = df.nsmallest(n_configs, tb_metric) - bottom_configs = df.nlargest(n_configs, tb_metric) - - st.subheader("Top Configurations") - st.dataframe(top_configs) - - st.subheader("Bottom Configurations") - st.dataframe(bottom_configs) - else: - st.warning("No data available to identify top/bottom configurations.") - -# -------------------------------------------------------------------------------- -# TAB 9: Optimization Suggestions -# -------------------------------------------------------------------------------- -with tabs[8]: - st.header("Optimization Suggestions") - st.write("Discover optimal configurations based on selected performance metrics.") - - target_metric = st.selectbox( - "Select Metric for Optimization", - options=metrics, - index=0, - key="opt_target_metric", - help="Choose a performance metric to optimize.", - ) - optimization_method = st.radio( - "Select Optimization Method", - ["Maximize", "Minimize"], - index=0, - key="opt_method_choice", - help="Choose whether to maximize or minimize the selected metric.", - ) - - if not df.empty: - if optimization_method == "Maximize": - optimal_config = df.loc[df[target_metric].idxmax()] - else: - optimal_config = df.loc[df[target_metric].idxmin()] - - st.write( - f"**Optimal Configuration ({optimization_method} {target_metric.replace('result.', '').replace('_', ' ').title()}):**" - ) - st.json(optimal_config[config_columns].to_dict()) - else: - st.warning("No data available for optimization.") - -# -------------------------------------------------------------------------------- -# TAB 10: Clustering -# -------------------------------------------------------------------------------- -with tabs[9]: - st.header("Clustering Analysis") - st.write("Group similar configurations to identify patterns and clusters.") - - cluster_columns = st.multiselect( - "Select Columns for Clustering", - options=config_columns + result_columns, - default=config_columns[:3], - help="Choose multiple columns to perform clustering.", - ) - num_clusters = st.slider( - "Number of Clusters", - min_value=2, - max_value=10, - value=3, - help="Define the number of clusters for K-Means.", - ) - - if len(cluster_columns) >= 2: - if st.button("Generate Clustering"): - clustering_data = df[cluster_columns].dropna() - - # Standardize the data - scaler = StandardScaler() - clustering_data_scaled = scaler.fit_transform(clustering_data) - - # Perform K-Means clustering - kmeans = KMeans(n_clusters=num_clusters, random_state=42) - clusters = kmeans.fit_predict(clustering_data_scaled) - df["cluster"] = clusters - - # Optional: Dimensionality Reduction for 3D Plotting - if len(cluster_columns) > 3: - pca = PCA(n_components=3) - principal_components = pca.fit_transform(clustering_data_scaled) - df["PC1"] = principal_components[:, 0] - df["PC2"] = principal_components[:, 1] - df["PC3"] = principal_components[:, 2] - fig = px.scatter_3d( - df, - x="PC1", - y="PC2", - z="PC3", - color="cluster", - title="3D Scatter Plot with PCA and Clustering", - labels={ - "PC1": "Principal Component 1", - "PC2": "Principal Component 2", - "PC3": "Principal Component 3", - }, - template="plotly_white", - ) - elif len(cluster_columns) == 3: - fig = px.scatter_3d( - df, - x=cluster_columns[0], - y=cluster_columns[1], - z=cluster_columns[2], - color="cluster", - title="3D Scatter Plot with Clustering", - labels={ - cluster_columns[0]: cluster_columns[0] - .replace("config.", "") - .replace("_", " ") - .title(), - cluster_columns[1]: cluster_columns[1] - .replace("config.", "") - .replace("_", " ") - .title(), - cluster_columns[2]: cluster_columns[2] - .replace("config.", "") - .replace("_", " ") - .title(), - }, - template="plotly_white", - ) - else: - fig = px.scatter( - df, - x=cluster_columns[0], - y=cluster_columns[1], - color="cluster", - title="2D Scatter Plot with Clustering", - labels={ - cluster_columns[0]: cluster_columns[0] - .replace("config.", "") - .replace("_", " ") - .title(), - cluster_columns[1]: cluster_columns[1] - .replace("config.", "") - .replace("_", " ") - .title(), - }, - template="plotly_white", - ) - - st.plotly_chart(fig, use_container_width=True) - - # Cluster Centroids - centroids = kmeans.cluster_centers_ - centroids_df = pd.DataFrame(centroids, columns=cluster_columns) - st.subheader("Cluster Centroids") - st.write(centroids_df) - else: - st.warning("Please select at least two columns for clustering.") - -# -------------------------------------------------------------------------------- -# TAB 10: Advanced Statistics -# -------------------------------------------------------------------------------- -with tabs[10]: - st.header("Advanced Statistics") - st.write("Perform advanced statistical analyses on the experiment data.") - - # Select Metric for Statistical Analysis - selected_metric = st.selectbox( - "Select Metric for Statistical Analysis", - options=metrics, - help="Choose a result metric to perform statistical tests.", - ) - - # Debugging: Display selected_metric and its type - st.write(f"**Selected Metric:** {selected_metric}") - st.write(f"**Selected Metric Type:** {df[selected_metric].dtype}") - - # Check if the selected metric is numeric - if pd.api.types.is_numeric_dtype(df[selected_metric]): - st.subheader( - f"Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}" - ) - st.write(df[selected_metric].describe()) - - # Define the template - template_value = "plotly_white" - st.write(f"**Template Type:** {type(template_value)}, **Value:** {template_value}") - - # Histogram with KDE - try: - fig_hist = px.histogram( - df, - x=selected_metric, - nbins=30, - title=f"Distribution of {selected_metric.replace('result.', '').replace('_', ' ').title()}", - marginal="kde", - labels={ - selected_metric: selected_metric.replace("result.", "") - .replace("_", " ") - .title() - }, - template=template_value, # Ensure this is a string - ) - st.plotly_chart(fig_hist, use_container_width=True) - except Exception as e: - st.error(f"An error occurred while generating the histogram: {e}") - - # Box Plot - st.subheader( - f"Box Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" - ) - try: - fig_box = px.box( - df, - y=selected_metric, - points="all", - title=f"Box Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", - labels={ - selected_metric: selected_metric.replace("result.", "") - .replace("_", " ") - .title() - }, - template=template_value, # Ensure this is a string - ) - st.plotly_chart(fig_box, use_container_width=True) - except Exception as e: - st.error(f"An error occurred while generating the box plot: {e}") - - # Violin Plot - st.subheader( - f"Violin Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" - ) - try: - fig_violin = px.violin( - df, - y=selected_metric, - box=True, - points="all", - title=f"Violin Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", - labels={ - selected_metric: selected_metric.replace("result.", "") - .replace("_", " ") - .title() - }, - template=template_value, # Ensure this is a string - ) - st.plotly_chart(fig_violin, use_container_width=True) - except Exception as e: - st.error(f"An error occurred while generating the violin plot: {e}") - else: - st.warning( - f"The selected metric '{selected_metric}' is not numeric and cannot be plotted." - ) - - # Display Plotly Version for Debugging - st.subheader("Plotly Version") - st.write(f"Plotly version: {plotly.__version__}") - - # Optional: Display the selected template - st.subheader("Template Information") - st.write(f"Selected Template: {template_value}") - - -# -------------------------------------------------------------------------------- -# TAB 12: Anomaly Detection -# -------------------------------------------------------------------------------- -with tabs[11]: - st.header("Anomaly Detection") - st.write("Identify anomalous trials based on selected metrics.") - - anomaly_metric = st.selectbox( - "Select Metric for Anomaly Detection", - options=metrics, - help="Choose a result metric to perform anomaly detection.", - ) - threshold = st.slider( - "Set Anomaly Threshold (Standard Deviations)", - min_value=1.0, - max_value=5.0, - value=3.0, - step=0.5, - help="Define how many standard deviations away from the mean a data point should be to be considered an anomaly.", - ) - - mean_val = df[anomaly_metric].mean() - std_val = df[anomaly_metric].std() - upper_bound = mean_val + threshold * std_val - lower_bound = mean_val - threshold * std_val - - anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] - - st.subheader(f"Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}") - if not anomalies.empty: - st.write(f"Total Anomalies Detected: {len(anomalies)}") - st.dataframe(anomalies) - - # Visualization: Scatter Plot Highlighting Anomalies - fig_anomaly = px.scatter( - df, - x="trial_id", - y=anomaly_metric, - color=df.index.isin(anomalies.index), - title=f"Anomaly Detection in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}", - labels={ - "trial_id": "Trial ID", - anomaly_metric: anomaly_metric.replace("result.", "").replace("_", " ").title(), - }, - color_discrete_map={True: "red", False: "blue"}, - template="plotly_white", - ) - st.plotly_chart(fig_anomaly, use_container_width=True) - else: - st.success("No anomalies detected based on the current threshold.") - -# -------------------------------------------------------------------------------- -# TAB 13: Save Analysis Report -# -------------------------------------------------------------------------------- -with tabs[12]: - st.header("Save Analysis Report") - st.write("Download a comprehensive analysis report of your experiment.") - - report_options = st.multiselect( - "Select Sections to Include in the Report", - options=[ - "Data Overview", - "Configurations Analysis", - "Failure Analysis", - "Correlation Heatmap", - "Parallel Coordinates", - "Config Params Scatter", - "Top & Bottom Configs", - "Optimization Suggestions", - "Clustering", - "Advanced Statistics", - "Anomaly Detection", - ], - default=[ - "Data Overview", - "Configurations Analysis", - "Failure Analysis", - "Correlation Heatmap", - "Top & Bottom Configs", - "Optimization Suggestions", - ], - help="Choose which sections of the analysis you want to include in the report.", - ) - - if st.button("Download Report"): - # Generate the report based on selected sections - report = f"# Azure MySQL Config Analyzer Report\n\n## Experiment: {experiment_id}\n\n**Description:** {exp.description}\n\n" - - if "Data Overview" in report_options: - report += "## Data Overview\n" - report += f"### Descriptive Statistics\n{df.describe().to_markdown()}\n\n" - - if "Configurations Analysis" in report_options: - report += "## Configurations Analysis\n" - # Example: Include top configuration analysis - top_config = df.loc[ - df["result.metric"].idxmax() - ] # Replace 'result.metric' with actual metric if needed - report += f"### Optimal Configuration\n{top_config[config_columns].to_dict()}\n\n" - - if "Failure Analysis" in report_options: - report += "## Failure Analysis\n" - failure_counts = df["status"].value_counts() - report += f"### Failure Rate Distribution\n{failure_counts.to_dict()}\n\n" - - if "Correlation Heatmap" in report_options: - report += "## Correlation Heatmap\n" - selected_columns = config_columns + result_columns # Adjust as needed - corr_matrix = df[selected_columns].corr() - report += f"### Correlation Matrix\n{corr_matrix.to_markdown()}\n\n" - - if "Parallel Coordinates" in report_options: - report += "## Parallel Coordinates\n" - # Example placeholder - report += "### Parallel Coordinates Plot was generated in the application.\n\n" - - if "Config Params Scatter" in report_options: - report += "## Configuration Parameters Scatter Plot\n" - # Example placeholder - report += "### Scatter plots were generated in the application.\n\n" - - if "Top & Bottom Configs" in report_options: - report += "## Top & Bottom Configurations\n" - n_configs = st.session_state.get("n_configs_display", 5) - tb_metric = st.session_state.get("tb_metric", metrics[0]) - optimization_method = st.session_state.get("tb_opt_method", "Maximize") - if optimization_method == "Maximize": - top_configs = df.nlargest(n_configs, tb_metric) - bottom_configs = df.nsmallest(n_configs, tb_metric) - else: - top_configs = df.nsmallest(n_configs, tb_metric) - bottom_configs = df.nlargest(n_configs, tb_metric) - report += f"### Top {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{top_configs.to_markdown()}\n\n" - report += f"### Bottom {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{bottom_configs.to_markdown()}\n\n" - - if "Optimization Suggestions" in report_options: - report += "## Optimization Suggestions\n" - target_metric = st.session_state.get("opt_target_metric", metrics[0]) - optimization_method = st.session_state.get("opt_method_choice", "Maximize") - if optimization_method == "Maximize": - optimal_config = df.loc[df[target_metric].idxmax()] - else: - optimal_config = df.loc[df[target_metric].idxmin()] - report += f"### Optimal Configuration ({optimization_method} {target_metric.replace('result.', '').replace('_', ' ').title()}):\n{optimal_config[config_columns].to_dict()}\n\n" - - if "Clustering" in report_options: - report += "## Clustering Analysis\n" - # Example placeholder - report += "### Clustering results were generated in the application.\n\n" - - if "Advanced Statistics" in report_options: - report += "## Advanced Statistics\n" - selected_metric = st.session_state.get("advanced_stat_metric", metrics[0]) - report += f"### Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}\n{df[selected_metric].describe().to_markdown()}\n\n" - - if "Anomaly Detection" in report_options: - report += "## Anomaly Detection\n" - anomaly_metric = st.session_state.get("anomaly_metric", metrics[0]) - threshold = st.session_state.get("anomaly_threshold", 3.0) - mean_val = df[anomaly_metric].mean() - std_val = df[anomaly_metric].std() - upper_bound = mean_val + threshold * std_val - lower_bound = mean_val - threshold * std_val - anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] - report += f"### Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()} (Threshold: {threshold} Std Dev)\n{anomalies.to_markdown()}\n\n" - - # Download the report as a text file - st.download_button( - label="Download Report as Text", - data=report, - file_name="analysis_report.txt", - mime="text/plain", - ) - - # Optionally, provide the CSV report - st.subheader("Download Descriptive Statistics") - if st.button("Download Descriptive Statistics as CSV"): - report_csv = df.describe().to_csv() - st.download_button( - label="Download CSV Report", - data=report_csv, - file_name="descriptive_statistics.csv", - mime="text/csv", - ) - - st.info("Select the sections you want to include in the report and click 'Download Report'.") - -# -------------------------------------------------------------------------------- -# TAB 10: Clustering -# -------------------------------------------------------------------------------- -with tabs[9]: - st.header("Clustering Analysis") - st.write("Group similar configurations to identify patterns and clusters.") - - cluster_columns = st.multiselect( - "Select Columns for Clustering", - options=config_columns + result_columns, - default=config_columns[:3], - help="Choose multiple columns to perform clustering.", - key="clustering_columns_select", # Unique key - ) - - num_clusters = st.slider( - "Number of Clusters", - min_value=2, - max_value=10, - value=3, - help="Define the number of clusters for K-Means.", - key="num_clusters_slider_clustering", # Unique key - ) - - if len(cluster_columns) >= 2: - if st.button("Generate Clustering", key="gen cluster"): - clustering_data = df[cluster_columns].dropna() - - # Standardize the data - scaler = StandardScaler() - clustering_data_scaled = scaler.fit_transform(clustering_data) - - # Perform K-Means clustering - kmeans = KMeans(n_clusters=num_clusters, random_state=42) - clusters = kmeans.fit_predict(clustering_data_scaled) - df["cluster"] = clusters - - # Optional: Dimensionality Reduction for 3D Plotting - if len(cluster_columns) > 3: - pca = PCA(n_components=3) - principal_components = pca.fit_transform(clustering_data_scaled) - df["PC1"] = principal_components[:, 0] - df["PC2"] = principal_components[:, 1] - df["PC3"] = principal_components[:, 2] - fig = px.scatter_3d( - df, - x="PC1", - y="PC2", - z="PC3", - color="cluster", - title="3D Scatter Plot with PCA and Clustering", - labels={ - "PC1": "Principal Component 1", - "PC2": "Principal Component 2", - "PC3": "Principal Component 3", - }, - template="plotly_white", - ) - elif len(cluster_columns) == 3: - fig = px.scatter_3d( - df, - x=cluster_columns[0], - y=cluster_columns[1], - z=cluster_columns[2], - color="cluster", - title="3D Scatter Plot with Clustering", - labels={ - cluster_columns[0]: cluster_columns[0] - .replace("config.", "") - .replace("_", " ") - .title(), - cluster_columns[1]: cluster_columns[1] - .replace("config.", "") - .replace("_", " ") - .title(), - cluster_columns[2]: cluster_columns[2] - .replace("config.", "") - .replace("_", " ") - .title(), - }, - template="plotly_white", - ) - else: - fig = px.scatter( - df, - x=cluster_columns[0], - y=cluster_columns[1], - color="cluster", - title="2D Scatter Plot with Clustering", - labels={ - cluster_columns[0]: cluster_columns[0] - .replace("config.", "") - .replace("_", " ") - .title(), - cluster_columns[1]: cluster_columns[1] - .replace("config.", "") - .replace("_", " ") - .title(), - }, - template="plotly_white", - ) - - st.plotly_chart(fig, use_container_width=True) - - # Cluster Centroids - centroids = kmeans.cluster_centers_ - centroids_df = pd.DataFrame(centroids, columns=cluster_columns) - st.subheader("Cluster Centroids") - st.write(centroids_df) - else: - st.warning("Please select at least two columns for clustering.") - -# -------------------------------------------------------------------------------- -# TAB 11: Advanced Statistics -# -------------------------------------------------------------------------------- -# -------------------------------------------------------------------------------- -# TAB 11: Advanced Statistics -# -------------------------------------------------------------------------------- -with tabs[10]: - st.header("Advanced Statistics") - st.write("Perform advanced statistical analyses on the experiment data.") - - selected_metric = st.selectbox( - "Select Metric for Statistical Analysis", - options=metrics, - help="Choose a result metric to perform statistical tests.", - key="sel adv", - ) - - st.subheader( - f"Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}" - ) - - # Display data type and missing values - st.write(f"Data Type: {df[selected_metric].dtype}") - st.write(f"Missing Values: {df[selected_metric].isnull().sum()}") - - # Handle missing values - plot_df = df.dropna(subset=[selected_metric]) - - # Check if the selected metric is numeric - if pd.api.types.is_numeric_dtype(plot_df[selected_metric]): - st.write(plot_df[selected_metric].describe()) - - # Histogram with KDE - fig_hist = px.histogram( - plot_df, - x=selected_metric, - nbins=30, - title=f"Distribution of {selected_metric.replace('result.', '').replace('_', ' ').title()}", - marginal="kde", - labels={ - selected_metric: selected_metric.replace("result.", "").replace("_", " ").title() - }, - template="plotly_white", - ) - st.plotly_chart(fig_hist, use_container_width=True) - - # Box Plot - st.subheader( - f"Box Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" - ) - fig_box = px.box( - plot_df, - y=selected_metric, - points="all", - title=f"Box Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", - labels={ - selected_metric: selected_metric.replace("result.", "").replace("_", " ").title() - }, - template="plotly_white", - ) - st.plotly_chart(fig_box, use_container_width=True) - - # Violin Plot - st.subheader( - f"Violin Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" - ) - fig_violin = px.violin( - plot_df, - y=selected_metric, - box=True, - points="all", - title=f"Violin Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", - labels={ - selected_metric: selected_metric.replace("result.", "").replace("_", " ").title() - }, - template="plotly_white", - ) - st.plotly_chart(fig_violin, use_container_width=True) - else: - st.error( - f"The selected metric '{selected_metric}' is not numeric. Please select a numeric metric for statistical analysis." - ) - - -# -------------------------------------------------------------------------------- -# TAB 12: Anomaly Detection -# -------------------------------------------------------------------------------- -with tabs[11]: - st.header("Anomaly Detection") - st.write("Identify anomalous trials based on selected metrics.") - - anomaly_metric = st.selectbox( - "Select Metric for Anomaly Detection", - options=metrics, - help="Choose a result metric to perform anomaly detection.", - ) - threshold = st.slider( - "Set Anomaly Threshold (Standard Deviations)", - min_value=1.0, - max_value=5.0, - value=3.0, - step=0.5, - help="Define how many standard deviations away from the mean a data point should be to be considered an anomaly.", - ) - - mean_val = df[anomaly_metric].mean() - std_val = df[anomaly_metric].std() - upper_bound = mean_val + threshold * std_val - lower_bound = mean_val - threshold * std_val - - anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] - - st.subheader(f"Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}") - if not anomalies.empty: - st.write(f"Total Anomalies Detected: {len(anomalies)}") - st.dataframe(anomalies) - - # Visualization: Scatter Plot Highlighting Anomalies - fig_anomaly = px.scatter( - df, - x="trial_id", - y=anomaly_metric, - color=df.index.isin(anomalies.index), - title=f"Anomaly Detection in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}", - labels={ - "trial_id": "Trial ID", - anomaly_metric: anomaly_metric.replace("result.", "").replace("_", " ").title(), - }, - color_discrete_map={True: "red", False: "blue"}, - template="plotly_white", - ) - st.plotly_chart(fig_anomaly, use_container_width=True) - else: - st.success("No anomalies detected based on the current threshold.") - -# -------------------------------------------------------------------------------- -# TAB 13: Save Analysis Report -# -------------------------------------------------------------------------------- -with tabs[12]: - st.header("Save Analysis Report") - st.write("Download a comprehensive analysis report of your experiment.") - - report_options = st.multiselect( - "Select Sections to Include in the Report", - options=[ - "Data Overview", - "Configurations Analysis", - "Failure Analysis", - "Correlation Heatmap", - "Parallel Coordinates", - "Config Params Scatter", - "Top & Bottom Configs", - "Optimization Suggestions", - "Clustering", - "Advanced Statistics", - "Anomaly Detection", - ], - default=[ - "Data Overview", - "Configurations Analysis", - "Failure Analysis", - "Correlation Heatmap", - "Top & Bottom Configs", - "Optimization Suggestions", - ], - help="Choose which sections of the analysis you want to include in the report.", - ) - - if st.button("Download Report"): - # Generate the report based on selected sections - report = f"# Azure MySQL Config Analyzer Report\n\n## Experiment: {experiment_id}\n\n**Description:** {exp.description}\n\n" - - if "Data Overview" in report_options: - report += "## Data Overview\n" - report += f"### Descriptive Statistics\n{df.describe().to_markdown()}\n\n" - - if "Configurations Analysis" in report_options: - report += "## Configurations Analysis\n" - # Example: Include top configuration analysis - if "result.metric" in df.columns: - top_config = df.loc[ - df["result.metric"].idxmax() - ] # Replace 'result.metric' with actual metric - report += f"### Optimal Configuration\n{top_config[config_columns].to_dict()}\n\n" - else: - report += ( - "### Configurations Analysis details were generated in the application.\n\n" - ) - - if "Failure Analysis" in report_options: - report += "## Failure Analysis\n" - failure_counts = df["status"].value_counts() - report += f"### Failure Rate Distribution\n{failure_counts.to_dict()}\n\n" - - if "Correlation Heatmap" in report_options: - report += "## Correlation Heatmap\n" - selected_columns = config_columns + result_columns # Adjust as needed - corr_matrix = df[selected_columns].corr() - report += f"### Correlation Matrix\n{corr_matrix.to_markdown()}\n\n" - - if "Parallel Coordinates" in report_options: - report += "## Parallel Coordinates\n" - report += "### Parallel Coordinates Plot was generated in the application.\n\n" - - if "Config Params Scatter" in report_options: - report += "## Configuration Parameters Scatter Plot\n" - report += "### Scatter plots were generated in the application.\n\n" - - if "Top & Bottom Configs" in report_options: - report += "## Top & Bottom Configurations\n" - n_configs = st.session_state.get("n_configs_display", 5) - tb_metric = st.session_state.get("tb_metric", metrics[0]) - optimization_method = st.session_state.get("tb_opt_method", "Maximize") - if optimization_method == "Maximize": - top_configs = df.nlargest(n_configs, tb_metric) - bottom_configs = df.nsmallest(n_configs, tb_metric) - else: - top_configs = df.nsmallest(n_configs, tb_metric) - bottom_configs = df.nlargest(n_configs, tb_metric) - report += f"### Top {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{top_configs.to_markdown()}\n\n" - report += f"### Bottom {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{bottom_configs.to_markdown()}\n\n" - - if "Optimization Suggestions" in report_options: - report += "## Optimization Suggestions\n" - target_metric = st.session_state.get("opt_target_metric", metrics[0]) - optimization_method = st.session_state.get("opt_method_choice", "Maximize") - if optimization_method == "Maximize": - optimal_config = df.loc[df[target_metric].idxmax()] - else: - optimal_config = df.loc[df[target_metric].idxmin()] - report += f"### Optimal Configuration ({optimization_method} {target_metric.replace('result.', '').replace('_', ' ').title()}):\n{optimal_config[config_columns].to_dict()}\n\n" - - if "Clustering" in report_options: - report += "## Clustering Analysis\n" - report += "### Clustering results were generated in the application.\n\n" - - if "Advanced Statistics" in report_options: - report += "## Advanced Statistics\n" - selected_metric = st.session_state.get("advanced_stat_metric", metrics[0]) - report += f"### Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}\n{df[selected_metric].describe().to_markdown()}\n\n" - - if "Anomaly Detection" in report_options: - report += "## Anomaly Detection\n" - anomaly_metric = st.session_state.get("anomaly_metric", metrics[0]) - threshold = st.session_state.get("anomaly_threshold", 3.0) - mean_val = df[anomaly_metric].mean() - std_val = df[anomaly_metric].std() - upper_bound = mean_val + threshold * std_val - lower_bound = mean_val - threshold * std_val - anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] - report += f"### Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()} (Threshold: {threshold} Std Dev)\n{anomalies.to_markdown()}\n\n" - - # Download the report as a text file - st.download_button( - label="Download Report as Text", - data=report, - file_name="analysis_report.txt", - mime="text/plain", - ) - - # Optionally, provide the CSV report - st.subheader("Download Descriptive Statistics") - if st.button("Download Descriptive Statistics as CSV"): - report_csv = df.describe().to_csv() - st.download_button( - label="Download CSV Report", - data=report_csv, - file_name="descriptive_statistics.csv", - mime="text/csv", - ) - - st.info("Select the sections you want to include in the report and click 'Download Report'.") - -# -------------------------------------------------------------------------------- -# Additional UI/UX Enhancements -# -------------------------------------------------------------------------------- -st.sidebar.markdown("---") -st.sidebar.markdown("#### Tips for Better Workflow") -st.sidebar.markdown( - """ -- **Start with the Dashboard** to get an overview of key metrics. -- **Use Data Overview** to understand and filter your dataset. -- **Configurations Analysis** helps visualize specific configuration performances. -- **Failure Analysis** highlights trial outcomes and trends. -- **Correlation Heatmap** and **Parallel Coordinates** allow in-depth correlation and multi-dimensional analysis. -- **Config Params Scatter** plots relationships between configuration parameters and metrics. -- **Top & Bottom Configs** identify the best and worst-performing configurations. -- **Optimization Suggestions** provide insights into optimal configurations. -- **Clustering** groups similar configurations for pattern recognition. -- **Advanced Statistics** offers detailed statistical analyses of your metrics. -- **Anomaly Detection** helps identify outliers and unusual trial performances. -- **Save Analysis** lets you download a comprehensive report of your findings. - """ -) diff --git a/dashboard.py b/dashboard.py new file mode 100644 index 0000000..679e2ea --- /dev/null +++ b/dashboard.py @@ -0,0 +1,1627 @@ +import streamlit as st +import pandas as pd +import plotly.express as px +import plotly.graph_objs as go +from sklearn.cluster import KMeans +from sklearn.preprocessing import StandardScaler +from mlos_bench.storage import from_config +import logging +from typing import Dict, List, Tuple +from dataclasses import dataclass + +# ------------------------------------------------------------------------------ +# Configuration and Setup +# ------------------------------------------------------------------------------ +@dataclass +class Config: + """Application configuration settings""" + TITLE: str = "MySQL Configuration Analysis Dashboard" + DESCRIPTION: str = "Analyze and optimize MySQL database configurations" + DB_CONFIG_PATH: str = "storage/sqlite.jsonc" + THEME: str = "plotly_white" + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Initialize page configuration first +st.set_page_config(page_title=Config.TITLE, layout="wide", initial_sidebar_state="expanded") + +# ------------------------------------------------------------------------------ +# Data Loading and Processing +# ------------------------------------------------------------------------------ +@st.cache_resource +def load_data(): + """Load and cache the database storage""" + try: + storage = from_config(config=Config.DB_CONFIG_PATH) + logger.info("Successfully loaded database storage") + return storage + except Exception as e: + logger.error(f"Failed to load storage: {e}") + st.error(f"Failed to load storage: {str(e)}") + return None + +class DataProcessor: + """Handle data processing operations""" + + @staticmethod + def get_column_types(df: pd.DataFrame) -> Tuple[List[str], List[str]]: + """Extract configuration and result columns""" + config_cols = [col for col in df.columns if col.startswith("config.")] + result_cols = [col for col in df.columns if col.startswith("result.")] + return config_cols, result_cols + + @staticmethod + def calculate_stats(df: pd.DataFrame) -> Dict: + """Calculate key statistics from the dataset""" + total = len(df) + success = df["status"].value_counts().get("SUCCESS", 0) + failed = df["status"].value_counts().get("FAILED", 0) + success_rate = (success / total * 100) if total > 0 else 0 + + return { + "total": total, + "success": success, + "failed": failed, + "success_rate": success_rate, + "failure_rate": 100 - success_rate + } + + @staticmethod + def perform_clustering(df: pd.DataFrame, columns: List[str], n_clusters: int) -> Dict: + """Perform KMeans clustering on selected columns""" + X = df[columns].fillna(df[columns].mean()) + scaler = StandardScaler() + X_scaled = scaler.fit_transform(X) + + kmeans = KMeans(n_clusters=n_clusters, random_state=42) + clusters = kmeans.fit_predict(X_scaled) + + return { + "labels": clusters, + "centroids": kmeans.cluster_centers_, + "inertia": kmeans.inertia_ + } + +class Visualizer: + """Handle data visualization""" + + @staticmethod + def plot_trial_outcomes(stats: Dict) -> go.Figure: + """Create pie chart of trial outcomes""" + return px.pie( + names=["Success", "Failed"], + values=[stats["success"], stats["failed"]], + title="Trial Outcomes", + color_discrete_map={"Success": "green", "Failed": "red"}, + template=Config.THEME + ) + + @staticmethod + def plot_metric_distribution(df: pd.DataFrame, metric: str) -> go.Figure: + """Create distribution plot for a metric""" + return px.histogram( + df, x=metric, + title=f"Distribution of {metric.replace('result.', '').replace('_', ' ').title()}", + template=Config.THEME, + marginal="box" + ) + + @staticmethod + def plot_correlation_heatmap(df: pd.DataFrame, columns: List[str]) -> go.Figure: + """Create correlation heatmap""" + corr_matrix = df[columns].corr() + return px.imshow( + corr_matrix, + title="Correlation Heatmap", + template=Config.THEME, + aspect="auto" + ) + +class Dashboard: + """Main dashboard application""" + + def __init__(self): + self.storage = load_data() + self.processor = DataProcessor() + self.visualizer = Visualizer() + + def run(self): + """Run the dashboard application""" + if not self.storage: + st.error("Failed to initialize dashboard. Please check the logs.") + return + + st.title(Config.TITLE) + st.markdown(Config.DESCRIPTION) + + # Sidebar for experiment selection and controls + self.setup_sidebar() + + # Main content + if "selected_experiment" in st.session_state: + self.display_experiment_analysis() + + def setup_sidebar(self): + """Setup sidebar controls""" + st.sidebar.title("Controls") + + # Experiment selection + experiments = list(self.storage.experiments.keys()) + selected = st.sidebar.selectbox( + "Select Experiment", + experiments, + key="selected_experiment" + ) + + if selected: + st.session_state.exp = self.storage.experiments[selected] + st.session_state.df = st.session_state.exp.results_df.copy() + + def display_experiment_analysis(self): + """Display the main analysis content""" + df = st.session_state.df + + # Get column types + config_cols, result_cols = self.processor.get_column_types(df) + + # Calculate statistics + stats = self.processor.calculate_stats(df) + + # Create tabs for different analyses + tabs = st.tabs([ + "Overview", + "Configuration Analysis", + "Performance Metrics", + "Clustering", + "Optimization" + ]) + + # Overview Tab + with tabs[0]: + self.display_overview(stats) + + # Configuration Analysis Tab + with tabs[1]: + self.display_config_analysis(df, config_cols) + + # Performance Metrics Tab + with tabs[2]: + self.display_performance_metrics(df, result_cols) + + # Clustering Tab + with tabs[3]: + self.display_clustering_analysis(df, config_cols) + + # Optimization Tab + with tabs[4]: + self.display_optimization(df, config_cols, result_cols) + + def display_overview(self, stats: Dict): + """Display overview statistics and charts""" + st.header("Overview") + + # Display metrics + col1, col2, col3 = st.columns(3) + col1.metric("Total Trials", stats["total"]) + col2.metric("Successful Trials", stats["success"]) + col3.metric("Failure Rate", f"{stats['failure_rate']:.1f}%") + + # Display trial outcomes pie chart + st.plotly_chart( + self.visualizer.plot_trial_outcomes(stats), + use_container_width=True + ) + + def display_config_analysis(self, df: pd.DataFrame, config_cols: List[str]): + """Display configuration analysis""" + st.header("Configuration Analysis") + + # Select configurations to analyze + selected_configs = st.multiselect( + "Select Configuration Parameters", + config_cols, + default=config_cols[:2] + ) + + if selected_configs: + # Display correlation heatmap + st.plotly_chart( + self.visualizer.plot_correlation_heatmap(df, selected_configs), + use_container_width=True + ) + + def display_performance_metrics(self, df: pd.DataFrame, result_cols: List[str]): + """Display performance metrics analysis""" + st.header("Performance Metrics") + + # Select metric to analyze + selected_metric = st.selectbox( + "Select Metric", + result_cols + ) + + if selected_metric: + # Display distribution plot + st.plotly_chart( + self.visualizer.plot_metric_distribution(df, selected_metric), + use_container_width=True + ) + + # Display summary statistics + st.write("Summary Statistics:") + st.dataframe(df[selected_metric].describe()) + + def display_clustering_analysis(self, df: pd.DataFrame, config_cols: List[str]): + """Display clustering analysis""" + st.header("Clustering Analysis") + + # Clustering controls + n_clusters = st.slider("Number of Clusters", 2, 10, 3) + selected_features = st.multiselect( + "Select Features for Clustering", + config_cols, + default=config_cols[:3] + ) + + if selected_features and st.button("Perform Clustering"): + clustering_results = self.processor.perform_clustering( + df, selected_features, n_clusters + ) + + # Add cluster labels to dataframe + df_cluster = df.copy() + df_cluster['Cluster'] = clustering_results['labels'] + + # Display cluster visualization + if len(selected_features) >= 2: + fig = px.scatter( + df_cluster, + x=selected_features[0], + y=selected_features[1], + color='Cluster', + title="Cluster Visualization", + template=Config.THEME + ) + st.plotly_chart(fig, use_container_width=True) + + def display_optimization(self, df: pd.DataFrame, config_cols: List[str], result_cols: List[str]): + """Display optimization suggestions""" + st.header("Configuration Optimization") + + # Select target metric + target_metric = st.selectbox( + "Select Target Metric for Optimization", + result_cols + ) + + if target_metric: + # Find best configuration + best_idx = df[target_metric].idxmax() + best_config = df.loc[best_idx, config_cols] + + st.write("### Best Performing Configuration") + st.write(f"Target Metric Value: {df.loc[best_idx, target_metric]:.2f}") + st.dataframe(pd.DataFrame([best_config])) + +def main(): + """Main entry point""" + try: + dashboard = Dashboard() + dashboard.run() + except Exception as e: + logger.error(f"Dashboard error: {e}") + st.error(f"An error occurred: {str(e)}") + +if __name__ == "__main__": + main() + +# import pandas as pd +# import seaborn as sns +# import matplotlib.pyplot as plt +# import streamlit as st +# import plotly +# import plotly.express as px +# import plotly.graph_objs as go +# from sklearn.cluster import KMeans +# from sklearn.decomposition import PCA +# from sklearn.preprocessing import StandardScaler +# from mlos_bench.storage import from_config + +# # -------------------------------------------------------------------------------- +# # Streamlit Configuration +# # -------------------------------------------------------------------------------- +# st.set_page_config( +# page_title="Azure MySQL Config Analyzer", layout="wide", initial_sidebar_state="expanded" +# ) + +# # -------------------------------------------------------------------------------- +# # Data Loading and Caching +# # -------------------------------------------------------------------------------- +# @st.cache_resource +# def load_storage(): +# """ +# Load the MLOS storage configuration for the experiments. +# This function is cached to prevent reloading on every interaction. +# """ +# return from_config(config="storage/sqlite.jsonc") + +# storage = load_storage() + +# # -------------------------------------------------------------------------------- +# # Sidebar - Experiment Selection and Filtering +# # -------------------------------------------------------------------------------- +# st.sidebar.title("Azure MySQL Config Analyzer") + +# # Experiment Selection +# experiment_id = st.sidebar.selectbox( +# "Select Experiment", +# options=list(storage.experiments.keys()), +# help="Choose the experiment you want to analyze.", +# ) + +# # Load the selected experiment +# exp = storage.experiments[experiment_id] +# df = exp.results_df.copy() + +# # Extract configuration and result columns +# config_columns = [col for col in df.columns if col.startswith("config.")] +# result_columns = [col for col in df.columns if col.startswith("result.")] +# metrics = result_columns + +# # -------------------------------------------------------------------------------- +# # Main Title and Description +# # -------------------------------------------------------------------------------- +# st.title(f"Azure MySQL Experiment: {experiment_id}") +# st.write(f"**Description**: {exp.description}") + +# # -------------------------------------------------------------------------------- +# # Tabs Creation +# # -------------------------------------------------------------------------------- +# tabs = st.tabs( +# [ +# "Dashboard", +# "Data Overview", +# "Configurations Analysis", +# "Failure Analysis", +# "Correlation Heatmap", +# "Parallel Coordinates", +# "Config Params Scatter", +# "Top & Bottom Configs", +# "Optimization Suggestions", +# "Clustering", +# "Advanced Statistics", +# "Anomaly Detection", +# "Save Analysis", +# ] +# ) + +# # -------------------------------------------------------------------------------- +# # TAB 1: Dashboard +# # -------------------------------------------------------------------------------- +# with tabs[0]: +# st.header("Dashboard") +# st.write("### Key Metrics Overview") + +# # Calculate key metrics +# total_trials = len(df) +# success_trials = df["status"].value_counts().get("SUCCESS", 0) +# failure_trials = df["status"].value_counts().get("FAILED", 0) +# success_rate = (success_trials / total_trials) * 100 if total_trials > 0 else 0 +# failure_rate = (failure_trials / total_trials) * 100 if total_trials > 0 else 0 + +# # Display key metrics +# col1, col2, col3 = st.columns(3) +# col1.metric("Total Trials", total_trials) +# col2.metric("Successful Trials", success_trials) +# col3.metric("Failure Rate (%)", f"{failure_rate:.2f}") + +# # Visualization: Success vs Failure +# fig = px.pie( +# names=["Success", "Failure"], +# values=[success_trials, failure_trials], +# title="Trial Outcomes", +# color=["Success", "Failure"], +# color_discrete_map={"Success": "green", "Failure": "red"}, +# ) +# st.plotly_chart(fig, use_container_width=True) + +# # Visualization: Top 5 Metrics +# st.write("### Top 5 Metrics") +# top_metrics = df[result_columns].mean().sort_values(ascending=False).head(5) +# fig_metrics = px.bar( +# top_metrics, +# x=top_metrics.index.str.replace("result.", "").str.replace("_", " ").str.title(), +# y=top_metrics.values, +# labels={"x": "Metric", "y": "Average Value"}, +# title="Top 5 Average Metrics", +# color=top_metrics.values, +# color_continuous_scale="Blues", +# ) +# st.plotly_chart(fig_metrics, use_container_width=True) + +# # -------------------------------------------------------------------------------- +# # TAB 2: Data Overview +# # -------------------------------------------------------------------------------- +# with tabs[1]: +# st.header("Data Overview") +# st.write("Explore experiment data and key statistics.") + +# # Data Filtering +# with st.expander("Filter Data"): +# st.subheader("Apply Filters") +# trial_id_filter = st.text_input( +# "Filter by Trial ID (comma-separated)", help="Enter trial IDs separated by commas." +# ) +# status_filter = st.multiselect( +# "Filter by Status", +# options=df["status"].unique(), +# default=df["status"].unique(), +# help="Select one or more statuses to filter the trials.", +# ) +# config_filter = st.multiselect( +# "Filter by Configuration ID", +# options=df["tunable_config_id"].unique(), +# default=df["tunable_config_id"].unique(), +# help="Select one or more configuration IDs to filter the trials.", +# ) + +# if st.button("Apply Filters"): +# filtered_df = df.copy() +# if trial_id_filter: +# try: +# trial_ids = [ +# int(tid.strip()) +# for tid in trial_id_filter.split(",") +# if tid.strip().isdigit() +# ] +# filtered_df = filtered_df[filtered_df["trial_id"].isin(trial_ids)] +# except ValueError: +# st.error("Please enter valid trial IDs separated by commas.") +# if status_filter: +# filtered_df = filtered_df[filtered_df["status"].isin(status_filter)] +# if config_filter: +# filtered_df = filtered_df[filtered_df["tunable_config_id"].isin(config_filter)] +# st.session_state.filtered_df = filtered_df +# st.success("Filters applied successfully!") + +# # Display filtered data or original data +# if "filtered_df" in st.session_state: +# display_df = st.session_state.filtered_df +# else: +# display_df = df + +# if st.checkbox("Show Data Table"): +# st.dataframe(display_df) +# st.write("### Descriptive Statistics:") +# st.write(display_df.describe()) + +# # -------------------------------------------------------------------------------- +# # TAB 3: Configurations Analysis +# # -------------------------------------------------------------------------------- +# with tabs[2]: +# st.header("Configurations Analysis") +# st.write("Visualize performance metrics across different configurations.") + +# config_id = st.selectbox( +# "Select Configuration ID", +# options=df["tunable_config_id"].unique(), +# help="Choose a configuration to analyze its performance over trials.", +# ) +# metric = st.selectbox( +# "Select Metric", options=metrics, help="Choose a performance metric to visualize." +# ) + +# config_data = df[df["tunable_config_id"] == config_id] +# fig = px.line( +# config_data, +# x="trial_id", +# y=metric, +# title=f"{metric.replace('result.', '').replace('_', ' ').title()} over Trials for Configuration {config_id}", +# markers=True, +# labels={ +# "trial_id": "Trial ID", +# metric: metric.replace("result.", "").replace("_", " ").title(), +# }, +# template="plotly_white", +# ) +# st.plotly_chart(fig, use_container_width=True) + +# # Additional Insights: Moving Average +# window_size = st.slider( +# "Select Moving Average Window Size", +# 1, +# 10, +# 3, +# help="Smooth the metric by applying a moving average.", +# ) +# config_data[f"{metric}_MA"] = config_data[metric].rolling(window=window_size).mean() +# fig_ma = px.line( +# config_data, +# x="trial_id", +# y=f"{metric}_MA", +# title=f"{metric.replace('result.', '').replace('_', ' ').title()} - Moving Average (Window Size={window_size})", +# markers=True, +# labels={ +# "trial_id": "Trial ID", +# f"{metric}_MA": f"{metric.replace('result.', '').replace('_', ' ').title()} (MA)", +# }, +# template="plotly_white", +# ) +# st.plotly_chart(fig_ma, use_container_width=True) + +# # -------------------------------------------------------------------------------- +# # TAB 4: Failure Analysis +# # -------------------------------------------------------------------------------- +# with tabs[3]: +# st.header("Failure Analysis") +# st.write("Analyze failure rates and trends across trials.") + +# if "status" in df.columns: +# # Failure Rate Distribution +# st.subheader("Failure Rate Distribution") +# failure_counts = df["status"].value_counts() +# fig_pie = px.pie( +# values=failure_counts.values, +# names=failure_counts.index, +# title="Failure Rate Distribution", +# color=failure_counts.index, +# color_discrete_map={"FAILED": "red", "SUCCESS": "green"}, +# ) +# st.plotly_chart(fig_pie, use_container_width=True) + +# # Failure Rate Trend Over Trials +# st.subheader("Failure Rate Trend Over Trials") +# failure_rate_trend = ( +# df.groupby("trial_id")["status"] +# .apply(lambda x: (x == "FAILED").mean() * 100) +# .reset_index() +# ) +# failure_rate_trend.columns = ["Trial ID", "Failure Rate (%)"] +# fig_line = px.line( +# failure_rate_trend, +# x="Trial ID", +# y="Failure Rate (%)", +# title="Failure Rate Trend Over Trials", +# markers=True, +# labels={"Trial ID": "Trial ID", "Failure Rate (%)": "Failure Rate (%)"}, +# template="plotly_white", +# ) +# st.plotly_chart(fig_line, use_container_width=True) +# else: +# st.info("No 'status' column found in the dataset.") + +# # -------------------------------------------------------------------------------- +# # TAB 5: Correlation Heatmap +# # -------------------------------------------------------------------------------- +# with tabs[4]: +# st.header("Correlation Heatmap") +# st.write("Visualize correlations between selected configuration and result metrics.") + +# selected_columns = st.multiselect( +# "Select Columns for Heatmap", +# options=config_columns + result_columns, +# default=config_columns[:2] + result_columns[:2], +# help="Choose multiple columns to analyze their correlation.", +# ) + +# if st.button("Generate Heatmap"): +# if selected_columns: +# corr_matrix = df[selected_columns].corr() +# fig = px.imshow( +# corr_matrix, +# text_auto=True, +# color_continuous_scale="Viridis", +# title="Correlation Heatmap", +# labels={"color": "Correlation Coefficient"}, +# ) +# st.plotly_chart(fig, use_container_width=True) +# else: +# st.warning("Please select at least one column to generate the heatmap.") +# else: +# st.info("Select columns and click 'Generate Heatmap' to visualize correlations.") + +# # -------------------------------------------------------------------------------- +# # TAB 6: Parallel Coordinates +# # -------------------------------------------------------------------------------- +# with tabs[5]: +# st.header("Parallel Coordinates Plot") +# st.write( +# "Explore multi-dimensional relationships between configuration parameters and metrics." +# ) + +# parallel_columns = st.multiselect( +# "Select Columns for Parallel Plot", +# options=config_columns + result_columns, +# default=config_columns[:3] + result_columns[:2], +# help="Choose multiple columns to include in the parallel coordinates plot.", +# ) + +# if parallel_columns: +# color_metric = st.selectbox( +# "Select Metric for Coloring", +# options=result_columns, +# help="Choose a result metric to color-code the parallel coordinates.", +# ) +# fig = px.parallel_coordinates( +# df, +# dimensions=parallel_columns, +# color=color_metric, +# color_continuous_scale=px.colors.diverging.Tealrose, +# title="Parallel Coordinates Plot", +# labels={ +# col: col.replace("config.", "").replace("_", " ").title() +# for col in parallel_columns +# }, +# template="plotly_white", +# ) +# st.plotly_chart(fig, use_container_width=True) +# else: +# st.info("Select columns to generate the parallel coordinates plot.") + +# # -------------------------------------------------------------------------------- +# # TAB 7: Config Params Scatter +# # -------------------------------------------------------------------------------- +# with tabs[6]: +# st.header("Configuration Parameters Scatter Plot") +# st.write( +# "Analyze the relationship between multiple configuration parameters and a selected result metric." +# ) + +# if not config_columns: +# st.warning("No configuration parameters available in the dataset.") +# elif not metrics: +# st.warning("No result metrics available in the dataset.") +# else: +# # Select multiple configuration parameters +# selected_config_params = st.multiselect( +# "Select Configuration Parameters", +# options=config_columns, +# default=config_columns[:2], +# help="Choose one or more configuration parameters to analyze.", +# ) + +# # Select one result metric +# selected_result_metric = st.selectbox( +# "Select Result Metric", +# options=metrics, +# help="Choose a result metric to analyze against the selected configuration parameters.", +# ) + +# if selected_config_params: +# # Determine layout based on number of selected parameters +# plots_per_row = 2 +# num_plots = len(selected_config_params) +# num_rows = (num_plots + plots_per_row - 1) // plots_per_row + +# for row in range(num_rows): +# cols = st.columns(plots_per_row) +# for i in range(plots_per_row): +# plot_index = row * plots_per_row + i +# if plot_index < num_plots: +# config_param = selected_config_params[plot_index] +# with cols[i]: +# fig = px.scatter( +# df, +# x=config_param, +# y=selected_result_metric, +# color="tunable_config_id", +# title=f"{config_param.replace('config.', '').replace('_', ' ').title()} vs {selected_result_metric.replace('result.', '').replace('_', ' ').title()}", +# labels={ +# config_param: config_param.replace("config.", "") +# .replace("_", " ") +# .title(), +# selected_result_metric: selected_result_metric.replace( +# "result.", "" +# ) +# .replace("_", " ") +# .title(), +# }, +# hover_data=["trial_id", "tunable_config_id"], +# trendline="ols", +# template="plotly_white", +# ) + +# st.plotly_chart(fig, use_container_width=True) + +# # Calculate and display the correlation coefficient +# corr_coeff = ( +# df[[config_param, selected_result_metric]].corr().iloc[0, 1] +# ) +# st.markdown(f"**Correlation Coefficient:** {corr_coeff:.2f}") +# else: +# st.info( +# "Please select at least one configuration parameter to generate scatter plots." +# ) + +# # -------------------------------------------------------------------------------- +# # TAB 8: Top & Bottom Configurations +# # -------------------------------------------------------------------------------- +# with tabs[7]: +# st.header("Top and Bottom Configurations") +# st.write( +# "Identify configurations with the best and worst performance based on selected metrics." +# ) + +# n_configs = st.slider( +# "Number of Configurations to Display", +# min_value=1, +# max_value=10, +# value=5, +# help="Select how many top and bottom configurations to display.", +# ) + +# # Select metric for ranking +# tb_metric = st.selectbox( +# "Select Metric for Ranking", +# options=metrics, +# index=0, +# key="tb_metric", +# help="Choose a metric to rank configurations.", +# ) +# optimization_method = st.radio( +# "Select Optimization Method", +# ["Maximize", "Minimize"], +# index=0, +# key="tb_opt_method", +# help="Choose whether to find configurations that maximize or minimize the selected metric.", +# ) + +# if not df.empty: +# if optimization_method == "Maximize": +# top_configs = df.nlargest(n_configs, tb_metric) +# bottom_configs = df.nsmallest(n_configs, tb_metric) +# else: +# top_configs = df.nsmallest(n_configs, tb_metric) +# bottom_configs = df.nlargest(n_configs, tb_metric) + +# st.subheader("Top Configurations") +# st.dataframe(top_configs) + +# st.subheader("Bottom Configurations") +# st.dataframe(bottom_configs) +# else: +# st.warning("No data available to identify top/bottom configurations.") + +# # -------------------------------------------------------------------------------- +# # TAB 9: Optimization Suggestions +# # -------------------------------------------------------------------------------- +# with tabs[8]: +# st.header("Optimization Suggestions") +# st.write("Discover optimal configurations based on selected performance metrics.") + +# target_metric = st.selectbox( +# "Select Metric for Optimization", +# options=metrics, +# index=0, +# key="opt_target_metric", +# help="Choose a performance metric to optimize.", +# ) +# optimization_method = st.radio( +# "Select Optimization Method", +# ["Maximize", "Minimize"], +# index=0, +# key="opt_method_choice", +# help="Choose whether to maximize or minimize the selected metric.", +# ) + +# if not df.empty: +# if optimization_method == "Maximize": +# optimal_config = df.loc[df[target_metric].idxmax()] +# else: +# optimal_config = df.loc[df[target_metric].idxmin()] + +# st.write( +# f"**Optimal Configuration ({optimization_method} {target_metric.replace('result.', '').replace('_', ' ').title()}):**" +# ) +# st.json(optimal_config[config_columns].to_dict()) +# else: +# st.warning("No data available for optimization.") + +# # -------------------------------------------------------------------------------- +# # TAB 10: Clustering +# # -------------------------------------------------------------------------------- +# with tabs[9]: +# st.header("Clustering Analysis") +# st.write("Group similar configurations to identify patterns and clusters.") + +# cluster_columns = st.multiselect( +# "Select Columns for Clustering", +# options=config_columns + result_columns, +# default=config_columns[:3], +# help="Choose multiple columns to perform clustering.", +# ) +# num_clusters = st.slider( +# "Number of Clusters", +# min_value=2, +# max_value=10, +# value=3, +# help="Define the number of clusters for K-Means.", +# ) + +# if len(cluster_columns) >= 2: +# if st.button("Generate Clustering"): +# clustering_data = df[cluster_columns].dropna() + +# # Standardize the data +# scaler = StandardScaler() +# clustering_data_scaled = scaler.fit_transform(clustering_data) + +# # Perform K-Means clustering +# kmeans = KMeans(n_clusters=num_clusters, random_state=42) +# clusters = kmeans.fit_predict(clustering_data_scaled) +# df["cluster"] = clusters + +# # Optional: Dimensionality Reduction for 3D Plotting +# if len(cluster_columns) > 3: +# pca = PCA(n_components=3) +# principal_components = pca.fit_transform(clustering_data_scaled) +# df["PC1"] = principal_components[:, 0] +# df["PC2"] = principal_components[:, 1] +# df["PC3"] = principal_components[:, 2] +# fig = px.scatter_3d( +# df, +# x="PC1", +# y="PC2", +# z="PC3", +# color="cluster", +# title="3D Scatter Plot with PCA and Clustering", +# labels={ +# "PC1": "Principal Component 1", +# "PC2": "Principal Component 2", +# "PC3": "Principal Component 3", +# }, +# template="plotly_white", +# ) +# elif len(cluster_columns) == 3: +# fig = px.scatter_3d( +# df, +# x=cluster_columns[0], +# y=cluster_columns[1], +# z=cluster_columns[2], +# color="cluster", +# title="3D Scatter Plot with Clustering", +# labels={ +# cluster_columns[0]: cluster_columns[0] +# .replace("config.", "") +# .replace("_", " ") +# .title(), +# cluster_columns[1]: cluster_columns[1] +# .replace("config.", "") +# .replace("_", " ") +# .title(), +# cluster_columns[2]: cluster_columns[2] +# .replace("config.", "") +# .replace("_", " ") +# .title(), +# }, +# template="plotly_white", +# ) +# else: +# fig = px.scatter( +# df, +# x=cluster_columns[0], +# y=cluster_columns[1], +# color="cluster", +# title="2D Scatter Plot with Clustering", +# labels={ +# cluster_columns[0]: cluster_columns[0] +# .replace("config.", "") +# .replace("_", " ") +# .title(), +# cluster_columns[1]: cluster_columns[1] +# .replace("config.", "") +# .replace("_", " ") +# .title(), +# }, +# template="plotly_white", +# ) + +# st.plotly_chart(fig, use_container_width=True) + +# # Cluster Centroids +# centroids = kmeans.cluster_centers_ +# centroids_df = pd.DataFrame(centroids, columns=cluster_columns) +# st.subheader("Cluster Centroids") +# st.write(centroids_df) +# else: +# st.warning("Please select at least two columns for clustering.") + +# # -------------------------------------------------------------------------------- +# # TAB 10: Advanced Statistics +# # -------------------------------------------------------------------------------- +# with tabs[10]: +# st.header("Advanced Statistics") +# st.write("Perform advanced statistical analyses on the experiment data.") + +# # Select Metric for Statistical Analysis +# selected_metric = st.selectbox( +# "Select Metric for Statistical Analysis", +# options=metrics, +# help="Choose a result metric to perform statistical tests.", +# ) + +# # Debugging: Display selected_metric and its type +# st.write(f"**Selected Metric:** {selected_metric}") +# st.write(f"**Selected Metric Type:** {df[selected_metric].dtype}") + +# # Check if the selected metric is numeric +# if pd.api.types.is_numeric_dtype(df[selected_metric]): +# st.subheader( +# f"Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}" +# ) +# st.write(df[selected_metric].describe()) + +# # Define the template +# template_value = "plotly_white" +# st.write(f"**Template Type:** {type(template_value)}, **Value:** {template_value}") + +# # Histogram with KDE +# try: +# fig_hist = px.histogram( +# df, +# x=selected_metric, +# nbins=30, +# title=f"Distribution of {selected_metric.replace('result.', '').replace('_', ' ').title()}", +# marginal="kde", +# labels={ +# selected_metric: selected_metric.replace("result.", "") +# .replace("_", " ") +# .title() +# }, +# template=template_value, # Ensure this is a string +# ) +# st.plotly_chart(fig_hist, use_container_width=True) +# except Exception as e: +# st.error(f"An error occurred while generating the histogram: {e}") + +# # Box Plot +# st.subheader( +# f"Box Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" +# ) +# try: +# fig_box = px.box( +# df, +# y=selected_metric, +# points="all", +# title=f"Box Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", +# labels={ +# selected_metric: selected_metric.replace("result.", "") +# .replace("_", " ") +# .title() +# }, +# template=template_value, # Ensure this is a string +# ) +# st.plotly_chart(fig_box, use_container_width=True) +# except Exception as e: +# st.error(f"An error occurred while generating the box plot: {e}") + +# # Violin Plot +# st.subheader( +# f"Violin Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" +# ) +# try: +# fig_violin = px.violin( +# df, +# y=selected_metric, +# box=True, +# points="all", +# title=f"Violin Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", +# labels={ +# selected_metric: selected_metric.replace("result.", "") +# .replace("_", " ") +# .title() +# }, +# template=template_value, # Ensure this is a string +# ) +# st.plotly_chart(fig_violin, use_container_width=True) +# except Exception as e: +# st.error(f"An error occurred while generating the violin plot: {e}") +# else: +# st.warning( +# f"The selected metric '{selected_metric}' is not numeric and cannot be plotted." +# ) + +# # Display Plotly Version for Debugging +# st.subheader("Plotly Version") +# st.write(f"Plotly version: {plotly.__version__}") + +# # Optional: Display the selected template +# st.subheader("Template Information") +# st.write(f"Selected Template: {template_value}") + + +# # -------------------------------------------------------------------------------- +# # TAB 12: Anomaly Detection +# # -------------------------------------------------------------------------------- +# with tabs[11]: +# st.header("Anomaly Detection") +# st.write("Identify anomalous trials based on selected metrics.") + +# anomaly_metric = st.selectbox( +# "Select Metric for Anomaly Detection", +# options=metrics, +# help="Choose a result metric to perform anomaly detection.", +# ) +# threshold = st.slider( +# "Set Anomaly Threshold (Standard Deviations)", +# min_value=1.0, +# max_value=5.0, +# value=3.0, +# step=0.5, +# help="Define how many standard deviations away from the mean a data point should be to be considered an anomaly.", +# ) + +# mean_val = df[anomaly_metric].mean() +# std_val = df[anomaly_metric].std() +# upper_bound = mean_val + threshold * std_val +# lower_bound = mean_val - threshold * std_val + +# anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] + +# st.subheader(f"Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}") +# if not anomalies.empty: +# st.write(f"Total Anomalies Detected: {len(anomalies)}") +# st.dataframe(anomalies) + +# # Visualization: Scatter Plot Highlighting Anomalies +# fig_anomaly = px.scatter( +# df, +# x="trial_id", +# y=anomaly_metric, +# color=df.index.isin(anomalies.index), +# title=f"Anomaly Detection in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}", +# labels={ +# "trial_id": "Trial ID", +# anomaly_metric: anomaly_metric.replace("result.", "").replace("_", " ").title(), +# }, +# color_discrete_map={True: "red", False: "blue"}, +# template="plotly_white", +# ) +# st.plotly_chart(fig_anomaly, use_container_width=True) +# else: +# st.success("No anomalies detected based on the current threshold.") + +# # -------------------------------------------------------------------------------- +# # TAB 13: Save Analysis Report +# # -------------------------------------------------------------------------------- +# with tabs[12]: +# st.header("Save Analysis Report") +# st.write("Download a comprehensive analysis report of your experiment.") + +# report_options = st.multiselect( +# "Select Sections to Include in the Report", +# options=[ +# "Data Overview", +# "Configurations Analysis", +# "Failure Analysis", +# "Correlation Heatmap", +# "Parallel Coordinates", +# "Config Params Scatter", +# "Top & Bottom Configs", +# "Optimization Suggestions", +# "Clustering", +# "Advanced Statistics", +# "Anomaly Detection", +# ], +# default=[ +# "Data Overview", +# "Configurations Analysis", +# "Failure Analysis", +# "Correlation Heatmap", +# "Top & Bottom Configs", +# "Optimization Suggestions", +# ], +# help="Choose which sections of the analysis you want to include in the report.", +# ) + +# if st.button("Download Report"): +# # Generate the report based on selected sections +# report = f"# Azure MySQL Config Analyzer Report\n\n## Experiment: {experiment_id}\n\n**Description:** {exp.description}\n\n" + +# if "Data Overview" in report_options: +# report += "## Data Overview\n" +# report += f"### Descriptive Statistics\n{df.describe().to_markdown()}\n\n" + +# if "Configurations Analysis" in report_options: +# report += "## Configurations Analysis\n" +# # Example: Include top configuration analysis +# top_config = df.loc[ +# df["result.metric"].idxmax() +# ] # Replace 'result.metric' with actual metric if needed +# report += f"### Optimal Configuration\n{top_config[config_columns].to_dict()}\n\n" + +# if "Failure Analysis" in report_options: +# report += "## Failure Analysis\n" +# failure_counts = df["status"].value_counts() +# report += f"### Failure Rate Distribution\n{failure_counts.to_dict()}\n\n" + +# if "Correlation Heatmap" in report_options: +# report += "## Correlation Heatmap\n" +# selected_columns = config_columns + result_columns # Adjust as needed +# corr_matrix = df[selected_columns].corr() +# report += f"### Correlation Matrix\n{corr_matrix.to_markdown()}\n\n" + +# if "Parallel Coordinates" in report_options: +# report += "## Parallel Coordinates\n" +# # Example placeholder +# report += "### Parallel Coordinates Plot was generated in the application.\n\n" + +# if "Config Params Scatter" in report_options: +# report += "## Configuration Parameters Scatter Plot\n" +# # Example placeholder +# report += "### Scatter plots were generated in the application.\n\n" + +# if "Top & Bottom Configs" in report_options: +# report += "## Top & Bottom Configurations\n" +# n_configs = st.session_state.get("n_configs_display", 5) +# tb_metric = st.session_state.get("tb_metric", metrics[0]) +# optimization_method = st.session_state.get("tb_opt_method", "Maximize") +# if optimization_method == "Maximize": +# top_configs = df.nlargest(n_configs, tb_metric) +# bottom_configs = df.nsmallest(n_configs, tb_metric) +# else: +# top_configs = df.nsmallest(n_configs, tb_metric) +# bottom_configs = df.nlargest(n_configs, tb_metric) +# report += f"### Top {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{top_configs.to_markdown()}\n\n" +# report += f"### Bottom {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{bottom_configs.to_markdown()}\n\n" + +# if "Optimization Suggestions" in report_options: +# report += "## Optimization Suggestions\n" +# target_metric = st.session_state.get("opt_target_metric", metrics[0]) +# optimization_method = st.session_state.get("opt_method_choice", "Maximize") +# if optimization_method == "Maximize": +# optimal_config = df.loc[df[target_metric].idxmax()] +# else: +# optimal_config = df.loc[df[target_metric].idxmin()] +# report += f"### Optimal Configuration ({optimization_method} {target_metric.replace('result.', '').replace('_', ' ').title()}):\n{optimal_config[config_columns].to_dict()}\n\n" + +# if "Clustering" in report_options: +# report += "## Clustering Analysis\n" +# # Example placeholder +# report += "### Clustering results were generated in the application.\n\n" + +# if "Advanced Statistics" in report_options: +# report += "## Advanced Statistics\n" +# selected_metric = st.session_state.get("advanced_stat_metric", metrics[0]) +# report += f"### Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}\n{df[selected_metric].describe().to_markdown()}\n\n" + +# if "Anomaly Detection" in report_options: +# report += "## Anomaly Detection\n" +# anomaly_metric = st.session_state.get("anomaly_metric", metrics[0]) +# threshold = st.session_state.get("anomaly_threshold", 3.0) +# mean_val = df[anomaly_metric].mean() +# std_val = df[anomaly_metric].std() +# upper_bound = mean_val + threshold * std_val +# lower_bound = mean_val - threshold * std_val +# anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] +# report += f"### Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()} (Threshold: {threshold} Std Dev)\n{anomalies.to_markdown()}\n\n" + +# # Download the report as a text file +# st.download_button( +# label="Download Report as Text", +# data=report, +# file_name="analysis_report.txt", +# mime="text/plain", +# ) + +# # Optionally, provide the CSV report +# st.subheader("Download Descriptive Statistics") +# if st.button("Download Descriptive Statistics as CSV"): +# report_csv = df.describe().to_csv() +# st.download_button( +# label="Download CSV Report", +# data=report_csv, +# file_name="descriptive_statistics.csv", +# mime="text/csv", +# ) + +# st.info("Select the sections you want to include in the report and click 'Download Report'.") + +# # -------------------------------------------------------------------------------- +# # TAB 10: Clustering +# # -------------------------------------------------------------------------------- +# with tabs[9]: +# st.header("Clustering Analysis") +# st.write("Group similar configurations to identify patterns and clusters.") + +# cluster_columns = st.multiselect( +# "Select Columns for Clustering", +# options=config_columns + result_columns, +# default=config_columns[:3], +# help="Choose multiple columns to perform clustering.", +# key="clustering_columns_select", # Unique key +# ) + +# num_clusters = st.slider( +# "Number of Clusters", +# min_value=2, +# max_value=10, +# value=3, +# help="Define the number of clusters for K-Means.", +# key="num_clusters_slider_clustering", # Unique key +# ) + +# if len(cluster_columns) >= 2: +# if st.button("Generate Clustering", key="gen cluster"): +# clustering_data = df[cluster_columns].dropna() + +# # Standardize the data +# scaler = StandardScaler() +# clustering_data_scaled = scaler.fit_transform(clustering_data) + +# # Perform K-Means clustering +# kmeans = KMeans(n_clusters=num_clusters, random_state=42) +# clusters = kmeans.fit_predict(clustering_data_scaled) +# df["cluster"] = clusters + +# # Optional: Dimensionality Reduction for 3D Plotting +# if len(cluster_columns) > 3: +# pca = PCA(n_components=3) +# principal_components = pca.fit_transform(clustering_data_scaled) +# df["PC1"] = principal_components[:, 0] +# df["PC2"] = principal_components[:, 1] +# df["PC3"] = principal_components[:, 2] +# fig = px.scatter_3d( +# df, +# x="PC1", +# y="PC2", +# z="PC3", +# color="cluster", +# title="3D Scatter Plot with PCA and Clustering", +# labels={ +# "PC1": "Principal Component 1", +# "PC2": "Principal Component 2", +# "PC3": "Principal Component 3", +# }, +# template="plotly_white", +# ) +# elif len(cluster_columns) == 3: +# fig = px.scatter_3d( +# df, +# x=cluster_columns[0], +# y=cluster_columns[1], +# z=cluster_columns[2], +# color="cluster", +# title="3D Scatter Plot with Clustering", +# labels={ +# cluster_columns[0]: cluster_columns[0] +# .replace("config.", "") +# .replace("_", " ") +# .title(), +# cluster_columns[1]: cluster_columns[1] +# .replace("config.", "") +# .replace("_", " ") +# .title(), +# cluster_columns[2]: cluster_columns[2] +# .replace("config.", "") +# .replace("_", " ") +# .title(), +# }, +# template="plotly_white", +# ) +# else: +# fig = px.scatter( +# df, +# x=cluster_columns[0], +# y=cluster_columns[1], +# color="cluster", +# title="2D Scatter Plot with Clustering", +# labels={ +# cluster_columns[0]: cluster_columns[0] +# .replace("config.", "") +# .replace("_", " ") +# .title(), +# cluster_columns[1]: cluster_columns[1] +# .replace("config.", "") +# .replace("_", " ") +# .title(), +# }, +# template="plotly_white", +# ) + +# st.plotly_chart(fig, use_container_width=True) + +# # Cluster Centroids +# centroids = kmeans.cluster_centers_ +# centroids_df = pd.DataFrame(centroids, columns=cluster_columns) +# st.subheader("Cluster Centroids") +# st.write(centroids_df) +# else: +# st.warning("Please select at least two columns for clustering.") + +# # -------------------------------------------------------------------------------- +# # TAB 11: Advanced Statistics +# # -------------------------------------------------------------------------------- +# # -------------------------------------------------------------------------------- +# # TAB 11: Advanced Statistics +# # -------------------------------------------------------------------------------- +# with tabs[10]: +# st.header("Advanced Statistics") +# st.write("Perform advanced statistical analyses on the experiment data.") + +# selected_metric = st.selectbox( +# "Select Metric for Statistical Analysis", +# options=metrics, +# help="Choose a result metric to perform statistical tests.", +# key="sel adv", +# ) + +# st.subheader( +# f"Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}" +# ) + +# # Display data type and missing values +# st.write(f"Data Type: {df[selected_metric].dtype}") +# st.write(f"Missing Values: {df[selected_metric].isnull().sum()}") + +# # Handle missing values +# plot_df = df.dropna(subset=[selected_metric]) + +# # Check if the selected metric is numeric +# if pd.api.types.is_numeric_dtype(plot_df[selected_metric]): +# st.write(plot_df[selected_metric].describe()) + +# # Histogram with KDE +# fig_hist = px.histogram( +# plot_df, +# x=selected_metric, +# nbins=30, +# title=f"Distribution of {selected_metric.replace('result.', '').replace('_', ' ').title()}", +# marginal="kde", +# labels={ +# selected_metric: selected_metric.replace("result.", "").replace("_", " ").title() +# }, +# template="plotly_white", +# ) +# st.plotly_chart(fig_hist, use_container_width=True) + +# # Box Plot +# st.subheader( +# f"Box Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" +# ) +# fig_box = px.box( +# plot_df, +# y=selected_metric, +# points="all", +# title=f"Box Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", +# labels={ +# selected_metric: selected_metric.replace("result.", "").replace("_", " ").title() +# }, +# template="plotly_white", +# ) +# st.plotly_chart(fig_box, use_container_width=True) + +# # Violin Plot +# st.subheader( +# f"Violin Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" +# ) +# fig_violin = px.violin( +# plot_df, +# y=selected_metric, +# box=True, +# points="all", +# title=f"Violin Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", +# labels={ +# selected_metric: selected_metric.replace("result.", "").replace("_", " ").title() +# }, +# template="plotly_white", +# ) +# st.plotly_chart(fig_violin, use_container_width=True) +# else: +# st.error( +# f"The selected metric '{selected_metric}' is not numeric. Please select a numeric metric for statistical analysis." +# ) + + +# # -------------------------------------------------------------------------------- +# # TAB 12: Anomaly Detection +# # -------------------------------------------------------------------------------- +# with tabs[11]: +# st.header("Anomaly Detection") +# st.write("Identify anomalous trials based on selected metrics.") + +# anomaly_metric = st.selectbox( +# "Select Metric for Anomaly Detection", +# options=metrics, +# help="Choose a result metric to perform anomaly detection.", +# ) +# threshold = st.slider( +# "Set Anomaly Threshold (Standard Deviations)", +# min_value=1.0, +# max_value=5.0, +# value=3.0, +# step=0.5, +# help="Define how many standard deviations away from the mean a data point should be to be considered an anomaly.", +# ) + +# mean_val = df[anomaly_metric].mean() +# std_val = df[anomaly_metric].std() +# upper_bound = mean_val + threshold * std_val +# lower_bound = mean_val - threshold * std_val + +# anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] + +# st.subheader(f"Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}") +# if not anomalies.empty: +# st.write(f"Total Anomalies Detected: {len(anomalies)}") +# st.dataframe(anomalies) + +# # Visualization: Scatter Plot Highlighting Anomalies +# fig_anomaly = px.scatter( +# df, +# x="trial_id", +# y=anomaly_metric, +# color=df.index.isin(anomalies.index), +# title=f"Anomaly Detection in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}", +# labels={ +# "trial_id": "Trial ID", +# anomaly_metric: anomaly_metric.replace("result.", "").replace("_", " ").title(), +# }, +# color_discrete_map={True: "red", False: "blue"}, +# template="plotly_white", +# ) +# st.plotly_chart(fig_anomaly, use_container_width=True) +# else: +# st.success("No anomalies detected based on the current threshold.") + +# # -------------------------------------------------------------------------------- +# # TAB 13: Save Analysis Report +# # -------------------------------------------------------------------------------- +# with tabs[12]: +# st.header("Save Analysis Report") +# st.write("Download a comprehensive analysis report of your experiment.") + +# report_options = st.multiselect( +# "Select Sections to Include in the Report", +# options=[ +# "Data Overview", +# "Configurations Analysis", +# "Failure Analysis", +# "Correlation Heatmap", +# "Parallel Coordinates", +# "Config Params Scatter", +# "Top & Bottom Configs", +# "Optimization Suggestions", +# "Clustering", +# "Advanced Statistics", +# "Anomaly Detection", +# ], +# default=[ +# "Data Overview", +# "Configurations Analysis", +# "Failure Analysis", +# "Correlation Heatmap", +# "Top & Bottom Configs", +# "Optimization Suggestions", +# ], +# help="Choose which sections of the analysis you want to include in the report.", +# ) + +# if st.button("Download Report"): +# # Generate the report based on selected sections +# report = f"# Azure MySQL Config Analyzer Report\n\n## Experiment: {experiment_id}\n\n**Description:** {exp.description}\n\n" + +# if "Data Overview" in report_options: +# report += "## Data Overview\n" +# report += f"### Descriptive Statistics\n{df.describe().to_markdown()}\n\n" + +# if "Configurations Analysis" in report_options: +# report += "## Configurations Analysis\n" +# # Example: Include top configuration analysis +# if "result.metric" in df.columns: +# top_config = df.loc[ +# df["result.metric"].idxmax() +# ] # Replace 'result.metric' with actual metric +# report += f"### Optimal Configuration\n{top_config[config_columns].to_dict()}\n\n" +# else: +# report += ( +# "### Configurations Analysis details were generated in the application.\n\n" +# ) + +# if "Failure Analysis" in report_options: +# report += "## Failure Analysis\n" +# failure_counts = df["status"].value_counts() +# report += f"### Failure Rate Distribution\n{failure_counts.to_dict()}\n\n" + +# if "Correlation Heatmap" in report_options: +# report += "## Correlation Heatmap\n" +# selected_columns = config_columns + result_columns # Adjust as needed +# corr_matrix = df[selected_columns].corr() +# report += f"### Correlation Matrix\n{corr_matrix.to_markdown()}\n\n" + +# if "Parallel Coordinates" in report_options: +# report += "## Parallel Coordinates\n" +# report += "### Parallel Coordinates Plot was generated in the application.\n\n" + +# if "Config Params Scatter" in report_options: +# report += "## Configuration Parameters Scatter Plot\n" +# report += "### Scatter plots were generated in the application.\n\n" + +# if "Top & Bottom Configs" in report_options: +# report += "## Top & Bottom Configurations\n" +# n_configs = st.session_state.get("n_configs_display", 5) +# tb_metric = st.session_state.get("tb_metric", metrics[0]) +# optimization_method = st.session_state.get("tb_opt_method", "Maximize") +# if optimization_method == "Maximize": +# top_configs = df.nlargest(n_configs, tb_metric) +# bottom_configs = df.nsmallest(n_configs, tb_metric) +# else: +# top_configs = df.nsmallest(n_configs, tb_metric) +# bottom_configs = df.nlargest(n_configs, tb_metric) +# report += f"### Top {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{top_configs.to_markdown()}\n\n" +# report += f"### Bottom {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{bottom_configs.to_markdown()}\n\n" + +# if "Optimization Suggestions" in report_options: +# report += "## Optimization Suggestions\n" +# target_metric = st.session_state.get("opt_target_metric", metrics[0]) +# optimization_method = st.session_state.get("opt_method_choice", "Maximize") +# if optimization_method == "Maximize": +# optimal_config = df.loc[df[target_metric].idxmax()] +# else: +# optimal_config = df.loc[df[target_metric].idxmin()] +# report += f"### Optimal Configuration ({optimization_method} {target_metric.replace('result.', '').replace('_', ' ').title()}):\n{optimal_config[config_columns].to_dict()}\n\n" + +# if "Clustering" in report_options: +# report += "## Clustering Analysis\n" +# report += "### Clustering results were generated in the application.\n\n" + +# if "Advanced Statistics" in report_options: +# report += "## Advanced Statistics\n" +# selected_metric = st.session_state.get("advanced_stat_metric", metrics[0]) +# report += f"### Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}\n{df[selected_metric].describe().to_markdown()}\n\n" + +# if "Anomaly Detection" in report_options: +# report += "## Anomaly Detection\n" +# anomaly_metric = st.session_state.get("anomaly_metric", metrics[0]) +# threshold = st.session_state.get("anomaly_threshold", 3.0) +# mean_val = df[anomaly_metric].mean() +# std_val = df[anomaly_metric].std() +# upper_bound = mean_val + threshold * std_val +# lower_bound = mean_val - threshold * std_val +# anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] +# report += f"### Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()} (Threshold: {threshold} Std Dev)\n{anomalies.to_markdown()}\n\n" + +# # Download the report as a text file +# st.download_button( +# label="Download Report as Text", +# data=report, +# file_name="analysis_report.txt", +# mime="text/plain", +# ) + +# # Optionally, provide the CSV report +# st.subheader("Download Descriptive Statistics") +# if st.button("Download Descriptive Statistics as CSV"): +# report_csv = df.describe().to_csv() +# st.download_button( +# label="Download CSV Report", +# data=report_csv, +# file_name="descriptive_statistics.csv", +# mime="text/csv", +# ) + +# st.info("Select the sections you want to include in the report and click 'Download Report'.") + +# # -------------------------------------------------------------------------------- +# # Additional UI/UX Enhancements +# # -------------------------------------------------------------------------------- +# st.sidebar.markdown("---") +# st.sidebar.markdown("#### Tips for Better Workflow") +# st.sidebar.markdown( +# """ +# - **Start with the Dashboard** to get an overview of key metrics. +# - **Use Data Overview** to understand and filter your dataset. +# - **Configurations Analysis** helps visualize specific configuration performances. +# - **Failure Analysis** highlights trial outcomes and trends. +# - **Correlation Heatmap** and **Parallel Coordinates** allow in-depth correlation and multi-dimensional analysis. +# - **Config Params Scatter** plots relationships between configuration parameters and metrics. +# - **Top & Bottom Configs** identify the best and worst-performing configurations. +# - **Optimization Suggestions** provide insights into optimal configurations. +# - **Clustering** groups similar configurations for pattern recognition. +# - **Advanced Statistics** offers detailed statistical analyses of your metrics. +# - **Anomaly Detection** helps identify outliers and unusual trial performances. +# - **Save Analysis** lets you download a comprehensive report of your findings. +# """ +# ) From bf4d6e8203bd4841bb0ac394c45ea3c8ac6408be Mon Sep 17 00:00:00 2001 From: Yaseen Shady <139421618+yshady-acheev@users.noreply.github.com> Date: Wed, 22 Jan 2025 05:39:17 +0000 Subject: [PATCH 5/8] adding old gui --- backend.py | 121 +++ dashboard.py | 1627 ----------------------------------------- frontend.py | 1305 +++++++++++++++++++++++++++++++++ mlos_demo_mysql.ipynb | 4 +- 4 files changed, 1428 insertions(+), 1629 deletions(-) create mode 100644 backend.py delete mode 100644 dashboard.py create mode 100644 frontend.py diff --git a/backend.py b/backend.py new file mode 100644 index 0000000..bb5bb55 --- /dev/null +++ b/backend.py @@ -0,0 +1,121 @@ +from datetime import datetime, timedelta +import time +from watchdog.observers import Observer +from watchdog.events import FileSystemEventHandler +import os +from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect, HTTPException +from pydantic import BaseModel +import pandas as pd +import json +from pathlib import Path +from azure.identity import DefaultAzureCredential +from mlos_bench.storage import from_config +from copy import deepcopy +import subprocess +import logging +import asyncio +from fastapi.middleware.cors import CORSMiddleware +import re +import json5 + +app = FastAPI() + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# Load the storage config and connect to the storage +try: + storage = storage = from_config(config="storage/sqlite.jsonc") +except Exception as e: + raise HTTPException(status_code=500, detail=f"Error loading storage configuration: {e}") + + +@app.get("/experiments") +def get_experiments(): + return list(storage.experiments.keys()) + + +@app.get("/experiment_results/{experiment_id}") +def get_experiment_results(experiment_id: str): + try: + exp = storage.experiments[experiment_id] + return exp.results_df.to_dict(orient="records") + except KeyError: + raise HTTPException(status_code=404, detail="Experiment not found") + + +def count_categorical_values(df: pd.DataFrame) -> str: + categorical_counts = {} + for col in df.select_dtypes(include=["object", "category"]).columns: + counts = df[col].value_counts().to_dict() + categorical_counts[col] = counts + + count_str = "Categorical Counts:\n" + for col, counts in categorical_counts.items(): + count_str += f"{col}:\n" + for value, count in counts.items(): + count_str += f" {value}: {count}\n" + + return count_str + + +# Load credentials from the JSON file +try: + with open("azure_openai_credentials.json", "r") as file: + credentials = json.load(file) +except Exception as e: + print("Error no open ai cred") + +# Try to create the AzureOpenAI client +try: + client = AzureOpenAI( + azure_endpoint=credentials["azure_endpoint"], + api_key=credentials["api_key"], + api_version=credentials["api_version"], + ) +except Exception as e: + print("Error creating AzureOpenAI client:", e) + + +class ExperimentExplanationRequest(BaseModel): + experiment_id: str + + +@app.post("/get_experiment_explanation") +def get_experiment_explanation(request: ExperimentExplanationRequest): + experiment_id = request.experiment_id + try: + exp = storage.experiments[experiment_id] + # Taking only the first 10 rows for simplicity + df = exp.results_df.tail(10) + experiment_data = df.to_dict(orient="records") + + df_head = exp.results_df.head(10) + experiment_data_head = df_head.to_dict(orient="records") + + df_des = exp.results_df.describe() + experiment_data_des = df_des.to_dict(orient="records") + + count_str = count_categorical_values(df) + + prompt = f"Explain the following experiment data: First 10 rows {experiment_data_head} last 10 {experiment_data} & descriptive stats {experiment_data_des} & categorical vars counts {count_str}. Give me params to complement config. params present in the data. Also explain what each param does and params for MySQL config that would complement what we have and can boost preformance if tuned. Explain which are dangreous to tune as it might fail the server. Also talk about parameters that are safe to tune. Talk about each in list format so that you are listing all information relevant to a param under its name" + + response = client.chat.completions.create( + model="gpt4o", # model = "deployment_name". + messages=[{"role": "assistant", "content": prompt}], + max_tokens=1000, + ) + + explanation = response.choices[0].message.content.strip() + print(explanation) + return {"explanation": explanation} + except KeyError: + raise HTTPException(status_code=404, detail="Experiment not found") + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app, host="0.0.0.0", port=8000, reload=True) diff --git a/dashboard.py b/dashboard.py deleted file mode 100644 index 679e2ea..0000000 --- a/dashboard.py +++ /dev/null @@ -1,1627 +0,0 @@ -import streamlit as st -import pandas as pd -import plotly.express as px -import plotly.graph_objs as go -from sklearn.cluster import KMeans -from sklearn.preprocessing import StandardScaler -from mlos_bench.storage import from_config -import logging -from typing import Dict, List, Tuple -from dataclasses import dataclass - -# ------------------------------------------------------------------------------ -# Configuration and Setup -# ------------------------------------------------------------------------------ -@dataclass -class Config: - """Application configuration settings""" - TITLE: str = "MySQL Configuration Analysis Dashboard" - DESCRIPTION: str = "Analyze and optimize MySQL database configurations" - DB_CONFIG_PATH: str = "storage/sqlite.jsonc" - THEME: str = "plotly_white" - -# Set up logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -# Initialize page configuration first -st.set_page_config(page_title=Config.TITLE, layout="wide", initial_sidebar_state="expanded") - -# ------------------------------------------------------------------------------ -# Data Loading and Processing -# ------------------------------------------------------------------------------ -@st.cache_resource -def load_data(): - """Load and cache the database storage""" - try: - storage = from_config(config=Config.DB_CONFIG_PATH) - logger.info("Successfully loaded database storage") - return storage - except Exception as e: - logger.error(f"Failed to load storage: {e}") - st.error(f"Failed to load storage: {str(e)}") - return None - -class DataProcessor: - """Handle data processing operations""" - - @staticmethod - def get_column_types(df: pd.DataFrame) -> Tuple[List[str], List[str]]: - """Extract configuration and result columns""" - config_cols = [col for col in df.columns if col.startswith("config.")] - result_cols = [col for col in df.columns if col.startswith("result.")] - return config_cols, result_cols - - @staticmethod - def calculate_stats(df: pd.DataFrame) -> Dict: - """Calculate key statistics from the dataset""" - total = len(df) - success = df["status"].value_counts().get("SUCCESS", 0) - failed = df["status"].value_counts().get("FAILED", 0) - success_rate = (success / total * 100) if total > 0 else 0 - - return { - "total": total, - "success": success, - "failed": failed, - "success_rate": success_rate, - "failure_rate": 100 - success_rate - } - - @staticmethod - def perform_clustering(df: pd.DataFrame, columns: List[str], n_clusters: int) -> Dict: - """Perform KMeans clustering on selected columns""" - X = df[columns].fillna(df[columns].mean()) - scaler = StandardScaler() - X_scaled = scaler.fit_transform(X) - - kmeans = KMeans(n_clusters=n_clusters, random_state=42) - clusters = kmeans.fit_predict(X_scaled) - - return { - "labels": clusters, - "centroids": kmeans.cluster_centers_, - "inertia": kmeans.inertia_ - } - -class Visualizer: - """Handle data visualization""" - - @staticmethod - def plot_trial_outcomes(stats: Dict) -> go.Figure: - """Create pie chart of trial outcomes""" - return px.pie( - names=["Success", "Failed"], - values=[stats["success"], stats["failed"]], - title="Trial Outcomes", - color_discrete_map={"Success": "green", "Failed": "red"}, - template=Config.THEME - ) - - @staticmethod - def plot_metric_distribution(df: pd.DataFrame, metric: str) -> go.Figure: - """Create distribution plot for a metric""" - return px.histogram( - df, x=metric, - title=f"Distribution of {metric.replace('result.', '').replace('_', ' ').title()}", - template=Config.THEME, - marginal="box" - ) - - @staticmethod - def plot_correlation_heatmap(df: pd.DataFrame, columns: List[str]) -> go.Figure: - """Create correlation heatmap""" - corr_matrix = df[columns].corr() - return px.imshow( - corr_matrix, - title="Correlation Heatmap", - template=Config.THEME, - aspect="auto" - ) - -class Dashboard: - """Main dashboard application""" - - def __init__(self): - self.storage = load_data() - self.processor = DataProcessor() - self.visualizer = Visualizer() - - def run(self): - """Run the dashboard application""" - if not self.storage: - st.error("Failed to initialize dashboard. Please check the logs.") - return - - st.title(Config.TITLE) - st.markdown(Config.DESCRIPTION) - - # Sidebar for experiment selection and controls - self.setup_sidebar() - - # Main content - if "selected_experiment" in st.session_state: - self.display_experiment_analysis() - - def setup_sidebar(self): - """Setup sidebar controls""" - st.sidebar.title("Controls") - - # Experiment selection - experiments = list(self.storage.experiments.keys()) - selected = st.sidebar.selectbox( - "Select Experiment", - experiments, - key="selected_experiment" - ) - - if selected: - st.session_state.exp = self.storage.experiments[selected] - st.session_state.df = st.session_state.exp.results_df.copy() - - def display_experiment_analysis(self): - """Display the main analysis content""" - df = st.session_state.df - - # Get column types - config_cols, result_cols = self.processor.get_column_types(df) - - # Calculate statistics - stats = self.processor.calculate_stats(df) - - # Create tabs for different analyses - tabs = st.tabs([ - "Overview", - "Configuration Analysis", - "Performance Metrics", - "Clustering", - "Optimization" - ]) - - # Overview Tab - with tabs[0]: - self.display_overview(stats) - - # Configuration Analysis Tab - with tabs[1]: - self.display_config_analysis(df, config_cols) - - # Performance Metrics Tab - with tabs[2]: - self.display_performance_metrics(df, result_cols) - - # Clustering Tab - with tabs[3]: - self.display_clustering_analysis(df, config_cols) - - # Optimization Tab - with tabs[4]: - self.display_optimization(df, config_cols, result_cols) - - def display_overview(self, stats: Dict): - """Display overview statistics and charts""" - st.header("Overview") - - # Display metrics - col1, col2, col3 = st.columns(3) - col1.metric("Total Trials", stats["total"]) - col2.metric("Successful Trials", stats["success"]) - col3.metric("Failure Rate", f"{stats['failure_rate']:.1f}%") - - # Display trial outcomes pie chart - st.plotly_chart( - self.visualizer.plot_trial_outcomes(stats), - use_container_width=True - ) - - def display_config_analysis(self, df: pd.DataFrame, config_cols: List[str]): - """Display configuration analysis""" - st.header("Configuration Analysis") - - # Select configurations to analyze - selected_configs = st.multiselect( - "Select Configuration Parameters", - config_cols, - default=config_cols[:2] - ) - - if selected_configs: - # Display correlation heatmap - st.plotly_chart( - self.visualizer.plot_correlation_heatmap(df, selected_configs), - use_container_width=True - ) - - def display_performance_metrics(self, df: pd.DataFrame, result_cols: List[str]): - """Display performance metrics analysis""" - st.header("Performance Metrics") - - # Select metric to analyze - selected_metric = st.selectbox( - "Select Metric", - result_cols - ) - - if selected_metric: - # Display distribution plot - st.plotly_chart( - self.visualizer.plot_metric_distribution(df, selected_metric), - use_container_width=True - ) - - # Display summary statistics - st.write("Summary Statistics:") - st.dataframe(df[selected_metric].describe()) - - def display_clustering_analysis(self, df: pd.DataFrame, config_cols: List[str]): - """Display clustering analysis""" - st.header("Clustering Analysis") - - # Clustering controls - n_clusters = st.slider("Number of Clusters", 2, 10, 3) - selected_features = st.multiselect( - "Select Features for Clustering", - config_cols, - default=config_cols[:3] - ) - - if selected_features and st.button("Perform Clustering"): - clustering_results = self.processor.perform_clustering( - df, selected_features, n_clusters - ) - - # Add cluster labels to dataframe - df_cluster = df.copy() - df_cluster['Cluster'] = clustering_results['labels'] - - # Display cluster visualization - if len(selected_features) >= 2: - fig = px.scatter( - df_cluster, - x=selected_features[0], - y=selected_features[1], - color='Cluster', - title="Cluster Visualization", - template=Config.THEME - ) - st.plotly_chart(fig, use_container_width=True) - - def display_optimization(self, df: pd.DataFrame, config_cols: List[str], result_cols: List[str]): - """Display optimization suggestions""" - st.header("Configuration Optimization") - - # Select target metric - target_metric = st.selectbox( - "Select Target Metric for Optimization", - result_cols - ) - - if target_metric: - # Find best configuration - best_idx = df[target_metric].idxmax() - best_config = df.loc[best_idx, config_cols] - - st.write("### Best Performing Configuration") - st.write(f"Target Metric Value: {df.loc[best_idx, target_metric]:.2f}") - st.dataframe(pd.DataFrame([best_config])) - -def main(): - """Main entry point""" - try: - dashboard = Dashboard() - dashboard.run() - except Exception as e: - logger.error(f"Dashboard error: {e}") - st.error(f"An error occurred: {str(e)}") - -if __name__ == "__main__": - main() - -# import pandas as pd -# import seaborn as sns -# import matplotlib.pyplot as plt -# import streamlit as st -# import plotly -# import plotly.express as px -# import plotly.graph_objs as go -# from sklearn.cluster import KMeans -# from sklearn.decomposition import PCA -# from sklearn.preprocessing import StandardScaler -# from mlos_bench.storage import from_config - -# # -------------------------------------------------------------------------------- -# # Streamlit Configuration -# # -------------------------------------------------------------------------------- -# st.set_page_config( -# page_title="Azure MySQL Config Analyzer", layout="wide", initial_sidebar_state="expanded" -# ) - -# # -------------------------------------------------------------------------------- -# # Data Loading and Caching -# # -------------------------------------------------------------------------------- -# @st.cache_resource -# def load_storage(): -# """ -# Load the MLOS storage configuration for the experiments. -# This function is cached to prevent reloading on every interaction. -# """ -# return from_config(config="storage/sqlite.jsonc") - -# storage = load_storage() - -# # -------------------------------------------------------------------------------- -# # Sidebar - Experiment Selection and Filtering -# # -------------------------------------------------------------------------------- -# st.sidebar.title("Azure MySQL Config Analyzer") - -# # Experiment Selection -# experiment_id = st.sidebar.selectbox( -# "Select Experiment", -# options=list(storage.experiments.keys()), -# help="Choose the experiment you want to analyze.", -# ) - -# # Load the selected experiment -# exp = storage.experiments[experiment_id] -# df = exp.results_df.copy() - -# # Extract configuration and result columns -# config_columns = [col for col in df.columns if col.startswith("config.")] -# result_columns = [col for col in df.columns if col.startswith("result.")] -# metrics = result_columns - -# # -------------------------------------------------------------------------------- -# # Main Title and Description -# # -------------------------------------------------------------------------------- -# st.title(f"Azure MySQL Experiment: {experiment_id}") -# st.write(f"**Description**: {exp.description}") - -# # -------------------------------------------------------------------------------- -# # Tabs Creation -# # -------------------------------------------------------------------------------- -# tabs = st.tabs( -# [ -# "Dashboard", -# "Data Overview", -# "Configurations Analysis", -# "Failure Analysis", -# "Correlation Heatmap", -# "Parallel Coordinates", -# "Config Params Scatter", -# "Top & Bottom Configs", -# "Optimization Suggestions", -# "Clustering", -# "Advanced Statistics", -# "Anomaly Detection", -# "Save Analysis", -# ] -# ) - -# # -------------------------------------------------------------------------------- -# # TAB 1: Dashboard -# # -------------------------------------------------------------------------------- -# with tabs[0]: -# st.header("Dashboard") -# st.write("### Key Metrics Overview") - -# # Calculate key metrics -# total_trials = len(df) -# success_trials = df["status"].value_counts().get("SUCCESS", 0) -# failure_trials = df["status"].value_counts().get("FAILED", 0) -# success_rate = (success_trials / total_trials) * 100 if total_trials > 0 else 0 -# failure_rate = (failure_trials / total_trials) * 100 if total_trials > 0 else 0 - -# # Display key metrics -# col1, col2, col3 = st.columns(3) -# col1.metric("Total Trials", total_trials) -# col2.metric("Successful Trials", success_trials) -# col3.metric("Failure Rate (%)", f"{failure_rate:.2f}") - -# # Visualization: Success vs Failure -# fig = px.pie( -# names=["Success", "Failure"], -# values=[success_trials, failure_trials], -# title="Trial Outcomes", -# color=["Success", "Failure"], -# color_discrete_map={"Success": "green", "Failure": "red"}, -# ) -# st.plotly_chart(fig, use_container_width=True) - -# # Visualization: Top 5 Metrics -# st.write("### Top 5 Metrics") -# top_metrics = df[result_columns].mean().sort_values(ascending=False).head(5) -# fig_metrics = px.bar( -# top_metrics, -# x=top_metrics.index.str.replace("result.", "").str.replace("_", " ").str.title(), -# y=top_metrics.values, -# labels={"x": "Metric", "y": "Average Value"}, -# title="Top 5 Average Metrics", -# color=top_metrics.values, -# color_continuous_scale="Blues", -# ) -# st.plotly_chart(fig_metrics, use_container_width=True) - -# # -------------------------------------------------------------------------------- -# # TAB 2: Data Overview -# # -------------------------------------------------------------------------------- -# with tabs[1]: -# st.header("Data Overview") -# st.write("Explore experiment data and key statistics.") - -# # Data Filtering -# with st.expander("Filter Data"): -# st.subheader("Apply Filters") -# trial_id_filter = st.text_input( -# "Filter by Trial ID (comma-separated)", help="Enter trial IDs separated by commas." -# ) -# status_filter = st.multiselect( -# "Filter by Status", -# options=df["status"].unique(), -# default=df["status"].unique(), -# help="Select one or more statuses to filter the trials.", -# ) -# config_filter = st.multiselect( -# "Filter by Configuration ID", -# options=df["tunable_config_id"].unique(), -# default=df["tunable_config_id"].unique(), -# help="Select one or more configuration IDs to filter the trials.", -# ) - -# if st.button("Apply Filters"): -# filtered_df = df.copy() -# if trial_id_filter: -# try: -# trial_ids = [ -# int(tid.strip()) -# for tid in trial_id_filter.split(",") -# if tid.strip().isdigit() -# ] -# filtered_df = filtered_df[filtered_df["trial_id"].isin(trial_ids)] -# except ValueError: -# st.error("Please enter valid trial IDs separated by commas.") -# if status_filter: -# filtered_df = filtered_df[filtered_df["status"].isin(status_filter)] -# if config_filter: -# filtered_df = filtered_df[filtered_df["tunable_config_id"].isin(config_filter)] -# st.session_state.filtered_df = filtered_df -# st.success("Filters applied successfully!") - -# # Display filtered data or original data -# if "filtered_df" in st.session_state: -# display_df = st.session_state.filtered_df -# else: -# display_df = df - -# if st.checkbox("Show Data Table"): -# st.dataframe(display_df) -# st.write("### Descriptive Statistics:") -# st.write(display_df.describe()) - -# # -------------------------------------------------------------------------------- -# # TAB 3: Configurations Analysis -# # -------------------------------------------------------------------------------- -# with tabs[2]: -# st.header("Configurations Analysis") -# st.write("Visualize performance metrics across different configurations.") - -# config_id = st.selectbox( -# "Select Configuration ID", -# options=df["tunable_config_id"].unique(), -# help="Choose a configuration to analyze its performance over trials.", -# ) -# metric = st.selectbox( -# "Select Metric", options=metrics, help="Choose a performance metric to visualize." -# ) - -# config_data = df[df["tunable_config_id"] == config_id] -# fig = px.line( -# config_data, -# x="trial_id", -# y=metric, -# title=f"{metric.replace('result.', '').replace('_', ' ').title()} over Trials for Configuration {config_id}", -# markers=True, -# labels={ -# "trial_id": "Trial ID", -# metric: metric.replace("result.", "").replace("_", " ").title(), -# }, -# template="plotly_white", -# ) -# st.plotly_chart(fig, use_container_width=True) - -# # Additional Insights: Moving Average -# window_size = st.slider( -# "Select Moving Average Window Size", -# 1, -# 10, -# 3, -# help="Smooth the metric by applying a moving average.", -# ) -# config_data[f"{metric}_MA"] = config_data[metric].rolling(window=window_size).mean() -# fig_ma = px.line( -# config_data, -# x="trial_id", -# y=f"{metric}_MA", -# title=f"{metric.replace('result.', '').replace('_', ' ').title()} - Moving Average (Window Size={window_size})", -# markers=True, -# labels={ -# "trial_id": "Trial ID", -# f"{metric}_MA": f"{metric.replace('result.', '').replace('_', ' ').title()} (MA)", -# }, -# template="plotly_white", -# ) -# st.plotly_chart(fig_ma, use_container_width=True) - -# # -------------------------------------------------------------------------------- -# # TAB 4: Failure Analysis -# # -------------------------------------------------------------------------------- -# with tabs[3]: -# st.header("Failure Analysis") -# st.write("Analyze failure rates and trends across trials.") - -# if "status" in df.columns: -# # Failure Rate Distribution -# st.subheader("Failure Rate Distribution") -# failure_counts = df["status"].value_counts() -# fig_pie = px.pie( -# values=failure_counts.values, -# names=failure_counts.index, -# title="Failure Rate Distribution", -# color=failure_counts.index, -# color_discrete_map={"FAILED": "red", "SUCCESS": "green"}, -# ) -# st.plotly_chart(fig_pie, use_container_width=True) - -# # Failure Rate Trend Over Trials -# st.subheader("Failure Rate Trend Over Trials") -# failure_rate_trend = ( -# df.groupby("trial_id")["status"] -# .apply(lambda x: (x == "FAILED").mean() * 100) -# .reset_index() -# ) -# failure_rate_trend.columns = ["Trial ID", "Failure Rate (%)"] -# fig_line = px.line( -# failure_rate_trend, -# x="Trial ID", -# y="Failure Rate (%)", -# title="Failure Rate Trend Over Trials", -# markers=True, -# labels={"Trial ID": "Trial ID", "Failure Rate (%)": "Failure Rate (%)"}, -# template="plotly_white", -# ) -# st.plotly_chart(fig_line, use_container_width=True) -# else: -# st.info("No 'status' column found in the dataset.") - -# # -------------------------------------------------------------------------------- -# # TAB 5: Correlation Heatmap -# # -------------------------------------------------------------------------------- -# with tabs[4]: -# st.header("Correlation Heatmap") -# st.write("Visualize correlations between selected configuration and result metrics.") - -# selected_columns = st.multiselect( -# "Select Columns for Heatmap", -# options=config_columns + result_columns, -# default=config_columns[:2] + result_columns[:2], -# help="Choose multiple columns to analyze their correlation.", -# ) - -# if st.button("Generate Heatmap"): -# if selected_columns: -# corr_matrix = df[selected_columns].corr() -# fig = px.imshow( -# corr_matrix, -# text_auto=True, -# color_continuous_scale="Viridis", -# title="Correlation Heatmap", -# labels={"color": "Correlation Coefficient"}, -# ) -# st.plotly_chart(fig, use_container_width=True) -# else: -# st.warning("Please select at least one column to generate the heatmap.") -# else: -# st.info("Select columns and click 'Generate Heatmap' to visualize correlations.") - -# # -------------------------------------------------------------------------------- -# # TAB 6: Parallel Coordinates -# # -------------------------------------------------------------------------------- -# with tabs[5]: -# st.header("Parallel Coordinates Plot") -# st.write( -# "Explore multi-dimensional relationships between configuration parameters and metrics." -# ) - -# parallel_columns = st.multiselect( -# "Select Columns for Parallel Plot", -# options=config_columns + result_columns, -# default=config_columns[:3] + result_columns[:2], -# help="Choose multiple columns to include in the parallel coordinates plot.", -# ) - -# if parallel_columns: -# color_metric = st.selectbox( -# "Select Metric for Coloring", -# options=result_columns, -# help="Choose a result metric to color-code the parallel coordinates.", -# ) -# fig = px.parallel_coordinates( -# df, -# dimensions=parallel_columns, -# color=color_metric, -# color_continuous_scale=px.colors.diverging.Tealrose, -# title="Parallel Coordinates Plot", -# labels={ -# col: col.replace("config.", "").replace("_", " ").title() -# for col in parallel_columns -# }, -# template="plotly_white", -# ) -# st.plotly_chart(fig, use_container_width=True) -# else: -# st.info("Select columns to generate the parallel coordinates plot.") - -# # -------------------------------------------------------------------------------- -# # TAB 7: Config Params Scatter -# # -------------------------------------------------------------------------------- -# with tabs[6]: -# st.header("Configuration Parameters Scatter Plot") -# st.write( -# "Analyze the relationship between multiple configuration parameters and a selected result metric." -# ) - -# if not config_columns: -# st.warning("No configuration parameters available in the dataset.") -# elif not metrics: -# st.warning("No result metrics available in the dataset.") -# else: -# # Select multiple configuration parameters -# selected_config_params = st.multiselect( -# "Select Configuration Parameters", -# options=config_columns, -# default=config_columns[:2], -# help="Choose one or more configuration parameters to analyze.", -# ) - -# # Select one result metric -# selected_result_metric = st.selectbox( -# "Select Result Metric", -# options=metrics, -# help="Choose a result metric to analyze against the selected configuration parameters.", -# ) - -# if selected_config_params: -# # Determine layout based on number of selected parameters -# plots_per_row = 2 -# num_plots = len(selected_config_params) -# num_rows = (num_plots + plots_per_row - 1) // plots_per_row - -# for row in range(num_rows): -# cols = st.columns(plots_per_row) -# for i in range(plots_per_row): -# plot_index = row * plots_per_row + i -# if plot_index < num_plots: -# config_param = selected_config_params[plot_index] -# with cols[i]: -# fig = px.scatter( -# df, -# x=config_param, -# y=selected_result_metric, -# color="tunable_config_id", -# title=f"{config_param.replace('config.', '').replace('_', ' ').title()} vs {selected_result_metric.replace('result.', '').replace('_', ' ').title()}", -# labels={ -# config_param: config_param.replace("config.", "") -# .replace("_", " ") -# .title(), -# selected_result_metric: selected_result_metric.replace( -# "result.", "" -# ) -# .replace("_", " ") -# .title(), -# }, -# hover_data=["trial_id", "tunable_config_id"], -# trendline="ols", -# template="plotly_white", -# ) - -# st.plotly_chart(fig, use_container_width=True) - -# # Calculate and display the correlation coefficient -# corr_coeff = ( -# df[[config_param, selected_result_metric]].corr().iloc[0, 1] -# ) -# st.markdown(f"**Correlation Coefficient:** {corr_coeff:.2f}") -# else: -# st.info( -# "Please select at least one configuration parameter to generate scatter plots." -# ) - -# # -------------------------------------------------------------------------------- -# # TAB 8: Top & Bottom Configurations -# # -------------------------------------------------------------------------------- -# with tabs[7]: -# st.header("Top and Bottom Configurations") -# st.write( -# "Identify configurations with the best and worst performance based on selected metrics." -# ) - -# n_configs = st.slider( -# "Number of Configurations to Display", -# min_value=1, -# max_value=10, -# value=5, -# help="Select how many top and bottom configurations to display.", -# ) - -# # Select metric for ranking -# tb_metric = st.selectbox( -# "Select Metric for Ranking", -# options=metrics, -# index=0, -# key="tb_metric", -# help="Choose a metric to rank configurations.", -# ) -# optimization_method = st.radio( -# "Select Optimization Method", -# ["Maximize", "Minimize"], -# index=0, -# key="tb_opt_method", -# help="Choose whether to find configurations that maximize or minimize the selected metric.", -# ) - -# if not df.empty: -# if optimization_method == "Maximize": -# top_configs = df.nlargest(n_configs, tb_metric) -# bottom_configs = df.nsmallest(n_configs, tb_metric) -# else: -# top_configs = df.nsmallest(n_configs, tb_metric) -# bottom_configs = df.nlargest(n_configs, tb_metric) - -# st.subheader("Top Configurations") -# st.dataframe(top_configs) - -# st.subheader("Bottom Configurations") -# st.dataframe(bottom_configs) -# else: -# st.warning("No data available to identify top/bottom configurations.") - -# # -------------------------------------------------------------------------------- -# # TAB 9: Optimization Suggestions -# # -------------------------------------------------------------------------------- -# with tabs[8]: -# st.header("Optimization Suggestions") -# st.write("Discover optimal configurations based on selected performance metrics.") - -# target_metric = st.selectbox( -# "Select Metric for Optimization", -# options=metrics, -# index=0, -# key="opt_target_metric", -# help="Choose a performance metric to optimize.", -# ) -# optimization_method = st.radio( -# "Select Optimization Method", -# ["Maximize", "Minimize"], -# index=0, -# key="opt_method_choice", -# help="Choose whether to maximize or minimize the selected metric.", -# ) - -# if not df.empty: -# if optimization_method == "Maximize": -# optimal_config = df.loc[df[target_metric].idxmax()] -# else: -# optimal_config = df.loc[df[target_metric].idxmin()] - -# st.write( -# f"**Optimal Configuration ({optimization_method} {target_metric.replace('result.', '').replace('_', ' ').title()}):**" -# ) -# st.json(optimal_config[config_columns].to_dict()) -# else: -# st.warning("No data available for optimization.") - -# # -------------------------------------------------------------------------------- -# # TAB 10: Clustering -# # -------------------------------------------------------------------------------- -# with tabs[9]: -# st.header("Clustering Analysis") -# st.write("Group similar configurations to identify patterns and clusters.") - -# cluster_columns = st.multiselect( -# "Select Columns for Clustering", -# options=config_columns + result_columns, -# default=config_columns[:3], -# help="Choose multiple columns to perform clustering.", -# ) -# num_clusters = st.slider( -# "Number of Clusters", -# min_value=2, -# max_value=10, -# value=3, -# help="Define the number of clusters for K-Means.", -# ) - -# if len(cluster_columns) >= 2: -# if st.button("Generate Clustering"): -# clustering_data = df[cluster_columns].dropna() - -# # Standardize the data -# scaler = StandardScaler() -# clustering_data_scaled = scaler.fit_transform(clustering_data) - -# # Perform K-Means clustering -# kmeans = KMeans(n_clusters=num_clusters, random_state=42) -# clusters = kmeans.fit_predict(clustering_data_scaled) -# df["cluster"] = clusters - -# # Optional: Dimensionality Reduction for 3D Plotting -# if len(cluster_columns) > 3: -# pca = PCA(n_components=3) -# principal_components = pca.fit_transform(clustering_data_scaled) -# df["PC1"] = principal_components[:, 0] -# df["PC2"] = principal_components[:, 1] -# df["PC3"] = principal_components[:, 2] -# fig = px.scatter_3d( -# df, -# x="PC1", -# y="PC2", -# z="PC3", -# color="cluster", -# title="3D Scatter Plot with PCA and Clustering", -# labels={ -# "PC1": "Principal Component 1", -# "PC2": "Principal Component 2", -# "PC3": "Principal Component 3", -# }, -# template="plotly_white", -# ) -# elif len(cluster_columns) == 3: -# fig = px.scatter_3d( -# df, -# x=cluster_columns[0], -# y=cluster_columns[1], -# z=cluster_columns[2], -# color="cluster", -# title="3D Scatter Plot with Clustering", -# labels={ -# cluster_columns[0]: cluster_columns[0] -# .replace("config.", "") -# .replace("_", " ") -# .title(), -# cluster_columns[1]: cluster_columns[1] -# .replace("config.", "") -# .replace("_", " ") -# .title(), -# cluster_columns[2]: cluster_columns[2] -# .replace("config.", "") -# .replace("_", " ") -# .title(), -# }, -# template="plotly_white", -# ) -# else: -# fig = px.scatter( -# df, -# x=cluster_columns[0], -# y=cluster_columns[1], -# color="cluster", -# title="2D Scatter Plot with Clustering", -# labels={ -# cluster_columns[0]: cluster_columns[0] -# .replace("config.", "") -# .replace("_", " ") -# .title(), -# cluster_columns[1]: cluster_columns[1] -# .replace("config.", "") -# .replace("_", " ") -# .title(), -# }, -# template="plotly_white", -# ) - -# st.plotly_chart(fig, use_container_width=True) - -# # Cluster Centroids -# centroids = kmeans.cluster_centers_ -# centroids_df = pd.DataFrame(centroids, columns=cluster_columns) -# st.subheader("Cluster Centroids") -# st.write(centroids_df) -# else: -# st.warning("Please select at least two columns for clustering.") - -# # -------------------------------------------------------------------------------- -# # TAB 10: Advanced Statistics -# # -------------------------------------------------------------------------------- -# with tabs[10]: -# st.header("Advanced Statistics") -# st.write("Perform advanced statistical analyses on the experiment data.") - -# # Select Metric for Statistical Analysis -# selected_metric = st.selectbox( -# "Select Metric for Statistical Analysis", -# options=metrics, -# help="Choose a result metric to perform statistical tests.", -# ) - -# # Debugging: Display selected_metric and its type -# st.write(f"**Selected Metric:** {selected_metric}") -# st.write(f"**Selected Metric Type:** {df[selected_metric].dtype}") - -# # Check if the selected metric is numeric -# if pd.api.types.is_numeric_dtype(df[selected_metric]): -# st.subheader( -# f"Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}" -# ) -# st.write(df[selected_metric].describe()) - -# # Define the template -# template_value = "plotly_white" -# st.write(f"**Template Type:** {type(template_value)}, **Value:** {template_value}") - -# # Histogram with KDE -# try: -# fig_hist = px.histogram( -# df, -# x=selected_metric, -# nbins=30, -# title=f"Distribution of {selected_metric.replace('result.', '').replace('_', ' ').title()}", -# marginal="kde", -# labels={ -# selected_metric: selected_metric.replace("result.", "") -# .replace("_", " ") -# .title() -# }, -# template=template_value, # Ensure this is a string -# ) -# st.plotly_chart(fig_hist, use_container_width=True) -# except Exception as e: -# st.error(f"An error occurred while generating the histogram: {e}") - -# # Box Plot -# st.subheader( -# f"Box Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" -# ) -# try: -# fig_box = px.box( -# df, -# y=selected_metric, -# points="all", -# title=f"Box Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", -# labels={ -# selected_metric: selected_metric.replace("result.", "") -# .replace("_", " ") -# .title() -# }, -# template=template_value, # Ensure this is a string -# ) -# st.plotly_chart(fig_box, use_container_width=True) -# except Exception as e: -# st.error(f"An error occurred while generating the box plot: {e}") - -# # Violin Plot -# st.subheader( -# f"Violin Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" -# ) -# try: -# fig_violin = px.violin( -# df, -# y=selected_metric, -# box=True, -# points="all", -# title=f"Violin Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", -# labels={ -# selected_metric: selected_metric.replace("result.", "") -# .replace("_", " ") -# .title() -# }, -# template=template_value, # Ensure this is a string -# ) -# st.plotly_chart(fig_violin, use_container_width=True) -# except Exception as e: -# st.error(f"An error occurred while generating the violin plot: {e}") -# else: -# st.warning( -# f"The selected metric '{selected_metric}' is not numeric and cannot be plotted." -# ) - -# # Display Plotly Version for Debugging -# st.subheader("Plotly Version") -# st.write(f"Plotly version: {plotly.__version__}") - -# # Optional: Display the selected template -# st.subheader("Template Information") -# st.write(f"Selected Template: {template_value}") - - -# # -------------------------------------------------------------------------------- -# # TAB 12: Anomaly Detection -# # -------------------------------------------------------------------------------- -# with tabs[11]: -# st.header("Anomaly Detection") -# st.write("Identify anomalous trials based on selected metrics.") - -# anomaly_metric = st.selectbox( -# "Select Metric for Anomaly Detection", -# options=metrics, -# help="Choose a result metric to perform anomaly detection.", -# ) -# threshold = st.slider( -# "Set Anomaly Threshold (Standard Deviations)", -# min_value=1.0, -# max_value=5.0, -# value=3.0, -# step=0.5, -# help="Define how many standard deviations away from the mean a data point should be to be considered an anomaly.", -# ) - -# mean_val = df[anomaly_metric].mean() -# std_val = df[anomaly_metric].std() -# upper_bound = mean_val + threshold * std_val -# lower_bound = mean_val - threshold * std_val - -# anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] - -# st.subheader(f"Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}") -# if not anomalies.empty: -# st.write(f"Total Anomalies Detected: {len(anomalies)}") -# st.dataframe(anomalies) - -# # Visualization: Scatter Plot Highlighting Anomalies -# fig_anomaly = px.scatter( -# df, -# x="trial_id", -# y=anomaly_metric, -# color=df.index.isin(anomalies.index), -# title=f"Anomaly Detection in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}", -# labels={ -# "trial_id": "Trial ID", -# anomaly_metric: anomaly_metric.replace("result.", "").replace("_", " ").title(), -# }, -# color_discrete_map={True: "red", False: "blue"}, -# template="plotly_white", -# ) -# st.plotly_chart(fig_anomaly, use_container_width=True) -# else: -# st.success("No anomalies detected based on the current threshold.") - -# # -------------------------------------------------------------------------------- -# # TAB 13: Save Analysis Report -# # -------------------------------------------------------------------------------- -# with tabs[12]: -# st.header("Save Analysis Report") -# st.write("Download a comprehensive analysis report of your experiment.") - -# report_options = st.multiselect( -# "Select Sections to Include in the Report", -# options=[ -# "Data Overview", -# "Configurations Analysis", -# "Failure Analysis", -# "Correlation Heatmap", -# "Parallel Coordinates", -# "Config Params Scatter", -# "Top & Bottom Configs", -# "Optimization Suggestions", -# "Clustering", -# "Advanced Statistics", -# "Anomaly Detection", -# ], -# default=[ -# "Data Overview", -# "Configurations Analysis", -# "Failure Analysis", -# "Correlation Heatmap", -# "Top & Bottom Configs", -# "Optimization Suggestions", -# ], -# help="Choose which sections of the analysis you want to include in the report.", -# ) - -# if st.button("Download Report"): -# # Generate the report based on selected sections -# report = f"# Azure MySQL Config Analyzer Report\n\n## Experiment: {experiment_id}\n\n**Description:** {exp.description}\n\n" - -# if "Data Overview" in report_options: -# report += "## Data Overview\n" -# report += f"### Descriptive Statistics\n{df.describe().to_markdown()}\n\n" - -# if "Configurations Analysis" in report_options: -# report += "## Configurations Analysis\n" -# # Example: Include top configuration analysis -# top_config = df.loc[ -# df["result.metric"].idxmax() -# ] # Replace 'result.metric' with actual metric if needed -# report += f"### Optimal Configuration\n{top_config[config_columns].to_dict()}\n\n" - -# if "Failure Analysis" in report_options: -# report += "## Failure Analysis\n" -# failure_counts = df["status"].value_counts() -# report += f"### Failure Rate Distribution\n{failure_counts.to_dict()}\n\n" - -# if "Correlation Heatmap" in report_options: -# report += "## Correlation Heatmap\n" -# selected_columns = config_columns + result_columns # Adjust as needed -# corr_matrix = df[selected_columns].corr() -# report += f"### Correlation Matrix\n{corr_matrix.to_markdown()}\n\n" - -# if "Parallel Coordinates" in report_options: -# report += "## Parallel Coordinates\n" -# # Example placeholder -# report += "### Parallel Coordinates Plot was generated in the application.\n\n" - -# if "Config Params Scatter" in report_options: -# report += "## Configuration Parameters Scatter Plot\n" -# # Example placeholder -# report += "### Scatter plots were generated in the application.\n\n" - -# if "Top & Bottom Configs" in report_options: -# report += "## Top & Bottom Configurations\n" -# n_configs = st.session_state.get("n_configs_display", 5) -# tb_metric = st.session_state.get("tb_metric", metrics[0]) -# optimization_method = st.session_state.get("tb_opt_method", "Maximize") -# if optimization_method == "Maximize": -# top_configs = df.nlargest(n_configs, tb_metric) -# bottom_configs = df.nsmallest(n_configs, tb_metric) -# else: -# top_configs = df.nsmallest(n_configs, tb_metric) -# bottom_configs = df.nlargest(n_configs, tb_metric) -# report += f"### Top {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{top_configs.to_markdown()}\n\n" -# report += f"### Bottom {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{bottom_configs.to_markdown()}\n\n" - -# if "Optimization Suggestions" in report_options: -# report += "## Optimization Suggestions\n" -# target_metric = st.session_state.get("opt_target_metric", metrics[0]) -# optimization_method = st.session_state.get("opt_method_choice", "Maximize") -# if optimization_method == "Maximize": -# optimal_config = df.loc[df[target_metric].idxmax()] -# else: -# optimal_config = df.loc[df[target_metric].idxmin()] -# report += f"### Optimal Configuration ({optimization_method} {target_metric.replace('result.', '').replace('_', ' ').title()}):\n{optimal_config[config_columns].to_dict()}\n\n" - -# if "Clustering" in report_options: -# report += "## Clustering Analysis\n" -# # Example placeholder -# report += "### Clustering results were generated in the application.\n\n" - -# if "Advanced Statistics" in report_options: -# report += "## Advanced Statistics\n" -# selected_metric = st.session_state.get("advanced_stat_metric", metrics[0]) -# report += f"### Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}\n{df[selected_metric].describe().to_markdown()}\n\n" - -# if "Anomaly Detection" in report_options: -# report += "## Anomaly Detection\n" -# anomaly_metric = st.session_state.get("anomaly_metric", metrics[0]) -# threshold = st.session_state.get("anomaly_threshold", 3.0) -# mean_val = df[anomaly_metric].mean() -# std_val = df[anomaly_metric].std() -# upper_bound = mean_val + threshold * std_val -# lower_bound = mean_val - threshold * std_val -# anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] -# report += f"### Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()} (Threshold: {threshold} Std Dev)\n{anomalies.to_markdown()}\n\n" - -# # Download the report as a text file -# st.download_button( -# label="Download Report as Text", -# data=report, -# file_name="analysis_report.txt", -# mime="text/plain", -# ) - -# # Optionally, provide the CSV report -# st.subheader("Download Descriptive Statistics") -# if st.button("Download Descriptive Statistics as CSV"): -# report_csv = df.describe().to_csv() -# st.download_button( -# label="Download CSV Report", -# data=report_csv, -# file_name="descriptive_statistics.csv", -# mime="text/csv", -# ) - -# st.info("Select the sections you want to include in the report and click 'Download Report'.") - -# # -------------------------------------------------------------------------------- -# # TAB 10: Clustering -# # -------------------------------------------------------------------------------- -# with tabs[9]: -# st.header("Clustering Analysis") -# st.write("Group similar configurations to identify patterns and clusters.") - -# cluster_columns = st.multiselect( -# "Select Columns for Clustering", -# options=config_columns + result_columns, -# default=config_columns[:3], -# help="Choose multiple columns to perform clustering.", -# key="clustering_columns_select", # Unique key -# ) - -# num_clusters = st.slider( -# "Number of Clusters", -# min_value=2, -# max_value=10, -# value=3, -# help="Define the number of clusters for K-Means.", -# key="num_clusters_slider_clustering", # Unique key -# ) - -# if len(cluster_columns) >= 2: -# if st.button("Generate Clustering", key="gen cluster"): -# clustering_data = df[cluster_columns].dropna() - -# # Standardize the data -# scaler = StandardScaler() -# clustering_data_scaled = scaler.fit_transform(clustering_data) - -# # Perform K-Means clustering -# kmeans = KMeans(n_clusters=num_clusters, random_state=42) -# clusters = kmeans.fit_predict(clustering_data_scaled) -# df["cluster"] = clusters - -# # Optional: Dimensionality Reduction for 3D Plotting -# if len(cluster_columns) > 3: -# pca = PCA(n_components=3) -# principal_components = pca.fit_transform(clustering_data_scaled) -# df["PC1"] = principal_components[:, 0] -# df["PC2"] = principal_components[:, 1] -# df["PC3"] = principal_components[:, 2] -# fig = px.scatter_3d( -# df, -# x="PC1", -# y="PC2", -# z="PC3", -# color="cluster", -# title="3D Scatter Plot with PCA and Clustering", -# labels={ -# "PC1": "Principal Component 1", -# "PC2": "Principal Component 2", -# "PC3": "Principal Component 3", -# }, -# template="plotly_white", -# ) -# elif len(cluster_columns) == 3: -# fig = px.scatter_3d( -# df, -# x=cluster_columns[0], -# y=cluster_columns[1], -# z=cluster_columns[2], -# color="cluster", -# title="3D Scatter Plot with Clustering", -# labels={ -# cluster_columns[0]: cluster_columns[0] -# .replace("config.", "") -# .replace("_", " ") -# .title(), -# cluster_columns[1]: cluster_columns[1] -# .replace("config.", "") -# .replace("_", " ") -# .title(), -# cluster_columns[2]: cluster_columns[2] -# .replace("config.", "") -# .replace("_", " ") -# .title(), -# }, -# template="plotly_white", -# ) -# else: -# fig = px.scatter( -# df, -# x=cluster_columns[0], -# y=cluster_columns[1], -# color="cluster", -# title="2D Scatter Plot with Clustering", -# labels={ -# cluster_columns[0]: cluster_columns[0] -# .replace("config.", "") -# .replace("_", " ") -# .title(), -# cluster_columns[1]: cluster_columns[1] -# .replace("config.", "") -# .replace("_", " ") -# .title(), -# }, -# template="plotly_white", -# ) - -# st.plotly_chart(fig, use_container_width=True) - -# # Cluster Centroids -# centroids = kmeans.cluster_centers_ -# centroids_df = pd.DataFrame(centroids, columns=cluster_columns) -# st.subheader("Cluster Centroids") -# st.write(centroids_df) -# else: -# st.warning("Please select at least two columns for clustering.") - -# # -------------------------------------------------------------------------------- -# # TAB 11: Advanced Statistics -# # -------------------------------------------------------------------------------- -# # -------------------------------------------------------------------------------- -# # TAB 11: Advanced Statistics -# # -------------------------------------------------------------------------------- -# with tabs[10]: -# st.header("Advanced Statistics") -# st.write("Perform advanced statistical analyses on the experiment data.") - -# selected_metric = st.selectbox( -# "Select Metric for Statistical Analysis", -# options=metrics, -# help="Choose a result metric to perform statistical tests.", -# key="sel adv", -# ) - -# st.subheader( -# f"Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}" -# ) - -# # Display data type and missing values -# st.write(f"Data Type: {df[selected_metric].dtype}") -# st.write(f"Missing Values: {df[selected_metric].isnull().sum()}") - -# # Handle missing values -# plot_df = df.dropna(subset=[selected_metric]) - -# # Check if the selected metric is numeric -# if pd.api.types.is_numeric_dtype(plot_df[selected_metric]): -# st.write(plot_df[selected_metric].describe()) - -# # Histogram with KDE -# fig_hist = px.histogram( -# plot_df, -# x=selected_metric, -# nbins=30, -# title=f"Distribution of {selected_metric.replace('result.', '').replace('_', ' ').title()}", -# marginal="kde", -# labels={ -# selected_metric: selected_metric.replace("result.", "").replace("_", " ").title() -# }, -# template="plotly_white", -# ) -# st.plotly_chart(fig_hist, use_container_width=True) - -# # Box Plot -# st.subheader( -# f"Box Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" -# ) -# fig_box = px.box( -# plot_df, -# y=selected_metric, -# points="all", -# title=f"Box Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", -# labels={ -# selected_metric: selected_metric.replace("result.", "").replace("_", " ").title() -# }, -# template="plotly_white", -# ) -# st.plotly_chart(fig_box, use_container_width=True) - -# # Violin Plot -# st.subheader( -# f"Violin Plot for {selected_metric.replace('result.', '').replace('_', ' ').title()}" -# ) -# fig_violin = px.violin( -# plot_df, -# y=selected_metric, -# box=True, -# points="all", -# title=f"Violin Plot of {selected_metric.replace('result.', '').replace('_', ' ').title()}", -# labels={ -# selected_metric: selected_metric.replace("result.", "").replace("_", " ").title() -# }, -# template="plotly_white", -# ) -# st.plotly_chart(fig_violin, use_container_width=True) -# else: -# st.error( -# f"The selected metric '{selected_metric}' is not numeric. Please select a numeric metric for statistical analysis." -# ) - - -# # -------------------------------------------------------------------------------- -# # TAB 12: Anomaly Detection -# # -------------------------------------------------------------------------------- -# with tabs[11]: -# st.header("Anomaly Detection") -# st.write("Identify anomalous trials based on selected metrics.") - -# anomaly_metric = st.selectbox( -# "Select Metric for Anomaly Detection", -# options=metrics, -# help="Choose a result metric to perform anomaly detection.", -# ) -# threshold = st.slider( -# "Set Anomaly Threshold (Standard Deviations)", -# min_value=1.0, -# max_value=5.0, -# value=3.0, -# step=0.5, -# help="Define how many standard deviations away from the mean a data point should be to be considered an anomaly.", -# ) - -# mean_val = df[anomaly_metric].mean() -# std_val = df[anomaly_metric].std() -# upper_bound = mean_val + threshold * std_val -# lower_bound = mean_val - threshold * std_val - -# anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] - -# st.subheader(f"Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}") -# if not anomalies.empty: -# st.write(f"Total Anomalies Detected: {len(anomalies)}") -# st.dataframe(anomalies) - -# # Visualization: Scatter Plot Highlighting Anomalies -# fig_anomaly = px.scatter( -# df, -# x="trial_id", -# y=anomaly_metric, -# color=df.index.isin(anomalies.index), -# title=f"Anomaly Detection in {anomaly_metric.replace('result.', '').replace('_', ' ').title()}", -# labels={ -# "trial_id": "Trial ID", -# anomaly_metric: anomaly_metric.replace("result.", "").replace("_", " ").title(), -# }, -# color_discrete_map={True: "red", False: "blue"}, -# template="plotly_white", -# ) -# st.plotly_chart(fig_anomaly, use_container_width=True) -# else: -# st.success("No anomalies detected based on the current threshold.") - -# # -------------------------------------------------------------------------------- -# # TAB 13: Save Analysis Report -# # -------------------------------------------------------------------------------- -# with tabs[12]: -# st.header("Save Analysis Report") -# st.write("Download a comprehensive analysis report of your experiment.") - -# report_options = st.multiselect( -# "Select Sections to Include in the Report", -# options=[ -# "Data Overview", -# "Configurations Analysis", -# "Failure Analysis", -# "Correlation Heatmap", -# "Parallel Coordinates", -# "Config Params Scatter", -# "Top & Bottom Configs", -# "Optimization Suggestions", -# "Clustering", -# "Advanced Statistics", -# "Anomaly Detection", -# ], -# default=[ -# "Data Overview", -# "Configurations Analysis", -# "Failure Analysis", -# "Correlation Heatmap", -# "Top & Bottom Configs", -# "Optimization Suggestions", -# ], -# help="Choose which sections of the analysis you want to include in the report.", -# ) - -# if st.button("Download Report"): -# # Generate the report based on selected sections -# report = f"# Azure MySQL Config Analyzer Report\n\n## Experiment: {experiment_id}\n\n**Description:** {exp.description}\n\n" - -# if "Data Overview" in report_options: -# report += "## Data Overview\n" -# report += f"### Descriptive Statistics\n{df.describe().to_markdown()}\n\n" - -# if "Configurations Analysis" in report_options: -# report += "## Configurations Analysis\n" -# # Example: Include top configuration analysis -# if "result.metric" in df.columns: -# top_config = df.loc[ -# df["result.metric"].idxmax() -# ] # Replace 'result.metric' with actual metric -# report += f"### Optimal Configuration\n{top_config[config_columns].to_dict()}\n\n" -# else: -# report += ( -# "### Configurations Analysis details were generated in the application.\n\n" -# ) - -# if "Failure Analysis" in report_options: -# report += "## Failure Analysis\n" -# failure_counts = df["status"].value_counts() -# report += f"### Failure Rate Distribution\n{failure_counts.to_dict()}\n\n" - -# if "Correlation Heatmap" in report_options: -# report += "## Correlation Heatmap\n" -# selected_columns = config_columns + result_columns # Adjust as needed -# corr_matrix = df[selected_columns].corr() -# report += f"### Correlation Matrix\n{corr_matrix.to_markdown()}\n\n" - -# if "Parallel Coordinates" in report_options: -# report += "## Parallel Coordinates\n" -# report += "### Parallel Coordinates Plot was generated in the application.\n\n" - -# if "Config Params Scatter" in report_options: -# report += "## Configuration Parameters Scatter Plot\n" -# report += "### Scatter plots were generated in the application.\n\n" - -# if "Top & Bottom Configs" in report_options: -# report += "## Top & Bottom Configurations\n" -# n_configs = st.session_state.get("n_configs_display", 5) -# tb_metric = st.session_state.get("tb_metric", metrics[0]) -# optimization_method = st.session_state.get("tb_opt_method", "Maximize") -# if optimization_method == "Maximize": -# top_configs = df.nlargest(n_configs, tb_metric) -# bottom_configs = df.nsmallest(n_configs, tb_metric) -# else: -# top_configs = df.nsmallest(n_configs, tb_metric) -# bottom_configs = df.nlargest(n_configs, tb_metric) -# report += f"### Top {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{top_configs.to_markdown()}\n\n" -# report += f"### Bottom {n_configs} Configurations Based on {tb_metric.replace('result.', '').replace('_', ' ').title()}\n{bottom_configs.to_markdown()}\n\n" - -# if "Optimization Suggestions" in report_options: -# report += "## Optimization Suggestions\n" -# target_metric = st.session_state.get("opt_target_metric", metrics[0]) -# optimization_method = st.session_state.get("opt_method_choice", "Maximize") -# if optimization_method == "Maximize": -# optimal_config = df.loc[df[target_metric].idxmax()] -# else: -# optimal_config = df.loc[df[target_metric].idxmin()] -# report += f"### Optimal Configuration ({optimization_method} {target_metric.replace('result.', '').replace('_', ' ').title()}):\n{optimal_config[config_columns].to_dict()}\n\n" - -# if "Clustering" in report_options: -# report += "## Clustering Analysis\n" -# report += "### Clustering results were generated in the application.\n\n" - -# if "Advanced Statistics" in report_options: -# report += "## Advanced Statistics\n" -# selected_metric = st.session_state.get("advanced_stat_metric", metrics[0]) -# report += f"### Statistical Summary for {selected_metric.replace('result.', '').replace('_', ' ').title()}\n{df[selected_metric].describe().to_markdown()}\n\n" - -# if "Anomaly Detection" in report_options: -# report += "## Anomaly Detection\n" -# anomaly_metric = st.session_state.get("anomaly_metric", metrics[0]) -# threshold = st.session_state.get("anomaly_threshold", 3.0) -# mean_val = df[anomaly_metric].mean() -# std_val = df[anomaly_metric].std() -# upper_bound = mean_val + threshold * std_val -# lower_bound = mean_val - threshold * std_val -# anomalies = df[(df[anomaly_metric] > upper_bound) | (df[anomaly_metric] < lower_bound)] -# report += f"### Anomalies in {anomaly_metric.replace('result.', '').replace('_', ' ').title()} (Threshold: {threshold} Std Dev)\n{anomalies.to_markdown()}\n\n" - -# # Download the report as a text file -# st.download_button( -# label="Download Report as Text", -# data=report, -# file_name="analysis_report.txt", -# mime="text/plain", -# ) - -# # Optionally, provide the CSV report -# st.subheader("Download Descriptive Statistics") -# if st.button("Download Descriptive Statistics as CSV"): -# report_csv = df.describe().to_csv() -# st.download_button( -# label="Download CSV Report", -# data=report_csv, -# file_name="descriptive_statistics.csv", -# mime="text/csv", -# ) - -# st.info("Select the sections you want to include in the report and click 'Download Report'.") - -# # -------------------------------------------------------------------------------- -# # Additional UI/UX Enhancements -# # -------------------------------------------------------------------------------- -# st.sidebar.markdown("---") -# st.sidebar.markdown("#### Tips for Better Workflow") -# st.sidebar.markdown( -# """ -# - **Start with the Dashboard** to get an overview of key metrics. -# - **Use Data Overview** to understand and filter your dataset. -# - **Configurations Analysis** helps visualize specific configuration performances. -# - **Failure Analysis** highlights trial outcomes and trends. -# - **Correlation Heatmap** and **Parallel Coordinates** allow in-depth correlation and multi-dimensional analysis. -# - **Config Params Scatter** plots relationships between configuration parameters and metrics. -# - **Top & Bottom Configs** identify the best and worst-performing configurations. -# - **Optimization Suggestions** provide insights into optimal configurations. -# - **Clustering** groups similar configurations for pattern recognition. -# - **Advanced Statistics** offers detailed statistical analyses of your metrics. -# - **Anomaly Detection** helps identify outliers and unusual trial performances. -# - **Save Analysis** lets you download a comprehensive report of your findings. -# """ -# ) diff --git a/frontend.py b/frontend.py new file mode 100644 index 0000000..407bcfa --- /dev/null +++ b/frontend.py @@ -0,0 +1,1305 @@ +import plotly.express as px +import streamlit as st +import requests +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import mlos_viz +from mlos_bench.storage import from_config +import re +import warnings +from pathlib import Path +import json5 as json +import os + +# Load the storage config and connect to the storage +try: + storage = storage = from_config(config="storage/sqlite.jsonc") +except Exception as e: + st.error(f"Error loading storage configuration: {e}") + storage = None + +# Suppress specific FutureWarning from seaborn +warnings.filterwarnings("ignore", category=FutureWarning) + +# Ensure the backend is running on this port +backend_url = "http://localhost:8000" + +# Base directory for the project +base_dir = Path(__file__).resolve().parent + + +@st.cache_data +def get_experiments(): + response = requests.get(f"{backend_url}/experiments") + if response.status_code == 200: + return response.json() + else: + st.error("Failed to fetch experiments") + return [] + + +def get_experiment_results(experiment_id): + response = requests.get(f"{backend_url}/experiment_results/{experiment_id}") + if response.status_code == 200: + return pd.DataFrame(response.json()) + else: + st.error(f"Failed to fetch results for experiment {experiment_id}") + return pd.DataFrame() + + +def get_experiment_explanation(experiment_id): + response = requests.post( + f"{backend_url}/get_experiment_explanation", json={"experiment_id": experiment_id} + ) + if response.status_code == 200: + return response.json()["explanation"] + else: + st.error(f"Failed to get explanation: {response.text}") + return "" + + +experiment_ids = get_experiments() + + +# Function to plot the average metrics for each configuration +def plot_average_metrics(experiment_id, storage, metric): + exp = storage.experiments[experiment_id] + df = exp.results_df + + # Select numeric columns only, along with 'tunable_config_id' + numeric_df = df.select_dtypes(include="number") + numeric_df["tunable_config_id"] = df["tunable_config_id"] + + # Group by 'tunable_config_id' and calculate the mean for numeric columns + average_metrics = numeric_df.groupby("tunable_config_id").mean().reset_index() + + metrics = ["result.reads", "result.writes", "result.total", metric] + metric_labels = [ + "Average Reads", + "Average Writes", + "Average Transactions", + "Average Score", + metric, + ] + + fig, axes = plt.subplots(2, 2, figsize=(15, 10)) + fig.suptitle("Average Metrics for Each Configuration") + + for ax, metric, label in zip(axes.flatten(), metrics, metric_labels): + if metric in average_metrics.columns: + ax.bar(average_metrics["tunable_config_id"], average_metrics[metric], color="blue") + ax.set_xlabel("Configuration ID") + ax.set_ylabel(label) + ax.set_title(label) + ax.tick_params(axis="x", rotation=45) + else: + ax.set_visible(False) + + plt.tight_layout(rect=[0, 0.03, 1, 0.95]) + st.pyplot(fig) + + +# Function to plot the failure rate by configuration +def plot_failure_rate(experiment_id, storage): + exp = storage.experiments[experiment_id] + df = exp.results_df + failure_rate_data = ( + df.groupby("tunable_config_id")["status"] + .apply(lambda x: (x == "FAILED").mean()) + .reset_index() + ) + failure_rate_data.columns = ["tunable_config_id", "failure_rate"] + + plt.figure(figsize=(10, 6)) + sns.barplot(data=failure_rate_data, x="tunable_config_id", y="failure_rate") + plt.xlabel("Configuration ID") + plt.ylabel("Failure Rate") + plt.title("Failure Rate by Configuration") + plt.xticks(rotation=45) + plt.tight_layout() + st.pyplot(plt) + + +# Function to plot the metric percentiles +def plot_metric_percentiles(experiment_id, storage, metric): + exp = storage.experiments[experiment_id] + df = exp.results_df + + # Ensure metric is numeric + df[metric] = pd.to_numeric(df[metric], errors="coerce") + + # Drop rows with NaN values in metric + df = df.dropna(subset=[metric]) + + latency_percentiles = ( + df.groupby("tunable_config_id")[metric].quantile([0.25, 0.5, 0.75]).unstack().reset_index() + ) + latency_percentiles.columns = [ + "config_id", + "25th_percentile", + "50th_percentile", + "75th_percentile", + ] + + plt.figure(figsize=(10, 6)) + sns.boxplot(data=df, x="tunable_config_id", y=metric) + plt.xlabel("Configuration ID") + plt.ylabel("Result Score") + plt.title(f"{metric} Percentiles by Configuration") + plt.xticks(rotation=45) + plt.tight_layout() + st.pyplot(plt) + + +# Function to plot whisker plots for configurations within a specific experiment + + +def plot_whisker_plots(df, target_col, n=5): + """ + Plots whisker plots for the top N and bottom N configurations with respect to a target column. + + Parameters: + df (pd.DataFrame): The DataFrame containing the data to plot. + target_col (str): The name of the target column to plot on the y-axis. + n (int): The number of top and bottom configurations to plot. + """ + if "tunable_config_id" not in df.columns or target_col not in df.columns: + st.error(f"'tunable_config_id' or '{target_col}' column not found in DataFrame.") + return + + # Ensure the target column is numeric and drop NaNs + df[target_col] = pd.to_numeric(df[target_col], errors="coerce") + df = df.dropna(subset=[target_col]) + + # Calculate the average of the target column for each configuration + config_avg = df.groupby("tunable_config_id")[target_col].mean().reset_index() + + # Filter out configurations that do not have any result values + config_avg = config_avg.dropna(subset=[target_col]) + + # Select top N configurations + top_n_configs = config_avg.nlargest(n, target_col)["tunable_config_id"] + + # Filter the DataFrame to include only the top N configurations + top_configs = df[df["tunable_config_id"].isin(top_n_configs)] + + # Sort the top configurations by the target column + top_configs = top_configs.sort_values(by=target_col, ascending=False) + + # Plot whisker plots for the top N configurations + fig_top = px.box( + top_configs, + x="tunable_config_id", + y=target_col, + title=f"Whisker Plot for Top {n} Configurations by {target_col}", + labels={"tunable_config_id": "Configuration ID", target_col: target_col}, + ) + st.plotly_chart(fig_top, use_container_width=True) + + # Select bottom N configurations + bottom_n_configs = config_avg.nsmallest(n, target_col)["tunable_config_id"] + + # Filter the DataFrame to include only the bottom N configurations + bottom_configs = df[df["tunable_config_id"].isin(bottom_n_configs)] + + # Sort the bottom configurations by the target column + bottom_configs = bottom_configs.sort_values(by=target_col, ascending=True) + + # Plot whisker plots for the bottom N configurations + fig_bottom = px.box( + bottom_configs, + x="tunable_config_id", + y=target_col, + title=f"Whisker Plot for Bottom {n} Configurations by {target_col}", + labels={"tunable_config_id": "Configuration ID", target_col: target_col}, + ) + st.plotly_chart(fig_bottom, use_container_width=True) + + +# Function to plot correlation between parameter changes and latency +def plot_param_latency_correlation(experiment_id, storage, metric): + exp = storage.experiments[experiment_id] + df = exp.results_df + + # Pivot the data to have parameters as columns + param_pivot = df.pivot_table(index="trial_id", columns="param_id", values="param_value") + combined_data_with_params = param_pivot.join(df.set_index("trial_id")[[metric]]) + + # Calculate the correlation + param_latency_corr = combined_data_with_params.corr()[metric].drop(metric).to_frame() + param_latency_corr.columns = ["Correlation with Score"] + + # Plot the heatmap + if not param_latency_corr.empty: + plt.figure(figsize=(10, 8)) + sns.heatmap(param_latency_corr, annot=True, cmap="coolwarm", linewidths=0.5) + plt.title("Correlation Between Parameter Changes and Score") + st.pyplot(plt) + else: + st.write("Correlation matrix is empty or contains only NaN values.") + + +# Function to plot correlation matrix between result columns and configuration parameters +def plot_correlation_matrix_with_config(df): + # st.title('Correlation Matrix Between Results and Configurations') + + # Select columns that start with 'result' or 'config' + result_columns = [col for col in df.columns if col.startswith("result")] + config_columns = [col for col in df.columns if col.startswith("config")] + + # Ensure both config and result columns are present + if not result_columns: + st.warning("No result columns found.") + return + if not config_columns: + st.warning("No config columns found.") + return + + # Select numeric columns from both result and config columns + numeric_result_df = df[result_columns].apply(pd.to_numeric, errors="coerce") + numeric_config_df = df[config_columns].apply(pd.to_numeric, errors="coerce") + + # Combine both dataframes to ensure they are in the same dataframe + combined_numeric_df = pd.concat([numeric_result_df, numeric_config_df], axis=1) + + # Ensure there are numeric columns + if combined_numeric_df.empty: + st.warning("No numeric columns available for correlation matrix.") + return + + # Drop columns with all NaN values + combined_numeric_df.dropna(axis=1, how="all", inplace=True) + + # Compute correlation matrix + corr = combined_numeric_df.corr() + + # Plot the correlation matrix using Seaborn and Matplotlib + plt.figure(figsize=(12, 10)) + sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5) + # plt.title('Correlation Matrix Between Results and Configurations') + st.pyplot(plt) + + +# Function to plot top and bottom configurations based on the target column +def plot_config_top_bottom(df, target_col, config_prefix="config."): + if target_col not in df.columns: + st.error(f"Target column '{target_col}' not found in DataFrame.") + return + + # Sort the DataFrame by target column in descending order + sorted_df = df.sort_values(by=target_col, ascending=False) + + # Select top and bottom configurations + top_configs = sorted_df.head(5) + bottom_configs = sorted_df.tail(5) + + config_columns = [col for col in df.columns if col.startswith(config_prefix)] + + # Plot top configurations + plt.figure(figsize=(15, 6)) + plt.subplot(1, 2, 1) + for config in config_columns: + plt.plot(top_configs["tunable_config_id"], top_configs[config], label=config) + plt.xlabel("Configuration ID") + plt.ylabel("Configuration Value") + plt.title(f"Top 5 Configurations by {target_col}") + plt.legend() + + # Plot bottom configurations + plt.subplot(1, 2, 2) + for config in config_columns: + plt.plot(bottom_configs["tunable_config_id"], bottom_configs[config], label=config) + plt.xlabel("Configuration ID") + plt.ylabel("Configuration Value") + plt.title(f"Bottom 5 Configurations by {target_col}") + plt.legend() + + plt.tight_layout() + st.pyplot(plt) + + +def display_config_details(experiment_data, config_prefix="config."): + """ + Display configuration details from the experiment data. + Filters columns that start with the specified prefix and displays them in Streamlit. + Assumes the experiment data can be accessed or converted to a DataFrame. + """ + # Accessing or converting experiment data to DataFrame + if hasattr(experiment_data, "results_df"): + df = experiment_data.results_df # Access the DataFrame if it's a property + else: + st.error("Experiment data does not contain 'results_df'. Check the data structure.") + return + + # Check for DataFrame columns + if not hasattr(df, "columns"): + st.error("Data is not a valid DataFrame.") + return + + # Filter columns that start with config_prefix + config_columns = [col for col in df.columns if col.startswith(config_prefix)] + if not config_columns: + st.warning("No configuration columns found.") + return + + # Assuming there is a unique identifier in the DataFrame to select configurations + if "tunable_config_id" in df.columns: + config_ids = df["tunable_config_id"].dropna().unique() + selected_config_id = st.selectbox("Select Configuration ID:", config_ids) + # Display details for the selected configuration ID + config_details = df[df["tunable_config_id"] == selected_config_id][config_columns] + # Iterate through each row and display each column value line by line + for _, row in config_details.iterrows(): + for col in config_columns: + st.text(f"{col}: {row[col]}") # Using st.text for plain text + else: + st.error("No 'tunable_config_id' column found in the DataFrame.") + + +def plot_line_scatter_chart(df, target_col, benchmark_col="result.Benchmark Type"): + if ( + "trial_id" not in df.columns + or target_col not in df.columns + or benchmark_col not in df.columns + ): + st.error( + f"'trial_id', '{target_col}', or '{benchmark_col}' column not found in DataFrame." + ) + return + + plot_data = df[["trial_id", target_col, benchmark_col]].dropna().sort_values(by="trial_id") + if plot_data.empty: + st.error(f"No data available for plotting with target column '{target_col}'.") + return + + fig = px.scatter( + plot_data, + x="trial_id", + y=target_col, + color=benchmark_col, + title=f"Line and Scatter Plot of trial_id vs {target_col}", + labels={"trial_id": "Trial ID", target_col: target_col}, + ) + + st.plotly_chart(fig, use_container_width=True) + + +def plot_success_failure_distribution(df): + """ + Plots a pie chart for the overall success and failure distribution using Plotly. + """ + status_counts = df["status"].value_counts() + fig = px.pie( + values=status_counts.values, + names=status_counts.index, + title="Overall Success/Failure Distribution", + ) + st.plotly_chart(fig, use_container_width=True) + + +def plot_success_failure_by_config(df): + """ + Plots a bar chart for the count of successes and failures by configuration using Plotly. + """ + status_by_config = ( + df.groupby(["tunable_config_id", "status"]).size().unstack(fill_value=0).reset_index() + ) + status_by_config = status_by_config.melt( + id_vars="tunable_config_id", var_name="status", value_name="count" + ) + fig = px.bar( + status_by_config, + x="tunable_config_id", + y="count", + color="status", + barmode="stack", + title="Success/Failure Count by Configuration", + ) + fig.update_layout(xaxis_title="Configuration ID", yaxis_title="Count") + st.plotly_chart(fig, use_container_width=True) + + +def plot_failure_rate_by_config(df): + """ + Plots a bar chart for the failure rate by configuration using Plotly. + """ + failure_rate_data = ( + df.groupby("tunable_config_id")["status"] + .apply(lambda x: (x == "FAILED").mean()) + .reset_index() + ) + failure_rate_data.columns = ["tunable_config_id", "failure_rate"] + fig = px.bar( + failure_rate_data, + x="tunable_config_id", + y="failure_rate", + title="Failure Rate by Configuration", + ) + fig.update_layout(xaxis_title="Configuration ID", yaxis_title="Failure Rate") + st.plotly_chart(fig, use_container_width=True) + + +# Main function to plot the selected view +def plot_failure_metrics(experiment_id, storage, view_type): + exp = storage.experiments[experiment_id] + df = exp.results_df + + if view_type == "Pie Chart": + plot_success_failure_distribution(df) + elif view_type == "Bar Chart - Success/Failure Count": + plot_success_failure_by_config(df) + elif view_type == "Bar Chart - Failure Rate": + plot_failure_rate_by_config(df) + + +def plot_heatmap(df): + # Select numeric columns only + numeric_df = df.select_dtypes(include=["int64", "float64"]) + + config_columns = [col for col in numeric_df.columns if col.startswith("config")] + result_columns = [col for col in numeric_df.columns if col.startswith("result")] + + combined_data = numeric_df[config_columns + result_columns].apply( + pd.to_numeric, errors="coerce" + ) + correlation_matrix = combined_data.corr() + config_result_corr = correlation_matrix.loc[config_columns, result_columns] + + fig = px.imshow( + config_result_corr, text_auto=True, color_continuous_scale="RdBu", aspect="auto" + ) + fig.update_layout( + title="Heatmap of Configuration Parameters vs Performance Metrics", + xaxis_title="Performance Metrics", + yaxis_title="Configuration Parameters", + ) + st.plotly_chart(fig, use_container_width=True) + + +def plot_correlation_table_target(df, target_col): + # Select numeric columns only + numeric_df = df.select_dtypes(include=["int64", "float64"]) + + result_columns = [col for col in numeric_df.columns if col.startswith("config")] + + numeric_df[target_col] = pd.to_numeric(numeric_df[target_col], errors="coerce") + + correlations = ( + numeric_df[result_columns].corrwith(numeric_df[target_col]).sort_values(ascending=False) + ) + correlations_df = pd.DataFrame(correlations, columns=["Correlation"]).reset_index() + correlations_df.columns = ["Config Column", "Correlation"] + + fig = px.imshow( + correlations_df.set_index("Config Column").T, + text_auto=True, + color_continuous_scale="RdBu", + aspect="auto", + ) + fig.update_layout( + title=f"Correlation Heatmap with {target_col}", + xaxis_title="Config Columns", + yaxis_title="Correlation", + ) + st.plotly_chart(fig, use_container_width=True) + + +def plot_top_bottom_configs_scatter(df, target_col, n=5): + """ + Plots the top N and bottom N configurations on a line and scatter plot with respect to a target column. + + Parameters: + df (pd.DataFrame): The DataFrame containing the data to plot. + target_col (str): The name of the target column to plot on the y-axis. + n (int): The number of top and bottom configurations to plot. + """ + if "trial_id" not in df.columns or target_col not in df.columns: + st.error(f"'trial_id' or '{target_col}' column not found in DataFrame.") + return + + # Ensure the target column is numeric + df[target_col] = pd.to_numeric(df[target_col], errors="coerce") + + # Sort the DataFrame by the target column + sorted_df = df.sort_values(by=target_col, ascending=False) + + # Select top N and bottom N configurations + top_configs = sorted_df.head(n) + bottom_configs = sorted_df.tail(n) + + plt.figure(figsize=(12, 6)) + + # Plot top N configurations + plt.plot( + top_configs["trial_id"], + top_configs[target_col], + linestyle="-", + marker="o", + color="blue", + label=f"Top {n} Trials", + ) + + # Plot bottom N configurations + plt.plot( + bottom_configs["trial_id"], + bottom_configs[target_col], + linestyle="-", + marker="o", + color="red", + label=f"Bottom {n} Trials", + ) + + plt.title(f"Top {n} and Bottom {n} Trials by {target_col}") + plt.xlabel("trial_id") + plt.ylabel(target_col) + plt.legend() + plt.grid(True) + + st.pyplot(plt) + + +def plot_config_scatter(df, target_col, n=5): + """ + Plots scatter plots for the top N and bottom N configurations with respect to a target column. + + Parameters: + df (pd.DataFrame): The DataFrame containing the data to plot. + target_col (str): The name of the target column to plot on the y-axis. + n (int): The number of top and bottom configurations to plot. + """ + if "tunable_config_id" not in df.columns or target_col not in df.columns: + st.error(f"'tunable_config_id' or '{target_col}' column not found in DataFrame.") + return + + # Ensure the target column is numeric + df[target_col] = pd.to_numeric(df[target_col], errors="coerce") + + # Sort the DataFrame by the target column + sorted_df = df.sort_values(by=target_col, ascending=False) + + # Select top N and bottom N configurations + top_configs = sorted_df.head(n) + bottom_configs = sorted_df.tail(n) + + # Plot top N configurations + plt.figure(figsize=(12, 6)) + plt.scatter( + top_configs["tunable_config_id"], + top_configs[target_col], + color="blue", + label=f"Top {n} Configs", + ) + plt.xlabel("Configuration ID") + plt.ylabel(target_col) + plt.title(f"Scatter Plot for Top {n} Configurations by {target_col}") + plt.legend() + plt.grid(True) + st.pyplot(plt) + + # Plot bottom N configurations + plt.figure(figsize=(12, 6)) + plt.scatter( + bottom_configs["tunable_config_id"], + bottom_configs[target_col], + color="red", + label=f"Bottom {n} Configs", + ) + plt.xlabel("Configuration ID") + plt.ylabel(target_col) + plt.title(f"Scatter Plot for Bottom {n} Configurations by {target_col}") + plt.legend() + plt.grid(True) + st.pyplot(plt) + + +def compare_whisker_plots(df, target_col, config_id_1, config_id_2): + """ + Plots whisker plots for two specific configurations with respect to a target column on the same plot. + + Parameters: + df (pd.DataFrame): The DataFrame containing the data to plot. + target_col (str): The name of the target column to plot on the y-axis. + config_id_1 (int): The ID of the first configuration to plot. + config_id_2 (int): The ID of the second configuration to plot. + """ + if "tunable_config_id" not in df.columns or target_col not in df.columns: + st.error(f"'tunable_config_id' or '{target_col}' column not found in DataFrame.") + return + + # Ensure the target column is numeric + df[target_col] = pd.to_numeric(df[target_col], errors="coerce") + + # Filter the DataFrame for the two configurations + config_1_data = df[df["tunable_config_id"] == config_id_1] + config_2_data = df[df["tunable_config_id"] == config_id_2] + + if config_1_data.empty or config_2_data.empty: + st.error("One or both configuration IDs do not exist in the DataFrame.") + return + + # Combine the data for plotting + combined_data = pd.concat([config_1_data, config_2_data]) + + fig = px.box( + combined_data, + x="tunable_config_id", + y=target_col, + points="all", + labels={"tunable_config_id": "Configuration ID", target_col: target_col}, + title=f"Whisker Plot for Configurations {config_id_1} and {config_id_2} by {target_col}", + ) + + st.plotly_chart(fig, use_container_width=True) + + +from scipy.stats import gaussian_kde +import plotly.graph_objects as go +import numpy as np + + +def compare_score_distributions(df, target_col, config_id_1, config_id_2): + """ + Plots the distribution of scores for two specific configurations side by side. + + Parameters: + df (pd.DataFrame): The DataFrame containing the data to plot. + target_col (str): The name of the target column to plot the distribution of. + config_id_1 (int): The ID of the first configuration to plot. + config_id_2 (int): The ID of the second configuration to plot. + """ + if "tunable_config_id" not in df.columns or target_col not in df.columns: + st.error(f"'tunable_config_id' or '{target_col}' column not found in DataFrame.") + return + + # Ensure the target column is numeric + df[target_col] = pd.to_numeric(df[target_col], errors="coerce") + + # Filter the DataFrame for the two configurations + config_1_data = df[df["tunable_config_id"] == config_id_1][target_col].dropna() + config_2_data = df[df["tunable_config_id"] == config_id_2][target_col].dropna() + + if config_1_data.empty or config_2_data.empty: + st.error("One or both configuration IDs do not exist in the DataFrame.") + return + + # Calculate KDE for both configurations + kde_1 = gaussian_kde(config_1_data) + kde_2 = gaussian_kde(config_2_data) + + # Create an array of x values for plotting the KDE + x_min = min(config_1_data.min(), config_2_data.min()) + x_max = max(config_1_data.max(), config_2_data.max()) + x_vals = np.linspace(x_min, x_max, 500) + + fig = go.Figure() + + fig.add_trace( + go.Scatter(x=x_vals, y=kde_1(x_vals), mode="lines", name=f"Config {config_id_1}") + ) + + fig.add_trace( + go.Scatter(x=x_vals, y=kde_2(x_vals), mode="lines", name=f"Config {config_id_2}") + ) + + fig.update_layout( + title_text=f"Score Distribution for Configurations {config_id_1} and {config_id_2}", + xaxis_title_text=target_col, + yaxis_title_text="Density", + legend_title_text="Configuration", + ) + + st.plotly_chart(fig, use_container_width=True) + + +# Function to create 3D scatter plot + + +def plot_3d_config_result( + df, config_col1, config_col2, result_col, benchmark_col="result.Benchmark Type" +): + if ( + config_col1 not in df.columns + or config_col2 not in df.columns + or result_col not in df.columns + or benchmark_col not in df.columns + ): + st.error( + f"One or more columns: '{config_col1}', '{config_col2}', '{result_col}', or '{benchmark_col}' not found in DataFrame." + ) + return + + df[config_col1] = pd.to_numeric(df[config_col1], errors="coerce") + df[config_col2] = pd.to_numeric(df[config_col2], errors="coerce") + df[result_col] = pd.to_numeric(df[result_col], errors="coerce") + + df = df.dropna(subset=[config_col1, config_col2, result_col, benchmark_col]) + + fig = px.scatter_3d( + df, + x=config_col1, + y=config_col2, + z=result_col, + color=benchmark_col, + labels={"x": config_col1, "y": config_col2, "z": result_col}, + title=f"3D Scatter Plot of {config_col1}, {config_col2}, and {result_col} by {benchmark_col}", + ) + + fig.update_layout( + legend=dict( + font=dict(size=10), + itemsizing="constant", + ), + scene=dict( + xaxis_title=config_col1, + yaxis_title=config_col2, + zaxis_title=result_col, + aspectmode="manual", + aspectratio=dict(x=1.2, y=1.2, z=1), + ), + margin=dict(l=0, r=0, t=40, b=0), + ) + + st.plotly_chart(fig, use_container_width=True) + + +import plotly.graph_objs as go + + +def plot_3d_surface_config_result( + df, config_col1, config_col2, result_col, benchmark_col="result.Benchmark Type" +): + if ( + config_col1 not in df.columns + or config_col2 not in df.columns + or result_col not in df.columns + or benchmark_col not in df.columns + ): + st.error( + f"One or more columns: '{config_col1}', '{config_col2}', '{result_col}', or '{benchmark_col}' not found in DataFrame." + ) + return + + df[config_col1] = pd.to_numeric(df[config_col1], errors="coerce") + df[config_col2] = pd.to_numeric(df[config_col2], errors="coerce") + df[result_col] = pd.to_numeric(df[result_col], errors="coerce") + + df = df.dropna(subset=[config_col1, config_col2, result_col, benchmark_col]) + + unique_benchmarks = df[benchmark_col].unique() + fig = go.Figure() + + for benchmark in unique_benchmarks: + benchmark_df = df[df[benchmark_col] == benchmark] + pivot_table = benchmark_df.pivot_table( + index=config_col1, columns=config_col2, values=result_col + ).fillna(0) + + fig.add_trace( + go.Surface( + z=pivot_table.values, x=pivot_table.columns, y=pivot_table.index, name=benchmark + ) + ) + + fig.update_layout( + title=f"3D Surface Plot of {config_col1}, {config_col2}, and {result_col} by {benchmark_col}", + scene=dict( + xaxis_title=config_col1, + yaxis_title=config_col2, + zaxis_title=result_col, + aspectmode="manual", + aspectratio=dict(x=1.2, y=1.2, z=1), + ), + margin=dict(l=0, r=0, t=40, b=0), + legend=dict( + font=dict(size=10), + itemsizing="constant", + ), + ) + + st.plotly_chart(fig, use_container_width=True) + + +def plot_2d_scatter(df, result_col, config_col, benchmark_col="result.Benchmark Type"): + """ + Creates a 2D scatter plot to visualize the impact of a configuration parameter on a selected benchmark result. + + Parameters: + df (pd.DataFrame): The DataFrame containing the data to plot. + result_col (str): The name of the result column to plot on the y-axis. + config_col (str): The name of the configuration column to plot on the x-axis. + benchmark_col (str): The name of the benchmark column to use for color differentiation. + """ + if ( + result_col not in df.columns + or config_col not in df.columns + or benchmark_col not in df.columns + ): + st.error("One or more columns not found in DataFrame.") + return + + df[result_col] = pd.to_numeric(df[result_col], errors="coerce") + df[config_col] = pd.to_numeric(df[config_col], errors="coerce") + df = df.dropna(subset=[result_col, config_col, benchmark_col]) + + fig = px.scatter( + df, + x=config_col, + y=result_col, + color=benchmark_col, + title=f"Scatter Plot of {config_col} vs {result_col}", + labels={config_col: config_col, result_col: result_col}, + ) + + st.plotly_chart(fig, use_container_width=True) + + +def plot_whisker_plots_all(df, target_col, benchmark_col="result.Benchmark Type"): + """ + Plots whisker plots for all configurations with respect to a target column and differentiates by benchmark type. + + Parameters: + df (pd.DataFrame): The DataFrame containing the data to plot. + target_col (str): The name of the target column to plot on the y-axis. + benchmark_col (str): The name of the benchmark column to use for color differentiation. + """ + if ( + "tunable_config_id" not in df.columns + or target_col not in df.columns + or benchmark_col not in df.columns + ): + st.error( + f"'tunable_config_id', '{target_col}', or '{benchmark_col}' column not found in DataFrame." + ) + return + + # Ensure the target column is numeric + df[target_col] = pd.to_numeric(df[target_col], errors="coerce") + + # Drop rows with NaN values in target column + df = df.dropna(subset=[target_col]) + + # Plot whisker plots for all configurations with color differentiation by benchmark type + fig = px.box( + df, + x="tunable_config_id", + y=target_col, + color=benchmark_col, + points="all", + labels={"tunable_config_id": "Configuration ID", target_col: target_col}, + title=f"Whisker Plot for All Configurations by {target_col}", + ) + + st.plotly_chart(fig, use_container_width=True) + + +def get_trial_ranges_by_benchmark(df): + # Adjust this to match your actual column name + benchmark_col = "result.Benchmark Type" + if benchmark_col not in df.columns: + st.error(f"Benchmark column '{benchmark_col}' not found in DataFrame.") + return {} + + benchmark_types = df[benchmark_col].unique() + trial_ranges = {} + for benchmark in benchmark_types: + trial_ids = sorted(df[df[benchmark_col] == benchmark]["trial_id"].unique()) + if trial_ids: + ranges = [] + range_start = trial_ids[0] + previous_id = trial_ids[0] + for trial_id in trial_ids[1:]: + if trial_id != previous_id + 1: + ranges.append((range_start, previous_id)) + range_start = trial_id + previous_id = trial_id + ranges.append((range_start, previous_id)) + trial_ranges[benchmark] = ranges + else: + trial_ranges[benchmark] = [] + return trial_ranges + + +def plot_violin_plot(df, target_col, config_id_1, config_id_2): + """ + Plots a violin plot for two specific configurations with respect to a target column. + + Parameters: + df (pd.DataFrame): The DataFrame containing the data to plot. + target_col (str): The name of the target column to plot on the y-axis. + config_id_1 (int): The ID of the first configuration to plot. + config_id_2 (int): The ID of the second configuration to plot. + """ + if "tunable_config_id" not in df.columns or target_col not in df.columns: + st.error(f"'tunable_config_id' or '{target_col}' column not found in DataFrame.") + return + + # Ensure the target column is numeric + df[target_col] = pd.to_numeric(df[target_col], errors="coerce") + + # Filter the DataFrame for the two configurations + config_1_data = df[df["tunable_config_id"] == config_id_1] + config_2_data = df[df["tunable_config_id"] == config_id_2] + + if config_1_data.empty or config_2_data.empty: + st.error("One or both configuration IDs do not exist in the DataFrame.") + return + + # Combine the data for plotting + combined_data = pd.concat([config_1_data, config_2_data]) + + fig = px.violin( + combined_data, + x="tunable_config_id", + y=target_col, + box=True, + points="all", + labels={"tunable_config_id": "Configuration ID", target_col: target_col}, + title=f"Violin Plot for Configurations {config_id_1} and {config_id_2} by {target_col}", + ) + + st.plotly_chart(fig, use_container_width=True) + + +def compare_two_experiments(experiment_id_1, experiment_id_2, storage, target_col): + df1 = storage.experiments[experiment_id_1].results_df + df2 = storage.experiments[experiment_id_2].results_df + + if target_col not in df1.columns or target_col not in df2.columns: + st.error( + f"The target column '{target_col}' does not exist in one of the selected experiments." + ) + return + + df1[target_col] = pd.to_numeric(df1[target_col], errors="coerce") + df2[target_col] = pd.to_numeric(df2[target_col], errors="coerce") + + fig = go.Figure() + + fig.add_trace( + go.Scatter( + x=df1["trial_id"], + y=df1[target_col], + mode="lines+markers", + name=f"Experiment {experiment_id_1}", + # Adding labels for points + text=[f"Trial {i}" for i in df1["trial_id"]], + hoverinfo="text+y", + ) + ) + + fig.add_trace( + go.Scatter( + x=df2["trial_id"], + y=df2[target_col], + mode="lines+markers", + name=f"Experiment {experiment_id_2}", + # Adding labels for points + text=[f"Trial {i}" for i in df2["trial_id"]], + hoverinfo="text+y", + ) + ) + + fig.update_layout( + title=f"Comparison of {target_col} between Experiment {experiment_id_1} and {experiment_id_2}", + xaxis_title="Trial ID", + yaxis_title=target_col, + legend_title="Experiment", + ) + + st.plotly_chart(fig, use_container_width=True) + + +def compare_multiple_experiments(experiment_ids, storage, target_col): + """ + Compare multiple experiments by plotting the selected target column. + + Parameters: + experiment_ids (list): List of experiment IDs to compare. + storage: The storage object containing experiment data. + target_col (str): The name of the target column to compare. + """ + # Scatter plot comparison + fig_scatter = go.Figure() + + for experiment_id in experiment_ids: + df = storage.experiments[experiment_id].results_df + + if target_col not in df.columns: + st.error( + f"The target column '{target_col}' does not exist in experiment {experiment_id}." + ) + return + + df[target_col] = pd.to_numeric(df[target_col], errors="coerce") + df = df.dropna(subset=[target_col]) + + fig_scatter.add_trace( + go.Scatter( + x=df["trial_id"], + y=df[target_col], + mode="markers", + name=f"Experiment {experiment_id}", + text=[f"Trial {i}" for i in df["trial_id"]], + hoverinfo="text+y", + ) + ) + + fig_scatter.update_layout( + title=f"Scatter Plot Comparison of {target_col} across Experiments", + xaxis_title="Trial ID", + yaxis_title=target_col, + legend_title="Experiment", + ) + st.plotly_chart(fig_scatter, use_container_width=True) + + # Line plot comparison + fig_line = go.Figure() + + for experiment_id in experiment_ids: + df = storage.experiments[experiment_id].results_df + + if target_col not in df.columns: + st.error( + f"The target column '{target_col}' does not exist in experiment {experiment_id}." + ) + return + + df[target_col] = pd.to_numeric(df[target_col], errors="coerce") + df = df.dropna(subset=[target_col]) + + fig_line.add_trace( + go.Scatter( + x=df["trial_id"], + y=df[target_col], + mode="lines+markers", + name=f"Experiment {experiment_id}", + text=[f"Trial {i}" for i in df["trial_id"]], + hoverinfo="text+y", + ) + ) + + fig_line.update_layout( + title=f"Line Plot Comparison of {target_col} across Experiments", + xaxis_title="Trial ID", + yaxis_title=target_col, + legend_title="Experiment", + ) + st.plotly_chart(fig_line, use_container_width=True) + + # Box plot comparison + df_combined = pd.DataFrame() + + for experiment_id in experiment_ids: + df = storage.experiments[experiment_id].results_df + df["experiment_id"] = experiment_id + + if target_col not in df.columns: + st.error( + f"The target column '{target_col}' does not exist in experiment {experiment_id}." + ) + return + + df[target_col] = pd.to_numeric(df[target_col], errors="coerce") + df_combined = pd.concat([df_combined, df]) + + df_combined = df_combined.dropna(subset=[target_col]) + + fig_box = px.box( + df_combined, + x="experiment_id", + y=target_col, + title=f"Box Plot Comparison of {target_col} across Experiments", + labels={"experiment_id": "Experiment ID", target_col: target_col}, + ) + st.plotly_chart(fig_box, use_container_width=True) + + # Violin plot comparison + fig_violin = px.violin( + df_combined, + x="experiment_id", + y=target_col, + box=True, + points="all", + title=f"Violin Plot Comparison of {target_col} across Experiments", + labels={"experiment_id": "Experiment ID", target_col: target_col}, + ) + st.plotly_chart(fig_violin, use_container_width=True) + + # Correlation matrix comparison + # for experiment_id in experiment_ids: + # df = storage.experiments[experiment_id].results_df + + # st.write(f"Correlation Matrix for Experiment {experiment_id}") + # plot_heatmap(df) + # plot_correlation_table_target(df, target_col) + + +if storage: + st.title("Analytics Panel") + + st.write("Welcome to the Panel. View and analyze the results of your experiments here.") + st.header("Select and View Experiment Details To Start Analyzing & Monitoring") + selected_experiment_id = st.selectbox("Select Experiment ID", list(storage.experiments.keys())) + + with st.expander("View Experiment Results Dataframe Details"): + df = storage.experiments[selected_experiment_id].results_df + st.dataframe(df) + + st.write("Descriptive Statistics:") + st.dataframe(df.describe()) + + if selected_experiment_id: + tab1, tab2, tab3, tab4, tab5, tab6, tab7 = st.tabs( + [ + "Failure Metrics", + "Trial Ranges", + "Graphs", + "Correlation", + "Compare Configurations", + "Compare Experiments", + "ChatGPT", + ] + ) + + with tab1: + st.header("Failure Metrics") + view_type = st.selectbox( + "Select View Type", + ["Pie Chart", "Bar Chart - Success/Failure Count", "Bar Chart - Failure Rate"], + ) + try: + plot_failure_metrics(selected_experiment_id, storage, view_type) + except: + st.write("Failure Metrics not available") + + with tab2: + st.header("Trial Ranges by Benchmark Type") + try: + trial_ranges = get_trial_ranges_by_benchmark(df) + for benchmark, ranges in trial_ranges.items(): + if ranges: + st.subheader(f"Benchmark: {benchmark}") + for start, end in ranges: + st.write(f" - Trial ID Range: {start} - {end}") + else: + st.write(f"Benchmark: {benchmark} has no trials") + except: + st.write("Trial Ranges by Benchmark Type not available") + + with tab3: + st.header("Graphs") + st.subheader("Select a Column to Graph Data On") + try: + config_columns = [col for col in df.columns if col.startswith("config")] + result_columns = [col for col in df.columns if col.startswith("result")] + + target_col = st.selectbox("Select Target Column", result_columns) + + st.subheader("Scatter of Trials & Target Column") + plot_line_scatter_chart(df, target_col) + except: + st.write("Scatter Plot not available") + + st.subheader("Scatter of Target Column With One Config Parameter") + config_col = st.selectbox("Select Configuration Column", config_columns) + + try: + plot_2d_scatter(df, target_col, config_col) + except: + st.write("2D Scatter Plot not available") + + st.header("Result Column & Two Config Params") + try: + if config_columns and result_columns: + config_col1 = st.selectbox("Select First Configuration Column", config_columns) + config_col2 = st.selectbox( + "Select Second Configuration Column", config_columns + ) + result_col = st.selectbox("Select Result Column", result_columns) + + plot_3d_config_result(df, config_col1, config_col2, result_col) + except: + st.write("3D Scatter Plot not available") + + with tab4: + st.header("Correlation of Target Column With Parameters") + try: + plot_heatmap(df) + plot_correlation_table_target(df, target_col) + except: + st.write("Correlation Heatmap not available") + + try: + st.subheader("Mlos_Viz Metrics") + exp = storage.experiments[selected_experiment_id] + st.set_option("deprecation.showPyplotGlobalUse", False) + fig = mlos_viz.plot(exp) + st.pyplot(fig) + except: + st.write("Mlos_Viz Metrics not available") + + with tab5: + st.header("Compare Two Configurations") + try: + config_id_1 = st.selectbox( + "Select First Configuration ID", df["tunable_config_id"].unique() + ) + config_id_2 = st.selectbox( + "Select Second Configuration ID", df["tunable_config_id"].unique() + ) + compare_whisker_plots(df, target_col, config_id_1, config_id_2) + plot_violin_plot(df, target_col, config_id_1, config_id_2) + except: + st.write("Comparison Plots not available") + + try: + compare_score_distributions(df, target_col, config_id_1, config_id_2) + except: + st.write("Score Distributions not available") + + try: + display_config_details(storage.experiments[selected_experiment_id]) + except: + st.write("Config Details not available") + + with tab6: + st.header("Compare Multiple Experiments") + try: + experiment_ids = list(storage.experiments.keys()) + selected_experiment_ids = st.multiselect("Select Experiment IDs", experiment_ids) + + target_col_for_comparison = st.selectbox( + "Select Target Column for Comparison", + ( + [ + col + for col in storage.experiments[ + selected_experiment_ids[0] + ].results_df.columns + if col.startswith("result") + ] + if selected_experiment_ids + else [] + ), + ) + + compare_multiple_experiments( + selected_experiment_ids, storage, target_col_for_comparison + ) + except Exception as e: + st.write("Multiple Experiments Comparison not available due to error: ", e) + + with tab7: + st.header("ChatGPT Explanation") + explanation = "Click the button to fetch the experiment explanation." + if st.button("Fetch Experiment Explanation"): + try: + explanation = get_experiment_explanation(selected_experiment_id) + except: + explanation = "Experiment explanation not available." + st.subheader("Experiment Explanation") + st.write(explanation) + +else: + st.warning("Storage configuration not loaded. Cannot display experiments.") diff --git a/mlos_demo_mysql.ipynb b/mlos_demo_mysql.ipynb index 58f18a0..543ff4d 100644 --- a/mlos_demo_mysql.ipynb +++ b/mlos_demo_mysql.ipynb @@ -1909,7 +1909,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "mlos", "language": "python", "name": "python3" }, @@ -1923,7 +1923,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.13.1" } }, "nbformat": 4, From 0a9f54217070098bc98c07c45f5c7b6c34c9747a Mon Sep 17 00:00:00 2001 From: Yaseen Shady <139421618+yshady-acheev@users.noreply.github.com> Date: Wed, 22 Jan 2025 06:14:25 +0000 Subject: [PATCH 6/8] working summer implementation for viz. panel --- frontend.py | 331 +++++++++++----------------------------------------- 1 file changed, 70 insertions(+), 261 deletions(-) diff --git a/frontend.py b/frontend.py index 407bcfa..2379f69 100644 --- a/frontend.py +++ b/frontend.py @@ -357,12 +357,8 @@ def display_config_details(experiment_data, config_prefix="config."): st.error("No 'tunable_config_id' column found in the DataFrame.") -def plot_line_scatter_chart(df, target_col, benchmark_col="result.Benchmark Type"): - if ( - "trial_id" not in df.columns - or target_col not in df.columns - or benchmark_col not in df.columns - ): +def plot_line_scatter_chart(df, target_col, benchmark_col="results.latency_pct"): + if "trial_id" not in df.columns or target_col not in df.columns: st.error( f"'trial_id', '{target_col}', or '{benchmark_col}' column not found in DataFrame." ) @@ -712,214 +708,6 @@ def compare_score_distributions(df, target_col, config_id_1, config_id_2): st.plotly_chart(fig, use_container_width=True) -# Function to create 3D scatter plot - - -def plot_3d_config_result( - df, config_col1, config_col2, result_col, benchmark_col="result.Benchmark Type" -): - if ( - config_col1 not in df.columns - or config_col2 not in df.columns - or result_col not in df.columns - or benchmark_col not in df.columns - ): - st.error( - f"One or more columns: '{config_col1}', '{config_col2}', '{result_col}', or '{benchmark_col}' not found in DataFrame." - ) - return - - df[config_col1] = pd.to_numeric(df[config_col1], errors="coerce") - df[config_col2] = pd.to_numeric(df[config_col2], errors="coerce") - df[result_col] = pd.to_numeric(df[result_col], errors="coerce") - - df = df.dropna(subset=[config_col1, config_col2, result_col, benchmark_col]) - - fig = px.scatter_3d( - df, - x=config_col1, - y=config_col2, - z=result_col, - color=benchmark_col, - labels={"x": config_col1, "y": config_col2, "z": result_col}, - title=f"3D Scatter Plot of {config_col1}, {config_col2}, and {result_col} by {benchmark_col}", - ) - - fig.update_layout( - legend=dict( - font=dict(size=10), - itemsizing="constant", - ), - scene=dict( - xaxis_title=config_col1, - yaxis_title=config_col2, - zaxis_title=result_col, - aspectmode="manual", - aspectratio=dict(x=1.2, y=1.2, z=1), - ), - margin=dict(l=0, r=0, t=40, b=0), - ) - - st.plotly_chart(fig, use_container_width=True) - - -import plotly.graph_objs as go - - -def plot_3d_surface_config_result( - df, config_col1, config_col2, result_col, benchmark_col="result.Benchmark Type" -): - if ( - config_col1 not in df.columns - or config_col2 not in df.columns - or result_col not in df.columns - or benchmark_col not in df.columns - ): - st.error( - f"One or more columns: '{config_col1}', '{config_col2}', '{result_col}', or '{benchmark_col}' not found in DataFrame." - ) - return - - df[config_col1] = pd.to_numeric(df[config_col1], errors="coerce") - df[config_col2] = pd.to_numeric(df[config_col2], errors="coerce") - df[result_col] = pd.to_numeric(df[result_col], errors="coerce") - - df = df.dropna(subset=[config_col1, config_col2, result_col, benchmark_col]) - - unique_benchmarks = df[benchmark_col].unique() - fig = go.Figure() - - for benchmark in unique_benchmarks: - benchmark_df = df[df[benchmark_col] == benchmark] - pivot_table = benchmark_df.pivot_table( - index=config_col1, columns=config_col2, values=result_col - ).fillna(0) - - fig.add_trace( - go.Surface( - z=pivot_table.values, x=pivot_table.columns, y=pivot_table.index, name=benchmark - ) - ) - - fig.update_layout( - title=f"3D Surface Plot of {config_col1}, {config_col2}, and {result_col} by {benchmark_col}", - scene=dict( - xaxis_title=config_col1, - yaxis_title=config_col2, - zaxis_title=result_col, - aspectmode="manual", - aspectratio=dict(x=1.2, y=1.2, z=1), - ), - margin=dict(l=0, r=0, t=40, b=0), - legend=dict( - font=dict(size=10), - itemsizing="constant", - ), - ) - - st.plotly_chart(fig, use_container_width=True) - - -def plot_2d_scatter(df, result_col, config_col, benchmark_col="result.Benchmark Type"): - """ - Creates a 2D scatter plot to visualize the impact of a configuration parameter on a selected benchmark result. - - Parameters: - df (pd.DataFrame): The DataFrame containing the data to plot. - result_col (str): The name of the result column to plot on the y-axis. - config_col (str): The name of the configuration column to plot on the x-axis. - benchmark_col (str): The name of the benchmark column to use for color differentiation. - """ - if ( - result_col not in df.columns - or config_col not in df.columns - or benchmark_col not in df.columns - ): - st.error("One or more columns not found in DataFrame.") - return - - df[result_col] = pd.to_numeric(df[result_col], errors="coerce") - df[config_col] = pd.to_numeric(df[config_col], errors="coerce") - df = df.dropna(subset=[result_col, config_col, benchmark_col]) - - fig = px.scatter( - df, - x=config_col, - y=result_col, - color=benchmark_col, - title=f"Scatter Plot of {config_col} vs {result_col}", - labels={config_col: config_col, result_col: result_col}, - ) - - st.plotly_chart(fig, use_container_width=True) - - -def plot_whisker_plots_all(df, target_col, benchmark_col="result.Benchmark Type"): - """ - Plots whisker plots for all configurations with respect to a target column and differentiates by benchmark type. - - Parameters: - df (pd.DataFrame): The DataFrame containing the data to plot. - target_col (str): The name of the target column to plot on the y-axis. - benchmark_col (str): The name of the benchmark column to use for color differentiation. - """ - if ( - "tunable_config_id" not in df.columns - or target_col not in df.columns - or benchmark_col not in df.columns - ): - st.error( - f"'tunable_config_id', '{target_col}', or '{benchmark_col}' column not found in DataFrame." - ) - return - - # Ensure the target column is numeric - df[target_col] = pd.to_numeric(df[target_col], errors="coerce") - - # Drop rows with NaN values in target column - df = df.dropna(subset=[target_col]) - - # Plot whisker plots for all configurations with color differentiation by benchmark type - fig = px.box( - df, - x="tunable_config_id", - y=target_col, - color=benchmark_col, - points="all", - labels={"tunable_config_id": "Configuration ID", target_col: target_col}, - title=f"Whisker Plot for All Configurations by {target_col}", - ) - - st.plotly_chart(fig, use_container_width=True) - - -def get_trial_ranges_by_benchmark(df): - # Adjust this to match your actual column name - benchmark_col = "result.Benchmark Type" - if benchmark_col not in df.columns: - st.error(f"Benchmark column '{benchmark_col}' not found in DataFrame.") - return {} - - benchmark_types = df[benchmark_col].unique() - trial_ranges = {} - for benchmark in benchmark_types: - trial_ids = sorted(df[df[benchmark_col] == benchmark]["trial_id"].unique()) - if trial_ids: - ranges = [] - range_start = trial_ids[0] - previous_id = trial_ids[0] - for trial_id in trial_ids[1:]: - if trial_id != previous_id + 1: - ranges.append((range_start, previous_id)) - range_start = trial_id - previous_id = trial_id - ranges.append((range_start, previous_id)) - trial_ranges[benchmark] = ranges - else: - trial_ranges[benchmark] = [] - return trial_ranges - - def plot_violin_plot(df, target_col, config_id_1, config_id_2): """ Plots a violin plot for two specific configurations with respect to a target column. @@ -1149,11 +937,13 @@ def compare_multiple_experiments(experiment_ids, storage, target_col): st.write("Descriptive Statistics:") st.dataframe(df.describe()) + available_result_columns = [col for col in df.columns if col.startswith("result")] + target_col = st.selectbox("Select a Result Column", available_result_columns) + if selected_experiment_id: - tab1, tab2, tab3, tab4, tab5, tab6, tab7 = st.tabs( + tab1, tab3, tab4, tab5, tab6, tab7 = st.tabs( [ "Failure Metrics", - "Trial Ranges", "Graphs", "Correlation", "Compare Configurations", @@ -1173,54 +963,69 @@ def compare_multiple_experiments(experiment_ids, storage, target_col): except: st.write("Failure Metrics not available") - with tab2: - st.header("Trial Ranges by Benchmark Type") - try: - trial_ranges = get_trial_ranges_by_benchmark(df) - for benchmark, ranges in trial_ranges.items(): - if ranges: - st.subheader(f"Benchmark: {benchmark}") - for start, end in ranges: - st.write(f" - Trial ID Range: {start} - {end}") - else: - st.write(f"Benchmark: {benchmark} has no trials") - except: - st.write("Trial Ranges by Benchmark Type not available") - with tab3: st.header("Graphs") - st.subheader("Select a Column to Graph Data On") - try: - config_columns = [col for col in df.columns if col.startswith("config")] - result_columns = [col for col in df.columns if col.startswith("result")] - - target_col = st.selectbox("Select Target Column", result_columns) - - st.subheader("Scatter of Trials & Target Column") - plot_line_scatter_chart(df, target_col) - except: - st.write("Scatter Plot not available") - - st.subheader("Scatter of Target Column With One Config Parameter") - config_col = st.selectbox("Select Configuration Column", config_columns) + st.subheader("Select Columns to Graph Data On") try: - plot_2d_scatter(df, target_col, config_col) - except: - st.write("2D Scatter Plot not available") + # Identify columns of interest + config_columns = [col for col in df.columns if col.startswith("config.")] + result_columns = [col for col in df.columns if col.startswith("result.")] + status_options = df["status"].unique().tolist() + + # Streamlit UI for interactive input + st.header("Plot Settings") + selected_x_axes = st.multiselect( + "Select X-axis Columns (Configurations)", + options=config_columns, + default=config_columns[:1], # Default to the first config column + ) + selected_y_axes = st.multiselect( + "Select Y-axis Columns (Results)", + options=result_columns, + default=result_columns[:1], # Default to the first result column + ) + status_filter = st.multiselect( + "Filter by Status", + options=status_options, + default=status_options, # Default to include all statuses + ) + show_grid = st.checkbox("Show Grid", value=True) + save_plots = st.checkbox("Save Plots", value=False) + + # Filter the DataFrame based on status + filtered_df = df[df["status"].isin(status_filter)] + + # Plot multiple scatter plots + st.header("Interactive Scatter Plots") + for x_axis in selected_x_axes: + for y_axis in selected_y_axes: + st.subheader(f"Plot: {y_axis} vs {x_axis}") + + # Create scatter plot using Plotly + fig = px.scatter( + filtered_df, + x=x_axis, + y=y_axis, + color="status", + title=f"{y_axis} vs {x_axis}", + labels={"status": "Status", x_axis: x_axis, y_axis: y_axis}, + ) + fig.update_layout( + xaxis_title=x_axis, + yaxis_title=y_axis, + showlegend=True, + ) + st.plotly_chart(fig, use_container_width=True) + + # Save the plot if selected + if save_plots: + filename = f"{x_axis}_vs_{y_axis}.html" + fig.write_html(filename) + st.success(f"Plot saved as {filename}") - st.header("Result Column & Two Config Params") - try: - if config_columns and result_columns: - config_col1 = st.selectbox("Select First Configuration Column", config_columns) - config_col2 = st.selectbox( - "Select Second Configuration Column", config_columns - ) - result_col = st.selectbox("Select Result Column", result_columns) - - plot_3d_config_result(df, config_col1, config_col2, result_col) - except: - st.write("3D Scatter Plot not available") + except Exception as e: + st.error(f"An error occurred: {e}") with tab4: st.header("Correlation of Target Column With Parameters") @@ -1242,14 +1047,18 @@ def compare_multiple_experiments(experiment_ids, storage, target_col): with tab5: st.header("Compare Two Configurations") try: + available_result_columns = [col for col in df.columns if col.startswith("result")] + target_col_config = st.selectbox( + "Select a Result Column", available_result_columns, key="t_col" + ) config_id_1 = st.selectbox( "Select First Configuration ID", df["tunable_config_id"].unique() ) config_id_2 = st.selectbox( "Select Second Configuration ID", df["tunable_config_id"].unique() ) - compare_whisker_plots(df, target_col, config_id_1, config_id_2) - plot_violin_plot(df, target_col, config_id_1, config_id_2) + compare_whisker_plots(df, target_col_config, config_id_1, config_id_2) + plot_violin_plot(df, target_col_config, config_id_1, config_id_2) except: st.write("Comparison Plots not available") From b2c13e13a1d3dde551427b6047c6796f96484007 Mon Sep 17 00:00:00 2001 From: Yaseen Shady <139421618+yshady-acheev@users.noreply.github.com> Date: Wed, 22 Jan 2025 06:30:54 +0000 Subject: [PATCH 7/8] better graphs tab (messy code needs clean up) --- frontend.py | 330 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 269 insertions(+), 61 deletions(-) diff --git a/frontend.py b/frontend.py index 2379f69..32393ff 100644 --- a/frontend.py +++ b/frontend.py @@ -1,4 +1,5 @@ import plotly.express as px +import plotly.graph_objects as go import streamlit as st import requests import pandas as pd @@ -964,68 +965,275 @@ def compare_multiple_experiments(experiment_ids, storage, target_col): st.write("Failure Metrics not available") with tab3: - st.header("Graphs") - st.subheader("Select Columns to Graph Data On") - - try: - # Identify columns of interest - config_columns = [col for col in df.columns if col.startswith("config.")] - result_columns = [col for col in df.columns if col.startswith("result.")] - status_options = df["status"].unique().tolist() - - # Streamlit UI for interactive input - st.header("Plot Settings") - selected_x_axes = st.multiselect( - "Select X-axis Columns (Configurations)", - options=config_columns, - default=config_columns[:1], # Default to the first config column - ) - selected_y_axes = st.multiselect( - "Select Y-axis Columns (Results)", - options=result_columns, - default=result_columns[:1], # Default to the first result column - ) - status_filter = st.multiselect( - "Filter by Status", - options=status_options, - default=status_options, # Default to include all statuses - ) - show_grid = st.checkbox("Show Grid", value=True) - save_plots = st.checkbox("Save Plots", value=False) - - # Filter the DataFrame based on status - filtered_df = df[df["status"].isin(status_filter)] - - # Plot multiple scatter plots - st.header("Interactive Scatter Plots") - for x_axis in selected_x_axes: - for y_axis in selected_y_axes: - st.subheader(f"Plot: {y_axis} vs {x_axis}") - - # Create scatter plot using Plotly - fig = px.scatter( - filtered_df, - x=x_axis, - y=y_axis, - color="status", - title=f"{y_axis} vs {x_axis}", - labels={"status": "Status", x_axis: x_axis, y_axis: y_axis}, - ) - fig.update_layout( - xaxis_title=x_axis, - yaxis_title=y_axis, - showlegend=True, - ) - st.plotly_chart(fig, use_container_width=True) - - # Save the plot if selected - if save_plots: - filename = f"{x_axis}_vs_{y_axis}.html" - fig.write_html(filename) - st.success(f"Plot saved as {filename}") + st.header("📊 Data Visualization") - except Exception as e: - st.error(f"An error occurred: {e}") + if df.empty: + st.warning("No data available for visualization. Please load some data first.") + else: + try: + with st.expander("📋 Visualization Settings", expanded=True): + # Column Selection + config_columns = [col for col in df.columns if col.startswith("config.")] + result_columns = [col for col in df.columns if col.startswith("result.")] + status_options = df["status"].unique().tolist() + + # Main Plot Settings + col1, col2 = st.columns(2) + with col1: + selected_x_axes = st.multiselect( + "X-axis (Configurations)", + options=config_columns, + default=config_columns[:1], + help="Select configuration parameters for X-axis", + ) + with col2: + selected_y_axes = st.multiselect( + "Y-axis (Results)", + options=result_columns, + default=result_columns[:1], + help="Select result metrics for Y-axis", + ) + + # Plot Type and Filters + col3, col4, col5 = st.columns(3) + with col3: + plot_type = st.selectbox( + "Plot Type", + options=[ + "Scatter", + "Line", + "Box", + "Violin", + "2D Histogram", + "3D Scatter", + "Parallel Coordinates", + ], + help="Select visualization type", + ) + with col4: + status_filter = st.multiselect( + "Status Filter", + options=status_options, + default=status_options, + help="Filter by status", + ) + with col5: + template = st.selectbox( + "Plot Theme", + options=[ + "plotly", + "plotly_white", + "plotly_dark", + "ggplot2", + "seaborn", + ], + help="Select plot visual theme", + ) + + # Additional Options + col6, col7, col8, col9 = st.columns(4) + with col6: + show_stats = st.checkbox("Show Statistics", value=True) + with col7: + show_trend = st.checkbox("Show Trend Line", value=True) + with col8: + marginal_plot = st.checkbox("Show Marginal Plots", value=True) + with col9: + save_plots = st.checkbox("Enable Save Plots", value=False) + + # Filter data + filtered_df = df[df["status"].isin(status_filter)] + + if filtered_df.empty: + st.warning("No data points match the selected filters.") + elif not selected_x_axes or not selected_y_axes: + st.warning("Please select both X and Y axis parameters.") + else: + st.subheader("📈 Interactive Plots") + + for x_axis in selected_x_axes: + for y_axis in selected_y_axes: + st.write(f"### {y_axis} vs {x_axis}") + + if plot_type == "Parallel Coordinates": + # Special handling for parallel coordinates + selected_cols = [x_axis, y_axis] + [ + col for col in result_columns if col != y_axis + ][:4] + fig = px.parallel_coordinates( + filtered_df, + dimensions=selected_cols, + color=y_axis, + template=template, + ) + elif plot_type == "3D Scatter": + # 3D scatter with additional dimension + extra_dim = next( + (col for col in result_columns if col != y_axis), + result_columns[0], + ) + fig = px.scatter_3d( + filtered_df, + x=x_axis, + y=y_axis, + z=extra_dim, + color="status", + template=template, + hover_data=["trial_id", "ts_start"], + ) + elif plot_type == "2D Histogram": + fig = px.density_heatmap( + filtered_df, + x=x_axis, + y=y_axis, + marginal_x="histogram", + marginal_y="histogram", + template=template, + ) + else: + # Standard 2D plots with marginal plots + if plot_type == "Scatter": + fig = px.scatter( + filtered_df, + x=x_axis, + y=y_axis, + color="status", + template=template, + hover_data=["trial_id", "ts_start"], + marginal_x="histogram" if marginal_plot else None, + marginal_y="histogram" if marginal_plot else None, + ) + elif plot_type == "Line": + fig = px.line( + filtered_df.sort_values(x_axis), + x=x_axis, + y=y_axis, + color="status", + template=template, + markers=True, + ) + elif plot_type == "Box": + fig = px.box( + filtered_df, + x=x_axis, + y=y_axis, + color="status", + template=template, + points="all", + ) + else: # Violin + fig = px.violin( + filtered_df, + x=x_axis, + y=y_axis, + color="status", + template=template, + box=True, + points="all", + ) + + # Add trend line for appropriate plot types + if show_trend and plot_type in ["Scatter", "Line"]: + try: + x_data = filtered_df[x_axis].astype(float) + y_data = filtered_df[y_axis].astype(float) + + # Calculate trend line + z = np.polyfit(x_data, y_data, 1) + p = np.poly1d(z) + + # Add trend line trace + fig.add_trace( + go.Scatter( + x=x_data, + y=p(x_data), + name=f"Trend (R²={stats.pearsonr(x_data, y_data)[0]**2:.3f})", + line=dict(color="red", dash="dash"), + showlegend=True, + ) + ) + except Exception as e: + st.warning(f"Could not add trend line: {str(e)}") + + # Update layout + fig.update_layout( + title=dict( + text=f"{y_axis} vs {x_axis}", x=0.5, xanchor="center" + ), + showlegend=True, + height=600, + ) + + # Display plot + st.plotly_chart(fig, use_container_width=True) + + # Show statistics if enabled + if show_stats: + with st.expander("📊 Statistical Analysis", expanded=False): + try: + # Basic statistics + col1, col2 = st.columns(2) + with col1: + st.write("X-axis Statistics:") + st.write(filtered_df[x_axis].describe()) + with col2: + st.write("Y-axis Statistics:") + st.write(filtered_df[y_axis].describe()) + + # Correlation analysis + if ( + filtered_df[x_axis].dtype.kind in "biufc" + and filtered_df[y_axis].dtype.kind in "biufc" + ): + pearson_corr = stats.pearsonr( + filtered_df[x_axis], filtered_df[y_axis] + ) + spearman_corr = stats.spearmanr( + filtered_df[x_axis], filtered_df[y_axis] + ) + st.write("### Correlation Analysis") + st.write( + f"Pearson correlation: {pearson_corr[0]:.4f} (p-value: {pearson_corr[1]:.4f})" + ) + st.write( + f"Spearman correlation: {spearman_corr[0]:.4f} (p-value: {spearman_corr[1]:.4f})" + ) + except Exception as e: + st.write( + "Could not calculate some statistics (non-numeric data or other error)" + ) + + # Save plot functionality + if save_plots: + col1, col2 = st.columns(2) + with col1: + # Save as HTML (interactive) + html_filename = f"{x_axis}_vs_{y_axis}_{plot_type}.html" + fig.write_html(html_filename) + with open(html_filename, "rb") as f: + st.download_button( + label="Download Interactive Plot (HTML)", + data=f, + file_name=html_filename, + mime="text/html", + ) + with col2: + # Save as PNG (static) + png_filename = f"{x_axis}_vs_{y_axis}_{plot_type}.png" + fig.write_image(png_filename) + with open(png_filename, "rb") as f: + st.download_button( + label="Download Static Plot (PNG)", + data=f, + file_name=png_filename, + mime="image/png", + ) + + st.markdown("---") # Visual separator between plots + + except Exception as e: + st.error(f"An error occurred during visualization: {str(e)}") + st.exception(e) with tab4: st.header("Correlation of Target Column With Parameters") From bc8c43acec1db9c2475d2b423a1d22f32f26f284 Mon Sep 17 00:00:00 2001 From: Yaseen Shady <139421618+yshady-acheev@users.noreply.github.com> Date: Wed, 22 Jan 2025 06:53:18 +0000 Subject: [PATCH 8/8] Parallel plot --- frontend.py | 181 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 180 insertions(+), 1 deletion(-) diff --git a/frontend.py b/frontend.py index 32393ff..a87f478 100644 --- a/frontend.py +++ b/frontend.py @@ -13,6 +13,11 @@ import json5 as json import os +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA +from scipy.stats import ttest_ind, mannwhitneyu + + # Load the storage config and connect to the storage try: storage = storage = from_config(config="storage/sqlite.jsonc") @@ -218,6 +223,79 @@ def plot_whisker_plots(df, target_col, n=5): st.plotly_chart(fig_bottom, use_container_width=True) +def run_pairwise_stat_tests( + df, result_col, group_col="tunable_config_id", alpha=0.05, test_type="ttest" +): + """ + Perform pairwise statistical significance tests on a result column, + grouped by a configuration column or tunable_config_id. + + Parameters + ---------- + df : pd.DataFrame + The DataFrame containing the data. + result_col : str + The name of the result column on which to base the test. + group_col : str + The column that identifies distinct configurations/groups (default = "tunable_config_id"). + alpha : float + The significance level for the test (default = 0.05). + test_type : str + Which test to use: "ttest" for independent two-sample t-test, + or "mannwhitney" for Mann-Whitney U test. + + Returns + ------- + pd.DataFrame + A DataFrame listing each pair of config groups, test statistic, p-value, + and a boolean indicating significance at the chosen alpha level. + """ + # Drop rows where result_col is NaN or infinite + df = df.dropna(subset=[result_col]).copy() + df = df[np.isfinite(df[result_col])] + + # Ensure result_col is numeric + df[result_col] = pd.to_numeric(df[result_col], errors="coerce") + + # Get unique configurations + configs = df[group_col].unique() + results = [] + + # Compare each pair of unique configs + for i in range(len(configs)): + for j in range(i + 1, len(configs)): + cfg_i = configs[i] + cfg_j = configs[j] + data_i = df.loc[df[group_col] == cfg_i, result_col] + data_j = df.loc[df[group_col] == cfg_j, result_col] + + # Skip if no data in one group + if data_i.empty or data_j.empty: + continue + + # Perform the chosen test + if test_type == "mannwhitney": + stat, pval = mannwhitneyu(data_i, data_j, alternative="two-sided") + else: + # Default to t-test + stat, pval = ttest_ind(data_i, data_j, equal_var=False, nan_policy="omit") + + is_significant = pval < alpha + results.append( + { + "Config_A": cfg_i, + "Config_B": cfg_j, + "N_A": len(data_i), + "N_B": len(data_j), + "Test_Statistic": stat, + "p-value": pval, + "Significant": is_significant, + } + ) + + return pd.DataFrame(results) + + # Function to plot correlation between parameter changes and latency def plot_param_latency_correlation(experiment_id, storage, metric): exp = storage.experiments[experiment_id] @@ -942,7 +1020,7 @@ def compare_multiple_experiments(experiment_ids, storage, target_col): target_col = st.selectbox("Select a Result Column", available_result_columns) if selected_experiment_id: - tab1, tab3, tab4, tab5, tab6, tab7 = st.tabs( + tab1, tab3, tab4, tab5, tab6, tab7, tab8, tab9 = st.tabs( [ "Failure Metrics", "Graphs", @@ -950,6 +1028,8 @@ def compare_multiple_experiments(experiment_ids, storage, target_col): "Compare Configurations", "Compare Experiments", "ChatGPT", + "Statistically Significant", + "Parallel Plot", ] ) @@ -1318,5 +1398,104 @@ def compare_multiple_experiments(experiment_ids, storage, target_col): st.subheader("Experiment Explanation") st.write(explanation) + ####################################### + # NEW TAB: Statistical Significance + ####################################### + with tab8: + st.header("Statistical Significance (Based on a Result Column)") + + # 1. Let user pick which result column to analyze + df = storage.experiments[selected_experiment_id].results_df + result_cols = [c for c in df.columns if c.startswith("result")] + if not result_cols: + st.warning("No columns found that start with 'result'.") + else: + chosen_result_col = st.selectbox( + "Select Result Column for Statistical Test:", options=result_cols, index=0 + ) + + # 2. Select test type + test_type = st.selectbox( + "Select Statistical Test:", + options=["ttest", "mannwhitney"], + index=0, + help="Choose a test: 'ttest' (two-sample t-test) or 'mannwhitney' (non-parametric).", + ) + + # 3. Significance level alpha + alpha = st.number_input( + "Significance Level (alpha)", + min_value=0.001, + max_value=0.1, + value=0.05, + step=0.01, + ) + + # 4. Perform pairwise tests + # Group by default on "tunable_config_id"; you can also gather + # unique config.* columns and group if you prefer. + if st.button("Run Pairwise Tests"): + st.write( + f"Performing pairwise {test_type} on `{chosen_result_col}`, alpha={alpha} ..." + ) + results_df = run_pairwise_stat_tests( + df=df, + result_col=chosen_result_col, + group_col="tunable_config_id", + alpha=alpha, + test_type=test_type, + ) + + if results_df.empty: + st.warning("No pairs or no valid data to compare.") + else: + # 5. Display results + st.dataframe(results_df) + + # Optionally highlight significant pairs + st.write("Significant Pairs:") + significant_pairs = results_df[results_df["Significant"] == True] + if significant_pairs.empty: + st.info( + "No significant differences found at alpha = {:.3f}".format(alpha) + ) + else: + st.write(significant_pairs) + with tab9: + st.header("Parallel Coordinates Plot") + st.write( + "Explore multi-dimensional relationships between configuration parameters and metrics." + ) + + parallel_columns = st.multiselect( + "Select Columns for Parallel Plot", + options=config_columns + result_columns, + default=config_columns[:3] + result_columns[:2], + help="Choose multiple columns to include in the parallel coordinates plot.", + ) + + if parallel_columns: + color_metric = st.selectbox( + "Select Metric for Coloring", + options=result_columns, + help="Choose a result metric to color-code the parallel coordinates.", + ) + fig = px.parallel_coordinates( + df, + dimensions=parallel_columns, + color=color_metric, + color_continuous_scale=px.colors.diverging.Tealrose, + title="Parallel Coordinates Plot", + labels={ + col: col.replace("config.", "").replace("_", " ").title() + for col in parallel_columns + }, + template="plotly_white", + ) + st.plotly_chart(fig, use_container_width=True) + else: + st.info("Select columns to generate the parallel coordinates plot.") + + else: st.warning("Storage configuration not loaded. Cannot display experiments.")