KalooIna · sihemdjh · Feb 4, 2026 · Feb 4, 2026
diff --git a/01_eda.ipynb b/01_eda.ipynb
@@ -0,0 +1,324 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Cybersecurity Attack Analysis - Exploratory Data Analysis\n",
+    "---\n",
+    "This notebook demonstrates how to use the modular cybersecurity analysis toolkit.\n",
+    "\n",
+    "**Key Principle**: This notebook calls functions from `src/`. It does NOT define reusable logic."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import our custom modules\n",
+    "import sys\n",
+    "sys.path.append('..')  # Add parent directory to path\n",
+    "\n",
+    "from src.data_loader import load_dataset, get_missing_value_summary, get_dataset_info\n",
+    "from src.features import create_all_features\n",
+    "from src.utils import (\n",
+    "    setup_plotting_style,\n",
+    "    plot_attack_distribution,\n",
+    "    plot_proxy_analysis,\n",
+    "    plot_ip_analysis,\n",
+    "    plot_protocol_analysis,\n",
+    "    plot_packet_analysis,\n",
+    "    statistical_test_packet_length,\n",
+    "    print_comprehensive_summary\n",
+    ")\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "# Setup plotting\n",
+    "setup_plotting_style()\n",
+    "\n",
+    "print(\"✓ Modules imported successfully!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the dataset\n",
+    "df = load_dataset('../data/cybersecurity_attacks.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Quick peek at the data\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Data Quality Check"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get comprehensive dataset info\n",
+    "get_dataset_info(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check missing values\n",
+    "missing_summary = get_missing_value_summary(df)\n",
+    "missing_summary.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Feature Engineering\n",
+    "\n",
+    "Create derived features that will help with analysis and modeling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create all features at once\n",
+    "df = create_all_features(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check new features\n",
+    "print(f\"New columns added:\")\n",
+    "new_cols = ['has_proxy', 'src_ip_class', 'dst_ip_class', 'src_is_private', \n",
+    "            'dst_is_private', 'is_bidirectional', 'src_port_category', \n",
+    "            'dst_port_category', 'packet_length_bin', 'anomaly_category']\n",
+    "for col in new_cols:\n",
+    "    if col in df.columns:\n",
+    "        print(f\"  ✓ {col}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Exploratory Analysis\n",
+    "\n",
+    "### 5.1 Attack Type Distribution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_attack_distribution(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.2 Proxy Usage Patterns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_proxy_analysis(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.3 IP Address Patterns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_ip_analysis(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.4 Protocol Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_protocol_analysis(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.5 Packet Length Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_packet_analysis(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Statistical Tests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test if packet length differs significantly across attack types\n",
+    "statistical_test_packet_length(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Custom Analysis\n",
+    "\n",
+    "This section is for ad-hoc analysis. Use the functions from `src/` or write temporary code here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example: Deep dive into a specific attack type\n",
+    "import pandas as pd\n",
+    "\n",
+    "attack_type = 'DDoS'  # Change this to analyze different attacks\n",
+    "subset = df[df['Attack Type'] == attack_type]\n",
+    "\n",
+    "print(f\"\\nAnalysis of {attack_type} attacks:\")\n",
+    "print(f\"Total records: {len(subset):,}\")\n",
+    "print(f\"\\nTop 5 destination ports:\")\n",
+    "print(subset['Destination Port'].value_counts().head())\n",
+    "print(f\"\\nProtocol distribution:\")\n",
+    "print(subset['Protocol'].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Final Summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_comprehensive_summary(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Next Steps\n",
+    "\n",
+    "1. **Model Training**: Create `src/model.py` with training functions\n",
+    "2. **Preprocessing**: Add encoding/scaling functions to `src/preprocessing.py`\n",
+    "3. **More Features**: Extend `src/features.py` with new feature ideas\n",
+    "4. **Save Results**: Export processed data for modeling\n",
+    "\n",
+    "**Remember**: If you write a useful function in this notebook, move it to `src/`!"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}