diff --git a/01_eda.ipynb b/01_eda.ipynb
new file mode 100644
index 0000000..6a96606
--- /dev/null
+++ b/01_eda.ipynb
@@ -0,0 +1,324 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Cybersecurity Attack Analysis - Exploratory Data Analysis\n",
+    "---\n",
+    "This notebook demonstrates how to use the modular cybersecurity analysis toolkit.\n",
+    "\n",
+    "**Key Principle**: This notebook calls functions from `src/`. It does NOT define reusable logic."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import our custom modules\n",
+    "import sys\n",
+    "sys.path.append('..')  # Add parent directory to path\n",
+    "\n",
+    "from src.data_loader import load_dataset, get_missing_value_summary, get_dataset_info\n",
+    "from src.features import create_all_features\n",
+    "from src.utils import (\n",
+    "    setup_plotting_style,\n",
+    "    plot_attack_distribution,\n",
+    "    plot_proxy_analysis,\n",
+    "    plot_ip_analysis,\n",
+    "    plot_protocol_analysis,\n",
+    "    plot_packet_analysis,\n",
+    "    statistical_test_packet_length,\n",
+    "    print_comprehensive_summary\n",
+    ")\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "# Setup plotting\n",
+    "setup_plotting_style()\n",
+    "\n",
+    "print(\"✓ Modules imported successfully!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the dataset\n",
+    "df = load_dataset('../data/cybersecurity_attacks.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Quick peek at the data\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Data Quality Check"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get comprehensive dataset info\n",
+    "get_dataset_info(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check missing values\n",
+    "missing_summary = get_missing_value_summary(df)\n",
+    "missing_summary.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Feature Engineering\n",
+    "\n",
+    "Create derived features that will help with analysis and modeling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create all features at once\n",
+    "df = create_all_features(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check new features\n",
+    "print(f\"New columns added:\")\n",
+    "new_cols = ['has_proxy', 'src_ip_class', 'dst_ip_class', 'src_is_private', \n",
+    "            'dst_is_private', 'is_bidirectional', 'src_port_category', \n",
+    "            'dst_port_category', 'packet_length_bin', 'anomaly_category']\n",
+    "for col in new_cols:\n",
+    "    if col in df.columns:\n",
+    "        print(f\"  ✓ {col}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Exploratory Analysis\n",
+    "\n",
+    "### 5.1 Attack Type Distribution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_attack_distribution(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.2 Proxy Usage Patterns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_proxy_analysis(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.3 IP Address Patterns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_ip_analysis(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.4 Protocol Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_protocol_analysis(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.5 Packet Length Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_packet_analysis(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Statistical Tests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test if packet length differs significantly across attack types\n",
+    "statistical_test_packet_length(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Custom Analysis\n",
+    "\n",
+    "This section is for ad-hoc analysis. Use the functions from `src/` or write temporary code here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example: Deep dive into a specific attack type\n",
+    "import pandas as pd\n",
+    "\n",
+    "attack_type = 'DDoS'  # Change this to analyze different attacks\n",
+    "subset = df[df['Attack Type'] == attack_type]\n",
+    "\n",
+    "print(f\"\\nAnalysis of {attack_type} attacks:\")\n",
+    "print(f\"Total records: {len(subset):,}\")\n",
+    "print(f\"\\nTop 5 destination ports:\")\n",
+    "print(subset['Destination Port'].value_counts().head())\n",
+    "print(f\"\\nProtocol distribution:\")\n",
+    "print(subset['Protocol'].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Final Summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_comprehensive_summary(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Next Steps\n",
+    "\n",
+    "1. **Model Training**: Create `src/model.py` with training functions\n",
+    "2. **Preprocessing**: Add encoding/scaling functions to `src/preprocessing.py`\n",
+    "3. **More Features**: Extend `src/features.py` with new feature ideas\n",
+    "4. **Save Results**: Export processed data for modeling\n",
+    "\n",
+    "**Remember**: If you write a useful function in this notebook, move it to `src/`!"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/cybersecurity_eda.ipynb b/cybersecurity_eda.ipynb
deleted file mode 100644
index fae9856..0000000
--- a/cybersecurity_eda.ipynb
+++ /dev/null
@@ -1,1271 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Cybersecurity Attack Type Detection - EDA\n",
-    "## Focus: Proxy + IP Trends, Spoofing Detection, and Data Bin Trends\n",
-    "\n",
-    "**Team Member:** [Your Name]  \n",
-    "**Date:** January 31, 2026  \n",
-    "**Dataset:** 40,000 rows, 25 features"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## 1. Setup and Data Loading"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import libraries\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns\n",
-    "from collections import Counter\n",
-    "import warnings\n",
-    "warnings.filterwarnings('ignore')\n",
-    "\n",
-    "# Set visualization style\n",
-    "sns.set_style(\"whitegrid\")\n",
-    "plt.rcParams['figure.figsize'] = (15, 8)\n",
-    "plt.rcParams['font.size'] = 10\n",
-    "\n",
-    "print(\"✓ Libraries imported successfully!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load dataset\n",
-    "# TODO: Update the filepath to your actual CSV file location\n",
-    "df = pd.read_csv('your_dataset.csv')\n",
-    "\n",
-    "print(f\"Dataset Shape: {df.shape}\")\n",
-    "print(f\"Total Records: {df.shape[0]:,}\")\n",
-    "print(f\"Total Features: {df.shape[1]}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Display first few rows\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Data types and basic info\n",
-    "df.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Missing values analysis\n",
-    "missing_df = pd.DataFrame({\n",
-    "    'Missing_Count': df.isnull().sum(),\n",
-    "    'Percentage': (df.isnull().sum() / len(df)) * 100,\n",
-    "    'Distinct_Count': df.nunique(),\n",
-    "    'Distinct_Percentage': (df.nunique() / len(df)) * 100\n",
-    "}).sort_values('Missing_Count', ascending=False)\n",
-    "\n",
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"MISSING VALUES AND DISTINCTNESS ANALYSIS\")\n",
-    "print(\"=\"*80)\n",
-    "print(missing_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Attack Type distribution\n",
-    "if 'Attack Type' in df.columns:\n",
-    "    print(\"\\nAttack Type Distribution:\")\n",
-    "    print(df['Attack Type'].value_counts())\n",
-    "    \n",
-    "    plt.figure(figsize=(12, 6))\n",
-    "    df['Attack Type'].value_counts().plot(kind='bar', color='steelblue', edgecolor='black')\n",
-    "    plt.title('Attack Type Distribution', fontsize=14, fontweight='bold')\n",
-    "    plt.xlabel('Attack Type')\n",
-    "    plt.ylabel('Count')\n",
-    "    plt.xticks(rotation=45, ha='right')\n",
-    "    plt.tight_layout()\n",
-    "    plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## 2. PART 1: Proxy Information Analysis\n",
-    "\n",
-    "**Key Insights from Data Profiling:**\n",
-    "- 50% missing values (19,851 out of 40,000)\n",
-    "- 20,148 distinct values when present (highly diverse)\n",
-    "- This suggests proxy info is present only for certain attacks/sources"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Proxy Information Analysis\n",
-    "print(\"=\"*80)\n",
-    "print(\"PROXY INFORMATION ANALYSIS\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "if 'Proxy Information' in df.columns:\n",
-    "    # Basic statistics\n",
-    "    total_records = len(df)\n",
-    "    proxy_present = df['Proxy Information'].notna().sum()\n",
-    "    proxy_missing = df['Proxy Information'].isna().sum()\n",
-    "    unique_proxies = df['Proxy Information'].nunique()\n",
-    "    \n",
-    "    print(f\"\\nProxy Information Statistics:\")\n",
-    "    print(f\"  - Total records: {total_records:,}\")\n",
-    "    print(f\"  - Records WITH proxy info: {proxy_present:,} ({proxy_present/total_records*100:.2f}%)\")\n",
-    "    print(f\"  - Records WITHOUT proxy info: {proxy_missing:,} ({proxy_missing/total_records*100:.2f}%)\")\n",
-    "    print(f\"  - Unique proxy values: {unique_proxies:,}\")\n",
-    "    \n",
-    "    # Create binary feature: has_proxy\n",
-    "    df['has_proxy'] = df['Proxy Information'].notna().astype(int)\n",
-    "    \n",
-    "    print(f\"\\nProxy Usage Distribution:\")\n",
-    "    print(df['has_proxy'].value_counts())\n",
-    "else:\n",
-    "    print(\"Warning: 'Proxy Information' column not found!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize proxy usage patterns\n",
-    "if 'has_proxy' in df.columns:\n",
-    "    fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n",
-    "    \n",
-    "    # 1. Overall proxy usage pie chart\n",
-    "    proxy_counts = df['has_proxy'].value_counts()\n",
-    "    labels = ['No Proxy', 'With Proxy']\n",
-    "    colors = ['lightcoral', 'lightgreen']\n",
-    "    axes[0, 0].pie(proxy_counts.values, labels=labels, autopct='%1.1f%%', \n",
-    "                    colors=colors, startangle=90)\n",
-    "    axes[0, 0].set_title('Overall Proxy Usage Distribution', fontsize=14, fontweight='bold')\n",
-    "    \n",
-    "    # 2. Proxy usage by Attack Type\n",
-    "    if 'Attack Type' in df.columns:\n",
-    "        proxy_attack = pd.crosstab(df['Attack Type'], df['has_proxy'], normalize='index') * 100\n",
-    "        proxy_attack.plot(kind='bar', ax=axes[0, 1], stacked=False, \n",
-    "                         color=['lightcoral', 'lightgreen'])\n",
-    "        axes[0, 1].set_title('Proxy Usage by Attack Type (%)', fontsize=14, fontweight='bold')\n",
-    "        axes[0, 1].set_xlabel('Attack Type')\n",
-    "        axes[0, 1].set_ylabel('Percentage')\n",
-    "        axes[0, 1].legend(['No Proxy', 'With Proxy'])\n",
-    "        axes[0, 1].tick_params(axis='x', rotation=45)\n",
-    "        \n",
-    "        # Print statistical summary\n",
-    "        print(\"\\nProxy Usage by Attack Type:\")\n",
-    "        print(proxy_attack)\n",
-    "    \n",
-    "    # 3. Proxy usage by Severity Level\n",
-    "    if 'Severity Level' in df.columns:\n",
-    "        proxy_severity = pd.crosstab(df['Severity Level'], df['has_proxy'])\n",
-    "        proxy_severity.plot(kind='bar', ax=axes[1, 0], color=['lightcoral', 'lightgreen'])\n",
-    "        axes[1, 0].set_title('Proxy Usage by Severity Level', fontsize=14, fontweight='bold')\n",
-    "        axes[1, 0].set_xlabel('Severity Level')\n",
-    "        axes[1, 0].set_ylabel('Count')\n",
-    "        axes[1, 0].legend(['No Proxy', 'With Proxy'])\n",
-    "        axes[1, 0].tick_params(axis='x', rotation=45)\n",
-    "    \n",
-    "    # 4. Proxy usage over time\n",
-    "    if 'Timestamp' in df.columns:\n",
-    "        df_temp = df.copy()\n",
-    "        df_temp['Timestamp'] = pd.to_datetime(df_temp['Timestamp'], errors='coerce')\n",
-    "        df_temp = df_temp.dropna(subset=['Timestamp'])\n",
-    "        df_temp['Date'] = df_temp['Timestamp'].dt.date\n",
-    "        \n",
-    "        proxy_time = df_temp.groupby('Date')['has_proxy'].agg(['sum', 'count'])\n",
-    "        proxy_time['percentage'] = (proxy_time['sum'] / proxy_time['count']) * 100\n",
-    "        \n",
-    "        axes[1, 1].plot(proxy_time.index, proxy_time['percentage'], \n",
-    "                       marker='o', color='steelblue', linewidth=2)\n",
-    "        axes[1, 1].set_title('Proxy Usage Trend Over Time', fontsize=14, fontweight='bold')\n",
-    "        axes[1, 1].set_xlabel('Date')\n",
-    "        axes[1, 1].set_ylabel('Percentage Using Proxy')\n",
-    "        axes[1, 1].tick_params(axis='x', rotation=45)\n",
-    "        axes[1, 1].grid(True, alpha=0.3)\n",
-    "    \n",
-    "    plt.tight_layout()\n",
-    "    plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Analyze relationship with Log Source (Firewall vs Server)\n",
-    "if 'Log Source' in df.columns and 'has_proxy' in df.columns:\n",
-    "    print(\"\\nProxy Usage by Log Source:\")\n",
-    "    log_proxy = pd.crosstab(df['Log Source'], df['has_proxy'], normalize='index') * 100\n",
-    "    print(log_proxy)\n",
-    "    \n",
-    "    # Visualize\n",
-    "    log_proxy.plot(kind='bar', figsize=(10, 6), color=['lightcoral', 'lightgreen'])\n",
-    "    plt.title('Proxy Usage: Firewall vs Server Logs', fontsize=14, fontweight='bold')\n",
-    "    plt.xlabel('Log Source')\n",
-    "    plt.ylabel('Percentage')\n",
-    "    plt.legend(['No Proxy', 'With Proxy'])\n",
-    "    plt.xticks(rotation=0)\n",
-    "    plt.tight_layout()\n",
-    "    plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Statistical significance test\n",
-    "if 'Attack Type' in df.columns and 'has_proxy' in df.columns:\n",
-    "    print(\"\\n\" + \"=\"*80)\n",
-    "    print(\"PROXY USAGE INSIGHTS BY ATTACK TYPE\")\n",
-    "    print(\"=\"*80)\n",
-    "    \n",
-    "    for attack_type in df['Attack Type'].unique():\n",
-    "        subset = df[df['Attack Type'] == attack_type]\n",
-    "        proxy_pct = (subset['has_proxy'].sum() / len(subset)) * 100\n",
-    "        \n",
-    "        print(f\"\\n{attack_type}:\")\n",
-    "        print(f\"  - Total attacks: {len(subset):,}\")\n",
-    "        print(f\"  - With proxy: {subset['has_proxy'].sum():,} ({proxy_pct:.2f}%)\")\n",
-    "        print(f\"  - Without proxy: {len(subset) - subset['has_proxy'].sum():,} ({100-proxy_pct:.2f}%)\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 📊 Key Findings - Proxy Analysis\n",
-    "\n",
-    "**Summary:**\n",
-    "- Write your key findings here after running the cells above\n",
-    "- Which attack types use proxies most?\n",
-    "- Is there a correlation with severity?\n",
-    "- Any temporal patterns?\n",
-    "\n",
-    "**Recommendation for ML Model:**\n",
-    "- The binary feature `has_proxy` appears to be a strong discriminator\n",
-    "- Consider as a key feature in your model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## 3. PART 2: IP Trends and Spoofing Detection\n",
-    "\n",
-    "**Analysis Goals:**\n",
-    "1. Identify top source and destination IPs\n",
-    "2. Detect fan-out patterns (one source → many destinations = scanning/spoofing)\n",
-    "3. Detect fan-in patterns (many sources → one destination = DDoS)\n",
-    "4. Analyze bidirectional traffic\n",
-    "5. Detect private IP usage anomalies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"=\"*80)\n",
-    "print(\"IP TRENDS AND SPOOFING DETECTION\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "# Basic IP statistics\n",
-    "if 'Source IP Address' in df.columns and 'Destination IP Address' in df.columns:\n",
-    "    print(f\"\\nIP Address Statistics:\")\n",
-    "    print(f\"  - Unique Source IPs: {df['Source IP Address'].nunique():,}\")\n",
-    "    print(f\"  - Unique Destination IPs: {df['Destination IP Address'].nunique():,}\")\n",
-    "    print(f\"  - Total IP-to-IP connections: {len(df):,}\")\n",
-    "    print(f\"  - Average connections per source IP: {len(df)/df['Source IP Address'].nunique():.2f}\")\n",
-    "    print(f\"  - Average connections per destination IP: {len(df)/df['Destination IP Address'].nunique():.2f}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Top Source IPs\n",
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"TOP SOURCE IP ADDRESSES\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "top_src_ips = df['Source IP Address'].value_counts().head(20)\n",
-    "print(\"\\nTop 20 Source IPs:\")\n",
-    "print(top_src_ips)\n",
-    "\n",
-    "# Visualize\n",
-    "plt.figure(figsize=(12, 8))\n",
-    "plt.barh(range(len(top_src_ips)), top_src_ips.values, color='steelblue')\n",
-    "plt.yticks(range(len(top_src_ips)), top_src_ips.index)\n",
-    "plt.xlabel('Frequency (Number of Connections)', fontsize=12)\n",
-    "plt.ylabel('Source IP Address', fontsize=12)\n",
-    "plt.title('Top 20 Most Active Source IP Addresses', fontsize=14, fontweight='bold')\n",
-    "plt.gca().invert_yaxis()\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Top Destination IPs\n",
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"TOP DESTINATION IP ADDRESSES\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "top_dst_ips = df['Destination IP Address'].value_counts().head(20)\n",
-    "print(\"\\nTop 20 Destination IPs:\")\n",
-    "print(top_dst_ips)\n",
-    "\n",
-    "# Visualize\n",
-    "plt.figure(figsize=(12, 8))\n",
-    "plt.barh(range(len(top_dst_ips)), top_dst_ips.values, color='coral')\n",
-    "plt.yticks(range(len(top_dst_ips)), top_dst_ips.index)\n",
-    "plt.xlabel('Frequency (Number of Connections)', fontsize=12)\n",
-    "plt.ylabel('Destination IP Address', fontsize=12)\n",
-    "plt.title('Top 20 Most Targeted Destination IP Addresses', fontsize=14, fontweight='bold')\n",
-    "plt.gca().invert_yaxis()\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# SPOOFING DETECTION 1: Fan-out Analysis (Source IP → Multiple Destinations)\n",
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"SPOOFING INDICATOR 1: FAN-OUT PATTERN (Source → Multiple Destinations)\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "# Count unique destinations per source IP\n",
-    "src_to_dst_mapping = df.groupby('Source IP Address')['Destination IP Address'].nunique()\n",
-    "src_to_dst_mapping = src_to_dst_mapping.sort_values(ascending=False)\n",
-    "\n",
-    "# Calculate thresholds\n",
-    "threshold_95 = src_to_dst_mapping.quantile(0.95)\n",
-    "threshold_99 = src_to_dst_mapping.quantile(0.99)\n",
-    "\n",
-    "suspicious_sources_95 = src_to_dst_mapping[src_to_dst_mapping > threshold_95]\n",
-    "suspicious_sources_99 = src_to_dst_mapping[src_to_dst_mapping > threshold_99]\n",
-    "\n",
-    "print(f\"\\nFan-out Statistics:\")\n",
-    "print(f\"  - Mean destinations per source: {src_to_dst_mapping.mean():.2f}\")\n",
-    "print(f\"  - Median destinations per source: {src_to_dst_mapping.median():.2f}\")\n",
-    "print(f\"  - 95th percentile threshold: {threshold_95:.0f} destinations\")\n",
-    "print(f\"  - 99th percentile threshold: {threshold_99:.0f} destinations\")\n",
-    "print(f\"\\nSuspicious Source IPs:\")\n",
-    "print(f\"  - IPs above 95th percentile: {len(suspicious_sources_95)} ({len(suspicious_sources_95)/len(src_to_dst_mapping)*100:.2f}%)\")\n",
-    "print(f\"  - IPs above 99th percentile: {len(suspicious_sources_99)} ({len(suspicious_sources_99)/len(src_to_dst_mapping)*100:.2f}%)\")\n",
-    "\n",
-    "print(f\"\\nTop 10 Source IPs with Highest Fan-out:\")\n",
-    "print(src_to_dst_mapping.head(10))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize fan-out distribution\n",
-    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
-    "\n",
-    "# Histogram\n",
-    "axes[0].hist(src_to_dst_mapping.values, bins=50, color='red', alpha=0.7, edgecolor='black')\n",
-    "axes[0].axvline(threshold_95, color='darkred', linestyle='--', linewidth=2, \n",
-    "                label=f'95th percentile: {threshold_95:.0f}')\n",
-    "axes[0].axvline(threshold_99, color='maroon', linestyle='--', linewidth=2, \n",
-    "                label=f'99th percentile: {threshold_99:.0f}')\n",
-    "axes[0].set_xlabel('Number of Unique Destinations per Source IP', fontsize=12)\n",
-    "axes[0].set_ylabel('Frequency (log scale)', fontsize=12)\n",
-    "axes[0].set_title('Source IP Fan-out Distribution\\n(Potential Scanning/Spoofing)', \n",
-    "                  fontsize=14, fontweight='bold')\n",
-    "axes[0].set_yscale('log')\n",
-    "axes[0].legend()\n",
-    "axes[0].grid(True, alpha=0.3)\n",
-    "\n",
-    "# Top suspicious IPs\n",
-    "top_suspicious = src_to_dst_mapping.head(15)\n",
-    "axes[1].barh(range(len(top_suspicious)), top_suspicious.values, color='darkred')\n",
-    "axes[1].set_yticks(range(len(top_suspicious)))\n",
-    "axes[1].set_yticklabels(top_suspicious.index)\n",
-    "axes[1].set_xlabel('Number of Unique Destinations', fontsize=12)\n",
-    "axes[1].set_ylabel('Source IP Address', fontsize=12)\n",
-    "axes[1].set_title('Top 15 Source IPs by Fan-out\\n(Most Suspicious)', \n",
-    "                  fontsize=14, fontweight='bold')\n",
-    "axes[1].invert_yaxis()\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# SPOOFING DETECTION 2: Fan-in Analysis (Multiple Sources → Single Destination)\n",
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"SPOOFING INDICATOR 2: FAN-IN PATTERN (Multiple Sources → Destination)\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "# Count unique sources per destination IP\n",
-    "dst_to_src_mapping = df.groupby('Destination IP Address')['Source IP Address'].nunique()\n",
-    "dst_to_src_mapping = dst_to_src_mapping.sort_values(ascending=False)\n",
-    "\n",
-    "# Calculate thresholds\n",
-    "threshold_95_dst = dst_to_src_mapping.quantile(0.95)\n",
-    "threshold_99_dst = dst_to_src_mapping.quantile(0.99)\n",
-    "\n",
-    "suspicious_targets_95 = dst_to_src_mapping[dst_to_src_mapping > threshold_95_dst]\n",
-    "suspicious_targets_99 = dst_to_src_mapping[dst_to_src_mapping > threshold_99_dst]\n",
-    "\n",
-    "print(f\"\\nFan-in Statistics:\")\n",
-    "print(f\"  - Mean sources per destination: {dst_to_src_mapping.mean():.2f}\")\n",
-    "print(f\"  - Median sources per destination: {dst_to_src_mapping.median():.2f}\")\n",
-    "print(f\"  - 95th percentile threshold: {threshold_95_dst:.0f} sources\")\n",
-    "print(f\"  - 99th percentile threshold: {threshold_99_dst:.0f} sources\")\n",
-    "print(f\"\\nSuspicious Target IPs (Potential DDoS Victims):\")\n",
-    "print(f\"  - IPs above 95th percentile: {len(suspicious_targets_95)} ({len(suspicious_targets_95)/len(dst_to_src_mapping)*100:.2f}%)\")\n",
-    "print(f\"  - IPs above 99th percentile: {len(suspicious_targets_99)} ({len(suspicious_targets_99)/len(dst_to_src_mapping)*100:.2f}%)\")\n",
-    "\n",
-    "print(f\"\\nTop 10 Target IPs with Highest Fan-in:\")\n",
-    "print(dst_to_src_mapping.head(10))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize fan-in distribution\n",
-    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
-    "\n",
-    "# Histogram\n",
-    "axes[0].hist(dst_to_src_mapping.values, bins=50, color='purple', alpha=0.7, edgecolor='black')\n",
-    "axes[0].axvline(threshold_95_dst, color='darkviolet', linestyle='--', linewidth=2, \n",
-    "                label=f'95th percentile: {threshold_95_dst:.0f}')\n",
-    "axes[0].axvline(threshold_99_dst, color='indigo', linestyle='--', linewidth=2, \n",
-    "                label=f'99th percentile: {threshold_99_dst:.0f}')\n",
-    "axes[0].set_xlabel('Number of Unique Sources per Destination IP', fontsize=12)\n",
-    "axes[0].set_ylabel('Frequency (log scale)', fontsize=12)\n",
-    "axes[0].set_title('Destination IP Fan-in Distribution\\n(Potential DDoS Targets)', \n",
-    "                  fontsize=14, fontweight='bold')\n",
-    "axes[0].set_yscale('log')\n",
-    "axes[0].legend()\n",
-    "axes[0].grid(True, alpha=0.3)\n",
-    "\n",
-    "# Top targeted IPs\n",
-    "top_targets = dst_to_src_mapping.head(15)\n",
-    "axes[1].barh(range(len(top_targets)), top_targets.values, color='darkviolet')\n",
-    "axes[1].set_yticks(range(len(top_targets)))\n",
-    "axes[1].set_yticklabels(top_targets.index)\n",
-    "axes[1].set_xlabel('Number of Unique Sources', fontsize=12)\n",
-    "axes[1].set_ylabel('Destination IP Address', fontsize=12)\n",
-    "axes[1].set_title('Top 15 Destination IPs by Fan-in\\n(Potential DDoS Targets)', \n",
-    "                  fontsize=14, fontweight='bold')\n",
-    "axes[1].invert_yaxis()\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# SPOOFING DETECTION 3: Bidirectional Traffic Analysis\n",
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"SPOOFING INDICATOR 3: BIDIRECTIONAL TRAFFIC\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "source_ips_set = set(df['Source IP Address'].dropna())\n",
-    "dest_ips_set = set(df['Destination IP Address'].dropna())\n",
-    "bidirectional_ips = source_ips_set.intersection(dest_ips_set)\n",
-    "\n",
-    "print(f\"\\nBidirectional IP Statistics:\")\n",
-    "print(f\"  - Total unique source IPs: {len(source_ips_set):,}\")\n",
-    "print(f\"  - Total unique destination IPs: {len(dest_ips_set):,}\")\n",
-    "print(f\"  - IPs appearing as BOTH source and destination: {len(bidirectional_ips):,}\")\n",
-    "print(f\"  - Percentage of bidirectional IPs: {len(bidirectional_ips)/(len(source_ips_set.union(dest_ips_set)))*100:.2f}%\")\n",
-    "\n",
-    "# Analyze bidirectional traffic by attack type\n",
-    "if 'Attack Type' in df.columns:\n",
-    "    df['is_bidirectional'] = (df['Source IP Address'].isin(bidirectional_ips)) | \\\n",
-    "                              (df['Destination IP Address'].isin(bidirectional_ips))\n",
-    "    \n",
-    "    print(f\"\\nBidirectional Traffic by Attack Type:\")\n",
-    "    bidir_attack = pd.crosstab(df['Attack Type'], df['is_bidirectional'], normalize='index') * 100\n",
-    "    print(bidir_attack)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize bidirectional traffic\n",
-    "if 'Attack Type' in df.columns and 'is_bidirectional' in df.columns:\n",
-    "    plt.figure(figsize=(12, 6))\n",
-    "    bidir_attack[True].sort_values().plot(kind='barh', color='teal', edgecolor='black')\n",
-    "    plt.xlabel('Percentage of Traffic with Bidirectional IPs', fontsize=12)\n",
-    "    plt.ylabel('Attack Type', fontsize=12)\n",
-    "    plt.title('Bidirectional IP Traffic by Attack Type', fontsize=14, fontweight='bold')\n",
-    "    plt.tight_layout()\n",
-    "    plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# SPOOFING DETECTION 4: Private IP Detection\n",
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"SPOOFING INDICATOR 4: PRIVATE IP ADDRESS DETECTION\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "def is_private_ip(ip):\n",
-    "    \"\"\"Check if an IP is in private range (RFC 1918)\"\"\"\n",
-    "    if pd.isna(ip):\n",
-    "        return False\n",
-    "    try:\n",
-    "        parts = str(ip).split('.')\n",
-    "        if len(parts) != 4:\n",
-    "            return False\n",
-    "        first = int(parts[0])\n",
-    "        second = int(parts[1])\n",
-    "        \n",
-    "        # Private IP ranges: 10.x.x.x, 172.16-31.x.x, 192.168.x.x\n",
-    "        if first == 10:\n",
-    "            return True\n",
-    "        if first == 172 and 16 <= second <= 31:\n",
-    "            return True\n",
-    "        if first == 192 and second == 168:\n",
-    "            return True\n",
-    "        return False\n",
-    "    except:\n",
-    "        return False\n",
-    "\n",
-    "df['src_is_private'] = df['Source IP Address'].apply(is_private_ip)\n",
-    "df['dst_is_private'] = df['Destination IP Address'].apply(is_private_ip)\n",
-    "\n",
-    "print(f\"\\nPrivate IP Statistics:\")\n",
-    "print(f\"  - Source IPs from private ranges: {df['src_is_private'].sum():,} ({df['src_is_private'].sum()/len(df)*100:.2f}%)\")\n",
-    "print(f\"  - Destination IPs from private ranges: {df['dst_is_private'].sum():,} ({df['dst_is_private'].sum()/len(df)*100:.2f}%)\")\n",
-    "print(f\"  - Total connections involving private IPs: {(df['src_is_private'] | df['dst_is_private']).sum():,}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Analyze private IP usage by attack type\n",
-    "if 'Attack Type' in df.columns:\n",
-    "    print(\"\\nPrivate IP Usage by Attack Type:\")\n",
-    "    attack_private = df.groupby('Attack Type').agg({\n",
-    "        'src_is_private': ['sum', 'mean'],\n",
-    "        'dst_is_private': ['sum', 'mean']\n",
-    "    })\n",
-    "    attack_private.columns = ['Src_Private_Count', 'Src_Private_Pct', 'Dst_Private_Count', 'Dst_Private_Pct']\n",
-    "    attack_private['Src_Private_Pct'] = attack_private['Src_Private_Pct'] * 100\n",
-    "    attack_private['Dst_Private_Pct'] = attack_private['Dst_Private_Pct'] * 100\n",
-    "    print(attack_private)\n",
-    "    \n",
-    "    # Visualize\n",
-    "    fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
-    "    \n",
-    "    attack_private['Src_Private_Pct'].plot(kind='bar', ax=axes[0], color='orange', edgecolor='black')\n",
-    "    axes[0].set_title('Source Private IP Usage by Attack Type', fontsize=14, fontweight='bold')\n",
-    "    axes[0].set_xlabel('Attack Type')\n",
-    "    axes[0].set_ylabel('Percentage')\n",
-    "    axes[0].tick_params(axis='x', rotation=45)\n",
-    "    \n",
-    "    attack_private['Dst_Private_Pct'].plot(kind='bar', ax=axes[1], color='red', edgecolor='black')\n",
-    "    axes[1].set_title('Destination Private IP Usage by Attack Type', fontsize=14, fontweight='bold')\n",
-    "    axes[1].set_xlabel('Attack Type')\n",
-    "    axes[1].set_ylabel('Percentage')\n",
-    "    axes[1].tick_params(axis='x', rotation=45)\n",
-    "    \n",
-    "    plt.tight_layout()\n",
-    "    plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Geo-location analysis (if available)\n",
-    "if 'Geo-location Data' in df.columns:\n",
-    "    print(\"\\n\" + \"=\"*80)\n",
-    "    print(\"GEO-LOCATION ANALYSIS\")\n",
-    "    print(\"=\"*80)\n",
-    "    \n",
-    "    print(f\"\\nGeo-location Statistics:\")\n",
-    "    print(f\"  - Unique locations: {df['Geo-location Data'].nunique():,}\")\n",
-    "    print(f\"  - Missing values: {df['Geo-location Data'].isna().sum():,}\")\n",
-    "    \n",
-    "    # Top locations\n",
-    "    print(f\"\\nTop 15 Geo-locations:\")\n",
-    "    top_locations = df['Geo-location Data'].value_counts().head(15)\n",
-    "    print(top_locations)\n",
-    "    \n",
-    "    # Visualize\n",
-    "    plt.figure(figsize=(14, 8))\n",
-    "    top_locations.plot(kind='barh', color='skyblue', edgecolor='black')\n",
-    "    plt.xlabel('Frequency', fontsize=12)\n",
-    "    plt.ylabel('Geo-location', fontsize=12)\n",
-    "    plt.title('Top 15 Geo-locations in Attack Traffic', fontsize=14, fontweight='bold')\n",
-    "    plt.gca().invert_yaxis()\n",
-    "    plt.tight_layout()\n",
-    "    plt.show()\n",
-    "    \n",
-    "    # Geo-location by attack type\n",
-    "    if 'Attack Type' in df.columns:\n",
-    "        print(f\"\\nTop Geo-location by Attack Type:\")\n",
-    "        for attack in df['Attack Type'].unique():\n",
-    "            top_loc = df[df['Attack Type'] == attack]['Geo-location Data'].value_counts().head(1)\n",
-    "            if len(top_loc) > 0:\n",
-    "                print(f\"  {attack}: {top_loc.index[0]} ({top_loc.values[0]} occurrences)\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 📊 Key Findings - IP Trends & Spoofing\n",
-    "\n",
-    "**Summary:**\n",
-    "- Write your key findings here\n",
-    "- How many suspicious IPs detected (fan-out/fan-in)?\n",
-    "- Any DDoS targets identified?\n",
-    "- Private IP issues?\n",
-    "- Geographic patterns?\n",
-    "\n",
-    "**Red Flags Identified:**\n",
-    "- List specific suspicious IPs or patterns\n",
-    "\n",
-    "**Recommendation for ML Model:**\n",
-    "- Create features: source_fanout_score, dest_fanin_score, is_bidirectional, is_private_ip"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## 4. PART 3: Data Bin Trends Analysis\n",
-    "\n",
-    "**Analysis Goals:**\n",
-    "1. Packet Length distribution and binning\n",
-    "2. Port usage patterns (well-known, registered, dynamic)\n",
-    "3. Protocol distribution\n",
-    "4. Anomaly score categorization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"=\"*80)\n",
-    "print(\"DATA BIN TRENDS ANALYSIS\")\n",
-    "print(\"=\"*80)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 1. PACKET LENGTH ANALYSIS\n",
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"PACKET LENGTH DISTRIBUTION\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "if 'Packet Length' in df.columns:\n",
-    "    packet_lengths = df['Packet Length'].dropna()\n",
-    "    \n",
-    "    print(f\"\\nPacket Length Statistics:\")\n",
-    "    print(f\"  - Mean: {packet_lengths.mean():.2f} bytes\")\n",
-    "    print(f\"  - Median: {packet_lengths.median():.2f} bytes\")\n",
-    "    print(f\"  - Std Dev: {packet_lengths.std():.2f} bytes\")\n",
-    "    print(f\"  - Min: {packet_lengths.min():.2f} bytes\")\n",
-    "    print(f\"  - Max: {packet_lengths.max():.2f} bytes\")\n",
-    "    print(f\"  - 25th percentile: {packet_lengths.quantile(0.25):.2f} bytes\")\n",
-    "    print(f\"  - 75th percentile: {packet_lengths.quantile(0.75):.2f} bytes\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create packet length bins\n",
-    "bins_packet = [0, 64, 128, 256, 512, 1024, 2048, float('inf')]\n",
-    "labels_packet = ['0-64', '64-128', '128-256', '256-512', '512-1024', '1024-2048', '2048+']\n",
-    "df['packet_length_bin'] = pd.cut(df['Packet Length'], bins=bins_packet, labels=labels_packet)\n",
-    "\n",
-    "packet_bin_dist = df['packet_length_bin'].value_counts().sort_index()\n",
-    "print(f\"\\nPacket Length Bins Distribution:\")\n",
-    "print(packet_bin_dist)\n",
-    "print(f\"\\nPercentage Distribution:\")\n",
-    "print((packet_bin_dist / packet_bin_dist.sum() * 100).round(2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize packet length\n",
-    "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n",
-    "\n",
-    "# Histogram\n",
-    "axes[0, 0].hist(packet_lengths, bins=50, color='skyblue', edgecolor='black', alpha=0.7)\n",
-    "axes[0, 0].set_xlabel('Packet Length (bytes)', fontsize=11)\n",
-    "axes[0, 0].set_ylabel('Frequency (log scale)', fontsize=11)\n",
-    "axes[0, 0].set_title('Packet Length Distribution', fontsize=13, fontweight='bold')\n",
-    "axes[0, 0].set_yscale('log')\n",
-    "axes[0, 0].grid(True, alpha=0.3)\n",
-    "\n",
-    "# Binned distribution\n",
-    "packet_bin_dist.plot(kind='bar', ax=axes[0, 1], color='coral', edgecolor='black')\n",
-    "axes[0, 1].set_xlabel('Packet Length Bins (bytes)', fontsize=11)\n",
-    "axes[0, 1].set_ylabel('Count', fontsize=11)\n",
-    "axes[0, 1].set_title('Packet Length Binned Distribution', fontsize=13, fontweight='bold')\n",
-    "axes[0, 1].tick_params(axis='x', rotation=45)\n",
-    "\n",
-    "# Box plot by attack type\n",
-    "if 'Attack Type' in df.columns:\n",
-    "    df.boxplot(column='Packet Length', by='Attack Type', ax=axes[1, 0])\n",
-    "    axes[1, 0].set_xlabel('Attack Type', fontsize=11)\n",
-    "    axes[1, 0].set_ylabel('Packet Length (bytes)', fontsize=11)\n",
-    "    axes[1, 0].set_title('Packet Length by Attack Type', fontsize=13, fontweight='bold')\n",
-    "    axes[1, 0].get_figure().suptitle('')  # Remove default title\n",
-    "    plt.sca(axes[1, 0])\n",
-    "    plt.xticks(rotation=45, ha='right')\n",
-    "\n",
-    "# Bins by attack type (stacked bar)\n",
-    "if 'Attack Type' in df.columns:\n",
-    "    bin_attack = pd.crosstab(df['Attack Type'], df['packet_length_bin'], normalize='index') * 100\n",
-    "    bin_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], colormap='tab10')\n",
-    "    axes[1, 1].set_xlabel('Attack Type', fontsize=11)\n",
-    "    axes[1, 1].set_ylabel('Percentage', fontsize=11)\n",
-    "    axes[1, 1].set_title('Packet Length Bins by Attack Type (%)', fontsize=13, fontweight='bold')\n",
-    "    axes[1, 1].legend(title='Packet Size', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
-    "    axes[1, 1].tick_params(axis='x', rotation=45)\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 2. PORT ANALYSIS\n",
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"PORT USAGE ANALYSIS\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "if 'Source Port' in df.columns and 'Destination Port' in df.columns:\n",
-    "    # Source ports\n",
-    "    print(f\"\\nTop 10 Source Ports:\")\n",
-    "    top_src_ports = df['Source Port'].value_counts().head(10)\n",
-    "    print(top_src_ports)\n",
-    "    \n",
-    "    # Destination ports\n",
-    "    print(f\"\\nTop 10 Destination Ports:\")\n",
-    "    top_dst_ports = df['Destination Port'].value_counts().head(10)\n",
-    "    print(top_dst_ports)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create port categories\n",
-    "def categorize_port(port):\n",
-    "    \"\"\"Categorize ports into well-known, registered, or dynamic\"\"\"\n",
-    "    if pd.isna(port):\n",
-    "        return 'Unknown'\n",
-    "    try:\n",
-    "        port = int(port)\n",
-    "        if 0 <= port <= 1023:\n",
-    "            return 'Well-known (0-1023)'\n",
-    "        elif 1024 <= port <= 49151:\n",
-    "            return 'Registered (1024-49151)'\n",
-    "        elif 49152 <= port <= 65535:\n",
-    "            return 'Dynamic (49152-65535)'\n",
-    "        else:\n",
-    "            return 'Unknown'\n",
-    "    except:\n",
-    "        return 'Unknown'\n",
-    "\n",
-    "df['dst_port_category'] = df['Destination Port'].apply(categorize_port)\n",
-    "df['src_port_category'] = df['Source Port'].apply(categorize_port)\n",
-    "\n",
-    "print(f\"\\nDestination Port Categories:\")\n",
-    "print(df['dst_port_category'].value_counts())\n",
-    "print(f\"\\nPercentage:\")\n",
-    "print((df['dst_port_category'].value_counts() / len(df) * 100).round(2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize port analysis\n",
-    "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n",
-    "\n",
-    "# Top source ports\n",
-    "top_src_ports_15 = df['Source Port'].value_counts().head(15)\n",
-    "axes[0, 0].barh(range(len(top_src_ports_15)), top_src_ports_15.values, color='lightgreen')\n",
-    "axes[0, 0].set_yticks(range(len(top_src_ports_15)))\n",
-    "axes[0, 0].set_yticklabels(top_src_ports_15.index)\n",
-    "axes[0, 0].set_xlabel('Frequency', fontsize=11)\n",
-    "axes[0, 0].set_ylabel('Port Number', fontsize=11)\n",
-    "axes[0, 0].set_title('Top 15 Source Ports', fontsize=13, fontweight='bold')\n",
-    "axes[0, 0].invert_yaxis()\n",
-    "\n",
-    "# Top destination ports\n",
-    "top_dst_ports_15 = df['Destination Port'].value_counts().head(15)\n",
-    "axes[0, 1].barh(range(len(top_dst_ports_15)), top_dst_ports_15.values, color='lightcoral')\n",
-    "axes[0, 1].set_yticks(range(len(top_dst_ports_15)))\n",
-    "axes[0, 1].set_yticklabels(top_dst_ports_15.index)\n",
-    "axes[0, 1].set_xlabel('Frequency', fontsize=11)\n",
-    "axes[0, 1].set_ylabel('Port Number', fontsize=11)\n",
-    "axes[0, 1].set_title('Top 15 Destination Ports', fontsize=13, fontweight='bold')\n",
-    "axes[0, 1].invert_yaxis()\n",
-    "\n",
-    "# Port category pie chart\n",
-    "port_cat_dist = df['dst_port_category'].value_counts()\n",
-    "axes[1, 0].pie(port_cat_dist.values, labels=port_cat_dist.index, autopct='%1.1f%%', startangle=90)\n",
-    "axes[1, 0].set_title('Destination Port Categories', fontsize=13, fontweight='bold')\n",
-    "\n",
-    "# Port categories by attack type\n",
-    "if 'Attack Type' in df.columns:\n",
-    "    port_attack = pd.crosstab(df['Attack Type'], df['dst_port_category'])\n",
-    "    port_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], colormap='Set3')\n",
-    "    axes[1, 1].set_xlabel('Attack Type', fontsize=11)\n",
-    "    axes[1, 1].set_ylabel('Count', fontsize=11)\n",
-    "    axes[1, 1].set_title('Port Categories by Attack Type', fontsize=13, fontweight='bold')\n",
-    "    axes[1, 1].legend(title='Port Category', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
-    "    axes[1, 1].tick_params(axis='x', rotation=45)\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 3. PROTOCOL ANALYSIS\n",
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"PROTOCOL DISTRIBUTION\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "if 'Protocol' in df.columns:\n",
-    "    protocol_dist = df['Protocol'].value_counts()\n",
-    "    print(f\"\\nProtocol Distribution:\")\n",
-    "    print(protocol_dist)\n",
-    "    print(f\"\\nPercentage:\")\n",
-    "    print((protocol_dist / protocol_dist.sum() * 100).round(2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize protocol analysis\n",
-    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
-    "\n",
-    "# Protocol pie chart\n",
-    "axes[0].pie(protocol_dist.values, labels=protocol_dist.index, autopct='%1.1f%%', startangle=90)\n",
-    "axes[0].set_title('Protocol Distribution', fontsize=14, fontweight='bold')\n",
-    "\n",
-    "# Protocol by attack type\n",
-    "if 'Attack Type' in df.columns:\n",
-    "    protocol_attack = pd.crosstab(df['Attack Type'], df['Protocol'], normalize='index') * 100\n",
-    "    protocol_attack.plot(kind='bar', stacked=True, ax=axes[1], colormap='viridis')\n",
-    "    axes[1].set_xlabel('Attack Type', fontsize=12)\n",
-    "    axes[1].set_ylabel('Percentage', fontsize=12)\n",
-    "    axes[1].set_title('Protocol Distribution by Attack Type (%)', fontsize=14, fontweight='bold')\n",
-    "    axes[1].legend(title='Protocol', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
-    "    axes[1].tick_params(axis='x', rotation=45)\n",
-    "    \n",
-    "    print(f\"\\nProtocol Usage by Attack Type (%):\")\n",
-    "    print(protocol_attack.round(2))\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 4. ANOMALY SCORES ANALYSIS\n",
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"ANOMALY SCORES DISTRIBUTION\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "if 'Anomaly Scores' in df.columns:\n",
-    "    anomaly_scores = df['Anomaly Scores'].dropna()\n",
-    "    \n",
-    "    print(f\"\\nAnomaly Score Statistics:\")\n",
-    "    print(f\"  - Mean: {anomaly_scores.mean():.4f}\")\n",
-    "    print(f\"  - Median: {anomaly_scores.median():.4f}\")\n",
-    "    print(f\"  - Std Dev: {anomaly_scores.std():.4f}\")\n",
-    "    print(f\"  - Min: {anomaly_scores.min():.4f}\")\n",
-    "    print(f\"  - Max: {anomaly_scores.max():.4f}\")\n",
-    "    print(f\"  - 25th percentile: {anomaly_scores.quantile(0.25):.4f}\")\n",
-    "    print(f\"  - 50th percentile: {anomaly_scores.quantile(0.50):.4f}\")\n",
-    "    print(f\"  - 75th percentile: {anomaly_scores.quantile(0.75):.4f}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create anomaly score categories based on quartiles\n",
-    "anomaly_bins = [anomaly_scores.min(), \n",
-    "                anomaly_scores.quantile(0.25),\n",
-    "                anomaly_scores.quantile(0.5),\n",
-    "                anomaly_scores.quantile(0.75),\n",
-    "                anomaly_scores.max()]\n",
-    "anomaly_labels = ['Low (0-25%)', 'Medium (25-50%)', 'High (50-75%)', 'Critical (75-100%)']\n",
-    "\n",
-    "df['anomaly_category'] = pd.cut(df['Anomaly Scores'], bins=anomaly_bins, \n",
-    "                                 labels=anomaly_labels, include_lowest=True)\n",
-    "\n",
-    "anomaly_cat_dist = df['anomaly_category'].value_counts().sort_index()\n",
-    "print(f\"\\nAnomaly Score Categories:\")\n",
-    "print(anomaly_cat_dist)\n",
-    "print(f\"\\nPercentage:\")\n",
-    "print((anomaly_cat_dist / anomaly_cat_dist.sum() * 100).round(2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize anomaly scores\n",
-    "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n",
-    "\n",
-    "# Histogram\n",
-    "axes[0, 0].hist(anomaly_scores, bins=50, color='purple', alpha=0.7, edgecolor='black')\n",
-    "axes[0, 0].set_xlabel('Anomaly Score', fontsize=11)\n",
-    "axes[0, 0].set_ylabel('Frequency', fontsize=11)\n",
-    "axes[0, 0].set_title('Anomaly Score Distribution', fontsize=13, fontweight='bold')\n",
-    "axes[0, 0].grid(True, alpha=0.3)\n",
-    "\n",
-    "# Category bar chart\n",
-    "colors = ['green', 'yellow', 'orange', 'red']\n",
-    "axes[0, 1].bar(range(len(anomaly_cat_dist)), anomaly_cat_dist.values, \n",
-    "               color=colors, edgecolor='black')\n",
-    "axes[0, 1].set_xticks(range(len(anomaly_cat_dist)))\n",
-    "axes[0, 1].set_xticklabels(anomaly_cat_dist.index, rotation=45, ha='right')\n",
-    "axes[0, 1].set_xlabel('Anomaly Category', fontsize=11)\n",
-    "axes[0, 1].set_ylabel('Count', fontsize=11)\n",
-    "axes[0, 1].set_title('Anomaly Score Categories', fontsize=13, fontweight='bold')\n",
-    "\n",
-    "# Box plot by attack type\n",
-    "if 'Attack Type' in df.columns:\n",
-    "    df.boxplot(column='Anomaly Scores', by='Attack Type', ax=axes[1, 0])\n",
-    "    axes[1, 0].set_xlabel('Attack Type', fontsize=11)\n",
-    "    axes[1, 0].set_ylabel('Anomaly Score', fontsize=11)\n",
-    "    axes[1, 0].set_title('Anomaly Scores by Attack Type', fontsize=13, fontweight='bold')\n",
-    "    axes[1, 0].get_figure().suptitle('')\n",
-    "    plt.sca(axes[1, 0])\n",
-    "    plt.xticks(rotation=45, ha='right')\n",
-    "\n",
-    "# Category by attack type\n",
-    "if 'Attack Type' in df.columns:\n",
-    "    anomaly_attack = pd.crosstab(df['Attack Type'], df['anomaly_category'], normalize='index') * 100\n",
-    "    anomaly_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], color=colors)\n",
-    "    axes[1, 1].set_xlabel('Attack Type', fontsize=11)\n",
-    "    axes[1, 1].set_ylabel('Percentage', fontsize=11)\n",
-    "    axes[1, 1].set_title('Anomaly Categories by Attack Type (%)', fontsize=13, fontweight='bold')\n",
-    "    axes[1, 1].legend(title='Anomaly Level', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
-    "    axes[1, 1].tick_params(axis='x', rotation=45)\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 📊 Key Findings - Data Bin Trends\n",
-    "\n",
-    "**Summary:**\n",
-    "- Write your key findings here\n",
-    "- What are the dominant packet sizes per attack type?\n",
-    "- Which ports are most targeted?\n",
-    "- Protocol preferences?\n",
-    "- Anomaly score patterns?\n",
-    "\n",
-    "**Attack Signatures Identified:**\n",
-    "- DDoS: [packet size pattern, protocol, ports]\n",
-    "- Malware: [packet size pattern, protocol, ports]\n",
-    "- etc.\n",
-    "\n",
-    "**Recommendation for ML Model:**\n",
-    "- Use binned features: packet_length_bin, port_category, anomaly_category"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## 5. COMPREHENSIVE SUMMARY & INSIGHTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"COMPREHENSIVE EDA SUMMARY\")\n",
-    "print(\"=\"*80)\n",
-    "\n",
-    "print(f\"\\n📊 DATASET OVERVIEW\")\n",
-    "print(\"-\" * 80)\n",
-    "print(f\"Total Records: {len(df):,}\")\n",
-    "print(f\"Total Features: {df.shape[1]}\")\n",
-    "print(f\"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB\")\n",
-    "\n",
-    "if 'Attack Type' in df.columns:\n",
-    "    print(f\"\\n🎯 ATTACK TYPE DISTRIBUTION\")\n",
-    "    print(\"-\" * 80)\n",
-    "    attack_dist = df['Attack Type'].value_counts()\n",
-    "    for attack, count in attack_dist.items():\n",
-    "        print(f\"  {attack}: {count:,} ({count/len(df)*100:.2f}%)\")\n",
-    "\n",
-    "print(f\"\\n🔍 KEY STATISTICS\")\n",
-    "print(\"-\" * 80)\n",
-    "\n",
-    "# Proxy\n",
-    "if 'has_proxy' in df.columns:\n",
-    "    proxy_pct = (df['has_proxy'].sum() / len(df)) * 100\n",
-    "    print(f\"  - Proxy Usage Rate: {proxy_pct:.2f}%\")\n",
-    "\n",
-    "# IPs\n",
-    "if 'Source IP Address' in df.columns:\n",
-    "    print(f\"  - Unique Source IPs: {df['Source IP Address'].nunique():,}\")\n",
-    "    print(f\"  - Unique Destination IPs: {df['Destination IP Address'].nunique():,}\")\n",
-    "\n",
-    "# Packet Length\n",
-    "if 'Packet Length' in df.columns:\n",
-    "    print(f\"  - Average Packet Size: {df['Packet Length'].mean():.2f} bytes\")\n",
-    "\n",
-    "# Protocol\n",
-    "if 'Protocol' in df.columns:\n",
-    "    top_protocol = df['Protocol'].value_counts().index[0]\n",
-    "    top_protocol_pct = (df['Protocol'].value_counts().values[0] / len(df)) * 100\n",
-    "    print(f\"  - Most Common Protocol: {top_protocol} ({top_protocol_pct:.2f}%)\")\n",
-    "\n",
-    "# Port\n",
-    "if 'Destination Port' in df.columns:\n",
-    "    top_port = df['Destination Port'].value_counts().index[0]\n",
-    "    top_port_count = df['Destination Port'].value_counts().values[0]\n",
-    "    print(f\"  - Most Targeted Port: {top_port} ({top_port_count:,} times)\")\n",
-    "\n",
-    "print(\"\\n\" + \"=\"*80)\n",
-    "print(\"✅ EDA ANALYSIS COMPLETE!\")\n",
-    "print(\"=\"*80)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## 6. EXPORT ENGINEERED FEATURES (Optional)\n",
-    "\n",
-    "Create new features based on EDA insights for ML model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create a summary of engineered features\n",
-    "engineered_features = df[[\n",
-    "    'has_proxy',                    # Binary: 0/1\n",
-    "    'is_bidirectional',            # Binary: 0/1  \n",
-    "    'src_is_private',              # Binary: 0/1\n",
-    "    'dst_is_private',              # Binary: 0/1\n",
-    "    'packet_length_bin',           # Categorical: 7 categories\n",
-    "    'dst_port_category',           # Categorical: 3 categories\n",
-    "    'src_port_category',           # Categorical: 3 categories\n",
-    "    'anomaly_category'             # Categorical: 4 categories\n",
-    "]].copy()\n",
-    "\n",
-    "print(\"Engineered Features Summary:\")\n",
-    "print(engineered_features.head(10))\n",
-    "print(f\"\\nShape: {engineered_features.shape}\")\n",
-    "print(f\"\\nFeature Data Types:\")\n",
-    "print(engineered_features.dtypes)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Optional: Save engineered features to CSV\n",
-    "# engineered_features.to_csv('engineered_features.csv', index=False)\n",
-    "# print(\"✓ Engineered features saved to 'engineered_features.csv'\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## 7. CONCLUSIONS & RECOMMENDATIONS\n",
-    "\n",
-    "### Key Findings:\n",
-    "1. **Proxy Usage:**\n",
-    "   - [Your findings here]\n",
-    "   \n",
-    "2. **IP Spoofing Indicators:**\n",
-    "   - [Your findings here]\n",
-    "   \n",
-    "3. **Data Bin Patterns:**\n",
-    "   - [Your findings here]\n",
-    "\n",
-    "### Recommendations for ML Model:\n",
-    "1. Binary features: `has_proxy`, `is_bidirectional`, `src_is_private`, `dst_is_private`\n",
-    "2. Categorical features: `packet_length_bin`, `port_category`, `anomaly_category`\n",
-    "3. Numerical features: Consider creating fan-out/fan-in scores\n",
-    "4. Attack-specific patterns identified can guide feature importance analysis\n",
-    "\n",
-    "### Next Steps:\n",
-    "1. Data preprocessing (handle missing values, encode categoricals)\n",
-    "2. Feature scaling/normalization\n",
-    "3. Address class imbalance if needed\n",
-    "4. Model selection and training\n",
-    "5. Hyperparameter tuning\n",
-    "6. Model evaluation"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/data_loader.py b/data_loader.py
new file mode 100644
index 0000000..930311e
--- /dev/null
+++ b/data_loader.py
@@ -0,0 +1,113 @@
+"""
+Data Loading and Basic Validation Module
+Handles CSV loading, initial validation, and basic cleaning
+"""
+
+import pandas as pd
+import numpy as np
+
+
+def load_dataset(filepath, verbose=True):
+    """
+    Load the cybersecurity attacks dataset from CSV
+    
+    Parameters:
+    -----------
+    filepath : str
+        Path to the CSV file
+    verbose : bool
+        If True, print dataset info
+        
+    Returns:
+    --------
+    pd.DataFrame
+        Loaded dataset
+    """
+    df = pd.read_csv(filepath)
+    
+    if verbose:
+        print(f"✓ Dataset loaded successfully!")
+        print(f"  - Shape: {df.shape}")
+        print(f"  - Total Records: {df.shape[0]:,}")
+        print(f"  - Total Features: {df.shape[1]}")
+    
+    return df
+
+
+def get_missing_value_summary(df):
+    """
+    Generate comprehensive missing value analysis
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe
+        
+    Returns:
+    --------
+    pd.DataFrame
+        Summary of missing values and distinctness
+    """
+    missing_df = pd.DataFrame({
+        'Missing_Count': df.isnull().sum(),
+        'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
+        'Distinct_Count': df.nunique(),
+        'Distinct_Percentage': (df.nunique() / len(df)) * 100
+    }).sort_values('Missing_Count', ascending=False)
+    
+    return missing_df
+
+
+def validate_required_columns(df, required_columns):
+    """
+    Check if required columns exist in dataframe
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe
+    required_columns : list
+        List of column names that must be present
+        
+    Returns:
+    --------
+    tuple
+        (bool: all_present, list: missing_columns)
+    """
+    missing = [col for col in required_columns if col not in df.columns]
+    
+    if missing:
+        print(f"⚠️  Missing required columns: {missing}")
+        return False, missing
+    
+    return True, []
+
+
+def get_dataset_info(df):
+    """
+    Print comprehensive dataset information
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe
+    """
+    print("\n" + "="*80)
+    print("DATASET INFORMATION")
+    print("="*80)
+    
+    print(f"\n📊 Basic Stats:")
+    print(f"  - Total Records: {len(df):,}")
+    print(f"  - Total Features: {df.shape[1]}")
+    print(f"  - Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
+    
+    print(f"\n📋 Data Types:")
+    print(df.dtypes.value_counts())
+    
+    print(f"\n❌ Missing Values:")
+    missing = df.isnull().sum().sum()
+    if missing > 0:
+        print(f"  - Total missing values: {missing:,}")
+        print(f"  - Percentage: {(missing / (df.shape[0] * df.shape[1])) * 100:.2f}%")
+    else:
+        print("  - No missing values found ✓")
diff --git a/features.py b/features.py
new file mode 100644
index 0000000..4fcdda0
--- /dev/null
+++ b/features.py
@@ -0,0 +1,269 @@
+"""
+Feature Engineering Module
+Functions for creating derived features from raw data
+"""
+
+import pandas as pd
+import numpy as np
+
+
+def create_proxy_features(df):
+    """
+    Create proxy-related features
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe with 'Proxy Information' column
+        
+    Returns:
+    --------
+    pd.DataFrame
+        Dataframe with added proxy features
+    """
+    if 'Proxy Information' in df.columns:
+        # Binary feature: has_proxy
+        df['has_proxy'] = df['Proxy Information'].notna().astype(int)
+        print(f"✓ Created 'has_proxy' feature")
+        print(f"  - Records with proxy: {df['has_proxy'].sum():,} ({df['has_proxy'].sum()/len(df)*100:.2f}%)")
+    else:
+        print("⚠️  'Proxy Information' column not found, skipping proxy features")
+    
+    return df
+
+
+def create_ip_features(df):
+    """
+    Create IP-related features
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe with IP address columns
+        
+    Returns:
+    --------
+    pd.DataFrame
+        Dataframe with added IP features
+    """
+    if 'Source IP Address' not in df.columns or 'Destination IP Address' not in df.columns:
+        print("⚠️  IP address columns not found, skipping IP features")
+        return df
+    
+    # IP Class (first octet)
+    df['src_ip_class'] = df['Source IP Address'].str.split('.').str[0].astype(int)
+    df['dst_ip_class'] = df['Destination IP Address'].str.split('.').str[0].astype(int)
+    
+    # Private IP detection
+    df['src_is_private'] = df['Source IP Address'].apply(is_private_ip)
+    df['dst_is_private'] = df['Destination IP Address'].apply(is_private_ip)
+    
+    # Bidirectional traffic detection
+    source_ips_set = set(df['Source IP Address'])
+    dest_ips_set = set(df['Destination IP Address'])
+    bidirectional_ips = source_ips_set.intersection(dest_ips_set)
+    
+    df['is_bidirectional'] = (df['Source IP Address'].isin(bidirectional_ips)) | \
+                              (df['Destination IP Address'].isin(bidirectional_ips))
+    
+    print(f"✓ Created IP features:")
+    print(f"  - IP class features (src_ip_class, dst_ip_class)")
+    print(f"  - Private IP indicators (src_is_private, dst_is_private)")
+    print(f"  - Bidirectional traffic indicator")
+    print(f"  - Bidirectional IPs found: {len(bidirectional_ips):,}")
+    
+    return df
+
+
+def is_private_ip(ip):
+    """
+    Check if an IP address is in private range (RFC 1918)
+    
+    Parameters:
+    -----------
+    ip : str
+        IP address string
+        
+    Returns:
+    --------
+    bool
+        True if IP is private, False otherwise
+    """
+    if pd.isna(ip):
+        return False
+    try:
+        parts = str(ip).split('.')
+        if len(parts) != 4:
+            return False
+        first = int(parts[0])
+        second = int(parts[1])
+        
+        # Private IP ranges: 10.x.x.x, 172.16-31.x.x, 192.168.x.x
+        if first == 10:
+            return True
+        if first == 172 and 16 <= second <= 31:
+            return True
+        if first == 192 and second == 168:
+            return True
+        return False
+    except:
+        return False
+
+
+def create_port_features(df):
+    """
+    Create port-related features
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe with port columns
+        
+    Returns:
+    --------
+    pd.DataFrame
+        Dataframe with added port features
+    """
+    if 'Source Port' in df.columns:
+        df['src_port_category'] = df['Source Port'].apply(categorize_port)
+    
+    if 'Destination Port' in df.columns:
+        df['dst_port_category'] = df['Destination Port'].apply(categorize_port)
+    
+    print(f"✓ Created port category features")
+    
+    return df
+
+
+def categorize_port(port):
+    """
+    Categorize ports into well-known, registered, or dynamic
+    
+    Parameters:
+    -----------
+    port : int
+        Port number
+        
+    Returns:
+    --------
+    str
+        Port category
+    """
+    if pd.isna(port):
+        return 'Unknown'
+    try:
+        port = int(port)
+        if 0 <= port <= 1023:
+            return 'Well-known (0-1023)'
+        elif 1024 <= port <= 49151:
+            return 'Registered (1024-49151)'
+        elif 49152 <= port <= 65535:
+            return 'Dynamic (49152-65535)'
+        else:
+            return 'Unknown'
+    except:
+        return 'Unknown'
+
+
+def create_packet_features(df):
+    """
+    Create packet-related features
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe with 'Packet Length' column
+        
+    Returns:
+    --------
+    pd.DataFrame
+        Dataframe with added packet features
+    """
+    if 'Packet Length' not in df.columns:
+        print("⚠️  'Packet Length' column not found, skipping packet features")
+        return df
+    
+    # Packet size categories
+    df['packet_length_bin'] = pd.cut(
+        df['Packet Length'], 
+        bins=[0, 100, 500, 1000, 1500, float('inf')],
+        labels=['Tiny (0-100)', 'Small (100-500)', 'Medium (500-1000)', 
+                'Large (1000-1500)', 'Jumbo (>1500)']
+    )
+    
+    print(f"✓ Created packet length bins")
+    
+    return df
+
+
+def create_anomaly_features(df):
+    """
+    Create anomaly score-related features
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe with 'Anomaly Scores' column
+        
+    Returns:
+    --------
+    pd.DataFrame
+        Dataframe with added anomaly features
+    """
+    if 'Anomaly Scores' not in df.columns:
+        print("⚠️  'Anomaly Scores' column not found, skipping anomaly features")
+        return df
+    
+    anomaly_scores = df['Anomaly Scores'].dropna()
+    
+    # Create quartile-based categories
+    anomaly_bins = [
+        anomaly_scores.min(), 
+        anomaly_scores.quantile(0.25),
+        anomaly_scores.quantile(0.5),
+        anomaly_scores.quantile(0.75),
+        anomaly_scores.max()
+    ]
+    anomaly_labels = ['Low (0-25%)', 'Medium (25-50%)', 'High (50-75%)', 'Critical (75-100%)']
+    
+    df['anomaly_category'] = pd.cut(
+        df['Anomaly Scores'], 
+        bins=anomaly_bins, 
+        labels=anomaly_labels, 
+        include_lowest=True
+    )
+    
+    print(f"✓ Created anomaly score categories")
+    
+    return df
+
+
+def create_all_features(df):
+    """
+    Create all engineered features at once
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe
+        
+    Returns:
+    --------
+    pd.DataFrame
+        Dataframe with all features added
+    """
+    print("\n" + "="*80)
+    print("FEATURE ENGINEERING")
+    print("="*80)
+    
+    df = create_proxy_features(df)
+    df = create_ip_features(df)
+    df = create_port_features(df)
+    df = create_packet_features(df)
+    df = create_anomaly_features(df)
+    
+    print(f"\n✓ Feature engineering complete!")
+    print(f"  - New shape: {df.shape}")
+    print(f"  - Total features: {df.shape[1]}")
+    
+    return df
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..303117b
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,338 @@
+"""
+Utility Functions Module
+Plotting, logging, metrics, and helper functions
+"""
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from scipy import stats
+from scipy.stats import chi2_contingency, f_oneway
+
+
+def setup_plotting_style():
+    """Set up consistent plotting style across all visualizations"""
+    sns.set_style("whitegrid")
+    plt.rcParams['figure.figsize'] = (15, 8)
+    plt.rcParams['font.size'] = 10
+
+
+def plot_attack_distribution(df, column='Attack Type'):
+    """
+    Plot the distribution of attack types
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe
+    column : str
+        Column name containing attack types
+    """
+    if column not in df.columns:
+        print(f"⚠️  Column '{column}' not found")
+        return
+    
+    plt.figure(figsize=(12, 6))
+    df[column].value_counts().plot(kind='bar', color='steelblue', edgecolor='black')
+    plt.title(f'{column} Distribution', fontsize=14, fontweight='bold')
+    plt.xlabel(column)
+    plt.ylabel('Count')
+    plt.xticks(rotation=45, ha='right')
+    plt.tight_layout()
+    plt.show()
+    
+    # Print statistics
+    print(f"\n{column} Distribution:")
+    print(df[column].value_counts())
+    print(f"\nPercentages:")
+    print((df[column].value_counts() / len(df) * 100).round(2))
+
+
+def plot_proxy_analysis(df):
+    """
+    Comprehensive proxy usage visualization
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe with proxy features
+    """
+    if 'has_proxy' not in df.columns:
+        print("⚠️  'has_proxy' feature not found. Run feature engineering first.")
+        return
+    
+    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+    
+    # 1. Overall proxy usage pie chart
+    proxy_counts = df['has_proxy'].value_counts()
+    labels = ['No Proxy', 'With Proxy']
+    colors = ['lightcoral', 'lightgreen']
+    axes[0, 0].pie(proxy_counts.values, labels=labels, autopct='%1.1f%%', 
+                    colors=colors, startangle=90)
+    axes[0, 0].set_title('Overall Proxy Usage Distribution', fontsize=14, fontweight='bold')
+    
+    # 2. Proxy usage by Attack Type
+    if 'Attack Type' in df.columns:
+        proxy_attack = pd.crosstab(df['Attack Type'], df['has_proxy'], normalize='index') * 100
+        proxy_attack.plot(kind='bar', ax=axes[0, 1], stacked=False, 
+                         color=['lightcoral', 'lightgreen'])
+        axes[0, 1].set_title('Proxy Usage by Attack Type (%)', fontsize=14, fontweight='bold')
+        axes[0, 1].set_xlabel('Attack Type')
+        axes[0, 1].set_ylabel('Percentage')
+        axes[0, 1].legend(['No Proxy', 'With Proxy'])
+        axes[0, 1].tick_params(axis='x', rotation=45)
+    
+    # 3. Proxy usage by Severity Level
+    if 'Severity Level' in df.columns:
+        proxy_severity = pd.crosstab(df['Severity Level'], df['has_proxy'])
+        proxy_severity.plot(kind='bar', ax=axes[1, 0], color=['lightcoral', 'lightgreen'])
+        axes[1, 0].set_title('Proxy Usage by Severity Level', fontsize=14, fontweight='bold')
+        axes[1, 0].set_xlabel('Severity Level')
+        axes[1, 0].set_ylabel('Count')
+        axes[1, 0].legend(['No Proxy', 'With Proxy'])
+        axes[1, 0].tick_params(axis='x', rotation=45)
+    
+    # 4. Proxy by Log Source
+    if 'Log Source' in df.columns:
+        log_proxy = pd.crosstab(df['Log Source'], df['has_proxy'], normalize='index') * 100
+        log_proxy.plot(kind='bar', ax=axes[1, 1], color=['lightcoral', 'lightgreen'])
+        axes[1, 1].set_title('Proxy Usage: Firewall vs Server', fontsize=14, fontweight='bold')
+        axes[1, 1].set_xlabel('Log Source')
+        axes[1, 1].set_ylabel('Percentage')
+        axes[1, 1].legend(['No Proxy', 'With Proxy'])
+        axes[1, 1].tick_params(axis='x', rotation=0)
+    
+    plt.tight_layout()
+    plt.show()
+
+
+def plot_ip_analysis(df):
+    """
+    Visualize IP-related patterns
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe with IP features
+    """
+    if 'src_ip_class' not in df.columns:
+        print("⚠️  IP features not found. Run feature engineering first.")
+        return
+    
+    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
+    
+    # Source IP class distribution
+    top_20_src_classes = df['src_ip_class'].value_counts().head(20)
+    axes[0].bar(range(len(top_20_src_classes)), top_20_src_classes.values, 
+                color='steelblue', edgecolor='black')
+    axes[0].set_xticks(range(len(top_20_src_classes)))
+    axes[0].set_xticklabels(top_20_src_classes.index, rotation=45)
+    axes[0].set_xlabel('IP Class (First Octet)', fontsize=12)
+    axes[0].set_ylabel('Count', fontsize=12)
+    axes[0].set_title('Top 20 Source IP Classes', fontsize=14, fontweight='bold')
+    axes[0].grid(axis='y', alpha=0.3)
+    
+    # IP class vs attack type heatmap
+    if 'Attack Type' in df.columns:
+        ip_attack_matrix = pd.crosstab(
+            df['src_ip_class'], 
+            df['Attack Type'], 
+            normalize='index'
+        ) * 100
+        
+        # Get top 15 IP classes for readability
+        top_15_classes = df['src_ip_class'].value_counts().head(15).index
+        ip_attack_subset = ip_attack_matrix.loc[top_15_classes]
+        
+        sns.heatmap(ip_attack_subset, annot=True, fmt='.1f', cmap='YlOrRd', 
+                    ax=axes[1], cbar_kws={'label': 'Percentage'})
+        axes[1].set_xlabel('Attack Type', fontsize=12)
+        axes[1].set_ylabel('IP Class (First Octet)', fontsize=12)
+        axes[1].set_title('Attack Type Distribution by IP Class (%)', 
+                         fontsize=14, fontweight='bold')
+    
+    plt.tight_layout()
+    plt.show()
+
+
+def plot_protocol_analysis(df):
+    """
+    Visualize protocol distribution and patterns
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe
+    """
+    if 'Protocol' not in df.columns:
+        print("⚠️  'Protocol' column not found")
+        return
+    
+    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
+    
+    protocol_dist = df['Protocol'].value_counts()
+    
+    # Protocol pie chart
+    axes[0].pie(protocol_dist.values, labels=protocol_dist.index, 
+                autopct='%1.1f%%', startangle=90)
+    axes[0].set_title('Protocol Distribution', fontsize=14, fontweight='bold')
+    
+    # Protocol by attack type
+    if 'Attack Type' in df.columns:
+        protocol_attack = pd.crosstab(df['Attack Type'], df['Protocol'], normalize='index') * 100
+        protocol_attack.plot(kind='bar', stacked=True, ax=axes[1], colormap='viridis')
+        axes[1].set_xlabel('Attack Type', fontsize=12)
+        axes[1].set_ylabel('Percentage', fontsize=12)
+        axes[1].set_title('Protocol Distribution by Attack Type (%)', fontsize=14, fontweight='bold')
+        axes[1].legend(title='Protocol', bbox_to_anchor=(1.05, 1), loc='upper left')
+        axes[1].tick_params(axis='x', rotation=45)
+    
+    plt.tight_layout()
+    plt.show()
+
+
+def plot_packet_analysis(df):
+    """
+    Visualize packet length patterns
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe
+    """
+    if 'Packet Length' not in df.columns:
+        print("⚠️  'Packet Length' column not found")
+        return
+    
+    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+    
+    packet_lengths = df['Packet Length'].dropna()
+    
+    # Histogram
+    axes[0, 0].hist(packet_lengths, bins=50, color='skyblue', edgecolor='black', alpha=0.7)
+    axes[0, 0].set_xlabel('Packet Length (bytes)', fontsize=11)
+    axes[0, 0].set_ylabel('Frequency (log scale)', fontsize=11)
+    axes[0, 0].set_title('Packet Length Distribution', fontsize=13, fontweight='bold')
+    axes[0, 0].set_yscale('log')
+    axes[0, 0].grid(True, alpha=0.3)
+    
+    # Binned distribution
+    if 'packet_length_bin' in df.columns:
+        packet_bin_dist = df['packet_length_bin'].value_counts().sort_index()
+        packet_bin_dist.plot(kind='bar', ax=axes[0, 1], color='coral', edgecolor='black')
+        axes[0, 1].set_xlabel('Packet Length Bins', fontsize=11)
+        axes[0, 1].set_ylabel('Count', fontsize=11)
+        axes[0, 1].set_title('Packet Length Binned Distribution', fontsize=13, fontweight='bold')
+        axes[0, 1].tick_params(axis='x', rotation=45)
+    
+    # Box plot by attack type
+    if 'Attack Type' in df.columns:
+        df.boxplot(column='Packet Length', by='Attack Type', ax=axes[1, 0])
+        axes[1, 0].set_xlabel('Attack Type', fontsize=11)
+        axes[1, 0].set_ylabel('Packet Length (bytes)', fontsize=11)
+        axes[1, 0].set_title('Packet Length by Attack Type', fontsize=13, fontweight='bold')
+        axes[1, 0].get_figure().suptitle('')
+        plt.sca(axes[1, 0])
+        plt.xticks(rotation=45, ha='right')
+    
+    # Bins by attack type
+    if 'Attack Type' in df.columns and 'packet_length_bin' in df.columns:
+        bin_attack = pd.crosstab(df['Attack Type'], df['packet_length_bin'], normalize='index') * 100
+        bin_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], colormap='tab10')
+        axes[1, 1].set_xlabel('Attack Type', fontsize=11)
+        axes[1, 1].set_ylabel('Percentage', fontsize=11)
+        axes[1, 1].set_title('Packet Length Bins by Attack Type (%)', fontsize=13, fontweight='bold')
+        axes[1, 1].legend(title='Packet Size', bbox_to_anchor=(1.05, 1), loc='upper left')
+        axes[1, 1].tick_params(axis='x', rotation=45)
+    
+    plt.tight_layout()
+    plt.show()
+
+
+def statistical_test_packet_length(df):
+    """
+    Perform ANOVA test on packet length across attack types
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe
+    """
+    if 'Attack Type' not in df.columns or 'Packet Length' not in df.columns:
+        print("⚠️  Required columns not found")
+        return
+    
+    attack_types = df['Attack Type'].unique()
+    groups = [df[df['Attack Type'] == attack]['Packet Length'].dropna() for attack in attack_types]
+    
+    f_stat, p_value = f_oneway(*groups)
+    
+    print("\n" + "="*80)
+    print("PACKET LENGTH ANOVA TEST")
+    print("="*80)
+    print(f"F-statistic: {f_stat:.2f}")
+    print(f"p-value: {p_value:.4e}")
+    
+    if p_value < 0.001:
+        print("✓ Packet Length is HIGHLY discriminative across attack types!")
+    elif p_value < 0.05:
+        print("✓ Packet Length shows significant differences across attack types")
+    else:
+        print("⚠️  Packet Length may not be strongly discriminative")
+
+
+def print_comprehensive_summary(df):
+    """
+    Print a comprehensive summary of the dataset
+    
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        Input dataframe
+    """
+    print("\n" + "="*80)
+    print("COMPREHENSIVE DATASET SUMMARY")
+    print("="*80)
+    
+    print(f"\n📊 DATASET OVERVIEW")
+    print("-" * 80)
+    print(f"Total Records: {len(df):,}")
+    print(f"Total Features: {df.shape[1]}")
+    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
+    
+    if 'Attack Type' in df.columns:
+        print(f"\n🎯 ATTACK TYPE DISTRIBUTION")
+        print("-" * 80)
+        attack_dist = df['Attack Type'].value_counts()
+        for attack, count in attack_dist.items():
+            print(f"  {attack}: {count:,} ({count/len(df)*100:.2f}%)")
+    
+    print(f"\n🔍 KEY STATISTICS")
+    print("-" * 80)
+    
+    # Proxy
+    if 'has_proxy' in df.columns:
+        proxy_pct = (df['has_proxy'].sum() / len(df)) * 100
+        print(f"  - Proxy Usage Rate: {proxy_pct:.2f}%")
+    
+    # IPs
+    if 'Source IP Address' in df.columns:
+        print(f"  - Unique Source IPs: {df['Source IP Address'].nunique():,}")
+        print(f"  - Unique Destination IPs: {df['Destination IP Address'].nunique():,}")
+    
+    # Packet Length
+    if 'Packet Length' in df.columns:
+        print(f"  - Average Packet Size: {df['Packet Length'].mean():.2f} bytes")
+    
+    # Protocol
+    if 'Protocol' in df.columns:
+        top_protocol = df['Protocol'].value_counts().index[0]
+        top_protocol_pct = (df['Protocol'].value_counts().values[0] / len(df)) * 100
+        print(f"  - Most Common Protocol: {top_protocol} ({top_protocol_pct:.2f}%)")
+    
+    # Port
+    if 'Destination Port' in df.columns:
+        top_port = df['Destination Port'].value_counts().index[0]
+        top_port_count = df['Destination Port'].value_counts().values[0]
+        print(f"  - Most Targeted Port: {top_port} ({top_port_count:,} times)")