diff --git a/01_eda.ipynb b/01_eda.ipynb new file mode 100644 index 0000000..6a96606 --- /dev/null +++ b/01_eda.ipynb @@ -0,0 +1,324 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cybersecurity Attack Analysis - Exploratory Data Analysis\n", + "---\n", + "This notebook demonstrates how to use the modular cybersecurity analysis toolkit.\n", + "\n", + "**Key Principle**: This notebook calls functions from `src/`. It does NOT define reusable logic." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import our custom modules\n", + "import sys\n", + "sys.path.append('..') # Add parent directory to path\n", + "\n", + "from src.data_loader import load_dataset, get_missing_value_summary, get_dataset_info\n", + "from src.features import create_all_features\n", + "from src.utils import (\n", + " setup_plotting_style,\n", + " plot_attack_distribution,\n", + " plot_proxy_analysis,\n", + " plot_ip_analysis,\n", + " plot_protocol_analysis,\n", + " plot_packet_analysis,\n", + " statistical_test_packet_length,\n", + " print_comprehensive_summary\n", + ")\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# Setup plotting\n", + "setup_plotting_style()\n", + "\n", + "print(\"āœ“ Modules imported successfully!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the dataset\n", + "df = load_dataset('../data/cybersecurity_attacks.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Quick peek at the data\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Data Quality Check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get comprehensive dataset info\n", + "get_dataset_info(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check missing values\n", + "missing_summary = get_missing_value_summary(df)\n", + "missing_summary.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Feature Engineering\n", + "\n", + "Create derived features that will help with analysis and modeling." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create all features at once\n", + "df = create_all_features(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check new features\n", + "print(f\"New columns added:\")\n", + "new_cols = ['has_proxy', 'src_ip_class', 'dst_ip_class', 'src_is_private', \n", + " 'dst_is_private', 'is_bidirectional', 'src_port_category', \n", + " 'dst_port_category', 'packet_length_bin', 'anomaly_category']\n", + "for col in new_cols:\n", + " if col in df.columns:\n", + " print(f\" āœ“ {col}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Exploratory Analysis\n", + "\n", + "### 5.1 Attack Type Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_attack_distribution(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.2 Proxy Usage Patterns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_proxy_analysis(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.3 IP Address Patterns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_ip_analysis(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.4 Protocol Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_protocol_analysis(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.5 Packet Length Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_packet_analysis(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Statistical Tests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test if packet length differs significantly across attack types\n", + "statistical_test_packet_length(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Custom Analysis\n", + "\n", + "This section is for ad-hoc analysis. Use the functions from `src/` or write temporary code here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example: Deep dive into a specific attack type\n", + "import pandas as pd\n", + "\n", + "attack_type = 'DDoS' # Change this to analyze different attacks\n", + "subset = df[df['Attack Type'] == attack_type]\n", + "\n", + "print(f\"\\nAnalysis of {attack_type} attacks:\")\n", + "print(f\"Total records: {len(subset):,}\")\n", + "print(f\"\\nTop 5 destination ports:\")\n", + "print(subset['Destination Port'].value_counts().head())\n", + "print(f\"\\nProtocol distribution:\")\n", + "print(subset['Protocol'].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Final Summary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_comprehensive_summary(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Next Steps\n", + "\n", + "1. **Model Training**: Create `src/model.py` with training functions\n", + "2. **Preprocessing**: Add encoding/scaling functions to `src/preprocessing.py`\n", + "3. **More Features**: Extend `src/features.py` with new feature ideas\n", + "4. **Save Results**: Export processed data for modeling\n", + "\n", + "**Remember**: If you write a useful function in this notebook, move it to `src/`!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/cybersecurity_eda.ipynb b/cybersecurity_eda.ipynb deleted file mode 100644 index fae9856..0000000 --- a/cybersecurity_eda.ipynb +++ /dev/null @@ -1,1271 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Cybersecurity Attack Type Detection - EDA\n", - "## Focus: Proxy + IP Trends, Spoofing Detection, and Data Bin Trends\n", - "\n", - "**Team Member:** [Your Name] \n", - "**Date:** January 31, 2026 \n", - "**Dataset:** 40,000 rows, 25 features" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "## 1. Setup and Data Loading" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Import libraries\n", - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from collections import Counter\n", - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "\n", - "# Set visualization style\n", - "sns.set_style(\"whitegrid\")\n", - "plt.rcParams['figure.figsize'] = (15, 8)\n", - "plt.rcParams['font.size'] = 10\n", - "\n", - "print(\"āœ“ Libraries imported successfully!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load dataset\n", - "# TODO: Update the filepath to your actual CSV file location\n", - "df = pd.read_csv('your_dataset.csv')\n", - "\n", - "print(f\"Dataset Shape: {df.shape}\")\n", - "print(f\"Total Records: {df.shape[0]:,}\")\n", - "print(f\"Total Features: {df.shape[1]}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Display first few rows\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Data types and basic info\n", - "df.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Missing values analysis\n", - "missing_df = pd.DataFrame({\n", - " 'Missing_Count': df.isnull().sum(),\n", - " 'Percentage': (df.isnull().sum() / len(df)) * 100,\n", - " 'Distinct_Count': df.nunique(),\n", - " 'Distinct_Percentage': (df.nunique() / len(df)) * 100\n", - "}).sort_values('Missing_Count', ascending=False)\n", - "\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"MISSING VALUES AND DISTINCTNESS ANALYSIS\")\n", - "print(\"=\"*80)\n", - "print(missing_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Attack Type distribution\n", - "if 'Attack Type' in df.columns:\n", - " print(\"\\nAttack Type Distribution:\")\n", - " print(df['Attack Type'].value_counts())\n", - " \n", - " plt.figure(figsize=(12, 6))\n", - " df['Attack Type'].value_counts().plot(kind='bar', color='steelblue', edgecolor='black')\n", - " plt.title('Attack Type Distribution', fontsize=14, fontweight='bold')\n", - " plt.xlabel('Attack Type')\n", - " plt.ylabel('Count')\n", - " plt.xticks(rotation=45, ha='right')\n", - " plt.tight_layout()\n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "## 2. PART 1: Proxy Information Analysis\n", - "\n", - "**Key Insights from Data Profiling:**\n", - "- 50% missing values (19,851 out of 40,000)\n", - "- 20,148 distinct values when present (highly diverse)\n", - "- This suggests proxy info is present only for certain attacks/sources" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Proxy Information Analysis\n", - "print(\"=\"*80)\n", - "print(\"PROXY INFORMATION ANALYSIS\")\n", - "print(\"=\"*80)\n", - "\n", - "if 'Proxy Information' in df.columns:\n", - " # Basic statistics\n", - " total_records = len(df)\n", - " proxy_present = df['Proxy Information'].notna().sum()\n", - " proxy_missing = df['Proxy Information'].isna().sum()\n", - " unique_proxies = df['Proxy Information'].nunique()\n", - " \n", - " print(f\"\\nProxy Information Statistics:\")\n", - " print(f\" - Total records: {total_records:,}\")\n", - " print(f\" - Records WITH proxy info: {proxy_present:,} ({proxy_present/total_records*100:.2f}%)\")\n", - " print(f\" - Records WITHOUT proxy info: {proxy_missing:,} ({proxy_missing/total_records*100:.2f}%)\")\n", - " print(f\" - Unique proxy values: {unique_proxies:,}\")\n", - " \n", - " # Create binary feature: has_proxy\n", - " df['has_proxy'] = df['Proxy Information'].notna().astype(int)\n", - " \n", - " print(f\"\\nProxy Usage Distribution:\")\n", - " print(df['has_proxy'].value_counts())\n", - "else:\n", - " print(\"Warning: 'Proxy Information' column not found!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize proxy usage patterns\n", - "if 'has_proxy' in df.columns:\n", - " fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n", - " \n", - " # 1. Overall proxy usage pie chart\n", - " proxy_counts = df['has_proxy'].value_counts()\n", - " labels = ['No Proxy', 'With Proxy']\n", - " colors = ['lightcoral', 'lightgreen']\n", - " axes[0, 0].pie(proxy_counts.values, labels=labels, autopct='%1.1f%%', \n", - " colors=colors, startangle=90)\n", - " axes[0, 0].set_title('Overall Proxy Usage Distribution', fontsize=14, fontweight='bold')\n", - " \n", - " # 2. Proxy usage by Attack Type\n", - " if 'Attack Type' in df.columns:\n", - " proxy_attack = pd.crosstab(df['Attack Type'], df['has_proxy'], normalize='index') * 100\n", - " proxy_attack.plot(kind='bar', ax=axes[0, 1], stacked=False, \n", - " color=['lightcoral', 'lightgreen'])\n", - " axes[0, 1].set_title('Proxy Usage by Attack Type (%)', fontsize=14, fontweight='bold')\n", - " axes[0, 1].set_xlabel('Attack Type')\n", - " axes[0, 1].set_ylabel('Percentage')\n", - " axes[0, 1].legend(['No Proxy', 'With Proxy'])\n", - " axes[0, 1].tick_params(axis='x', rotation=45)\n", - " \n", - " # Print statistical summary\n", - " print(\"\\nProxy Usage by Attack Type:\")\n", - " print(proxy_attack)\n", - " \n", - " # 3. Proxy usage by Severity Level\n", - " if 'Severity Level' in df.columns:\n", - " proxy_severity = pd.crosstab(df['Severity Level'], df['has_proxy'])\n", - " proxy_severity.plot(kind='bar', ax=axes[1, 0], color=['lightcoral', 'lightgreen'])\n", - " axes[1, 0].set_title('Proxy Usage by Severity Level', fontsize=14, fontweight='bold')\n", - " axes[1, 0].set_xlabel('Severity Level')\n", - " axes[1, 0].set_ylabel('Count')\n", - " axes[1, 0].legend(['No Proxy', 'With Proxy'])\n", - " axes[1, 0].tick_params(axis='x', rotation=45)\n", - " \n", - " # 4. Proxy usage over time\n", - " if 'Timestamp' in df.columns:\n", - " df_temp = df.copy()\n", - " df_temp['Timestamp'] = pd.to_datetime(df_temp['Timestamp'], errors='coerce')\n", - " df_temp = df_temp.dropna(subset=['Timestamp'])\n", - " df_temp['Date'] = df_temp['Timestamp'].dt.date\n", - " \n", - " proxy_time = df_temp.groupby('Date')['has_proxy'].agg(['sum', 'count'])\n", - " proxy_time['percentage'] = (proxy_time['sum'] / proxy_time['count']) * 100\n", - " \n", - " axes[1, 1].plot(proxy_time.index, proxy_time['percentage'], \n", - " marker='o', color='steelblue', linewidth=2)\n", - " axes[1, 1].set_title('Proxy Usage Trend Over Time', fontsize=14, fontweight='bold')\n", - " axes[1, 1].set_xlabel('Date')\n", - " axes[1, 1].set_ylabel('Percentage Using Proxy')\n", - " axes[1, 1].tick_params(axis='x', rotation=45)\n", - " axes[1, 1].grid(True, alpha=0.3)\n", - " \n", - " plt.tight_layout()\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Analyze relationship with Log Source (Firewall vs Server)\n", - "if 'Log Source' in df.columns and 'has_proxy' in df.columns:\n", - " print(\"\\nProxy Usage by Log Source:\")\n", - " log_proxy = pd.crosstab(df['Log Source'], df['has_proxy'], normalize='index') * 100\n", - " print(log_proxy)\n", - " \n", - " # Visualize\n", - " log_proxy.plot(kind='bar', figsize=(10, 6), color=['lightcoral', 'lightgreen'])\n", - " plt.title('Proxy Usage: Firewall vs Server Logs', fontsize=14, fontweight='bold')\n", - " plt.xlabel('Log Source')\n", - " plt.ylabel('Percentage')\n", - " plt.legend(['No Proxy', 'With Proxy'])\n", - " plt.xticks(rotation=0)\n", - " plt.tight_layout()\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Statistical significance test\n", - "if 'Attack Type' in df.columns and 'has_proxy' in df.columns:\n", - " print(\"\\n\" + \"=\"*80)\n", - " print(\"PROXY USAGE INSIGHTS BY ATTACK TYPE\")\n", - " print(\"=\"*80)\n", - " \n", - " for attack_type in df['Attack Type'].unique():\n", - " subset = df[df['Attack Type'] == attack_type]\n", - " proxy_pct = (subset['has_proxy'].sum() / len(subset)) * 100\n", - " \n", - " print(f\"\\n{attack_type}:\")\n", - " print(f\" - Total attacks: {len(subset):,}\")\n", - " print(f\" - With proxy: {subset['has_proxy'].sum():,} ({proxy_pct:.2f}%)\")\n", - " print(f\" - Without proxy: {len(subset) - subset['has_proxy'].sum():,} ({100-proxy_pct:.2f}%)\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### šŸ“Š Key Findings - Proxy Analysis\n", - "\n", - "**Summary:**\n", - "- Write your key findings here after running the cells above\n", - "- Which attack types use proxies most?\n", - "- Is there a correlation with severity?\n", - "- Any temporal patterns?\n", - "\n", - "**Recommendation for ML Model:**\n", - "- The binary feature `has_proxy` appears to be a strong discriminator\n", - "- Consider as a key feature in your model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "## 3. PART 2: IP Trends and Spoofing Detection\n", - "\n", - "**Analysis Goals:**\n", - "1. Identify top source and destination IPs\n", - "2. Detect fan-out patterns (one source → many destinations = scanning/spoofing)\n", - "3. Detect fan-in patterns (many sources → one destination = DDoS)\n", - "4. Analyze bidirectional traffic\n", - "5. Detect private IP usage anomalies" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"=\"*80)\n", - "print(\"IP TRENDS AND SPOOFING DETECTION\")\n", - "print(\"=\"*80)\n", - "\n", - "# Basic IP statistics\n", - "if 'Source IP Address' in df.columns and 'Destination IP Address' in df.columns:\n", - " print(f\"\\nIP Address Statistics:\")\n", - " print(f\" - Unique Source IPs: {df['Source IP Address'].nunique():,}\")\n", - " print(f\" - Unique Destination IPs: {df['Destination IP Address'].nunique():,}\")\n", - " print(f\" - Total IP-to-IP connections: {len(df):,}\")\n", - " print(f\" - Average connections per source IP: {len(df)/df['Source IP Address'].nunique():.2f}\")\n", - " print(f\" - Average connections per destination IP: {len(df)/df['Destination IP Address'].nunique():.2f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Top Source IPs\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"TOP SOURCE IP ADDRESSES\")\n", - "print(\"=\"*80)\n", - "\n", - "top_src_ips = df['Source IP Address'].value_counts().head(20)\n", - "print(\"\\nTop 20 Source IPs:\")\n", - "print(top_src_ips)\n", - "\n", - "# Visualize\n", - "plt.figure(figsize=(12, 8))\n", - "plt.barh(range(len(top_src_ips)), top_src_ips.values, color='steelblue')\n", - "plt.yticks(range(len(top_src_ips)), top_src_ips.index)\n", - "plt.xlabel('Frequency (Number of Connections)', fontsize=12)\n", - "plt.ylabel('Source IP Address', fontsize=12)\n", - "plt.title('Top 20 Most Active Source IP Addresses', fontsize=14, fontweight='bold')\n", - "plt.gca().invert_yaxis()\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Top Destination IPs\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"TOP DESTINATION IP ADDRESSES\")\n", - "print(\"=\"*80)\n", - "\n", - "top_dst_ips = df['Destination IP Address'].value_counts().head(20)\n", - "print(\"\\nTop 20 Destination IPs:\")\n", - "print(top_dst_ips)\n", - "\n", - "# Visualize\n", - "plt.figure(figsize=(12, 8))\n", - "plt.barh(range(len(top_dst_ips)), top_dst_ips.values, color='coral')\n", - "plt.yticks(range(len(top_dst_ips)), top_dst_ips.index)\n", - "plt.xlabel('Frequency (Number of Connections)', fontsize=12)\n", - "plt.ylabel('Destination IP Address', fontsize=12)\n", - "plt.title('Top 20 Most Targeted Destination IP Addresses', fontsize=14, fontweight='bold')\n", - "plt.gca().invert_yaxis()\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# SPOOFING DETECTION 1: Fan-out Analysis (Source IP → Multiple Destinations)\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"SPOOFING INDICATOR 1: FAN-OUT PATTERN (Source → Multiple Destinations)\")\n", - "print(\"=\"*80)\n", - "\n", - "# Count unique destinations per source IP\n", - "src_to_dst_mapping = df.groupby('Source IP Address')['Destination IP Address'].nunique()\n", - "src_to_dst_mapping = src_to_dst_mapping.sort_values(ascending=False)\n", - "\n", - "# Calculate thresholds\n", - "threshold_95 = src_to_dst_mapping.quantile(0.95)\n", - "threshold_99 = src_to_dst_mapping.quantile(0.99)\n", - "\n", - "suspicious_sources_95 = src_to_dst_mapping[src_to_dst_mapping > threshold_95]\n", - "suspicious_sources_99 = src_to_dst_mapping[src_to_dst_mapping > threshold_99]\n", - "\n", - "print(f\"\\nFan-out Statistics:\")\n", - "print(f\" - Mean destinations per source: {src_to_dst_mapping.mean():.2f}\")\n", - "print(f\" - Median destinations per source: {src_to_dst_mapping.median():.2f}\")\n", - "print(f\" - 95th percentile threshold: {threshold_95:.0f} destinations\")\n", - "print(f\" - 99th percentile threshold: {threshold_99:.0f} destinations\")\n", - "print(f\"\\nSuspicious Source IPs:\")\n", - "print(f\" - IPs above 95th percentile: {len(suspicious_sources_95)} ({len(suspicious_sources_95)/len(src_to_dst_mapping)*100:.2f}%)\")\n", - "print(f\" - IPs above 99th percentile: {len(suspicious_sources_99)} ({len(suspicious_sources_99)/len(src_to_dst_mapping)*100:.2f}%)\")\n", - "\n", - "print(f\"\\nTop 10 Source IPs with Highest Fan-out:\")\n", - "print(src_to_dst_mapping.head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize fan-out distribution\n", - "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", - "\n", - "# Histogram\n", - "axes[0].hist(src_to_dst_mapping.values, bins=50, color='red', alpha=0.7, edgecolor='black')\n", - "axes[0].axvline(threshold_95, color='darkred', linestyle='--', linewidth=2, \n", - " label=f'95th percentile: {threshold_95:.0f}')\n", - "axes[0].axvline(threshold_99, color='maroon', linestyle='--', linewidth=2, \n", - " label=f'99th percentile: {threshold_99:.0f}')\n", - "axes[0].set_xlabel('Number of Unique Destinations per Source IP', fontsize=12)\n", - "axes[0].set_ylabel('Frequency (log scale)', fontsize=12)\n", - "axes[0].set_title('Source IP Fan-out Distribution\\n(Potential Scanning/Spoofing)', \n", - " fontsize=14, fontweight='bold')\n", - "axes[0].set_yscale('log')\n", - "axes[0].legend()\n", - "axes[0].grid(True, alpha=0.3)\n", - "\n", - "# Top suspicious IPs\n", - "top_suspicious = src_to_dst_mapping.head(15)\n", - "axes[1].barh(range(len(top_suspicious)), top_suspicious.values, color='darkred')\n", - "axes[1].set_yticks(range(len(top_suspicious)))\n", - "axes[1].set_yticklabels(top_suspicious.index)\n", - "axes[1].set_xlabel('Number of Unique Destinations', fontsize=12)\n", - "axes[1].set_ylabel('Source IP Address', fontsize=12)\n", - "axes[1].set_title('Top 15 Source IPs by Fan-out\\n(Most Suspicious)', \n", - " fontsize=14, fontweight='bold')\n", - "axes[1].invert_yaxis()\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# SPOOFING DETECTION 2: Fan-in Analysis (Multiple Sources → Single Destination)\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"SPOOFING INDICATOR 2: FAN-IN PATTERN (Multiple Sources → Destination)\")\n", - "print(\"=\"*80)\n", - "\n", - "# Count unique sources per destination IP\n", - "dst_to_src_mapping = df.groupby('Destination IP Address')['Source IP Address'].nunique()\n", - "dst_to_src_mapping = dst_to_src_mapping.sort_values(ascending=False)\n", - "\n", - "# Calculate thresholds\n", - "threshold_95_dst = dst_to_src_mapping.quantile(0.95)\n", - "threshold_99_dst = dst_to_src_mapping.quantile(0.99)\n", - "\n", - "suspicious_targets_95 = dst_to_src_mapping[dst_to_src_mapping > threshold_95_dst]\n", - "suspicious_targets_99 = dst_to_src_mapping[dst_to_src_mapping > threshold_99_dst]\n", - "\n", - "print(f\"\\nFan-in Statistics:\")\n", - "print(f\" - Mean sources per destination: {dst_to_src_mapping.mean():.2f}\")\n", - "print(f\" - Median sources per destination: {dst_to_src_mapping.median():.2f}\")\n", - "print(f\" - 95th percentile threshold: {threshold_95_dst:.0f} sources\")\n", - "print(f\" - 99th percentile threshold: {threshold_99_dst:.0f} sources\")\n", - "print(f\"\\nSuspicious Target IPs (Potential DDoS Victims):\")\n", - "print(f\" - IPs above 95th percentile: {len(suspicious_targets_95)} ({len(suspicious_targets_95)/len(dst_to_src_mapping)*100:.2f}%)\")\n", - "print(f\" - IPs above 99th percentile: {len(suspicious_targets_99)} ({len(suspicious_targets_99)/len(dst_to_src_mapping)*100:.2f}%)\")\n", - "\n", - "print(f\"\\nTop 10 Target IPs with Highest Fan-in:\")\n", - "print(dst_to_src_mapping.head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize fan-in distribution\n", - "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", - "\n", - "# Histogram\n", - "axes[0].hist(dst_to_src_mapping.values, bins=50, color='purple', alpha=0.7, edgecolor='black')\n", - "axes[0].axvline(threshold_95_dst, color='darkviolet', linestyle='--', linewidth=2, \n", - " label=f'95th percentile: {threshold_95_dst:.0f}')\n", - "axes[0].axvline(threshold_99_dst, color='indigo', linestyle='--', linewidth=2, \n", - " label=f'99th percentile: {threshold_99_dst:.0f}')\n", - "axes[0].set_xlabel('Number of Unique Sources per Destination IP', fontsize=12)\n", - "axes[0].set_ylabel('Frequency (log scale)', fontsize=12)\n", - "axes[0].set_title('Destination IP Fan-in Distribution\\n(Potential DDoS Targets)', \n", - " fontsize=14, fontweight='bold')\n", - "axes[0].set_yscale('log')\n", - "axes[0].legend()\n", - "axes[0].grid(True, alpha=0.3)\n", - "\n", - "# Top targeted IPs\n", - "top_targets = dst_to_src_mapping.head(15)\n", - "axes[1].barh(range(len(top_targets)), top_targets.values, color='darkviolet')\n", - "axes[1].set_yticks(range(len(top_targets)))\n", - "axes[1].set_yticklabels(top_targets.index)\n", - "axes[1].set_xlabel('Number of Unique Sources', fontsize=12)\n", - "axes[1].set_ylabel('Destination IP Address', fontsize=12)\n", - "axes[1].set_title('Top 15 Destination IPs by Fan-in\\n(Potential DDoS Targets)', \n", - " fontsize=14, fontweight='bold')\n", - "axes[1].invert_yaxis()\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# SPOOFING DETECTION 3: Bidirectional Traffic Analysis\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"SPOOFING INDICATOR 3: BIDIRECTIONAL TRAFFIC\")\n", - "print(\"=\"*80)\n", - "\n", - "source_ips_set = set(df['Source IP Address'].dropna())\n", - "dest_ips_set = set(df['Destination IP Address'].dropna())\n", - "bidirectional_ips = source_ips_set.intersection(dest_ips_set)\n", - "\n", - "print(f\"\\nBidirectional IP Statistics:\")\n", - "print(f\" - Total unique source IPs: {len(source_ips_set):,}\")\n", - "print(f\" - Total unique destination IPs: {len(dest_ips_set):,}\")\n", - "print(f\" - IPs appearing as BOTH source and destination: {len(bidirectional_ips):,}\")\n", - "print(f\" - Percentage of bidirectional IPs: {len(bidirectional_ips)/(len(source_ips_set.union(dest_ips_set)))*100:.2f}%\")\n", - "\n", - "# Analyze bidirectional traffic by attack type\n", - "if 'Attack Type' in df.columns:\n", - " df['is_bidirectional'] = (df['Source IP Address'].isin(bidirectional_ips)) | \\\n", - " (df['Destination IP Address'].isin(bidirectional_ips))\n", - " \n", - " print(f\"\\nBidirectional Traffic by Attack Type:\")\n", - " bidir_attack = pd.crosstab(df['Attack Type'], df['is_bidirectional'], normalize='index') * 100\n", - " print(bidir_attack)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize bidirectional traffic\n", - "if 'Attack Type' in df.columns and 'is_bidirectional' in df.columns:\n", - " plt.figure(figsize=(12, 6))\n", - " bidir_attack[True].sort_values().plot(kind='barh', color='teal', edgecolor='black')\n", - " plt.xlabel('Percentage of Traffic with Bidirectional IPs', fontsize=12)\n", - " plt.ylabel('Attack Type', fontsize=12)\n", - " plt.title('Bidirectional IP Traffic by Attack Type', fontsize=14, fontweight='bold')\n", - " plt.tight_layout()\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# SPOOFING DETECTION 4: Private IP Detection\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"SPOOFING INDICATOR 4: PRIVATE IP ADDRESS DETECTION\")\n", - "print(\"=\"*80)\n", - "\n", - "def is_private_ip(ip):\n", - " \"\"\"Check if an IP is in private range (RFC 1918)\"\"\"\n", - " if pd.isna(ip):\n", - " return False\n", - " try:\n", - " parts = str(ip).split('.')\n", - " if len(parts) != 4:\n", - " return False\n", - " first = int(parts[0])\n", - " second = int(parts[1])\n", - " \n", - " # Private IP ranges: 10.x.x.x, 172.16-31.x.x, 192.168.x.x\n", - " if first == 10:\n", - " return True\n", - " if first == 172 and 16 <= second <= 31:\n", - " return True\n", - " if first == 192 and second == 168:\n", - " return True\n", - " return False\n", - " except:\n", - " return False\n", - "\n", - "df['src_is_private'] = df['Source IP Address'].apply(is_private_ip)\n", - "df['dst_is_private'] = df['Destination IP Address'].apply(is_private_ip)\n", - "\n", - "print(f\"\\nPrivate IP Statistics:\")\n", - "print(f\" - Source IPs from private ranges: {df['src_is_private'].sum():,} ({df['src_is_private'].sum()/len(df)*100:.2f}%)\")\n", - "print(f\" - Destination IPs from private ranges: {df['dst_is_private'].sum():,} ({df['dst_is_private'].sum()/len(df)*100:.2f}%)\")\n", - "print(f\" - Total connections involving private IPs: {(df['src_is_private'] | df['dst_is_private']).sum():,}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Analyze private IP usage by attack type\n", - "if 'Attack Type' in df.columns:\n", - " print(\"\\nPrivate IP Usage by Attack Type:\")\n", - " attack_private = df.groupby('Attack Type').agg({\n", - " 'src_is_private': ['sum', 'mean'],\n", - " 'dst_is_private': ['sum', 'mean']\n", - " })\n", - " attack_private.columns = ['Src_Private_Count', 'Src_Private_Pct', 'Dst_Private_Count', 'Dst_Private_Pct']\n", - " attack_private['Src_Private_Pct'] = attack_private['Src_Private_Pct'] * 100\n", - " attack_private['Dst_Private_Pct'] = attack_private['Dst_Private_Pct'] * 100\n", - " print(attack_private)\n", - " \n", - " # Visualize\n", - " fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", - " \n", - " attack_private['Src_Private_Pct'].plot(kind='bar', ax=axes[0], color='orange', edgecolor='black')\n", - " axes[0].set_title('Source Private IP Usage by Attack Type', fontsize=14, fontweight='bold')\n", - " axes[0].set_xlabel('Attack Type')\n", - " axes[0].set_ylabel('Percentage')\n", - " axes[0].tick_params(axis='x', rotation=45)\n", - " \n", - " attack_private['Dst_Private_Pct'].plot(kind='bar', ax=axes[1], color='red', edgecolor='black')\n", - " axes[1].set_title('Destination Private IP Usage by Attack Type', fontsize=14, fontweight='bold')\n", - " axes[1].set_xlabel('Attack Type')\n", - " axes[1].set_ylabel('Percentage')\n", - " axes[1].tick_params(axis='x', rotation=45)\n", - " \n", - " plt.tight_layout()\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Geo-location analysis (if available)\n", - "if 'Geo-location Data' in df.columns:\n", - " print(\"\\n\" + \"=\"*80)\n", - " print(\"GEO-LOCATION ANALYSIS\")\n", - " print(\"=\"*80)\n", - " \n", - " print(f\"\\nGeo-location Statistics:\")\n", - " print(f\" - Unique locations: {df['Geo-location Data'].nunique():,}\")\n", - " print(f\" - Missing values: {df['Geo-location Data'].isna().sum():,}\")\n", - " \n", - " # Top locations\n", - " print(f\"\\nTop 15 Geo-locations:\")\n", - " top_locations = df['Geo-location Data'].value_counts().head(15)\n", - " print(top_locations)\n", - " \n", - " # Visualize\n", - " plt.figure(figsize=(14, 8))\n", - " top_locations.plot(kind='barh', color='skyblue', edgecolor='black')\n", - " plt.xlabel('Frequency', fontsize=12)\n", - " plt.ylabel('Geo-location', fontsize=12)\n", - " plt.title('Top 15 Geo-locations in Attack Traffic', fontsize=14, fontweight='bold')\n", - " plt.gca().invert_yaxis()\n", - " plt.tight_layout()\n", - " plt.show()\n", - " \n", - " # Geo-location by attack type\n", - " if 'Attack Type' in df.columns:\n", - " print(f\"\\nTop Geo-location by Attack Type:\")\n", - " for attack in df['Attack Type'].unique():\n", - " top_loc = df[df['Attack Type'] == attack]['Geo-location Data'].value_counts().head(1)\n", - " if len(top_loc) > 0:\n", - " print(f\" {attack}: {top_loc.index[0]} ({top_loc.values[0]} occurrences)\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### šŸ“Š Key Findings - IP Trends & Spoofing\n", - "\n", - "**Summary:**\n", - "- Write your key findings here\n", - "- How many suspicious IPs detected (fan-out/fan-in)?\n", - "- Any DDoS targets identified?\n", - "- Private IP issues?\n", - "- Geographic patterns?\n", - "\n", - "**Red Flags Identified:**\n", - "- List specific suspicious IPs or patterns\n", - "\n", - "**Recommendation for ML Model:**\n", - "- Create features: source_fanout_score, dest_fanin_score, is_bidirectional, is_private_ip" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "## 4. PART 3: Data Bin Trends Analysis\n", - "\n", - "**Analysis Goals:**\n", - "1. Packet Length distribution and binning\n", - "2. Port usage patterns (well-known, registered, dynamic)\n", - "3. Protocol distribution\n", - "4. Anomaly score categorization" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"=\"*80)\n", - "print(\"DATA BIN TRENDS ANALYSIS\")\n", - "print(\"=\"*80)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 1. PACKET LENGTH ANALYSIS\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"PACKET LENGTH DISTRIBUTION\")\n", - "print(\"=\"*80)\n", - "\n", - "if 'Packet Length' in df.columns:\n", - " packet_lengths = df['Packet Length'].dropna()\n", - " \n", - " print(f\"\\nPacket Length Statistics:\")\n", - " print(f\" - Mean: {packet_lengths.mean():.2f} bytes\")\n", - " print(f\" - Median: {packet_lengths.median():.2f} bytes\")\n", - " print(f\" - Std Dev: {packet_lengths.std():.2f} bytes\")\n", - " print(f\" - Min: {packet_lengths.min():.2f} bytes\")\n", - " print(f\" - Max: {packet_lengths.max():.2f} bytes\")\n", - " print(f\" - 25th percentile: {packet_lengths.quantile(0.25):.2f} bytes\")\n", - " print(f\" - 75th percentile: {packet_lengths.quantile(0.75):.2f} bytes\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create packet length bins\n", - "bins_packet = [0, 64, 128, 256, 512, 1024, 2048, float('inf')]\n", - "labels_packet = ['0-64', '64-128', '128-256', '256-512', '512-1024', '1024-2048', '2048+']\n", - "df['packet_length_bin'] = pd.cut(df['Packet Length'], bins=bins_packet, labels=labels_packet)\n", - "\n", - "packet_bin_dist = df['packet_length_bin'].value_counts().sort_index()\n", - "print(f\"\\nPacket Length Bins Distribution:\")\n", - "print(packet_bin_dist)\n", - "print(f\"\\nPercentage Distribution:\")\n", - "print((packet_bin_dist / packet_bin_dist.sum() * 100).round(2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize packet length\n", - "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n", - "\n", - "# Histogram\n", - "axes[0, 0].hist(packet_lengths, bins=50, color='skyblue', edgecolor='black', alpha=0.7)\n", - "axes[0, 0].set_xlabel('Packet Length (bytes)', fontsize=11)\n", - "axes[0, 0].set_ylabel('Frequency (log scale)', fontsize=11)\n", - "axes[0, 0].set_title('Packet Length Distribution', fontsize=13, fontweight='bold')\n", - "axes[0, 0].set_yscale('log')\n", - "axes[0, 0].grid(True, alpha=0.3)\n", - "\n", - "# Binned distribution\n", - "packet_bin_dist.plot(kind='bar', ax=axes[0, 1], color='coral', edgecolor='black')\n", - "axes[0, 1].set_xlabel('Packet Length Bins (bytes)', fontsize=11)\n", - "axes[0, 1].set_ylabel('Count', fontsize=11)\n", - "axes[0, 1].set_title('Packet Length Binned Distribution', fontsize=13, fontweight='bold')\n", - "axes[0, 1].tick_params(axis='x', rotation=45)\n", - "\n", - "# Box plot by attack type\n", - "if 'Attack Type' in df.columns:\n", - " df.boxplot(column='Packet Length', by='Attack Type', ax=axes[1, 0])\n", - " axes[1, 0].set_xlabel('Attack Type', fontsize=11)\n", - " axes[1, 0].set_ylabel('Packet Length (bytes)', fontsize=11)\n", - " axes[1, 0].set_title('Packet Length by Attack Type', fontsize=13, fontweight='bold')\n", - " axes[1, 0].get_figure().suptitle('') # Remove default title\n", - " plt.sca(axes[1, 0])\n", - " plt.xticks(rotation=45, ha='right')\n", - "\n", - "# Bins by attack type (stacked bar)\n", - "if 'Attack Type' in df.columns:\n", - " bin_attack = pd.crosstab(df['Attack Type'], df['packet_length_bin'], normalize='index') * 100\n", - " bin_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], colormap='tab10')\n", - " axes[1, 1].set_xlabel('Attack Type', fontsize=11)\n", - " axes[1, 1].set_ylabel('Percentage', fontsize=11)\n", - " axes[1, 1].set_title('Packet Length Bins by Attack Type (%)', fontsize=13, fontweight='bold')\n", - " axes[1, 1].legend(title='Packet Size', bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " axes[1, 1].tick_params(axis='x', rotation=45)\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 2. PORT ANALYSIS\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"PORT USAGE ANALYSIS\")\n", - "print(\"=\"*80)\n", - "\n", - "if 'Source Port' in df.columns and 'Destination Port' in df.columns:\n", - " # Source ports\n", - " print(f\"\\nTop 10 Source Ports:\")\n", - " top_src_ports = df['Source Port'].value_counts().head(10)\n", - " print(top_src_ports)\n", - " \n", - " # Destination ports\n", - " print(f\"\\nTop 10 Destination Ports:\")\n", - " top_dst_ports = df['Destination Port'].value_counts().head(10)\n", - " print(top_dst_ports)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create port categories\n", - "def categorize_port(port):\n", - " \"\"\"Categorize ports into well-known, registered, or dynamic\"\"\"\n", - " if pd.isna(port):\n", - " return 'Unknown'\n", - " try:\n", - " port = int(port)\n", - " if 0 <= port <= 1023:\n", - " return 'Well-known (0-1023)'\n", - " elif 1024 <= port <= 49151:\n", - " return 'Registered (1024-49151)'\n", - " elif 49152 <= port <= 65535:\n", - " return 'Dynamic (49152-65535)'\n", - " else:\n", - " return 'Unknown'\n", - " except:\n", - " return 'Unknown'\n", - "\n", - "df['dst_port_category'] = df['Destination Port'].apply(categorize_port)\n", - "df['src_port_category'] = df['Source Port'].apply(categorize_port)\n", - "\n", - "print(f\"\\nDestination Port Categories:\")\n", - "print(df['dst_port_category'].value_counts())\n", - "print(f\"\\nPercentage:\")\n", - "print((df['dst_port_category'].value_counts() / len(df) * 100).round(2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize port analysis\n", - "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n", - "\n", - "# Top source ports\n", - "top_src_ports_15 = df['Source Port'].value_counts().head(15)\n", - "axes[0, 0].barh(range(len(top_src_ports_15)), top_src_ports_15.values, color='lightgreen')\n", - "axes[0, 0].set_yticks(range(len(top_src_ports_15)))\n", - "axes[0, 0].set_yticklabels(top_src_ports_15.index)\n", - "axes[0, 0].set_xlabel('Frequency', fontsize=11)\n", - "axes[0, 0].set_ylabel('Port Number', fontsize=11)\n", - "axes[0, 0].set_title('Top 15 Source Ports', fontsize=13, fontweight='bold')\n", - "axes[0, 0].invert_yaxis()\n", - "\n", - "# Top destination ports\n", - "top_dst_ports_15 = df['Destination Port'].value_counts().head(15)\n", - "axes[0, 1].barh(range(len(top_dst_ports_15)), top_dst_ports_15.values, color='lightcoral')\n", - "axes[0, 1].set_yticks(range(len(top_dst_ports_15)))\n", - "axes[0, 1].set_yticklabels(top_dst_ports_15.index)\n", - "axes[0, 1].set_xlabel('Frequency', fontsize=11)\n", - "axes[0, 1].set_ylabel('Port Number', fontsize=11)\n", - "axes[0, 1].set_title('Top 15 Destination Ports', fontsize=13, fontweight='bold')\n", - "axes[0, 1].invert_yaxis()\n", - "\n", - "# Port category pie chart\n", - "port_cat_dist = df['dst_port_category'].value_counts()\n", - "axes[1, 0].pie(port_cat_dist.values, labels=port_cat_dist.index, autopct='%1.1f%%', startangle=90)\n", - "axes[1, 0].set_title('Destination Port Categories', fontsize=13, fontweight='bold')\n", - "\n", - "# Port categories by attack type\n", - "if 'Attack Type' in df.columns:\n", - " port_attack = pd.crosstab(df['Attack Type'], df['dst_port_category'])\n", - " port_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], colormap='Set3')\n", - " axes[1, 1].set_xlabel('Attack Type', fontsize=11)\n", - " axes[1, 1].set_ylabel('Count', fontsize=11)\n", - " axes[1, 1].set_title('Port Categories by Attack Type', fontsize=13, fontweight='bold')\n", - " axes[1, 1].legend(title='Port Category', bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " axes[1, 1].tick_params(axis='x', rotation=45)\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 3. PROTOCOL ANALYSIS\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"PROTOCOL DISTRIBUTION\")\n", - "print(\"=\"*80)\n", - "\n", - "if 'Protocol' in df.columns:\n", - " protocol_dist = df['Protocol'].value_counts()\n", - " print(f\"\\nProtocol Distribution:\")\n", - " print(protocol_dist)\n", - " print(f\"\\nPercentage:\")\n", - " print((protocol_dist / protocol_dist.sum() * 100).round(2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize protocol analysis\n", - "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", - "\n", - "# Protocol pie chart\n", - "axes[0].pie(protocol_dist.values, labels=protocol_dist.index, autopct='%1.1f%%', startangle=90)\n", - "axes[0].set_title('Protocol Distribution', fontsize=14, fontweight='bold')\n", - "\n", - "# Protocol by attack type\n", - "if 'Attack Type' in df.columns:\n", - " protocol_attack = pd.crosstab(df['Attack Type'], df['Protocol'], normalize='index') * 100\n", - " protocol_attack.plot(kind='bar', stacked=True, ax=axes[1], colormap='viridis')\n", - " axes[1].set_xlabel('Attack Type', fontsize=12)\n", - " axes[1].set_ylabel('Percentage', fontsize=12)\n", - " axes[1].set_title('Protocol Distribution by Attack Type (%)', fontsize=14, fontweight='bold')\n", - " axes[1].legend(title='Protocol', bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " axes[1].tick_params(axis='x', rotation=45)\n", - " \n", - " print(f\"\\nProtocol Usage by Attack Type (%):\")\n", - " print(protocol_attack.round(2))\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 4. ANOMALY SCORES ANALYSIS\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"ANOMALY SCORES DISTRIBUTION\")\n", - "print(\"=\"*80)\n", - "\n", - "if 'Anomaly Scores' in df.columns:\n", - " anomaly_scores = df['Anomaly Scores'].dropna()\n", - " \n", - " print(f\"\\nAnomaly Score Statistics:\")\n", - " print(f\" - Mean: {anomaly_scores.mean():.4f}\")\n", - " print(f\" - Median: {anomaly_scores.median():.4f}\")\n", - " print(f\" - Std Dev: {anomaly_scores.std():.4f}\")\n", - " print(f\" - Min: {anomaly_scores.min():.4f}\")\n", - " print(f\" - Max: {anomaly_scores.max():.4f}\")\n", - " print(f\" - 25th percentile: {anomaly_scores.quantile(0.25):.4f}\")\n", - " print(f\" - 50th percentile: {anomaly_scores.quantile(0.50):.4f}\")\n", - " print(f\" - 75th percentile: {anomaly_scores.quantile(0.75):.4f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create anomaly score categories based on quartiles\n", - "anomaly_bins = [anomaly_scores.min(), \n", - " anomaly_scores.quantile(0.25),\n", - " anomaly_scores.quantile(0.5),\n", - " anomaly_scores.quantile(0.75),\n", - " anomaly_scores.max()]\n", - "anomaly_labels = ['Low (0-25%)', 'Medium (25-50%)', 'High (50-75%)', 'Critical (75-100%)']\n", - "\n", - "df['anomaly_category'] = pd.cut(df['Anomaly Scores'], bins=anomaly_bins, \n", - " labels=anomaly_labels, include_lowest=True)\n", - "\n", - "anomaly_cat_dist = df['anomaly_category'].value_counts().sort_index()\n", - "print(f\"\\nAnomaly Score Categories:\")\n", - "print(anomaly_cat_dist)\n", - "print(f\"\\nPercentage:\")\n", - "print((anomaly_cat_dist / anomaly_cat_dist.sum() * 100).round(2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize anomaly scores\n", - "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n", - "\n", - "# Histogram\n", - "axes[0, 0].hist(anomaly_scores, bins=50, color='purple', alpha=0.7, edgecolor='black')\n", - "axes[0, 0].set_xlabel('Anomaly Score', fontsize=11)\n", - "axes[0, 0].set_ylabel('Frequency', fontsize=11)\n", - "axes[0, 0].set_title('Anomaly Score Distribution', fontsize=13, fontweight='bold')\n", - "axes[0, 0].grid(True, alpha=0.3)\n", - "\n", - "# Category bar chart\n", - "colors = ['green', 'yellow', 'orange', 'red']\n", - "axes[0, 1].bar(range(len(anomaly_cat_dist)), anomaly_cat_dist.values, \n", - " color=colors, edgecolor='black')\n", - "axes[0, 1].set_xticks(range(len(anomaly_cat_dist)))\n", - "axes[0, 1].set_xticklabels(anomaly_cat_dist.index, rotation=45, ha='right')\n", - "axes[0, 1].set_xlabel('Anomaly Category', fontsize=11)\n", - "axes[0, 1].set_ylabel('Count', fontsize=11)\n", - "axes[0, 1].set_title('Anomaly Score Categories', fontsize=13, fontweight='bold')\n", - "\n", - "# Box plot by attack type\n", - "if 'Attack Type' in df.columns:\n", - " df.boxplot(column='Anomaly Scores', by='Attack Type', ax=axes[1, 0])\n", - " axes[1, 0].set_xlabel('Attack Type', fontsize=11)\n", - " axes[1, 0].set_ylabel('Anomaly Score', fontsize=11)\n", - " axes[1, 0].set_title('Anomaly Scores by Attack Type', fontsize=13, fontweight='bold')\n", - " axes[1, 0].get_figure().suptitle('')\n", - " plt.sca(axes[1, 0])\n", - " plt.xticks(rotation=45, ha='right')\n", - "\n", - "# Category by attack type\n", - "if 'Attack Type' in df.columns:\n", - " anomaly_attack = pd.crosstab(df['Attack Type'], df['anomaly_category'], normalize='index') * 100\n", - " anomaly_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], color=colors)\n", - " axes[1, 1].set_xlabel('Attack Type', fontsize=11)\n", - " axes[1, 1].set_ylabel('Percentage', fontsize=11)\n", - " axes[1, 1].set_title('Anomaly Categories by Attack Type (%)', fontsize=13, fontweight='bold')\n", - " axes[1, 1].legend(title='Anomaly Level', bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " axes[1, 1].tick_params(axis='x', rotation=45)\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### šŸ“Š Key Findings - Data Bin Trends\n", - "\n", - "**Summary:**\n", - "- Write your key findings here\n", - "- What are the dominant packet sizes per attack type?\n", - "- Which ports are most targeted?\n", - "- Protocol preferences?\n", - "- Anomaly score patterns?\n", - "\n", - "**Attack Signatures Identified:**\n", - "- DDoS: [packet size pattern, protocol, ports]\n", - "- Malware: [packet size pattern, protocol, ports]\n", - "- etc.\n", - "\n", - "**Recommendation for ML Model:**\n", - "- Use binned features: packet_length_bin, port_category, anomaly_category" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "## 5. COMPREHENSIVE SUMMARY & INSIGHTS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"\\n\" + \"=\"*80)\n", - "print(\"COMPREHENSIVE EDA SUMMARY\")\n", - "print(\"=\"*80)\n", - "\n", - "print(f\"\\nšŸ“Š DATASET OVERVIEW\")\n", - "print(\"-\" * 80)\n", - "print(f\"Total Records: {len(df):,}\")\n", - "print(f\"Total Features: {df.shape[1]}\")\n", - "print(f\"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB\")\n", - "\n", - "if 'Attack Type' in df.columns:\n", - " print(f\"\\nšŸŽÆ ATTACK TYPE DISTRIBUTION\")\n", - " print(\"-\" * 80)\n", - " attack_dist = df['Attack Type'].value_counts()\n", - " for attack, count in attack_dist.items():\n", - " print(f\" {attack}: {count:,} ({count/len(df)*100:.2f}%)\")\n", - "\n", - "print(f\"\\nšŸ” KEY STATISTICS\")\n", - "print(\"-\" * 80)\n", - "\n", - "# Proxy\n", - "if 'has_proxy' in df.columns:\n", - " proxy_pct = (df['has_proxy'].sum() / len(df)) * 100\n", - " print(f\" - Proxy Usage Rate: {proxy_pct:.2f}%\")\n", - "\n", - "# IPs\n", - "if 'Source IP Address' in df.columns:\n", - " print(f\" - Unique Source IPs: {df['Source IP Address'].nunique():,}\")\n", - " print(f\" - Unique Destination IPs: {df['Destination IP Address'].nunique():,}\")\n", - "\n", - "# Packet Length\n", - "if 'Packet Length' in df.columns:\n", - " print(f\" - Average Packet Size: {df['Packet Length'].mean():.2f} bytes\")\n", - "\n", - "# Protocol\n", - "if 'Protocol' in df.columns:\n", - " top_protocol = df['Protocol'].value_counts().index[0]\n", - " top_protocol_pct = (df['Protocol'].value_counts().values[0] / len(df)) * 100\n", - " print(f\" - Most Common Protocol: {top_protocol} ({top_protocol_pct:.2f}%)\")\n", - "\n", - "# Port\n", - "if 'Destination Port' in df.columns:\n", - " top_port = df['Destination Port'].value_counts().index[0]\n", - " top_port_count = df['Destination Port'].value_counts().values[0]\n", - " print(f\" - Most Targeted Port: {top_port} ({top_port_count:,} times)\")\n", - "\n", - "print(\"\\n\" + \"=\"*80)\n", - "print(\"āœ… EDA ANALYSIS COMPLETE!\")\n", - "print(\"=\"*80)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "## 6. EXPORT ENGINEERED FEATURES (Optional)\n", - "\n", - "Create new features based on EDA insights for ML model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a summary of engineered features\n", - "engineered_features = df[[\n", - " 'has_proxy', # Binary: 0/1\n", - " 'is_bidirectional', # Binary: 0/1 \n", - " 'src_is_private', # Binary: 0/1\n", - " 'dst_is_private', # Binary: 0/1\n", - " 'packet_length_bin', # Categorical: 7 categories\n", - " 'dst_port_category', # Categorical: 3 categories\n", - " 'src_port_category', # Categorical: 3 categories\n", - " 'anomaly_category' # Categorical: 4 categories\n", - "]].copy()\n", - "\n", - "print(\"Engineered Features Summary:\")\n", - "print(engineered_features.head(10))\n", - "print(f\"\\nShape: {engineered_features.shape}\")\n", - "print(f\"\\nFeature Data Types:\")\n", - "print(engineered_features.dtypes)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Optional: Save engineered features to CSV\n", - "# engineered_features.to_csv('engineered_features.csv', index=False)\n", - "# print(\"āœ“ Engineered features saved to 'engineered_features.csv'\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "## 7. CONCLUSIONS & RECOMMENDATIONS\n", - "\n", - "### Key Findings:\n", - "1. **Proxy Usage:**\n", - " - [Your findings here]\n", - " \n", - "2. **IP Spoofing Indicators:**\n", - " - [Your findings here]\n", - " \n", - "3. **Data Bin Patterns:**\n", - " - [Your findings here]\n", - "\n", - "### Recommendations for ML Model:\n", - "1. Binary features: `has_proxy`, `is_bidirectional`, `src_is_private`, `dst_is_private`\n", - "2. Categorical features: `packet_length_bin`, `port_category`, `anomaly_category`\n", - "3. Numerical features: Consider creating fan-out/fan-in scores\n", - "4. Attack-specific patterns identified can guide feature importance analysis\n", - "\n", - "### Next Steps:\n", - "1. Data preprocessing (handle missing values, encode categoricals)\n", - "2. Feature scaling/normalization\n", - "3. Address class imbalance if needed\n", - "4. Model selection and training\n", - "5. Hyperparameter tuning\n", - "6. Model evaluation" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/data_loader.py b/data_loader.py new file mode 100644 index 0000000..930311e --- /dev/null +++ b/data_loader.py @@ -0,0 +1,113 @@ +""" +Data Loading and Basic Validation Module +Handles CSV loading, initial validation, and basic cleaning +""" + +import pandas as pd +import numpy as np + + +def load_dataset(filepath, verbose=True): + """ + Load the cybersecurity attacks dataset from CSV + + Parameters: + ----------- + filepath : str + Path to the CSV file + verbose : bool + If True, print dataset info + + Returns: + -------- + pd.DataFrame + Loaded dataset + """ + df = pd.read_csv(filepath) + + if verbose: + print(f"āœ“ Dataset loaded successfully!") + print(f" - Shape: {df.shape}") + print(f" - Total Records: {df.shape[0]:,}") + print(f" - Total Features: {df.shape[1]}") + + return df + + +def get_missing_value_summary(df): + """ + Generate comprehensive missing value analysis + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe + + Returns: + -------- + pd.DataFrame + Summary of missing values and distinctness + """ + missing_df = pd.DataFrame({ + 'Missing_Count': df.isnull().sum(), + 'Missing_Percentage': (df.isnull().sum() / len(df)) * 100, + 'Distinct_Count': df.nunique(), + 'Distinct_Percentage': (df.nunique() / len(df)) * 100 + }).sort_values('Missing_Count', ascending=False) + + return missing_df + + +def validate_required_columns(df, required_columns): + """ + Check if required columns exist in dataframe + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe + required_columns : list + List of column names that must be present + + Returns: + -------- + tuple + (bool: all_present, list: missing_columns) + """ + missing = [col for col in required_columns if col not in df.columns] + + if missing: + print(f"āš ļø Missing required columns: {missing}") + return False, missing + + return True, [] + + +def get_dataset_info(df): + """ + Print comprehensive dataset information + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe + """ + print("\n" + "="*80) + print("DATASET INFORMATION") + print("="*80) + + print(f"\nšŸ“Š Basic Stats:") + print(f" - Total Records: {len(df):,}") + print(f" - Total Features: {df.shape[1]}") + print(f" - Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") + + print(f"\nšŸ“‹ Data Types:") + print(df.dtypes.value_counts()) + + print(f"\nāŒ Missing Values:") + missing = df.isnull().sum().sum() + if missing > 0: + print(f" - Total missing values: {missing:,}") + print(f" - Percentage: {(missing / (df.shape[0] * df.shape[1])) * 100:.2f}%") + else: + print(" - No missing values found āœ“") diff --git a/features.py b/features.py new file mode 100644 index 0000000..4fcdda0 --- /dev/null +++ b/features.py @@ -0,0 +1,269 @@ +""" +Feature Engineering Module +Functions for creating derived features from raw data +""" + +import pandas as pd +import numpy as np + + +def create_proxy_features(df): + """ + Create proxy-related features + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe with 'Proxy Information' column + + Returns: + -------- + pd.DataFrame + Dataframe with added proxy features + """ + if 'Proxy Information' in df.columns: + # Binary feature: has_proxy + df['has_proxy'] = df['Proxy Information'].notna().astype(int) + print(f"āœ“ Created 'has_proxy' feature") + print(f" - Records with proxy: {df['has_proxy'].sum():,} ({df['has_proxy'].sum()/len(df)*100:.2f}%)") + else: + print("āš ļø 'Proxy Information' column not found, skipping proxy features") + + return df + + +def create_ip_features(df): + """ + Create IP-related features + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe with IP address columns + + Returns: + -------- + pd.DataFrame + Dataframe with added IP features + """ + if 'Source IP Address' not in df.columns or 'Destination IP Address' not in df.columns: + print("āš ļø IP address columns not found, skipping IP features") + return df + + # IP Class (first octet) + df['src_ip_class'] = df['Source IP Address'].str.split('.').str[0].astype(int) + df['dst_ip_class'] = df['Destination IP Address'].str.split('.').str[0].astype(int) + + # Private IP detection + df['src_is_private'] = df['Source IP Address'].apply(is_private_ip) + df['dst_is_private'] = df['Destination IP Address'].apply(is_private_ip) + + # Bidirectional traffic detection + source_ips_set = set(df['Source IP Address']) + dest_ips_set = set(df['Destination IP Address']) + bidirectional_ips = source_ips_set.intersection(dest_ips_set) + + df['is_bidirectional'] = (df['Source IP Address'].isin(bidirectional_ips)) | \ + (df['Destination IP Address'].isin(bidirectional_ips)) + + print(f"āœ“ Created IP features:") + print(f" - IP class features (src_ip_class, dst_ip_class)") + print(f" - Private IP indicators (src_is_private, dst_is_private)") + print(f" - Bidirectional traffic indicator") + print(f" - Bidirectional IPs found: {len(bidirectional_ips):,}") + + return df + + +def is_private_ip(ip): + """ + Check if an IP address is in private range (RFC 1918) + + Parameters: + ----------- + ip : str + IP address string + + Returns: + -------- + bool + True if IP is private, False otherwise + """ + if pd.isna(ip): + return False + try: + parts = str(ip).split('.') + if len(parts) != 4: + return False + first = int(parts[0]) + second = int(parts[1]) + + # Private IP ranges: 10.x.x.x, 172.16-31.x.x, 192.168.x.x + if first == 10: + return True + if first == 172 and 16 <= second <= 31: + return True + if first == 192 and second == 168: + return True + return False + except: + return False + + +def create_port_features(df): + """ + Create port-related features + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe with port columns + + Returns: + -------- + pd.DataFrame + Dataframe with added port features + """ + if 'Source Port' in df.columns: + df['src_port_category'] = df['Source Port'].apply(categorize_port) + + if 'Destination Port' in df.columns: + df['dst_port_category'] = df['Destination Port'].apply(categorize_port) + + print(f"āœ“ Created port category features") + + return df + + +def categorize_port(port): + """ + Categorize ports into well-known, registered, or dynamic + + Parameters: + ----------- + port : int + Port number + + Returns: + -------- + str + Port category + """ + if pd.isna(port): + return 'Unknown' + try: + port = int(port) + if 0 <= port <= 1023: + return 'Well-known (0-1023)' + elif 1024 <= port <= 49151: + return 'Registered (1024-49151)' + elif 49152 <= port <= 65535: + return 'Dynamic (49152-65535)' + else: + return 'Unknown' + except: + return 'Unknown' + + +def create_packet_features(df): + """ + Create packet-related features + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe with 'Packet Length' column + + Returns: + -------- + pd.DataFrame + Dataframe with added packet features + """ + if 'Packet Length' not in df.columns: + print("āš ļø 'Packet Length' column not found, skipping packet features") + return df + + # Packet size categories + df['packet_length_bin'] = pd.cut( + df['Packet Length'], + bins=[0, 100, 500, 1000, 1500, float('inf')], + labels=['Tiny (0-100)', 'Small (100-500)', 'Medium (500-1000)', + 'Large (1000-1500)', 'Jumbo (>1500)'] + ) + + print(f"āœ“ Created packet length bins") + + return df + + +def create_anomaly_features(df): + """ + Create anomaly score-related features + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe with 'Anomaly Scores' column + + Returns: + -------- + pd.DataFrame + Dataframe with added anomaly features + """ + if 'Anomaly Scores' not in df.columns: + print("āš ļø 'Anomaly Scores' column not found, skipping anomaly features") + return df + + anomaly_scores = df['Anomaly Scores'].dropna() + + # Create quartile-based categories + anomaly_bins = [ + anomaly_scores.min(), + anomaly_scores.quantile(0.25), + anomaly_scores.quantile(0.5), + anomaly_scores.quantile(0.75), + anomaly_scores.max() + ] + anomaly_labels = ['Low (0-25%)', 'Medium (25-50%)', 'High (50-75%)', 'Critical (75-100%)'] + + df['anomaly_category'] = pd.cut( + df['Anomaly Scores'], + bins=anomaly_bins, + labels=anomaly_labels, + include_lowest=True + ) + + print(f"āœ“ Created anomaly score categories") + + return df + + +def create_all_features(df): + """ + Create all engineered features at once + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe + + Returns: + -------- + pd.DataFrame + Dataframe with all features added + """ + print("\n" + "="*80) + print("FEATURE ENGINEERING") + print("="*80) + + df = create_proxy_features(df) + df = create_ip_features(df) + df = create_port_features(df) + df = create_packet_features(df) + df = create_anomaly_features(df) + + print(f"\nāœ“ Feature engineering complete!") + print(f" - New shape: {df.shape}") + print(f" - Total features: {df.shape[1]}") + + return df diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..303117b --- /dev/null +++ b/utils.py @@ -0,0 +1,338 @@ +""" +Utility Functions Module +Plotting, logging, metrics, and helper functions +""" + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from scipy import stats +from scipy.stats import chi2_contingency, f_oneway + + +def setup_plotting_style(): + """Set up consistent plotting style across all visualizations""" + sns.set_style("whitegrid") + plt.rcParams['figure.figsize'] = (15, 8) + plt.rcParams['font.size'] = 10 + + +def plot_attack_distribution(df, column='Attack Type'): + """ + Plot the distribution of attack types + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe + column : str + Column name containing attack types + """ + if column not in df.columns: + print(f"āš ļø Column '{column}' not found") + return + + plt.figure(figsize=(12, 6)) + df[column].value_counts().plot(kind='bar', color='steelblue', edgecolor='black') + plt.title(f'{column} Distribution', fontsize=14, fontweight='bold') + plt.xlabel(column) + plt.ylabel('Count') + plt.xticks(rotation=45, ha='right') + plt.tight_layout() + plt.show() + + # Print statistics + print(f"\n{column} Distribution:") + print(df[column].value_counts()) + print(f"\nPercentages:") + print((df[column].value_counts() / len(df) * 100).round(2)) + + +def plot_proxy_analysis(df): + """ + Comprehensive proxy usage visualization + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe with proxy features + """ + if 'has_proxy' not in df.columns: + print("āš ļø 'has_proxy' feature not found. Run feature engineering first.") + return + + fig, axes = plt.subplots(2, 2, figsize=(16, 12)) + + # 1. Overall proxy usage pie chart + proxy_counts = df['has_proxy'].value_counts() + labels = ['No Proxy', 'With Proxy'] + colors = ['lightcoral', 'lightgreen'] + axes[0, 0].pie(proxy_counts.values, labels=labels, autopct='%1.1f%%', + colors=colors, startangle=90) + axes[0, 0].set_title('Overall Proxy Usage Distribution', fontsize=14, fontweight='bold') + + # 2. Proxy usage by Attack Type + if 'Attack Type' in df.columns: + proxy_attack = pd.crosstab(df['Attack Type'], df['has_proxy'], normalize='index') * 100 + proxy_attack.plot(kind='bar', ax=axes[0, 1], stacked=False, + color=['lightcoral', 'lightgreen']) + axes[0, 1].set_title('Proxy Usage by Attack Type (%)', fontsize=14, fontweight='bold') + axes[0, 1].set_xlabel('Attack Type') + axes[0, 1].set_ylabel('Percentage') + axes[0, 1].legend(['No Proxy', 'With Proxy']) + axes[0, 1].tick_params(axis='x', rotation=45) + + # 3. Proxy usage by Severity Level + if 'Severity Level' in df.columns: + proxy_severity = pd.crosstab(df['Severity Level'], df['has_proxy']) + proxy_severity.plot(kind='bar', ax=axes[1, 0], color=['lightcoral', 'lightgreen']) + axes[1, 0].set_title('Proxy Usage by Severity Level', fontsize=14, fontweight='bold') + axes[1, 0].set_xlabel('Severity Level') + axes[1, 0].set_ylabel('Count') + axes[1, 0].legend(['No Proxy', 'With Proxy']) + axes[1, 0].tick_params(axis='x', rotation=45) + + # 4. Proxy by Log Source + if 'Log Source' in df.columns: + log_proxy = pd.crosstab(df['Log Source'], df['has_proxy'], normalize='index') * 100 + log_proxy.plot(kind='bar', ax=axes[1, 1], color=['lightcoral', 'lightgreen']) + axes[1, 1].set_title('Proxy Usage: Firewall vs Server', fontsize=14, fontweight='bold') + axes[1, 1].set_xlabel('Log Source') + axes[1, 1].set_ylabel('Percentage') + axes[1, 1].legend(['No Proxy', 'With Proxy']) + axes[1, 1].tick_params(axis='x', rotation=0) + + plt.tight_layout() + plt.show() + + +def plot_ip_analysis(df): + """ + Visualize IP-related patterns + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe with IP features + """ + if 'src_ip_class' not in df.columns: + print("āš ļø IP features not found. Run feature engineering first.") + return + + fig, axes = plt.subplots(1, 2, figsize=(16, 6)) + + # Source IP class distribution + top_20_src_classes = df['src_ip_class'].value_counts().head(20) + axes[0].bar(range(len(top_20_src_classes)), top_20_src_classes.values, + color='steelblue', edgecolor='black') + axes[0].set_xticks(range(len(top_20_src_classes))) + axes[0].set_xticklabels(top_20_src_classes.index, rotation=45) + axes[0].set_xlabel('IP Class (First Octet)', fontsize=12) + axes[0].set_ylabel('Count', fontsize=12) + axes[0].set_title('Top 20 Source IP Classes', fontsize=14, fontweight='bold') + axes[0].grid(axis='y', alpha=0.3) + + # IP class vs attack type heatmap + if 'Attack Type' in df.columns: + ip_attack_matrix = pd.crosstab( + df['src_ip_class'], + df['Attack Type'], + normalize='index' + ) * 100 + + # Get top 15 IP classes for readability + top_15_classes = df['src_ip_class'].value_counts().head(15).index + ip_attack_subset = ip_attack_matrix.loc[top_15_classes] + + sns.heatmap(ip_attack_subset, annot=True, fmt='.1f', cmap='YlOrRd', + ax=axes[1], cbar_kws={'label': 'Percentage'}) + axes[1].set_xlabel('Attack Type', fontsize=12) + axes[1].set_ylabel('IP Class (First Octet)', fontsize=12) + axes[1].set_title('Attack Type Distribution by IP Class (%)', + fontsize=14, fontweight='bold') + + plt.tight_layout() + plt.show() + + +def plot_protocol_analysis(df): + """ + Visualize protocol distribution and patterns + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe + """ + if 'Protocol' not in df.columns: + print("āš ļø 'Protocol' column not found") + return + + fig, axes = plt.subplots(1, 2, figsize=(16, 6)) + + protocol_dist = df['Protocol'].value_counts() + + # Protocol pie chart + axes[0].pie(protocol_dist.values, labels=protocol_dist.index, + autopct='%1.1f%%', startangle=90) + axes[0].set_title('Protocol Distribution', fontsize=14, fontweight='bold') + + # Protocol by attack type + if 'Attack Type' in df.columns: + protocol_attack = pd.crosstab(df['Attack Type'], df['Protocol'], normalize='index') * 100 + protocol_attack.plot(kind='bar', stacked=True, ax=axes[1], colormap='viridis') + axes[1].set_xlabel('Attack Type', fontsize=12) + axes[1].set_ylabel('Percentage', fontsize=12) + axes[1].set_title('Protocol Distribution by Attack Type (%)', fontsize=14, fontweight='bold') + axes[1].legend(title='Protocol', bbox_to_anchor=(1.05, 1), loc='upper left') + axes[1].tick_params(axis='x', rotation=45) + + plt.tight_layout() + plt.show() + + +def plot_packet_analysis(df): + """ + Visualize packet length patterns + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe + """ + if 'Packet Length' not in df.columns: + print("āš ļø 'Packet Length' column not found") + return + + fig, axes = plt.subplots(2, 2, figsize=(16, 12)) + + packet_lengths = df['Packet Length'].dropna() + + # Histogram + axes[0, 0].hist(packet_lengths, bins=50, color='skyblue', edgecolor='black', alpha=0.7) + axes[0, 0].set_xlabel('Packet Length (bytes)', fontsize=11) + axes[0, 0].set_ylabel('Frequency (log scale)', fontsize=11) + axes[0, 0].set_title('Packet Length Distribution', fontsize=13, fontweight='bold') + axes[0, 0].set_yscale('log') + axes[0, 0].grid(True, alpha=0.3) + + # Binned distribution + if 'packet_length_bin' in df.columns: + packet_bin_dist = df['packet_length_bin'].value_counts().sort_index() + packet_bin_dist.plot(kind='bar', ax=axes[0, 1], color='coral', edgecolor='black') + axes[0, 1].set_xlabel('Packet Length Bins', fontsize=11) + axes[0, 1].set_ylabel('Count', fontsize=11) + axes[0, 1].set_title('Packet Length Binned Distribution', fontsize=13, fontweight='bold') + axes[0, 1].tick_params(axis='x', rotation=45) + + # Box plot by attack type + if 'Attack Type' in df.columns: + df.boxplot(column='Packet Length', by='Attack Type', ax=axes[1, 0]) + axes[1, 0].set_xlabel('Attack Type', fontsize=11) + axes[1, 0].set_ylabel('Packet Length (bytes)', fontsize=11) + axes[1, 0].set_title('Packet Length by Attack Type', fontsize=13, fontweight='bold') + axes[1, 0].get_figure().suptitle('') + plt.sca(axes[1, 0]) + plt.xticks(rotation=45, ha='right') + + # Bins by attack type + if 'Attack Type' in df.columns and 'packet_length_bin' in df.columns: + bin_attack = pd.crosstab(df['Attack Type'], df['packet_length_bin'], normalize='index') * 100 + bin_attack.plot(kind='bar', stacked=True, ax=axes[1, 1], colormap='tab10') + axes[1, 1].set_xlabel('Attack Type', fontsize=11) + axes[1, 1].set_ylabel('Percentage', fontsize=11) + axes[1, 1].set_title('Packet Length Bins by Attack Type (%)', fontsize=13, fontweight='bold') + axes[1, 1].legend(title='Packet Size', bbox_to_anchor=(1.05, 1), loc='upper left') + axes[1, 1].tick_params(axis='x', rotation=45) + + plt.tight_layout() + plt.show() + + +def statistical_test_packet_length(df): + """ + Perform ANOVA test on packet length across attack types + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe + """ + if 'Attack Type' not in df.columns or 'Packet Length' not in df.columns: + print("āš ļø Required columns not found") + return + + attack_types = df['Attack Type'].unique() + groups = [df[df['Attack Type'] == attack]['Packet Length'].dropna() for attack in attack_types] + + f_stat, p_value = f_oneway(*groups) + + print("\n" + "="*80) + print("PACKET LENGTH ANOVA TEST") + print("="*80) + print(f"F-statistic: {f_stat:.2f}") + print(f"p-value: {p_value:.4e}") + + if p_value < 0.001: + print("āœ“ Packet Length is HIGHLY discriminative across attack types!") + elif p_value < 0.05: + print("āœ“ Packet Length shows significant differences across attack types") + else: + print("āš ļø Packet Length may not be strongly discriminative") + + +def print_comprehensive_summary(df): + """ + Print a comprehensive summary of the dataset + + Parameters: + ----------- + df : pd.DataFrame + Input dataframe + """ + print("\n" + "="*80) + print("COMPREHENSIVE DATASET SUMMARY") + print("="*80) + + print(f"\nšŸ“Š DATASET OVERVIEW") + print("-" * 80) + print(f"Total Records: {len(df):,}") + print(f"Total Features: {df.shape[1]}") + print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") + + if 'Attack Type' in df.columns: + print(f"\nšŸŽÆ ATTACK TYPE DISTRIBUTION") + print("-" * 80) + attack_dist = df['Attack Type'].value_counts() + for attack, count in attack_dist.items(): + print(f" {attack}: {count:,} ({count/len(df)*100:.2f}%)") + + print(f"\nšŸ” KEY STATISTICS") + print("-" * 80) + + # Proxy + if 'has_proxy' in df.columns: + proxy_pct = (df['has_proxy'].sum() / len(df)) * 100 + print(f" - Proxy Usage Rate: {proxy_pct:.2f}%") + + # IPs + if 'Source IP Address' in df.columns: + print(f" - Unique Source IPs: {df['Source IP Address'].nunique():,}") + print(f" - Unique Destination IPs: {df['Destination IP Address'].nunique():,}") + + # Packet Length + if 'Packet Length' in df.columns: + print(f" - Average Packet Size: {df['Packet Length'].mean():.2f} bytes") + + # Protocol + if 'Protocol' in df.columns: + top_protocol = df['Protocol'].value_counts().index[0] + top_protocol_pct = (df['Protocol'].value_counts().values[0] / len(df)) * 100 + print(f" - Most Common Protocol: {top_protocol} ({top_protocol_pct:.2f}%)") + + # Port + if 'Destination Port' in df.columns: + top_port = df['Destination Port'].value_counts().index[0] + top_port_count = df['Destination Port'].value_counts().values[0] + print(f" - Most Targeted Port: {top_port} ({top_port_count:,} times)")