-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbaseline_model.py
More file actions
108 lines (87 loc) · 3.63 KB
/
baseline_model.py
File metadata and controls
108 lines (87 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# baseline_model.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import warnings
warnings.filterwarnings('ignore')
# ================================
# 🔷 1. Load dataset
# ================================
df = pd.read_csv("login_dataset.csv")
print("✅ Dataset loaded. Shape:", df.shape)
# ================================
# 🔷 2. Encode categorical features
# ================================
le_device = LabelEncoder()
df['Device novelty'] = le_device.fit_transform(df['Device novelty']) # Known=0, New=1
# Encode Country and City
le_country = LabelEncoder()
le_city = LabelEncoder()
df['Country_encoded'] = le_country.fit_transform(df['Country'])
df['City_encoded'] = le_city.fit_transform(df['City'])
# ================================
# 🔷 3. Feature engineering
# ================================
# Drop unprocessed columns
X = df.drop(columns=[
'Label',
'UserID',
'DeviceID',
'IP Address',
'Timestamp',
'Country',
'City'
])
# Add encoded country and city to X (already done above)
y = df['Label']
print("✅ Features for modeling:", list(X.columns))
# ================================
# 🔷 4. Train-Test split with stratification
# ================================
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print("✅ Data split completed. Train size:", X_train.shape[0], "Test size:", X_test.shape[0])
# ================================
# 🔷 5. Feature scaling (StandardScaler)
# ================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# ================================
# 🔷 6. Handle class imbalance using SMOTE
# ================================
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)
print("✅ SMOTE applied.")
print("Class distribution before SMOTE:\n", y_train.value_counts())
print("Class distribution after SMOTE:\n", pd.Series(y_train_resampled).value_counts())
# ================================
# 🔷 7. Train Logistic Regression (Baseline linear model)
# ================================
lr_model = LogisticRegression(random_state=42, class_weight='balanced')
lr_model.fit(X_train_resampled, y_train_resampled)
y_pred_lr = lr_model.predict(X_test_scaled)
print("\n🔷 Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))
print("ROC AUC Score:", roc_auc_score(y_test, lr_model.predict_proba(X_test_scaled)[:,1]))
# ================================
# 🔷 8. Train Random Forest (Non-linear ensemble model)
# ================================
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_model.fit(X_train_resampled, y_train_resampled)
y_pred_rf = rf_model.predict(X_test_scaled)
print("\n🔷 Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))
print("ROC AUC Score:", roc_auc_score(y_test, rf_model.predict_proba(X_test_scaled)[:,1]))
# ================================
# 🔷 9. Confusion Matrix comparison
# ================================
print("\n🔷 Confusion Matrix (Logistic Regression):\n", confusion_matrix(y_test, y_pred_lr))
print("\n🔷 Confusion Matrix (Random Forest):\n", confusion_matrix(y_test, y_pred_rf))
print("\n✅ Baseline model training complete.")