diff --git a/titanic.ipynb b/titanic.ipynb index cde5079..8f28030 100644 --- a/titanic.ipynb +++ b/titanic.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"markdown","metadata":{},"source":["
\n","# Ignore this"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import numpy as np # linear algebra\n","import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","import matplotlib.pyplot as plt\n","plt.style.use(\"seaborn-v0_8-whitegrid\")\n","\n","import seaborn as sns\n","\n","from collections import Counter\n","\n","import warnings\n","warnings.filterwarnings(\"ignore\")"]},{"cell_type":"markdown","metadata":{},"source":["
\n","# Load and Check Data"]},{"cell_type":"markdown","metadata":{},"source":["DataFrames hold the dataset in a tabular format for easy manipulation and analysis.
\n","CSV data is read into 'df' using Pandas' read_csv() function."]},{"cell_type":"code","execution_count":null,"metadata":{"_kg_hide-input":true,"execution":{"iopub.execute_input":"2024-04-01T06:45:27.416192Z","iopub.status.busy":"2024-04-01T06:45:27.415763Z","iopub.status.idle":"2024-04-01T06:45:27.433162Z","shell.execute_reply":"2024-04-01T06:45:27.431944Z","shell.execute_reply.started":"2024-04-01T06:45:27.416105Z"},"trusted":true},"outputs":[],"source":["train_df = pd.read_csv(\"./data/train.csv\")"]},{"cell_type":"markdown","metadata":{},"source":["### 1. Try to read the test .csv file into test_df"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.436611Z","iopub.status.busy":"2024-04-01T06:45:27.435916Z","iopub.status.idle":"2024-04-01T06:45:27.449974Z","shell.execute_reply":"2024-04-01T06:45:27.448230Z","shell.execute_reply.started":"2024-04-01T06:45:27.436517Z"},"trusted":true},"outputs":[],"source":["test_df = pd.read_csv(\"./data/test.csv\")\n","test_PassengerId = test_df[\"PassengerId\"]"]},{"cell_type":"code","execution_count":null,"metadata":{"_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","execution":{"iopub.execute_input":"2024-04-01T06:45:27.452397Z","iopub.status.busy":"2024-04-01T06:45:27.451949Z","iopub.status.idle":"2024-04-01T06:45:27.462622Z","shell.execute_reply":"2024-04-01T06:45:27.461859Z","shell.execute_reply.started":"2024-04-01T06:45:27.452348Z"},"trusted":true},"outputs":[],"source":["print(\"The Columns of train_df are: \")\n","train_df.columns"]},{"cell_type":"markdown","metadata":{},"source":["### We can use head() to see the first few rows in the dataframe"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.464289Z","iopub.status.busy":"2024-04-01T06:45:27.463866Z","iopub.status.idle":"2024-04-01T06:45:27.491984Z","shell.execute_reply":"2024-04-01T06:45:27.491110Z","shell.execute_reply.started":"2024-04-01T06:45:27.464242Z"},"trusted":true},"outputs":[],"source":["train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.494326Z","iopub.status.busy":"2024-04-01T06:45:27.493637Z","iopub.status.idle":"2024-04-01T06:45:27.541999Z","shell.execute_reply":"2024-04-01T06:45:27.541210Z","shell.execute_reply.started":"2024-04-01T06:45:27.494251Z"},"jupyter":{"source_hidden":true},"trusted":true},"outputs":[],"source":["train_df.describe()"]},{"cell_type":"markdown","metadata":{},"source":["### 2. Without looking above, try checking the first few rows of test_df"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{},"source":["### 3. Now try checking for a description of test_df's data"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{},"source":["# Variable Description\n","
    \n","
  1. PassengerId: unique id number to each passenger
  2. \n","
  3. Survived: passenger survive(1) or died(0)
  4. \n","
  5. Pclass: passenger class
  6. \n","
  7. Name: name
  8. \n","
  9. Sex: gender of passenger
  10. \n","
  11. Age: age of passenger
  12. \n","
  13. SibSp: number of siblings/spouses
  14. \n","
  15. Parch: number of parents/children
  16. \n","
  17. Ticket: ticket number
  18. \n","
  19. Fare: amount of money spent on ticket
  20. \n","
  21. Cabin: cabin category
  22. \n","
  23. Embarked: port where passenger embarked ( C = Cherbourg, Q = Queenstown, S = Southampton )
  24. \n","
\n"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.544377Z","iopub.status.busy":"2024-04-01T06:45:27.543901Z","iopub.status.idle":"2024-04-01T06:45:27.557229Z","shell.execute_reply":"2024-04-01T06:45:27.555972Z","shell.execute_reply.started":"2024-04-01T06:45:27.544320Z"},"trusted":true},"outputs":[],"source":["train_df.info()"]},{"cell_type":"markdown","metadata":{},"source":["### Slice Rows and Columsn of DF (Assigmennt)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:53:12.214069Z","iopub.status.busy":"2024-04-01T06:53:12.213708Z","iopub.status.idle":"2024-04-01T06:53:12.223150Z","shell.execute_reply":"2024-04-01T06:53:12.222195Z","shell.execute_reply.started":"2024-04-01T06:53:12.214014Z"},"trusted":true},"outputs":[],"source":["# Printing the Second Row\n","train_df.iloc[2]"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Print the 5th Row"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:14.398373Z","iopub.status.busy":"2024-04-01T06:54:14.398006Z","iopub.status.idle":"2024-04-01T06:54:14.407886Z","shell.execute_reply":"2024-04-01T06:54:14.406590Z","shell.execute_reply.started":"2024-04-01T06:54:14.398326Z"},"trusted":true},"outputs":[],"source":["# Print the Sex Column\n","train_df['Sex']"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:24.550687Z","iopub.status.busy":"2024-04-01T06:54:24.550286Z","iopub.status.idle":"2024-04-01T06:54:24.555255Z","shell.execute_reply":"2024-04-01T06:54:24.553923Z","shell.execute_reply.started":"2024-04-01T06:54:24.550616Z"},"trusted":true},"outputs":[],"source":["# Print the Name Column"]},{"cell_type":"markdown","metadata":{},"source":["## Visualization (Assignment)"]},{"cell_type":"markdown","metadata":{},"source":["### Age -- Survived"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:13:34.450088Z","iopub.status.busy":"2024-04-01T07:13:34.449302Z","iopub.status.idle":"2024-04-01T07:13:34.932717Z","shell.execute_reply":"2024-04-01T07:13:34.930449Z","shell.execute_reply.started":"2024-04-01T07:13:34.450021Z"},"trusted":true},"outputs":[],"source":["plt.figure(figsize=(8, 6))\n","# Plot 1: Survivors vs Non Survivors\n","\n","# Creating a plot for the Survived Column\n","sns.countplot(x='Survived', data=train_df)\n","\n","plt.title('Survivors vs Non Survivors')\n","plt.xlabel('Survived')\n","plt.ylabel('Count')\n","plt.xticks([0, 1], ['No', 'Yes']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try Plotting Passenger Class"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:14:31.768779Z","iopub.status.busy":"2024-04-01T07:14:31.768341Z","iopub.status.idle":"2024-04-01T07:14:32.062495Z","shell.execute_reply":"2024-04-01T07:14:32.060660Z","shell.execute_reply.started":"2024-04-01T07:14:31.768690Z"},"trusted":true},"outputs":[],"source":["plt.figure(figsize=(8, 6))\n","\n","# Make the plot for Pclass here:\n","\n","\n","plt.title('Count of Passengers In each Passenger Class')\n","plt.xlabel('Passenger Class')\n","plt.ylabel('Count')\n","plt.xticks([0, 1, 2], ['1st', '2nd', '3rd']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try it for \"Embarked\""]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{},"source":["### Try Making a histogram for \"Fare\""]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{},"source":["### Here is the distplot for \"Fare\", refer to it after you tried it yourself:"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:18:24.402882Z","iopub.status.busy":"2024-04-01T07:18:24.402274Z","iopub.status.idle":"2024-04-01T07:18:24.798062Z","shell.execute_reply":"2024-04-01T07:18:24.796669Z","shell.execute_reply.started":"2024-04-01T07:18:24.402828Z"},"trusted":true},"outputs":[],"source":["sns.histplot(train_df['Fare'], bins=20, color='orange')\n","plt.title('Distribution of Passenger Fares')\n","plt.xlabel('Fare')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Make a histogram for \"Age\" (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:19:53.874413Z","iopub.status.busy":"2024-04-01T07:19:53.873686Z","iopub.status.idle":"2024-04-01T07:19:54.244996Z","shell.execute_reply":"2024-04-01T07:19:54.243521Z","shell.execute_reply.started":"2024-04-01T07:19:53.874351Z"},"trusted":true},"outputs":[],"source":["# Create the plot below"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Fill Missing: Age Feature"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:50.370496Z","iopub.status.busy":"2024-04-01T06:27:50.369419Z","iopub.status.idle":"2024-04-01T06:27:50.427731Z","shell.execute_reply":"2024-04-01T06:27:50.426655Z","shell.execute_reply.started":"2024-04-01T06:27:50.370387Z"},"trusted":true},"outputs":[],"source":["train_df[train_df[\"Age\"].isnull()]"]},{"cell_type":"markdown","metadata":{},"source":["### Try Checking for Null Values in Test Df"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df.isnull().sum()"]},{"cell_type":"markdown","metadata":{},"source":["Run this to fix the Null Values"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:21:48.194895Z","iopub.status.busy":"2024-04-01T07:21:48.194020Z","iopub.status.idle":"2024-04-01T07:21:49.449282Z","shell.execute_reply":"2024-04-01T07:21:49.447918Z","shell.execute_reply.started":"2024-04-01T07:21:48.194825Z"},"trusted":true},"outputs":[],"source":["index_nan_age = list(train_df[\"Age\"][train_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = train_df[\"Age\"][((train_df[\"SibSp\"] == train_df.iloc[i][\"SibSp\"]) &(train_df[\"Parch\"] == train_df.iloc[i][\"Parch\"])& (train_df[\"Pclass\"] == train_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = train_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," train_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," train_df[\"Age\"].iloc[i] = age_med\n","\n","index_nan_age = list(test_df[\"Age\"][test_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = test_df[\"Age\"][((test_df[\"SibSp\"] == test_df.iloc[i][\"SibSp\"]) &(test_df[\"Parch\"] == test_df.iloc[i][\"Parch\"])& (test_df[\"Pclass\"] == test_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = test_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," test_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," test_df[\"Age\"].iloc[i] = age_med"]},{"cell_type":"markdown","metadata":{},"source":["## Analysing the correlation between the different columns"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:24:33.644174Z","iopub.status.busy":"2024-04-01T07:24:33.643621Z","iopub.status.idle":"2024-04-01T07:24:34.404306Z","shell.execute_reply":"2024-04-01T07:24:34.402938Z","shell.execute_reply.started":"2024-04-01T07:24:33.643935Z"},"trusted":true},"outputs":[],"source":["numerical_columns = train_df.select_dtypes(include=[np.number]).columns\n","sns.heatmap(train_df[numerical_columns].corr(), annot=True)"]},{"cell_type":"markdown","metadata":{},"source":["We see that Fare and Parch are positively correlated with Survived. Similarly, Fare and Class are negatively correlated, in the sense that the higher the higher the Fare, the lower the Class number (Remember that Class 1 < Class 2 < Class 3 in face value)."]},{"cell_type":"markdown","metadata":{},"source":["## Embarked"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.378151Z","iopub.status.busy":"2024-04-01T06:27:55.377756Z","iopub.status.idle":"2024-04-01T06:27:55.384785Z","shell.execute_reply":"2024-04-01T06:27:55.384101Z","shell.execute_reply.started":"2024-04-01T06:27:55.378107Z"},"trusted":true},"outputs":[],"source":["train_df[\"Embarked\"].head()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.386265Z","iopub.status.busy":"2024-04-01T06:27:55.385875Z","iopub.status.idle":"2024-04-01T06:27:55.635178Z","shell.execute_reply":"2024-04-01T06:27:55.633609Z","shell.execute_reply.started":"2024-04-01T06:27:55.386223Z"},"trusted":true},"outputs":[],"source":["sns.countplot(x = \"Embarked\", data = train_df)\n","plt.show()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.638242Z","iopub.status.busy":"2024-04-01T06:27:55.637447Z","iopub.status.idle":"2024-04-01T06:27:55.699106Z","shell.execute_reply":"2024-04-01T06:27:55.698208Z","shell.execute_reply.started":"2024-04-01T06:27:55.638150Z"},"trusted":true},"outputs":[],"source":["train_df = pd.get_dummies(train_df, columns=[\"Embarked\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df = pd.get_dummies(test_df, columns=[\"Embarked\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Ticket (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.700760Z","iopub.status.busy":"2024-04-01T06:27:55.700330Z","iopub.status.idle":"2024-04-01T06:27:55.708542Z","shell.execute_reply":"2024-04-01T06:27:55.707466Z","shell.execute_reply.started":"2024-04-01T06:27:55.700715Z"},"trusted":true},"outputs":[],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.710291Z","iopub.status.busy":"2024-04-01T06:27:55.709980Z","iopub.status.idle":"2024-04-01T06:27:55.722810Z","shell.execute_reply":"2024-04-01T06:27:55.721839Z","shell.execute_reply.started":"2024-04-01T06:27:55.710231Z"},"trusted":true},"outputs":[],"source":["example_ticket = \"A/5. 2151\"\n","example_ticket.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0]"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.726116Z","iopub.status.busy":"2024-04-01T06:27:55.725689Z","iopub.status.idle":"2024-04-01T06:27:55.738095Z","shell.execute_reply":"2024-04-01T06:27:55.737043Z","shell.execute_reply.started":"2024-04-01T06:27:55.726039Z"},"trusted":true},"outputs":[],"source":["tickets = []\n","for i in list(train_df.Ticket):\n"," if not i.isdigit():\n"," tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n"," else:\n"," tickets.append(\"x\")\n","train_df[\"Ticket\"] = tickets\n","\n","# Do the same for the test set"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.740389Z","iopub.status.busy":"2024-04-01T06:27:55.739797Z","iopub.status.idle":"2024-04-01T06:27:55.755416Z","shell.execute_reply":"2024-04-01T06:27:55.754317Z","shell.execute_reply.started":"2024-04-01T06:27:55.740333Z"},"trusted":true},"outputs":[],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.790832Z","iopub.status.busy":"2024-04-01T06:27:55.790500Z","iopub.status.idle":"2024-04-01T06:27:55.841011Z","shell.execute_reply":"2024-04-01T06:27:55.839862Z","shell.execute_reply.started":"2024-04-01T06:27:55.790770Z"},"trusted":true},"outputs":[],"source":["train_df = pd.get_dummies(train_df, columns= [\"Ticket\"], prefix = \"TcktName\")\n","train_df.head(10)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df = pd.get_dummies(test_df, columns= [\"Ticket\"], prefix = \"TcktName\")\n","test_df.head(10)"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Pclass"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.843395Z","iopub.status.busy":"2024-04-01T06:27:55.842833Z","iopub.status.idle":"2024-04-01T06:27:56.089225Z","shell.execute_reply":"2024-04-01T06:27:56.087578Z","shell.execute_reply.started":"2024-04-01T06:27:55.843168Z"},"trusted":true},"outputs":[],"source":["sns.countplot(x = \"Pclass\", data = train_df)\n","plt.show()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.092270Z","iopub.status.busy":"2024-04-01T06:27:56.091722Z","iopub.status.idle":"2024-04-01T06:27:56.162888Z","shell.execute_reply":"2024-04-01T06:27:56.161841Z","shell.execute_reply.started":"2024-04-01T06:27:56.092186Z"},"trusted":true},"outputs":[],"source":["train_df[\"Pclass\"] = train_df[\"Pclass\"].astype(\"category\")\n","train_df = pd.get_dummies(train_df, columns= [\"Pclass\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df[\"Pclass\"] = test_df[\"Pclass\"].astype(\"category\")\n","test_df = pd.get_dummies(test_df, columns= [\"Pclass\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Sex"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.164709Z","iopub.status.busy":"2024-04-01T06:27:56.164391Z","iopub.status.idle":"2024-04-01T06:27:56.205775Z","shell.execute_reply":"2024-04-01T06:27:56.204761Z","shell.execute_reply.started":"2024-04-01T06:27:56.164639Z"},"trusted":true},"outputs":[],"source":["train_df[\"Sex\"] = train_df[\"Sex\"].astype(\"category\")\n","train_df = pd.get_dummies(train_df, columns=[\"Sex\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df[\"Sex\"] = test_df[\"Sex\"].astype(\"category\")\n","test_df = pd.get_dummies(test_df, columns=[\"Sex\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Drop Passenger ID and Cabin (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.207602Z","iopub.status.busy":"2024-04-01T06:27:56.207299Z","iopub.status.idle":"2024-04-01T06:27:56.215886Z","shell.execute_reply":"2024-04-01T06:27:56.214401Z","shell.execute_reply.started":"2024-04-01T06:27:56.207550Z"},"trusted":true},"outputs":[],"source":["train_df.drop(labels = [\"PassengerId\", \"Cabin\"], axis = 1, inplace = True)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.217917Z","iopub.status.busy":"2024-04-01T06:27:56.217536Z","iopub.status.idle":"2024-04-01T06:27:56.228150Z","shell.execute_reply":"2024-04-01T06:27:56.227230Z","shell.execute_reply.started":"2024-04-01T06:27:56.217854Z"},"trusted":true},"outputs":[],"source":["train_df.columns"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Drop the PassengerId and Cabin columns from the test set"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Print the columns of the test set"]},{"cell_type":"markdown","metadata":{},"source":["
\n","# Modeling"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.230086Z","iopub.status.busy":"2024-04-01T06:27:56.229809Z","iopub.status.idle":"2024-04-01T06:27:56.238557Z","shell.execute_reply":"2024-04-01T06:27:56.237679Z","shell.execute_reply.started":"2024-04-01T06:27:56.230040Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.svm import SVC\n","from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.metrics import accuracy_score"]},{"cell_type":"markdown","metadata":{},"source":["## Train - Test Split (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.240103Z","iopub.status.busy":"2024-04-01T06:27:56.239830Z","iopub.status.idle":"2024-04-01T06:27:56.256809Z","shell.execute_reply":"2024-04-01T06:27:56.255463Z","shell.execute_reply.started":"2024-04-01T06:27:56.240056Z"},"trusted":true},"outputs":[],"source":["train_df_len = len(train_df)\n","train_df_len"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.314730Z","iopub.status.busy":"2024-04-01T06:27:56.313986Z","iopub.status.idle":"2024-04-01T06:27:56.333564Z","shell.execute_reply":"2024-04-01T06:27:56.332507Z","shell.execute_reply.started":"2024-04-01T06:27:56.314635Z"},"trusted":true},"outputs":[],"source":["\n","train = train_df[:train_df_len]\n","test = test_df\n","\n","# Select all numerical values from train and test\n","numeric_train = train.select_dtypes(include=[np.number])\n","numeric_test = test.select_dtypes(include=[np.number]) \n","\n","\n","X_train = numeric_train.drop(labels=[\"Survived\",], axis=1)\n","y_train = numeric_train[\"Survived\"]\n","\n","# Split the train data into train and test sets with a 1/3 ratio\n","X_train, X_test, y_train, y_test = # Use the train_test_split function here\n","\n","\n","print(\"X_train\", len(X_train))\n","print(\"X_test\", len(X_test))\n","print(\"y_train\", len(y_train))\n","print(\"y_test\", len(y_test))\n","print(\"test\", len(numeric_test))"]},{"cell_type":"markdown","metadata":{},"source":["## Simple Logistic Regression (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.335970Z","iopub.status.busy":"2024-04-01T06:27:56.335281Z","iopub.status.idle":"2024-04-01T06:27:56.368083Z","shell.execute_reply":"2024-04-01T06:27:56.366489Z","shell.execute_reply.started":"2024-04-01T06:27:56.335561Z"},"trusted":true},"outputs":[],"source":["logreg = LogisticRegression()\n","logreg.fit(X_train, y_train)\n","acc_log_train = round(logreg.score(X_train, y_train)*100,2) \n","acc_log_test = round(logreg.score(X_test,y_test)*100,2)\n","# Print the accuracy on the training and test set"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Hyperparameter Tuning -- Grid Search -- Cross Validation\n","We will compare 5 ml classifier and evaluate mean accuracy of each of them by stratified cross validation.\n","\n","* Decision Tree\n","* SVM\n","* Random Forest\n","* KNN\n","* Logistic Regression"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.371066Z","iopub.status.busy":"2024-04-01T06:27:56.370400Z","iopub.status.idle":"2024-04-01T06:27:56.401742Z","shell.execute_reply":"2024-04-01T06:27:56.396867Z","shell.execute_reply.started":"2024-04-01T06:27:56.370802Z"},"trusted":true},"outputs":[],"source":["random_state = 42\n","classifier = [DecisionTreeClassifier(random_state = random_state),\n"," SVC(random_state = random_state),\n"," RandomForestClassifier(random_state = random_state),\n"," LogisticRegression(random_state = random_state),\n"," KNeighborsClassifier()]\n","\n","dt_param_grid = {\"min_samples_split\" : range(10,500,20),\n"," \"max_depth\": range(1,20,2)}\n","\n","svc_param_grid = {\"kernel\" : [\"rbf\"],\n"," \"gamma\": [0.001, 0.01, 0.1, 1],\n"," \"C\": [1,10,50,100,200,300,1000]}\n","\n","rf_param_grid = {\"max_features\": [1,3,10],\n"," \"min_samples_split\":[2,3,10],\n"," \"min_samples_leaf\":[1,3,10],\n"," \"bootstrap\":[False],\n"," \"n_estimators\":[100,300],\n"," \"criterion\":[\"gini\"]}\n","\n","logreg_param_grid = {\"C\":np.logspace(-3,3,7),\n"," \"penalty\": [\"l1\",\"l2\"]}\n","\n","knn_param_grid = {\"n_neighbors\": np.linspace(1,19,10, dtype = int).tolist(),\n"," \"weights\": [\"uniform\",\"distance\"],\n"," \"metric\":[\"euclidean\",\"manhattan\"]}\n","classifier_param = [dt_param_grid,\n"," svc_param_grid,\n"," rf_param_grid,\n"," logreg_param_grid,\n"," knn_param_grid]"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.413811Z","iopub.status.busy":"2024-04-01T06:27:56.404322Z","iopub.status.idle":"2024-04-01T06:29:38.718970Z","shell.execute_reply":"2024-04-01T06:29:38.717807Z","shell.execute_reply.started":"2024-04-01T06:27:56.413658Z"},"trusted":true},"outputs":[],"source":["cv_result = []\n","best_estimators = []\n","for i in range(len(classifier)):\n"," clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = \"accuracy\", n_jobs = -1,verbose = 1)\n"," clf.fit(X_train,y_train)\n"," cv_result.append(clf.best_score_)\n"," best_estimators.append(clf.best_estimator_)\n"," print(cv_result[i])"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:38.722928Z","iopub.status.busy":"2024-04-01T06:29:38.722207Z","iopub.status.idle":"2024-04-01T06:29:39.075423Z","shell.execute_reply":"2024-04-01T06:29:39.073987Z","shell.execute_reply.started":"2024-04-01T06:29:38.722582Z"},"trusted":true},"outputs":[],"source":["cv_results = pd.DataFrame({\"Cross Validation Means\":cv_result, \"ML Models\":[\"DecisionTreeClassifier\", \"SVM\",\"RandomForestClassifier\",\n"," \"LogisticRegression\",\n"," \"KNeighborsClassifier\"]})\n","\n","g = sns.barplot(x=\"Cross Validation Means\",y= \"ML Models\", data=cv_results)\n","g.set_xlabel(\"Mean Accuracy\")\n","g.set_title(\"Cross Validation Scores\")"]},{"cell_type":"markdown","metadata":{},"source":["## Ensemble Modeling (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:39.078654Z","iopub.status.busy":"2024-04-01T06:29:39.077840Z","iopub.status.idle":"2024-04-01T06:29:39.862871Z","shell.execute_reply":"2024-04-01T06:29:39.860937Z","shell.execute_reply.started":"2024-04-01T06:29:39.078554Z"},"trusted":true},"outputs":[],"source":["votingC = VotingClassifier(estimators = [(\"dt\",best_estimators[0]),\n"," (\"rfc\",best_estimators[2]),\n"," (\"lr\",best_estimators[3])],\n"," voting = \"soft\", n_jobs = -1)\n","votingC = votingC.fit(X_train, y_train)\n","\n","# Print the accuracy score of the voting classifier"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Drop the null values which are going to cause you an error in the next cell"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Prediction and Submission"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:39.865981Z","iopub.status.busy":"2024-04-01T06:29:39.865330Z","iopub.status.idle":"2024-04-01T06:29:39.977357Z","shell.execute_reply":"2024-04-01T06:29:39.973301Z","shell.execute_reply.started":"2024-04-01T06:29:39.865906Z"},"trusted":true},"outputs":[],"source":["test_survived = pd.Series(votingC.predict(numeric_test), name=\"Survived\").astype(int)\n","results = pd.concat([test_PassengerId, test_survived], axis=1)\n","results.to_csv(\"titanic.csv\", index=False)\n","print(results)"]},{"cell_type":"markdown","metadata":{},"source":["# Congratulations on finishing the assignment!!\n","\n","### The submission is the titanic.csv which was just created, and this file which you have modified."]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"databundleVersionId":26502,"sourceId":3136,"sourceType":"competition"}],"dockerImageVersionId":29852,"isGpuEnabled":false,"isInternetEnabled":false,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.7"}},"nbformat":4,"nbformat_minor":4} +{"cells":[{"cell_type":"markdown","metadata":{},"source":["
\n","# Ignore this"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import numpy as np # linear algebra\n","import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","import matplotlib.pyplot as plt\n","plt.style.use(\"seaborn-v0_8-whitegrid\")\n","\n","import seaborn as sns\n","\n","from collections import Counter\n","\n","import warnings\n","warnings.filterwarnings(\"ignore\")"]},{"cell_type":"markdown","metadata":{},"source":["
\n","# Load and Check Data"]},{"cell_type":"markdown","metadata":{},"source":["DataFrames hold the dataset in a tabular format for easy manipulation and analysis.
\n","CSV data is read into 'df' using Pandas' read_csv() function."]},{"cell_type":"code","execution_count":null,"metadata":{"_kg_hide-input":true,"execution":{"iopub.execute_input":"2024-04-01T06:45:27.416192Z","iopub.status.busy":"2024-04-01T06:45:27.415763Z","iopub.status.idle":"2024-04-01T06:45:27.433162Z","shell.execute_reply":"2024-04-01T06:45:27.431944Z","shell.execute_reply.started":"2024-04-01T06:45:27.416105Z"},"trusted":true},"outputs":[],"source":["train_df = pd.read_csv(\"./data/train.csv\")"]},{"cell_type":"markdown","metadata":{},"source":["### 1. Try to read the test .csv file into test_df"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.436611Z","iopub.status.busy":"2024-04-01T06:45:27.435916Z","iopub.status.idle":"2024-04-01T06:45:27.449974Z","shell.execute_reply":"2024-04-01T06:45:27.448230Z","shell.execute_reply.started":"2024-04-01T06:45:27.436517Z"},"trusted":true},"outputs":[],"source":["test_df = pd.read_csv(\"./data/test.csv\")\n","test_PassengerId = test_df[\"PassengerId\"]"]},{"cell_type":"code","execution_count":null,"metadata":{"_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","execution":{"iopub.execute_input":"2024-04-01T06:45:27.452397Z","iopub.status.busy":"2024-04-01T06:45:27.451949Z","iopub.status.idle":"2024-04-01T06:45:27.462622Z","shell.execute_reply":"2024-04-01T06:45:27.461859Z","shell.execute_reply.started":"2024-04-01T06:45:27.452348Z"},"trusted":true},"outputs":[],"source":["print(\"The Columns of train_df are: \")\n","train_df.columns"]},{"cell_type":"markdown","metadata":{},"source":["### We can use head() to see the first few rows in the dataframe"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.464289Z","iopub.status.busy":"2024-04-01T06:45:27.463866Z","iopub.status.idle":"2024-04-01T06:45:27.491984Z","shell.execute_reply":"2024-04-01T06:45:27.491110Z","shell.execute_reply.started":"2024-04-01T06:45:27.464242Z"},"trusted":true},"outputs":[],"source":["train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.494326Z","iopub.status.busy":"2024-04-01T06:45:27.493637Z","iopub.status.idle":"2024-04-01T06:45:27.541999Z","shell.execute_reply":"2024-04-01T06:45:27.541210Z","shell.execute_reply.started":"2024-04-01T06:45:27.494251Z"},"jupyter":{"source_hidden":true},"trusted":true},"outputs":[],"source":["train_df.describe()"]},{"cell_type":"markdown","metadata":{},"source":["### 2. Without looking above, try checking the first few rows of test_df"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["### 3. Now try checking for a description of test_df's data"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df.describe()"]},{"cell_type":"markdown","metadata":{},"source":["# Variable Description\n","
    \n","
  1. PassengerId: unique id number to each passenger
  2. \n","
  3. Survived: passenger survive(1) or died(0)
  4. \n","
  5. Pclass: passenger class
  6. \n","
  7. Name: name
  8. \n","
  9. Sex: gender of passenger
  10. \n","
  11. Age: age of passenger
  12. \n","
  13. SibSp: number of siblings/spouses
  14. \n","
  15. Parch: number of parents/children
  16. \n","
  17. Ticket: ticket number
  18. \n","
  19. Fare: amount of money spent on ticket
  20. \n","
  21. Cabin: cabin category
  22. \n","
  23. Embarked: port where passenger embarked ( C = Cherbourg, Q = Queenstown, S = Southampton )
  24. \n","
\n"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.544377Z","iopub.status.busy":"2024-04-01T06:45:27.543901Z","iopub.status.idle":"2024-04-01T06:45:27.557229Z","shell.execute_reply":"2024-04-01T06:45:27.555972Z","shell.execute_reply.started":"2024-04-01T06:45:27.544320Z"},"trusted":true},"outputs":[],"source":["train_df.info()"]},{"cell_type":"markdown","metadata":{},"source":["### Slice Rows and Columsn of DF (Assigmennt)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:53:12.214069Z","iopub.status.busy":"2024-04-01T06:53:12.213708Z","iopub.status.idle":"2024-04-01T06:53:12.223150Z","shell.execute_reply":"2024-04-01T06:53:12.222195Z","shell.execute_reply.started":"2024-04-01T06:53:12.214014Z"},"trusted":true},"outputs":[],"source":["# Printing the Second Row\n","train_df.iloc[2]"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Print the 5th Row\n","train_df.iloc[5]"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:14.398373Z","iopub.status.busy":"2024-04-01T06:54:14.398006Z","iopub.status.idle":"2024-04-01T06:54:14.407886Z","shell.execute_reply":"2024-04-01T06:54:14.406590Z","shell.execute_reply.started":"2024-04-01T06:54:14.398326Z"},"trusted":true},"outputs":[],"source":["# Print the Sex Column\n","train_df['Sex']"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:24.550687Z","iopub.status.busy":"2024-04-01T06:54:24.550286Z","iopub.status.idle":"2024-04-01T06:54:24.555255Z","shell.execute_reply":"2024-04-01T06:54:24.553923Z","shell.execute_reply.started":"2024-04-01T06:54:24.550616Z"},"trusted":true},"outputs":[],"source":["# Print the Name Column\n","train_df['Name']"]},{"cell_type":"markdown","metadata":{},"source":["## Visualization (Assignment)"]},{"cell_type":"markdown","metadata":{},"source":["### Age -- Survived"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:13:34.450088Z","iopub.status.busy":"2024-04-01T07:13:34.449302Z","iopub.status.idle":"2024-04-01T07:13:34.932717Z","shell.execute_reply":"2024-04-01T07:13:34.930449Z","shell.execute_reply.started":"2024-04-01T07:13:34.450021Z"},"trusted":true},"outputs":[],"source":["plt.figure(figsize=(8, 6))\n","# Plot 1: Survivors vs Non Survivors\n","\n","# Creating a plot for the Survived Column\n","sns.countplot(x='Survived', data=train_df)\n","\n","plt.title('Survivors vs Non Survivors')\n","plt.xlabel('Survived')\n","plt.ylabel('Count')\n","plt.xticks([0, 1], ['No', 'Yes']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try Plotting Passenger Class"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:14:31.768779Z","iopub.status.busy":"2024-04-01T07:14:31.768341Z","iopub.status.idle":"2024-04-01T07:14:32.062495Z","shell.execute_reply":"2024-04-01T07:14:32.060660Z","shell.execute_reply.started":"2024-04-01T07:14:31.768690Z"},"trusted":true},"outputs":[],"source":["plt.figure(figsize=(8, 6))\n","\n","# Make the plot for Pclass here:\n","sns.countplot(x='Pclass', data=train_df)\n","\n","plt.title('Count of Passengers In each Passenger Class')\n","plt.xlabel('Passenger Class')\n","plt.ylabel('Count')\n","plt.xticks([0, 1, 2], ['1st', '2nd', '3rd']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try it for \"Embarked\""]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":["plt.figure(figsize=(8, 6))\n","\n","sns.countplot(x='Embarked', data= train_df)\n","plt.title('Count of Embarked')\n","plt.xlabel('Embarked')\n","plt.ylabel('Count')\n","plt.xticks([0, 1, 2], ['S', 'C', 'Q']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try Making a histogram for \"Fare\""]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["plt.figure(figsize=(8, 6))\n","\n","sns.histplot(x='Fare', data= train_df)\n","plt.title('Count of Fare')\n","plt.xlabel('Fare')\n","plt.ylabel('Count')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Here is the distplot for \"Fare\", refer to it after you tried it yourself:"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:18:24.402882Z","iopub.status.busy":"2024-04-01T07:18:24.402274Z","iopub.status.idle":"2024-04-01T07:18:24.798062Z","shell.execute_reply":"2024-04-01T07:18:24.796669Z","shell.execute_reply.started":"2024-04-01T07:18:24.402828Z"},"trusted":true},"outputs":[],"source":["sns.histplot(train_df['Fare'], bins=20, color='orange')\n","plt.title('Distribution of Passenger Fares')\n","plt.xlabel('Fare')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Make a histogram for \"Age\" (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:19:53.874413Z","iopub.status.busy":"2024-04-01T07:19:53.873686Z","iopub.status.idle":"2024-04-01T07:19:54.244996Z","shell.execute_reply":"2024-04-01T07:19:54.243521Z","shell.execute_reply.started":"2024-04-01T07:19:53.874351Z"},"trusted":true},"outputs":[],"source":["# Create the plot below\n","plt.figure(figsize=(8, 6))\n","\n","sns.histplot(train_df['Age'], bins=20, color='pink')\n","plt.title('Count of Age')\n","plt.xlabel('Age')\n","plt.ylabel('Count')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Fill Missing: Age Feature"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:50.370496Z","iopub.status.busy":"2024-04-01T06:27:50.369419Z","iopub.status.idle":"2024-04-01T06:27:50.427731Z","shell.execute_reply":"2024-04-01T06:27:50.426655Z","shell.execute_reply.started":"2024-04-01T06:27:50.370387Z"},"trusted":true},"outputs":[],"source":["train_df[train_df[\"Age\"].isnull()]"]},{"cell_type":"markdown","metadata":{},"source":["### Try Checking for Null Values in Test Df"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df.isnull().sum()"]},{"cell_type":"markdown","metadata":{},"source":["Run this to fix the Null Values"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:21:48.194895Z","iopub.status.busy":"2024-04-01T07:21:48.194020Z","iopub.status.idle":"2024-04-01T07:21:49.449282Z","shell.execute_reply":"2024-04-01T07:21:49.447918Z","shell.execute_reply.started":"2024-04-01T07:21:48.194825Z"},"trusted":true},"outputs":[],"source":["index_nan_age = list(train_df[\"Age\"][train_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = train_df[\"Age\"][((train_df[\"SibSp\"] == train_df.iloc[i][\"SibSp\"]) &(train_df[\"Parch\"] == train_df.iloc[i][\"Parch\"])& (train_df[\"Pclass\"] == train_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = train_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," train_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," train_df[\"Age\"].iloc[i] = age_med\n","\n","index_nan_age = list(test_df[\"Age\"][test_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = test_df[\"Age\"][((test_df[\"SibSp\"] == test_df.iloc[i][\"SibSp\"]) &(test_df[\"Parch\"] == test_df.iloc[i][\"Parch\"])& (test_df[\"Pclass\"] == test_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = test_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," test_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," test_df[\"Age\"].iloc[i] = age_med"]},{"cell_type":"markdown","metadata":{},"source":["## Analysing the correlation between the different columns"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:24:33.644174Z","iopub.status.busy":"2024-04-01T07:24:33.643621Z","iopub.status.idle":"2024-04-01T07:24:34.404306Z","shell.execute_reply":"2024-04-01T07:24:34.402938Z","shell.execute_reply.started":"2024-04-01T07:24:33.643935Z"},"trusted":true},"outputs":[],"source":["numerical_columns = train_df.select_dtypes(include=[np.number]).columns\n","sns.heatmap(train_df[numerical_columns].corr(), annot=True)"]},{"cell_type":"markdown","metadata":{},"source":["We see that Fare and Parch are positively correlated with Survived. Similarly, Fare and Class are negatively correlated, in the sense that the higher the higher the Fare, the lower the Class number (Remember that Class 1 < Class 2 < Class 3 in face value)."]},{"cell_type":"markdown","metadata":{},"source":["## Embarked"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.378151Z","iopub.status.busy":"2024-04-01T06:27:55.377756Z","iopub.status.idle":"2024-04-01T06:27:55.384785Z","shell.execute_reply":"2024-04-01T06:27:55.384101Z","shell.execute_reply.started":"2024-04-01T06:27:55.378107Z"},"trusted":true},"outputs":[],"source":["train_df[\"Embarked\"].head()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.386265Z","iopub.status.busy":"2024-04-01T06:27:55.385875Z","iopub.status.idle":"2024-04-01T06:27:55.635178Z","shell.execute_reply":"2024-04-01T06:27:55.633609Z","shell.execute_reply.started":"2024-04-01T06:27:55.386223Z"},"trusted":true},"outputs":[],"source":["sns.countplot(x = \"Embarked\", data = train_df)\n","plt.show()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.638242Z","iopub.status.busy":"2024-04-01T06:27:55.637447Z","iopub.status.idle":"2024-04-01T06:27:55.699106Z","shell.execute_reply":"2024-04-01T06:27:55.698208Z","shell.execute_reply.started":"2024-04-01T06:27:55.638150Z"},"trusted":true},"outputs":[],"source":["train_df = pd.get_dummies(train_df, columns=[\"Embarked\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df = pd.get_dummies(test_df, columns=[\"Embarked\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Ticket (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.700760Z","iopub.status.busy":"2024-04-01T06:27:55.700330Z","iopub.status.idle":"2024-04-01T06:27:55.708542Z","shell.execute_reply":"2024-04-01T06:27:55.707466Z","shell.execute_reply.started":"2024-04-01T06:27:55.700715Z"},"trusted":true},"outputs":[],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.710291Z","iopub.status.busy":"2024-04-01T06:27:55.709980Z","iopub.status.idle":"2024-04-01T06:27:55.722810Z","shell.execute_reply":"2024-04-01T06:27:55.721839Z","shell.execute_reply.started":"2024-04-01T06:27:55.710231Z"},"trusted":true},"outputs":[],"source":["example_ticket = \"A/5. 2151\"\n","example_ticket.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0]"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.726116Z","iopub.status.busy":"2024-04-01T06:27:55.725689Z","iopub.status.idle":"2024-04-01T06:27:55.738095Z","shell.execute_reply":"2024-04-01T06:27:55.737043Z","shell.execute_reply.started":"2024-04-01T06:27:55.726039Z"},"trusted":true},"outputs":[],"source":["tickets = []\n","for i in list(train_df.Ticket):\n"," if not i.isdigit():\n"," tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n"," else:\n"," tickets.append(\"x\")\n","train_df[\"Ticket\"] = tickets\n","\n","# Do the same for the test set\n","tickets = []\n","for i in list(test_df.Ticket):\n"," if not i.isdigit():\n"," tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n"," else:\n"," tickets.append(\"x\")\n","test_df[\"Ticket\"] = tickets"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.740389Z","iopub.status.busy":"2024-04-01T06:27:55.739797Z","iopub.status.idle":"2024-04-01T06:27:55.755416Z","shell.execute_reply":"2024-04-01T06:27:55.754317Z","shell.execute_reply.started":"2024-04-01T06:27:55.740333Z"},"trusted":true},"outputs":[],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.790832Z","iopub.status.busy":"2024-04-01T06:27:55.790500Z","iopub.status.idle":"2024-04-01T06:27:55.841011Z","shell.execute_reply":"2024-04-01T06:27:55.839862Z","shell.execute_reply.started":"2024-04-01T06:27:55.790770Z"},"trusted":true},"outputs":[],"source":["train_df = pd.get_dummies(train_df, columns= [\"Ticket\"], prefix = \"TcktName\")\n","train_df.head(10)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df = pd.get_dummies(test_df, columns= [\"Ticket\"], prefix = \"TcktName\")\n","test_df.head(10)"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Pclass"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.843395Z","iopub.status.busy":"2024-04-01T06:27:55.842833Z","iopub.status.idle":"2024-04-01T06:27:56.089225Z","shell.execute_reply":"2024-04-01T06:27:56.087578Z","shell.execute_reply.started":"2024-04-01T06:27:55.843168Z"},"trusted":true},"outputs":[],"source":["sns.countplot(x = \"Pclass\", data = train_df)\n","plt.show()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.092270Z","iopub.status.busy":"2024-04-01T06:27:56.091722Z","iopub.status.idle":"2024-04-01T06:27:56.162888Z","shell.execute_reply":"2024-04-01T06:27:56.161841Z","shell.execute_reply.started":"2024-04-01T06:27:56.092186Z"},"trusted":true},"outputs":[],"source":["train_df[\"Pclass\"] = train_df[\"Pclass\"].astype(\"category\")\n","train_df = pd.get_dummies(train_df, columns= [\"Pclass\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df[\"Pclass\"] = test_df[\"Pclass\"].astype(\"category\")\n","test_df = pd.get_dummies(test_df, columns= [\"Pclass\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Sex"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.164709Z","iopub.status.busy":"2024-04-01T06:27:56.164391Z","iopub.status.idle":"2024-04-01T06:27:56.205775Z","shell.execute_reply":"2024-04-01T06:27:56.204761Z","shell.execute_reply.started":"2024-04-01T06:27:56.164639Z"},"trusted":true},"outputs":[],"source":["train_df[\"Sex\"] = train_df[\"Sex\"].astype(\"category\")\n","train_df = pd.get_dummies(train_df, columns=[\"Sex\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df[\"Sex\"] = test_df[\"Sex\"].astype(\"category\")\n","test_df = pd.get_dummies(test_df, columns=[\"Sex\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Drop Passenger ID and Cabin (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.207602Z","iopub.status.busy":"2024-04-01T06:27:56.207299Z","iopub.status.idle":"2024-04-01T06:27:56.215886Z","shell.execute_reply":"2024-04-01T06:27:56.214401Z","shell.execute_reply.started":"2024-04-01T06:27:56.207550Z"},"trusted":true},"outputs":[],"source":["train_df.drop(labels = [\"PassengerId\", \"Cabin\"], axis = 1, inplace = True)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.217917Z","iopub.status.busy":"2024-04-01T06:27:56.217536Z","iopub.status.idle":"2024-04-01T06:27:56.228150Z","shell.execute_reply":"2024-04-01T06:27:56.227230Z","shell.execute_reply.started":"2024-04-01T06:27:56.217854Z"},"trusted":true},"outputs":[],"source":["train_df.columns"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Drop the PassengerId and Cabin columns from the test set\n","test_df.drop(labels = [\"PassengerId\", \"Cabin\"], axis = 1, inplace = True)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Print the columns of the test set\n","print(test_df.columns)"]},{"cell_type":"markdown","metadata":{},"source":["
\n","# Modeling"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.230086Z","iopub.status.busy":"2024-04-01T06:27:56.229809Z","iopub.status.idle":"2024-04-01T06:27:56.238557Z","shell.execute_reply":"2024-04-01T06:27:56.237679Z","shell.execute_reply.started":"2024-04-01T06:27:56.230040Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.svm import SVC\n","from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.metrics import accuracy_score"]},{"cell_type":"markdown","metadata":{},"source":["## Train - Test Split (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.240103Z","iopub.status.busy":"2024-04-01T06:27:56.239830Z","iopub.status.idle":"2024-04-01T06:27:56.256809Z","shell.execute_reply":"2024-04-01T06:27:56.255463Z","shell.execute_reply.started":"2024-04-01T06:27:56.240056Z"},"trusted":true},"outputs":[],"source":["train_df_len = len(train_df)\n","train_df_len"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.314730Z","iopub.status.busy":"2024-04-01T06:27:56.313986Z","iopub.status.idle":"2024-04-01T06:27:56.333564Z","shell.execute_reply":"2024-04-01T06:27:56.332507Z","shell.execute_reply.started":"2024-04-01T06:27:56.314635Z"},"trusted":true},"outputs":[],"source":["\n","train = train_df[:train_df_len]\n","test = test_df\n","\n","# Select all numerical values from train and test\n","numeric_train = train.select_dtypes(include=[np.number])\n","numeric_test = test.select_dtypes(include=[np.number]) \n","\n","\n","X_train = numeric_train.drop(labels=[\"Survived\",], axis=1)\n","y_train = numeric_train[\"Survived\"]\n","\n","# Split the train data into train and test sets with a 1/3 ratio\n","X_train, X_test, y_train, y_test = # Use the train_test_split function here\n","\n","\n","print(\"X_train\", len(X_train))\n","print(\"X_test\", len(X_test))\n","print(\"y_train\", len(y_train))\n","print(\"y_test\", len(y_test))\n","print(\"test\", len(numeric_test))"]},{"cell_type":"markdown","metadata":{},"source":["## Simple Logistic Regression (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.335970Z","iopub.status.busy":"2024-04-01T06:27:56.335281Z","iopub.status.idle":"2024-04-01T06:27:56.368083Z","shell.execute_reply":"2024-04-01T06:27:56.366489Z","shell.execute_reply.started":"2024-04-01T06:27:56.335561Z"},"trusted":true},"outputs":[],"source":["logreg = LogisticRegression()\n","logreg.fit(X_train, y_train)\n","acc_log_train = round(logreg.score(X_train, y_train)*100,2) \n","acc_log_test = round(logreg.score(X_test,y_test)*100,2)\n","# Print the accuracy on the training and test set\n","print(acc_log_train) \n","print(acc_log_test) "]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Hyperparameter Tuning -- Grid Search -- Cross Validation\n","We will compare 5 ml classifier and evaluate mean accuracy of each of them by stratified cross validation.\n","\n","* Decision Tree\n","* SVM\n","* Random Forest\n","* KNN\n","* Logistic Regression"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.371066Z","iopub.status.busy":"2024-04-01T06:27:56.370400Z","iopub.status.idle":"2024-04-01T06:27:56.401742Z","shell.execute_reply":"2024-04-01T06:27:56.396867Z","shell.execute_reply.started":"2024-04-01T06:27:56.370802Z"},"trusted":true},"outputs":[],"source":["random_state = 42\n","classifier = [DecisionTreeClassifier(random_state = random_state),\n"," SVC(random_state = random_state),\n"," RandomForestClassifier(random_state = random_state),\n"," LogisticRegression(random_state = random_state),\n"," KNeighborsClassifier()]\n","\n","dt_param_grid = {\"min_samples_split\" : range(10,500,20),\n"," \"max_depth\": range(1,20,2)}\n","\n","svc_param_grid = {\"kernel\" : [\"rbf\"],\n"," \"gamma\": [0.001, 0.01, 0.1, 1],\n"," \"C\": [1,10,50,100,200,300,1000]}\n","\n","rf_param_grid = {\"max_features\": [1,3,10],\n"," \"min_samples_split\":[2,3,10],\n"," \"min_samples_leaf\":[1,3,10],\n"," \"bootstrap\":[False],\n"," \"n_estimators\":[100,300],\n"," \"criterion\":[\"gini\"]}\n","\n","logreg_param_grid = {\"C\":np.logspace(-3,3,7),\n"," \"penalty\": [\"l1\",\"l2\"]}\n","\n","knn_param_grid = {\"n_neighbors\": np.linspace(1,19,10, dtype = int).tolist(),\n"," \"weights\": [\"uniform\",\"distance\"],\n"," \"metric\":[\"euclidean\",\"manhattan\"]}\n","classifier_param = [dt_param_grid,\n"," svc_param_grid,\n"," rf_param_grid,\n"," logreg_param_grid,\n"," knn_param_grid]"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.413811Z","iopub.status.busy":"2024-04-01T06:27:56.404322Z","iopub.status.idle":"2024-04-01T06:29:38.718970Z","shell.execute_reply":"2024-04-01T06:29:38.717807Z","shell.execute_reply.started":"2024-04-01T06:27:56.413658Z"},"trusted":true},"outputs":[],"source":["cv_result = []\n","best_estimators = []\n","for i in range(len(classifier)):\n"," clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = \"accuracy\", n_jobs = -1,verbose = 1)\n"," clf.fit(X_train,y_train)\n"," cv_result.append(clf.best_score_)\n"," best_estimators.append(clf.best_estimator_)\n"," print(cv_result[i])"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:38.722928Z","iopub.status.busy":"2024-04-01T06:29:38.722207Z","iopub.status.idle":"2024-04-01T06:29:39.075423Z","shell.execute_reply":"2024-04-01T06:29:39.073987Z","shell.execute_reply.started":"2024-04-01T06:29:38.722582Z"},"trusted":true},"outputs":[],"source":["cv_results = pd.DataFrame({\"Cross Validation Means\":cv_result, \"ML Models\":[\"DecisionTreeClassifier\", \"SVM\",\"RandomForestClassifier\",\n"," \"LogisticRegression\",\n"," \"KNeighborsClassifier\"]})\n","\n","g = sns.barplot(x=\"Cross Validation Means\",y= \"ML Models\", data=cv_results)\n","g.set_xlabel(\"Mean Accuracy\")\n","g.set_title(\"Cross Validation Scores\")"]},{"cell_type":"markdown","metadata":{},"source":["## Ensemble Modeling (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:39.078654Z","iopub.status.busy":"2024-04-01T06:29:39.077840Z","iopub.status.idle":"2024-04-01T06:29:39.862871Z","shell.execute_reply":"2024-04-01T06:29:39.860937Z","shell.execute_reply.started":"2024-04-01T06:29:39.078554Z"},"trusted":true},"outputs":[],"source":["votingC = VotingClassifier(estimators = [(\"dt\",best_estimators[0]),\n"," (\"rfc\",best_estimators[2]),\n"," (\"lr\",best_estimators[3])],\n"," voting = \"soft\", n_jobs = -1)\n","votingC = votingC.fit(X_train, y_train)\n","\n","# Print the accuracy score of the voting classifier\n","print(votingC)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Drop the null values which are going to cause you an error in the next cell\n","import pandas as pd\n","from sklearn.ensemble import VotingClassifier\n","\n","# Assuming votingC is your trained VotingClassifier and numeric_test is your test data\n","# Assuming test_PassengerId is the corresponding PassengerId for each test data row\n","\n","# Drop null values from the test data\n","numeric_test_dropped = numeric_test.dropna()\n","\n","# Make predictions on the non-null test data\n","test_survived = pd.Series(votingC.predict(numeric_test_dropped), name=\"Survived\").astype(int)\n","\n","# Get the PassengerId corresponding to the non-null test data rows\n","test_PassengerId_dropped = test_PassengerId.loc[numeric_test_dropped.index]\n","\n","# Concatenate PassengerId and predicted survival outcomes for non-null data\n","results = pd.concat([test_PassengerId_dropped, test_survived], axis=1)\n","\n","# Save the results to a CSV file\n","results.to_csv(\"titanic.csv\", index=False)\n","\n","# Print the results DataFrame\n","print(results)"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Prediction and Submission"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:39.865981Z","iopub.status.busy":"2024-04-01T06:29:39.865330Z","iopub.status.idle":"2024-04-01T06:29:39.977357Z","shell.execute_reply":"2024-04-01T06:29:39.973301Z","shell.execute_reply.started":"2024-04-01T06:29:39.865906Z"},"trusted":true},"outputs":[],"source":["test_survived = pd.Series(votingC.predict(numeric_test), name=\"Survived\").astype(int)\n","results = pd.concat([test_PassengerId, test_survived], axis=1)\n","results.to_csv(\"titanic.csv\", index=False)\n","print(results)"]},{"cell_type":"markdown","metadata":{},"source":["# Congratulations on finishing the assignment!!\n","\n","### The submission is the titanic.csv which was just created, and this file which you have modified."]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"databundleVersionId":26502,"sourceId":3136,"sourceType":"competition"}],"dockerImageVersionId":29852,"isGpuEnabled":false,"isInternetEnabled":false,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.7"}},"nbformat":4,"nbformat_minor":4}