Python-Analytics-Machine-Learning/logistic_regression.py at main · deepak-mandal/Python-Analytics-Machine-Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Logistic Regression

#-------------Logistic Regression------------------------------
#Import Libraries
import pandas as pd
#import numpy as np
import seaborn as sns
#import matplotlib.pyplot as plt

#Import data
titanic_data = pd.read_csv("/home/deepak/analytics/titanic.csv")
titanic_data.head(10)

print("# of passengers in original dataset:" +str(len(titanic_data.index)))

#Analyzing Data
sns.countplot(x="survived",data=titanic_data)

sns.countplot(x="survived",hue="sex",data=titanic_data)

sns.countplot(x="survived",hue="pclass",data=titanic_data)


'''
#CHECKING DATA TYPE OF A VARIABLE AND CONVERTING IT INTO ANOTHER TYPE
titanic_data.info()
titanic_data["age"].plot.hist()
'''


#Converting var "age" from object type to float type
titanic_data["age"] = pd.to_numeric(titanic_data.age, errors='coerce')
titanic_data.info()
titanic_data["age"].plot.hist()

#Converting var "fare" from object type to float type
titanic_data["fare"] = pd.to_numeric(titanic_data.fare, errors='coerce')
titanic_data.info()
titanic_data["fare"].plot.hist()

#Data Wrangling
titanic_data.isnull()

titanic_data.isnull().sum()

sns.heatmap(titanic_data.isnull(),yticklabels=False, cmap="viridis")

sns.boxplot(x="pclass",y="age",data=titanic_data)

#Handling Missing Values
titanic_data.head(5)

#Droping all the rows which have a missing value in column (Fare)
#Drop NaN in a specific column
titanic_data.dropna(subset=['fare'],inplace=True)
sns.heatmap(titanic_data.isnull(),yticklabels=False)

#Imputing missing values in column (Age) with mean imputation
titanic_data["age"].fillna(titanic_data["age"].mean(), inplace=True)
sns.heatmap(titanic_data.isnull(),yticklabels=False)

#Hence, we do not have any missing values in the dataset now.
titanic_data.isnull().sum()

#There are lot of string value var in the dataset which have to be converted to numerical values
#for applying machine learing algoritm. Hence, we will now convert string var to numerical var.
pd.get_dummies(titanic_data["sex"])

pd.get_dummies(titanic_data["sex"],drop_first=True)

Sex_Dummy = pd.get_dummies(titanic_data["sex"],drop_first=True)
Sex_Dummy.head(5)

Embardked_Dummy = pd.get_dummies(titanic_data["embarked"],drop_first=True)
Embardked_Dummy.head(5)

PClass_Dummy = pd.get_dummies(titanic_data["pclass"],drop_first=True)
PClass_Dummy.head(5)

#Now, lets concatenate these dummy var columns in our dataset.
titanic_data = pd.concat([titanic_data,Sex_Dummy,PClass_Dummy,Embardked_Dummy],axis=1)
titanic_data.head(5)

#dropping the columns whose dummy var have been created
titanic_data.drop(["sex","embarked","pclass","Passenger_id","name","ticket"],axis=1,inplace=True)
titanic_data.head(5)

#Splitting the dataset into Train & Test dataset
x=titanic_data.drop("survived",axis=1)
y=titanic_data["survived"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(random_state = 0)
logmodel.fit(X_train, y_train)

predictions = logmodel.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

#Hence, accuracy = (139+69)\(139+16+37+69) = 77.5%

#----To Improve the accuracy of the model, lets go with Backward ELimination Method &
# rebuild the logisitc model again with few independent variables--------
titanic_data_1 = titanic_data
titanic_data_1.head(5)

#--------------------------Backward Elimination--------------------------------
#Backward elimination is a feature selection technique while building a machine learning model. It is used
#to remove those features that do not have significant effect on dependent variable or prediction of output.

#Step: 1- Preparation of Backward Elimination:

#Importing the library:
import statsmodels.api as sm

#Adding a column in matrix of features:
x1=titanic_data_1.drop("survived",axis=1)
y1=titanic_data_1["survived"]
import numpy as nm
x1 = nm.append(arr = nm.ones((1291,1)).astype(int), values=x1, axis=1)

#Applying backward elimination process now
#Firstly we will create a new feature vector x_opt, which will only contain a set of independent features
#that are significantly affecting the dependent variable.
x_opt= x1[:, [0,1,2,3,4,5,6,7,8,9,10]]

#for fitting the model, we will create a regressor_OLS object of new class OLS of statsmodels library.
#Then we will fit it by using the fit() method.
regressor_OLS=sm.OLS(endog = y1, exog=x_opt).fit()

#We will use summary() method to get the summary table of all the variables.
regressor_OLS.summary()

#In the above summary table, we can clearly see the p-values of all the variables.
#And remove the ind var with p-value greater than 0.05
x_opt= x1[:, [0,1,2,5,6,7,10]]
regressor_OLS=sm.OLS(endog = y, exog=x_opt).fit()
regressor_OLS.summary()

#Hence,independent var - age, fare, sex, pclass & embarked are significant variable for the prediction.
#So we can now predict efficiently using these variables.

#-------Building Logistic Regression model using ind var: age, sibsip, sex, pclass & embarked--------

# Splitting the dataset into training and test set.
from sklearn.model_selection import train_test_split
x_BE_train, x_BE_test, y_BE_train, y_BE_test= train_test_split(x_opt, y1, test_size= 0.25, random_state=0)

# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(random_state = 0)
logmodel.fit(x_BE_train, y_BE_train)

predictions = logmodel.predict(x_BE_test)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_BE_test,predictions)

#Accuracy = (141+71)/(141+14+35+71) = 80%