-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathpca.py
More file actions
149 lines (78 loc) · 3.3 KB
/
pca.py
File metadata and controls
149 lines (78 loc) · 3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D
dataset = pd.read_csv("data/diabetes.csv")
# print(dataset.head())
# print(dataset.describe())
X = dataset.iloc[:,0:8]
y = dataset.iloc[:,8]
# Standardize feature space mean 0 and variance 1
X_std = (X+np.mean(X,axis = 0))/np.std(X,axis = 0)
# covariance matrix for X (8x8)
cov_matrix = np.cov(X_std, rowvar=False)
# print(f'Covariance matrix of X:\n {cov_matrix}')
# eigenvalues(8x1) and eigenvectors(8x8)
eigenvalues, eigenvectors, = np.linalg.eig(cov_matrix)
# print(f'Eigenvectors of Cov(X): \n {eigenvectors}')
# print(f'\nEigenvalues of Cov(X): \n {eigenvalues}')
# Set of (eigenvalue, eigenvector) pairs
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
# Descending sort (eigenvalue, eigenvector) pairs with respect to eigenvalue
eigvalues_sort = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sort = [eig_pairs[index][1] for index in range(len(eigenvalues))]
# print(f'Eigenvalues in descending order: \n{eigvalues_sort}')
# cumulative variance of each principle component
var_comp_sum = np.cumsum(eigvalues_sort)/sum(eigvalues_sort)
# cumulative proportion of varaince with respect to components
# print(f"Cumulative proportion of variance explained vector: \n {var_comp_sum}")
# x-axis for number of principal components kept
num_comp = range(1,len(eigvalues_sort)+1)
plt.title('Cum. Prop. Variance Explain and Components Kept')
# x-label
plt.xlabel('Principal Components')
# y-label
plt.ylabel('Cum. Prop. Variance Expalined')
# Scatter plot
plt.scatter(num_comp, var_comp_sum)
plt.show()
# Project data onto 2d
# Keep the first two principal components
# P_reduce is 8 x 2 matrix
P_reduce = np.array(eigvectors_sort[0:2]).T
# The projected data in 2D will be n x 2 matrix
Proj_data_2D = np.dot(X_std,P_reduce)
# Plot projected the data onto 2D (test negative for diabetes)
negative = plt.scatter(Proj_data_2D[:,0][y == 0], Proj_data_2D[:,1][y == 0])
# Plot projected the data onto 2D (test positive for diabetes)
positive = plt.scatter(Proj_data_2D[:,0][y == 1], Proj_data_2D[:,1][y == 1], color = "red")
plt.title('PCA Dimensionality Reduction to 2D')
# y-label
plt.ylabel('Principal Component 2')
# x-label
plt.xlabel('Principal Component 1')
# legend
plt.legend([negative,positive],["No Diabetes", "Have Diabetes"])
plt.show()
# data onto 3d
# P_reduce is k x 3 matrix
P_reduce = np.array(eigvectors_sort[0:3]).transpose()
# Let's project data onto 3D space
# The projected data in 3D will be n x 3 matrix
Proj_data_3D = np.dot(X_std,P_reduce)
# Visualize data in 3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# Scatter plot in 3D (test negative for diabetes)
negative = ax.scatter(Proj_data_3D[:0,][y == 0], Proj_data_3D[:1,][y == 0], Proj_data_3D[:,2][y == 0], label="No Diabetes")
# Scatter plot in 3D (test positive for diabetes)
positive = ax.scatter(Proj_data_3D[:0,][y == 0], Proj_data_3D[:1,][y == 0], Proj_data_3D[:,2][y == 1], color="red", label="Have Diabetes")
ax.set_title('PCA Reduces Data to 3D')
# x-label
ax.set_xlabel('Principal Component 1')
# y-label
ax.set_ylabel('Principal Component 2')
# z-label
ax.set_zlabel('Principal Component 3')
ax.legend()
plt.show()