-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAnalysis.py
277 lines (226 loc) · 9.82 KB
/
Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import pandas as pd
from scipy.stats import shapiro, spearmanr, kendalltau, mannwhitneyu, chi2_contingency
import itertools
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plot_style import set_plot_theme, font
from cb91visuals import *
#!adjust path accordingly!
data_path = "Reports/Data/"
data = pd.read_csv(data_path+"heart_failure_clinical_records_dataset.csv", delimiter=',')
data_df = data
features = data_df.columns.copy()
labels_names = ['safe', 'dead']
labels = data_df["DEATH_EVENT"].astype(int).copy()
#Setting the default theme for the plots
set_plot_theme()
#change the colnames and value names for better clarity
data_df = data_df.set_axis(
["Age", "Anaemia", "CPK", "Diabetes", "EF", "hBP", "Platelets", "SC", "sNa", "Sex", "Smoking", "Time", "Death Event"],
axis = 1
)
measures = ["years","1000 mcg/L","Percentage","1000 kiloplatelets/mL","mg/dL","mEq/L", "Days"]
# %% Correlation
#Data Normalization
columns_to_standardize = [
"Age","CPK", "EF", "Platelets", "SC", "sNa", "Time"
]
data_df_norm = data_df.copy()
for column in data_df.columns:
data_df_norm[column] = (data_df[column]-data_df[column].mean())/data_df[column].std()
#quick check for missing values
missing = data.isnull().values.any()
#print(missing)
#there are none, as missing is 'False'
#remove binary data
bincols = data.columns[data.isin([0,1]).all()]
non_bincols = [col for col in data.columns if col not in bincols]
data_cont = data[non_bincols]
#data_cont.to_csv(data_path+"heart_failure_data__continuous-only.csv")
#data_blood = data_cont[["creatinine_phosphokinase", "serum_creatinine", "serum_sodium", "ejection_fraction", "platelets"]]
#data_other = data_cont[["age", "time"]]
#test for normality
shapiro_df = pd.DataFrame(
columns = ["Parameter", "Test Statistic", "p-value"]
)
for col in data.columns:
stat, pval = shapiro(data[col])
test = pd.DataFrame(
[[col, stat, pval]],
columns = ["Parameter", "Test Statistic", "p-value"]
)
shapiro_df = shapiro_df.append(test, ignore_index = True)
#it seems that none of the parameters are normally distributed
#test for normalized data
norm_data = (data - data.mean())/data.std()
shapiro_norm_df = pd.DataFrame(
columns = ["Parameter", "Test Statistic", "p-value"]
)
for col in norm_data.columns:
stat, pval = shapiro(norm_data[col])
test = pd.DataFrame(
[[col, stat, pval]],
columns = ["Parameter", "Test Statistic", "p-value"]
)
shapiro_norm_df = shapiro_norm_df.append(test, ignore_index = True)
#same results
#test correlation
#since none of the parameters are normally distributed, Pearson correlation cannot be performed
#alternatives for non-normally distributed data are Spearman's rho and Kendall's tau
corr_df = pd.DataFrame(
columns = ["Par1", "Par2", "Spearman's Rho", "Spearman Coef.", "Kendall's Tau", "Kendall Coef."]
)
for col_pair in itertools.combinations(data.columns, 2):
par1, par2 = col_pair
#comb_name = par1 + " and " + par2
rho_stat, rho_pval = spearmanr(data[par1], data[par2])
tau_stat, tau_pval = kendalltau(data[par1], data[par2])
sub_df = pd.DataFrame(
[[par1, par2, rho_pval, rho_stat, tau_pval, tau_stat]],
columns = ["Par1", "Par2", "Spearman's Rho", "Spearman Coef.", "Kendall's Tau", "Kendall Coef."]
)
corr_df = corr_df.append(sub_df)
corr_df = corr_df.sort_values("Spearman's Rho")
#interestingly, serum creatinine and CPK are not really correlated
corr_df_sig = corr_df[corr_df["Spearman's Rho"] <= 0.05]
#replicating results of Chicco and Jurman (2020) and selecting interesting correlations:
# - serum creatinine and sodium
# - age and serum creatinine
# - ejection fraction and serum creatinine
###look for correlations related to EF
ef_corr_df = corr_df[corr_df["Par1"] == "ejection_fraction"]
#the interesting ones are between serum_sodium, smoking, platelets and high blood pressure
#test other relationships between all other variables, as well as between serum creatinine
stat_df = pd.DataFrame(
columns = ["Par1", "Par2", "M-W-U pVal", "chi-sq pVal"]
)
for col_pair in itertools.combinations(data.columns, 2):
par1, par2 = col_pair
mwu_stat, mwu_pval = mannwhitneyu(data[par1], data[par2])
conting_tab = pd.crosstab(data[par1], data[par2])
chi2_stat, chi2_pval, chi2_dof, chi2_exp = chi2_contingency(conting_tab)
sub_df = pd.DataFrame(
[[par1, par2, mwu_pval, chi2_pval]],
columns = ["Par1", "Par2", "M-W-U pVal", "chi-sq pVal"]
)
stat_df = stat_df.append(sub_df)
stat_df = stat_df.sort_values("chi-sq pVal")
#again, reproduced results (although p-values are still different)
#slice the dataframe to get (significant) relationships between serum creatinine or EF and the other attributes
int_stat_df = stat_df.loc[(stat_df["Par1"] == "serum_creatinine") | (stat_df["Par1"] == "ejection_fraction")]
#Plot a heatmap to see the correlation among the features
heatmap = sns.heatmap(data_df.corr(), vmin=-1, vmax=1, annot=True, cmap='Blues', fmt='.2f')
locs, labs = plt.yticks()
plt.setp(labs, fontsize = 15)
locs, labs = plt.xticks()
plt.setp(labs, rotation=45, ha = "right", fontsize = 15)
# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Correlation Coefficients', fontdict={'fontsize':23}, pad=12)
#plot correlation between sodium and creatinine only
fig = plt.figure(figsize = (15, 10))
ax = fig.add_subplot()
sns.regplot(data = data, x = "serum_sodium", y = "serum_creatinine")
ax.set(
title = "Basic linear regression",
xlabel = "SNa - mEq/L",
ylabel = "mg/dL"
)
#%% Data Normalization
columns_to_standardize = [
"Age","CPK", "EF", "Platelets", "SC", "sNa", "Time"
]
data_df_norm = data_df.copy()
for column in data_df.columns:
data_df_norm[column] = (data_df[column]-data_df[column].mean())/data_df[column].std()
#convert to array
X = data_df_norm.to_numpy()
#%% 3.2 Principal Component Analysis
# Run PCA algorithm
U, S, VT = np.linalg.svd(X, full_matrices=False)
rho = rho = (S*S) / (S*S).sum()
threshold = 0.95
fig = plt.figure(figsize = (20, 10))
ax = fig.add_subplot()
ax.plot(range(1,len(rho)+1),rho,'x-', markersize = 20, color=CB91_Red)
ax.plot(range(1,len(rho)+1),np.cumsum(rho),'o-', markersize = 20, color=CB91_Green)
ax.plot([1,len(rho)],[threshold, threshold],'k--')
ax.set(
title = 'Variance explained by Principal Components',
xlabel = 'Number of Prinicipal Components',
ylabel = 'Variance explained'
)
ax.legend(['Individual','Cumulative','95% Threshold'])
plt.show()
#project the original dataset on the new subspace
X_tilde = X @ VT.T
# select the values projected on the first principal component
pc1 = X_tilde[:, 0]
# select the values projected on the second principal component
pc2 = X_tilde[:, 1]
# select the values projected on the third principal component
pc2 = X_tilde[:, 2]
#%% 2-D plot
fig = plt.figure(figsize = (11, 7))
ax = fig.add_subplot()
ax.scatter(pc1[labels==0], pc2[labels==0], label ="safe", marker='o', color = CB91_Green, edgecolors='black', linewidths=1)
ax.scatter(pc1[labels==1], pc2[labels==1], label="dead", marker='o', color = CB91_Red, edgecolors='black', linewidth=1)
ax.set(
title = 'Projection on the First Two Principal Components',
xlabel = 'PC 1',
ylabel = 'PC 2',
)
ax.legend(loc = 1, )
plt.show()
#%% Bar plot to see how weights of the original features in the new features space
fig, ax = plt.subplots(1,2,figsize=(16,8))
for i in range(2):
ax[i].bar(data_df_norm.columns,VT.T[:,i])
ax[i].set_xticklabels(data_df_norm.columns,rotation = 90)
ax[i].set_title(f"{i+1}° Principal Component", fontsize = 24)
ax[i].axhline(0)
ax[i].set_ylim(-0.6,0.6)
labs = ax[i].get_xticklabels()
ax[i].set_xticklabels(labs, rotation = 45, ha = "right", fontsize = 20)
fig_title = fig.suptitle(
"Weights of the First Two Principal Components", fontsize = 32
)
fig_title.set_position([.5, 1.02])
# %% Attempt to make a classification based on the first principal component values
plt.figure()
labels_predicted = np.empty(299).astype(int)
labels_predicted[pc1>=-0.7] = 0
labels_predicted[pc1<-0.7] = 1
true_0 = np.sum(np.logical_and(data['DEATH_EVENT'] == 0, labels_predicted == 0))
false_0 = np.sum(np.logical_and(data['DEATH_EVENT'] == 1, labels_predicted == 0))
true_1 = np.sum(np.logical_and(data['DEATH_EVENT'] == 1, labels_predicted == 1))
false_1 = np.sum(np.logical_and(data['DEATH_EVENT'] == 0, labels_predicted == 1))
accuracy = (true_0 + true_1)/(true_0+true_1+false_0+false_1)
recall = true_1/(true_1+false_0)
precision = true_1/(true_1+false_1)
print('true %s: %d' % ('survived', true_0))
print('false %s: %d' % ('survived', false_0))
print('true %s: %d' % ('dead', true_1))
print('false %s: %d' % ('dead', false_1))
print('accuracy = %1.2f %%' % (accuracy * 100))
print('precision = %1.2f %%' % (precision * 100))
print('recall = %1.2f %%' % (recall * 100))
cf_matrix = np.array([[true_0, false_1],[false_0, true_1]])
values = cf_matrix/np.sum(cf_matrix)
group_names = ['True Negative','False Positive','False Negative','True Positive']
group_counts = ["{0:0.0f}".format(value) for value in
cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')
ax.set_title('Confusion Matrix', size=28);
ax.set_xlabel('\nPredicted Values',size=18)
ax.set_ylabel('Actual Values',size=18);
## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'],size=16)
ax.yaxis.set_ticklabels(['False','True'],size=16)
## Display the visualization of the Confusion Matrix.
plt.show()