-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathkmeans.py
62 lines (48 loc) · 1.99 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pandas as pd
import matplotlib.pylab as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np
# Generating random uniform numbers
X = np.random.uniform(0,1,1000)
Y = np.random.uniform(0,1,1000)
df_xy =pd.DataFrame(columns=["X","Y"])
df_xy.X = X
df_xy.Y = Y
df_xy.plot(x="X",y = "Y",kind="scatter")
model1 = KMeans(n_clusters=5).fit(df_xy)
model1.labels_
model1.cluster_centers_
df_xy.plot(x="X",y = "Y",c=model1.labels_,kind="scatter",s=10,cmap=plt.cm.coolwarm)
# Kmeans on University Data set
Univ = pd.read_csv("E:\\Bokey\\Excelr Data\\Python Codes\\all_py\\Clustering\\Universities.csv")
# Normalization function
def norm_func(i):
x = (i-i.min()) / (i.max() - i.min())
return (x)
# Normalized data frame (considering the numerical part of data)
df_norm = norm_func(Univ.iloc[:,1:])
df_norm.head(10) # Top 10 rows
###### screw plot or elbow curve ############
k = list(range(2,15))
k
TWSS = [] # variable for storing total within sum of squares for each kmeans
for i in k:
kmeans = KMeans(n_clusters = i)
kmeans.fit(df_norm)
WSS = [] # variable for storing within sum of squares for each cluster
for j in range(i):
WSS.append(sum(cdist(df_norm.iloc[kmeans.labels_==j,:],kmeans.cluster_centers_[j].reshape(1,df_norm.shape[1]),"euclidean")))
TWSS.append(sum(WSS))
# Scree plot
plt.plot(k,TWSS, 'ro-');plt.xlabel("No_of_Clusters");plt.ylabel("total_within_SS");plt.xticks(k)
# Selecting 5 clusters from the above scree plot which is the optimum number of clusters
model=KMeans(n_clusters=5)
model.fit(df_norm)
model.labels_ # getting the labels of clusters assigned to each row
md=pd.Series(model.labels_) # converting numpy array into pandas series object
Univ['clust']=md # creating a new column and assigning it to new column
df_norm.head()
Univ = Univ.iloc[:,[7,0,1,2,3,4,5,6]]
Univ.iloc[:,1:7].groupby(Univ.clust).mean()
Univ.to_csv("Univsersity.csv")