-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathk-medoids-clustering.R
192 lines (161 loc) · 5.29 KB
/
k-medoids-clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# k-medoids-clustering.R
# Contains k-medoids-clustering algorithm and helper functions
#
# Purpose of Partitioning Around Medoids is to clusterize input
# dataset into clusters. Each cluster is represented by its medoid.
# Algorithm is searching best combination of clustering in order
# to minimize cost function, which is simply a sum of distances
# between elements inside clusters and medoids.
#
# Usage:
# data <- replicate(2, rnorm(20))
# nclusters <- 3
# metric <- "euclidean"
# kMedoidsClustering(data, nclusters, metric)
#
# Author: akowalew
# Calculates cost for particular cluster, as a sum of distances
# between its elements and medoid.
calcCost <- function(data, cluster, medoid, metric) {
if(length(cluster) == 0)
return(NA)
v <- data[medoid, ]
distances <- sapply(cluster, function(elem) {
x <- data[elem, ]
distance <- dist(rbind(x, v), method=metric)
})
sum(distances)
}
# Calculates cost for every cluster.
calcCosts <- function(data, clusters, medoids, metric) {
costs <- sapply(1:length(clusters), function(k) {
cluster <- clusters[[k]]
medoid <- medoids[k]
cost <- calcCost(data, cluster, medoid, metric)
})
}
# Calculates distances between given point 'x' and medoids.
calcDistances <- function(data, x, medoids, metric) {
sapply(medoids, function(medoid) {
v <- data[medoid, ]
dist(rbind(x, v), method=metric)
})
}
# Returns number of medoid, to which given element has shortest distance
getNearestMedoidNumber <- function(data, elem, medoids, metric) {
x <- data[elem, ]
distances <- calcDistances(data, x, medoids, metric)
k <- which.min(distances)
}
# Creates list, where each node contains vector of data elements
# which are assigned to particular clusters
makeClusters <- function(data, elements, medoids, metric) {
clusters <- vector("list", length(medoids))
for(elem in elements) {
k <- getNearestMedoidNumber(data, elem, medoids, metric)
clusters[[k]] <- c(clusters[[k]], elem)
}
return(clusters)
}
# Checks isolation of given cluster.
# Cluster is isolated, when after replacing medoid with its center
# we are getting lower cost value.
isClusterIsolated <- function(data, cluster, medoid, metric) {
if(length(cluster) == 0)
return(TRUE)
currentCost <- calcCost(data, cluster, medoid, metric)
# get mean point of whole group
group <- data[c(cluster, medoid), ]
xmean <- colMeans(group)
# replace medoid with mean and compare its cost to current one
newCost <- sum(sapply(cluster, function(elem) {
x <- data[elem, ]
distance <- dist(rbind(x, xmean), method=metric)
}))
isolated <- (newCost < currentCost)
}
# Returns vector representing an isolation of all clusters
calcIsolations <- function(data, clusters, medoids, metric) {
isolations <- sapply(1:length(clusters), function(k) {
cluster <- clusters[[k]]
medoid <- medoids[k]
isolated <- isClusterIsolated(data, cluster, medoid, metric)
})
}
# Generates vector: membership of data with particular clusters
makeClustering <- function(data, clusters, medoids) {
clustering <- vector(mode="integer", nrow(data))
for(i in 1:length(clusters)) {
cluster <- clusters[[i]]
# if there is any element inside a cluster, assign it to data
if(length(cluster) > 0)
clustering[cluster] <- rep(i, length(cluster))
# medoid is always assigned to cluster
clustering[medoids[i]] <- i
}
return(clustering)
}
# Implements PAM algorithm. It takes `data` matrix and is clustering them
# into `nclusters` partitions. All costs and distances are evaluated using
# specified `metric`.
# To improve speed, clustering results are represented by indices to exact
# rows inside `data` matrix.
kMedoidsClustering <- function(data, nclusters=5, metric="euclidean") {
stopifnot(is.matrix(data))
stopifnot(nclusters %% nclusters == 0) # nclusters must be int
stopifnot(is.character(metric))
stopifnot(nrow(data) > nclusters)
# get initial random sample medoids
medoids <- sample(1:nrow(data), nclusters)
elements <- (1:nrow(data))[-medoids]
repeat {
clusters <- makeClusters(data, elements, medoids, metric)
# find best medoid inside each cluster
hasChanged <- FALSE
for(k in 1:nclusters) {
medoid <- medoids[k]
cluster <- clusters[[k]]
# if cluster is empty, skip iteration
if(length(cluster) == 0)
next
bestCost <- calcCost(data, cluster, medoid, metric)
bestIdx <- -1
# calculate costs for each replacement of cluster element and medoid
for(i in 1:length(cluster)) {
# swap
newMedoid <- cluster[i]
cluster[i] <- medoid
medoid <- newMedoid
# test, whether new combination gives lower cost
cost <- calcCost(data, cluster, medoid, metric)
if(cost < bestCost) {
bestCost <- cost
bestIdx <- i
}
# un-swap
oldMedoid <- cluster[i]
cluster[i] <- medoid
medoid <- oldMedoid
}
# if better option was found, replace permamently
if(bestIdx != -1) {
hasChanged <- TRUE
newMedoid <- cluster[bestIdx]
clusters[[k]][bestIdx] <- medoid
medoids[[k]] <- newMedoid
}
}
if(!hasChanged)
break
elements <- unlist(clusters)
}
clustering <- makeClustering(data, clusters, medoids)
costs <- calcCosts(data, clusters, medoids, metric)
isolations <- calcIsolations(data, clusters, medoids, metric)
return(list(
"medoids"=medoids,
"clusters"=clusters,
"clustering"=clustering,
"costs"=costs,
"isolations"=isolations))
}