Skip to content

Commit

Permalink
CONT-334: Think time merging fails for new clusters.
Browse files Browse the repository at this point in the history
  • Loading branch information
Henning-Schulz committed Mar 20, 2020
1 parent ecec8bd commit a45794e
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 5 deletions.
16 changes: 13 additions & 3 deletions clustinator/minimum_distance_appender.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def _find_cluster(self, reduced_matrix, indices_unassigned, max_radius):

return new_indices

def _cluster_remainder(self, reduced_matrix, indices_unassigned, labels):
def _cluster_remainder(self, reduced_matrix, indices_unassigned, labels, prev_labels):
print(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), 'Checking whether to group the remaining sessions into a new cluster...')

max_radius = max([r for (mid, r) in self.prev_radiuses.items() if mid != '-1']) * self.radius_factor
Expand All @@ -71,7 +71,7 @@ def _cluster_remainder(self, reduced_matrix, indices_unassigned, labels):
largest_cluster = new_cluster

if len(largest_cluster) >= self.min_samples:
new_label = str(max([ int(s) if s.isdigit() else 0 for s in labels ]) + 1)
new_label = str(max([ int(s) if s.isdigit() else 0 for s in prev_labels ]) + 1)
labels[largest_cluster] = new_label

print(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), 'Found', len(largest_cluster), 'sessions fitting into one cluster. Assigning them to label', new_label)
Expand Down Expand Up @@ -125,7 +125,7 @@ def _assign_sessions(self, csr_matrix):
else:
indices_unassigned.append(i)

self._cluster_remainder(reduced_matrix, indices_unassigned, labels)
self._cluster_remainder(reduced_matrix, indices_unassigned, labels, unique_labels)

unique, counts = np.unique(labels, return_counts = True)

Expand All @@ -144,11 +144,21 @@ def append(self, csr_matrix):
new_cluster_means = self._calculate_cluster_means(csr_matrix, self.labels)
self.cluster_means = { mid: self._recalculate_mean(mid, new_mean, num_sessions[mid]) for mid, new_mean in new_cluster_means.items() }

for (old_mid, old_mean) in self.prev_markov_chains.items():
if old_mid not in self.cluster_means:
self.cluster_means[old_mid] = old_mean
print(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), 'Added old cluster mean of group', old_mid, 'as no new session belongs to it.')

print(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), 'Mean calculation done.')

print("Calculating the cluster radiuses...")
new_cluster_radiuses = self._calculate_cluster_radiuses(csr_matrix, self.labels, self.cluster_means)

for (old_mid, old_radius) in self.prev_radiuses.items():
if old_mid not in new_cluster_radiuses:
new_cluster_radiuses[old_mid] = old_radius
print(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), 'Added old cluster radius of group', old_mid, 'as no new session belongs to it.')

self.cluster_radiuses = { mid: max(new_radius, self.prev_radiuses.get(mid, 0)) for mid, new_radius in new_cluster_radiuses.items() }
print(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), 'Radius calculation and appending done. Found the following radiuses:', self.cluster_radiuses)

14 changes: 12 additions & 2 deletions clustinator/thinktime_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,15 @@ def mean_1d_dict(self, prev_behavior_model=None, new_num_sessions=None):
new_mean = [ self._thinktime_mean(tt) for tt in thinktimes ]

if prev_behavior_model:
result_dict[beh_id] = self._recalculate_mean(new_mean, new_num_sessions[beh_id], prev_means[beh_id], prev_num_sessions[beh_id])
result_dict[beh_id] = self._recalculate_mean(new_mean, new_num_sessions[beh_id], prev_means.get(beh_id, new_mean), prev_num_sessions.get(beh_id, 0))
else:
result_dict[beh_id] = new_mean

for (old_mid, old_mean) in prev_means.items():
if old_mid not in result_dict:
result_dict[old_mid] = old_mean
print(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), 'Added old think time mean of group', old_mid, 'as no new session belongs to it.')

return result_dict

def variance_1d_dict(self, prev_behavior_model=None, new_num_sessions=None):
Expand All @@ -128,9 +133,14 @@ def variance_1d_dict(self, prev_behavior_model=None, new_num_sessions=None):
new_variance = [ self._thinktime_variance(tt) for tt in thinktimes ]

if prev_behavior_model:
result_dict[beh_id] = self._recalculate_variance(new_variance, new_num_sessions[beh_id], prev_variances[beh_id], prev_num_sessions[beh_id])
result_dict[beh_id] = self._recalculate_variance(new_variance, new_num_sessions[beh_id], prev_variances.get(beh_id, new_variance), prev_num_sessions.get(beh_id, 0))
else:
result_dict[beh_id] = new_variance

for (old_mid, old_variance) in prev_variances.items():
if old_mid not in result_dict:
result_dict[old_mid] = old_variance
print(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), 'Added old think time variance of group', old_mid, 'as no new session belongs to it.')

return result_dict

0 comments on commit a45794e

Please sign in to comment.