-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathk_modes.py
executable file
·141 lines (112 loc) · 4.32 KB
/
k_modes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/python
"""
Find the k most frequent elements i) in an array and ii) in a stream.
"""
from collections import defaultdict
def k_modes_array(arr, k):
"""
Implementation with an array to keep track of most frequent elements.
>>> k_modes_array([3, 2, 9, 4, 5, 1, 2, 3, 5, 7, 9, 8, 9, 0, 9, 8, 7, 9, 5, 6, 2, 3, 4, 6, 5, 4, 5, 6, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0], 3)
[0, 5, 9]
"""
k_modes = list()
cur_min = None
counter = defaultdict(int)
for i in range(len(arr)):
x = arr[i]
counter[x] += 1
# 1. k_modes array not full yet, add current value to k_modes
if len(k_modes) < k and x not in k_modes:
k_modes.append(x)
# 2. k_modes array full and current value count > current min,
# remove min from k_modes and add current value to k_modes
elif counter[x] > cur_min and x not in k_modes:
to_del = filter(lambda y: counter[y] == cur_min, k_modes)[0]
k_modes.remove(to_del)
k_modes.append(x)
# 3. Recompute current min
idx = reduce(lambda x, y: x if counter[x] <= counter[y] else y, k_modes)
cur_min = counter[idx]
return sorted(k_modes)
def heappush(heap, x):
cur = len(heap)
heap.append(x)
parent = cur//2
while parent > 0 and heap[cur] < heap[parent]:
heap[parent], heap[cur] = heap[cur], heap[parent]
parent, cur = parent//2, parent
def heappop(heap):
# Pop root and replace with last element.
root = heap[1]
heap[1] = heap[len(heap)-1]
heap.pop()
# Heapify.
def _min_child(heap, child1, child2):
if child2 > len(heap)-1:
if child1 > len(heap)-1:
return
else:
return child1
if heap[child1] <= heap[child2]:
return child1
else:
return child2
cur = 1
children = 2*cur, 2*cur+1
min_child = _min_child(heap, *children)
while min_child and heap[cur] > heap[min_child]:
heap[cur], heap[min_child] = heap[min_child], heap[cur]
cur, children = min_child, (2*min_child, 2*min_child+1)
min_child = _min_child(heap, *children)
return root
def k_modes_array2(arr, k):
"""
Alternate implementation with a min-heap to keep track of most frequent elements.
The min-heap keeps track of the min more efficiently than the array approach.
>>> k_modes_array([3, 2, 9, 4, 5, 1, 2, 3, 5, 7, 9, 8, 9, 0, 9, 8, 7, 9, 5, 6, 2, 3, 4, 6, 5, 4, 5, 6, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0], 3)
[0, 5, 9]
"""
k_modes = list() # min-heap
k_modes.append(0) # 1-indexed
counter = defaultdict(int)
for i in range(len(arr)):
x = arr[i]
counter[x] += 1
# 1. k_modes min-heap not full yet, add current value to k_modes
if len(k_modes)-1 < k and x not in [y[1] for y in k_modes[1:]]:
heappush(k_modes, (counter[x], x))
# 2. k_modes min-heap full and current value count > current min,
# remove min from k_modes and add current value to k_modes
elif counter[x] > k_modes[1] and x not in [y[1] for y in k_modes[1:]]:
heappop(k_modes)
heappush(k_modes, (counter[x], x))
return sorted(k_modes[1:])
arr = [9, 5, 3, 5, 9, 8, 9, 0, 9, 8, 9, 5, 6, 2, 3, 5, 4, 5, 6, 0, 1, 0, 0, 0, 0, 0, 0]
stream = (x for x in arr)
def k_modes_stream(stream, k):
"""
The above solutions for a fixed input array can be generalized for a stream.
"""
k_modes = list()
cur_min = None
counter = defaultdict(int)
x = stream.next()
while x:
counter[x] += 1
# 1. k_modes array not full yet, add current value to k_modes
if len(k_modes) < k and x not in k_modes:
k_modes.append(x)
# 2. k_modes array full and current value count > current min,
# remove min from k_modes and add current value to k_modes
elif counter[x] > cur_min and x not in k_modes:
to_del = filter(lambda y: counter[y] == cur_min, k_modes)[0]
k_modes.remove(to_del)
k_modes.append(x)
# 3. Recompute current min
idx = reduce(lambda x, y: x if counter[x] <= counter[y] else y, k_modes)
cur_min = counter[idx]
x = stream.next()
return sorted(k_modes)
if __name__ == "__main__":
import doctest
doctest.testmod()