forked from bwilder0/clusternet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_data.py
executable file
·132 lines (116 loc) · 4.45 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from shutil import copyfile
import os
import sys
cluster_shapes = ['circle', 'halfmoon', 'filled_circle', 'line']
def usage():
programName = os.path.basename(__file__)
print('''
Usage: python %s <cluster_shape>
cluster_shape: %s
''' % (programName, "|".join(cluster_shapes)))
exit(1)
def main(args):
if(len(args) == 0):
usage()
cluster_shape = args[0]
if(cluster_shape not in cluster_shapes):
usage()
num_classes = 2 # the code supports only 2 classes
radius1 = 20
radius2 = 4
noise = 1
vertex_class_prob = 0.5
V = 2000
E = 10000
prob_edge_between_classes = 0.3
train_pct = 0.4
test_pct = 0.5
labels = np.array([np.random.binomial(num_classes - 1, vertex_class_prob) for _ in range(V)])
class_vertices_dict = {}
for c in range(num_classes):
class_vertices_dict[c] = np.array([i for i, j in enumerate(labels) if j == c])
edges = []
count_between_classes = 0
for _ in range(E):
source_class = np.random.binomial(1, 0.5)
if(np.random.rand() <= prob_edge_between_classes):
dest_class = 1 - source_class
count_between_classes += 1
else:
dest_class = source_class
edge = (np.random.choice(class_vertices_dict[source_class]),
np.random.choice(class_vertices_dict[dest_class]))
while edge in edges or (edge[1], edge[0]) in edges or edge[0] == edge[1]:
edge = (np.random.choice(class_vertices_dict[source_class]),
np.random.choice(class_vertices_dict[dest_class]))
edges.append(edge)
directory = "data/%s" % cluster_shape
Path(directory).mkdir(parents=True, exist_ok=True)
os.chdir(directory)
with open("%s.cites" % cluster_shape, 'w') as cites:
for e in edges:
cites.write("%d %d\n" % e)
all_x = []
all_y = []
classes = []
with open("%s.content" % cluster_shape, 'w') as content:
for v, c in enumerate(labels):
dx = 0.
dy = 0.
if cluster_shape == 'halfmoon':
r = radius1
side = c * 2 - 1
dx = - r / 8 * side
dy = r / 2 * side
elif cluster_shape == 'filled_circle':
r = np.random.rand() * radius1
side = c * 2 - 1
dx = side * 2 * radius1
dy = 0
else:
if(c == 0):
r = radius1
else:
r = radius2
nx0 = noise * (np.random.rand() - 0.5) * 2
ny0 = noise * (np.random.rand() - 0.5) * 2
if cluster_shape in ['circle', 'filled_circle']:
x0 = (np.random.rand() - 0.5) * 2 * r
y0 = ((r ** 2) - (x0 ** 2)) ** 0.5 * np.random.choice([-1, 1])
elif cluster_shape == 'halfmoon':
x0 = np.random.rand() * r * side
y0 = ((r ** 2) - (x0 ** 2)) ** 0.5 * np.random.choice([-1, 1])
else:
y0 = np.random.rand() * 2 * r
x0 = r
px0 = x0 + nx0 + dx
py0 = y0 + ny0 + dy
all_x.append(px0)
all_y.append(py0)
classes.append(c)
content.write("%d %f %f %d\n" % (v, px0, py0, c))
plt.scatter(np.array(all_x), np.array(all_y), c=classes, s=5)
plt.show()
for i in ['train', 'test', 'valid']:
copyfile("%s.content" % cluster_shape, "%s_%s_%.2f.content" % (cluster_shape, i, train_pct))
train_pivot = int(E * train_pct)
test_pivot = int(E * test_pct + train_pivot)
with open("%s_train_%.2f.cites" % (cluster_shape, train_pct), 'w') as cites:
for e in edges[:train_pivot]:
cites.write("%d %d\n" % e)
with open("%s_test_%.2f.cites" % (cluster_shape, train_pct), 'w') as cites:
for e in edges[train_pivot:test_pivot]:
cites.write("%d %d\n" % e)
with open("%s_valid_%.2f.cites" % (cluster_shape, train_pct), 'w') as cites:
for e in edges[test_pivot:]:
cites.write("%d %d\n" % e)
print("%s vertices inn class 0" % len(class_vertices_dict[0]))
print("%s vertices inn class 1" % len(class_vertices_dict[1]))
print("%s edges between classes" % count_between_classes)
print("The generated dataset name is %s" % cluster_shape)
if __name__ == "__main__":
main(sys.argv[1:])