-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcugraph_sampler.py
123 lines (101 loc) · 4.2 KB
/
cugraph_sampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
cugraph sampling test on Benchmark Datasets
"""
import numpy as np
import pandas as pd
# Import the modules
import cugraph
import cudf
import dgl
# system and other
import gc
import os
import time
import random
import cupy
# MTX file reader
from scipy.io import mmread
from torch.utils.dlpack import to_dlpack
from torch.utils.dlpack import from_dlpack
# dlpack can only used for pytorch > 1.10 and cupy > 10
def read_and_create(datafile):
# print('Reading ' + str(datafile) + '...')
M = mmread(datafile).asfptype()
_gdf = cudf.DataFrame()
_gdf['src'] = M.row
_gdf['dst'] = M.col
_gdf['wt'] = 1.0
_g = cugraph.Graph()
_g.from_cudf_edgelist(_gdf, source='src', destination='dst', edge_attr='wt', renumber=False)
# print("\t{:,} nodes, {:,} edges".format(_g.number_of_nodes(), _g.number_of_edges() ))
return _g
def cugraphSampler(g, nodes, fanouts, edge_dir='in', prob=None, replace=False,
copy_ndata=True, copy_edata=True, _dist_training=False, exclude_edges=None):
# from here get in a new for loop
# ego_net return edge list
num_nodes = len(nodes)
current_seeds = nodes.reindex(index = np.arange(0, num_nodes))
blocks = []
#seeds = cudf.Series(nodes.to_array())
for fanout in fanouts:
ego_edge_list, seeds_offsets = cugraph.community.egonet.batched_ego_graphs(g, current_seeds, radius = 1)
#print ("current_seeds", current_seeds)
print ("fanout", fanout)
#all_parents = cupy.ndarray(fanout*len(current_seeds))
#all_children = cupy.ndarray(fanout*len(current_seeds))
all_parents = cupy.ndarray(0)
all_children = cupy.ndarray(0)
#print ("all parents", all_parents)
# filter and get a certain size neighborhood
for i in range(1, len(seeds_offsets)):
pos0 = seeds_offsets[i-1]
pos1 = seeds_offsets[i]
edge_list = ego_edge_list[pos0:pos1]
# get randomness fanout
filtered_list = edge_list[edge_list ['dst']== current_seeds[i-1]]
# get sampled_list
if len(filtered_list) > fanout:
sampled_indices = random.sample(filtered_list.index.to_arrow().to_pylist(), fanout)
filtered_list = filtered_list.reindex(index = sampled_indices)
children = cupy.asarray(filtered_list['src'])
parents = cupy.asarray(filtered_list['dst'])
# copy the src and dst to cupy array
all_parents = cupy.append(all_parents, parents)
all_children = cupy.append(all_children, children)
#print (len(test_parents))
# generate dgl.graph and blocks
sampled_graph = dgl.graph ((all_children,all_parents))
#print(all_parents)
#print(all_children)
#print(sampled_graph.edges())
#print(seeds.to_array())
#eid = sampled_graph.edata[dgl.EID]
#block =dgl.to_block(sampled_graph,current_seeds.to_array())
#block.edata[dgl.EID] = eid
#current_seeds = block.srcdata[dgl.NID]
#current_seeds = cudf.Series(current_seeds.cpu().detach().numpy())
#blocks.insert(0, block)
# end of for
return sampled_graph
if __name__ == '__main__':
data = ['preferentialAttachment']#, 'as-Skitter', 'citationCiteseer', 'caidaRouterLevel', 'coAuthorsDBLP', 'coPapersDBLP']
for file_name in data:
G_cu = read_and_create('./data/'+ file_name + '.mtx')
nodes = G_cu.nodes()#.to_array().tolist()
#print(nodes.index)
num_nodes = G_cu.number_of_nodes()
#num_seeds_ = [1000, 3000, 5000, 10000]
# just test 1 epoch
batch_size = 1000
num_batch = num_nodes/batch_size
print (num_batch)
# in each epoch shuffle the nodes
shuffled_nodes = np.arange(num_nodes)
#print(len(nodes), len(shuffled_nodes))
np.random.shuffle(shuffled_nodes)
print(type(nodes))
shuffled_nodes = cudf.Series(shuffled_nodes)
new_nodes = nodes.reindex(index = shuffled_nodes)
#print (nodes)
for i in range(int(num_batch)-1):
blocks = cugraphSampler(G_cu, shuffled_nodes[i*batch_size: (i+1)*batch_size], [5,10])