Skip to content

Commit

Permalink
Merge pull request #160 from tigergraph/GML-1812-specialized_testing
Browse files Browse the repository at this point in the history
Gml 1812 specialized testing
  • Loading branch information
RobRossmiller-TG authored Jul 16, 2024
2 parents 8514ecc + 362024f commit ca797fc
Show file tree
Hide file tree
Showing 12 changed files with 290 additions and 54 deletions.
Binary file added tests/data/baseline/ml/fastRP.json.gz
Binary file not shown.
7 changes: 3 additions & 4 deletions tests/run.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
clear
python3 test/create_baseline.py &&
python3 test/setup.py &&
pytest test/test_centrality.py::TestCentrality
# pytest
python3 test/setup.py &&
python3 test/baseline/create_baselines.py &&
pytest test/test_centrality.py test/test_ml.py
Empty file added tests/test/baseline/__init__.py
Empty file.
2 changes: 2 additions & 0 deletions tests/test/baseline/algos/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .degree_cent import *
from .fastrp import fastrp_wrapper as fastrp
46 changes: 46 additions & 0 deletions tests/test/baseline/algos/degree_cent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from collections import Counter

import networkx as nx


def run_degree_baseline_complete(g: nx.Graph, _):
s = 1.0 / (len(g) - 1.0)

# d-1 because nx will double count the self-edge
res = {n: (d - 1) * s for n, d in g.degree()}

out = []
for k, v in res.items():
out.append({"Vertex_ID": k, "score": v})

out = [{"top_scores": out}]
return out


def run_degree_baseline(g: nx.Graph, metric):
res = metric(g)

out = []
for k, v in res.items():
out.append({"Vertex_ID": k, "score": v})

out = [{"top_scores": out}]
return out


def weighted_deg_cent(
g: nx.Graph,
dir: str = "",
):
res = Counter()
for e in g.edges:
a = g.get_edge_data(e[0], e[1])["weight"]
match dir:
case "in":
res[e[1]] += a
case "out":
res[e[0]] += a
case _:
res[e[0]] += a
res[e[1]] += a
return res
107 changes: 107 additions & 0 deletions tests/test/baseline/algos/fastrp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# source: https://github.com/GTmac/FastRP/blob/master/fastrp.py

import numpy as np
from scipy.sparse import csc_matrix, csr_matrix, spdiags
from sklearn import random_projection
from sklearn.preprocessing import normalize, scale


# projection method: choose from Gaussian and Sparse
# input matrix: choose from adjacency and transition matrix
# alpha adjusts the weighting of nodes according to their degree
def fastrp_projection(
A, q=3, dim=128, projection_method="gaussian", input_matrix="adj", alpha=None
):
assert input_matrix == "adj" or input_matrix == "trans"
assert projection_method == "gaussian" or projection_method == "sparse"

if input_matrix == "adj":
M = A
else:
N = A.shape[0]
normalizer = spdiags(np.squeeze(1.0 / csc_matrix.sum(A, axis=1)), 0, N, N)
M = normalizer @ A
# Gaussian projection matrix
if projection_method == "gaussian":
transformer = random_projection.GaussianRandomProjection(
n_components=dim, random_state=42
)
# Sparse projection matrix
else:
transformer = random_projection.SparseRandomProjection(
n_components=dim, random_state=42
)
Y = transformer.fit(M)
# Random projection for A
if alpha is not None:
Y.components_ = Y.components_ @ spdiags(
np.squeeze(np.power(csc_matrix.sum(A, axis=1), alpha)), 0, N, N
)
cur_U = transformer.transform(M)
U_list = [cur_U]

for _ in range(2, q + 1):
cur_U = M @ cur_U
U_list.append(cur_U)
return U_list


# When weights is None, concatenate instead of linearly combines the embeddings from different powers of A
def fastrp_merge(U_list, weights, normalization=False):
dense_U_list = (
[_U.todense() for _U in U_list] if type(U_list[0]) == csc_matrix else U_list
)
_U_list = (
[normalize(_U, norm="l2", axis=1) for _U in dense_U_list]
if normalization
else dense_U_list
)

if weights is None:
return np.concatenate(_U_list, axis=1)
U = np.zeros_like(_U_list[0])
for cur_U, weight in zip(_U_list, weights):
U += cur_U * weight
# U = scale(U.todense())
# U = normalize(U.todense(), norm='l2', axis=1)
return scale(U.toarray()) if type(U) == csr_matrix else scale(U)


# A is always the adjacency matrix
# the choice between adj matrix and trans matrix is decided in the conf
def fastrp_wrapper(A, conf):
U_list = fastrp_projection(
A,
q=len(conf["weights"]),
dim=conf["dim"],
projection_method=conf["projection_method"],
input_matrix=conf["input_matrix"],
alpha=conf["alpha"],
)
U = fastrp_merge(U_list, conf["weights"], conf["normalization"])
return U


def get_emb_filename(prefix, conf):
return (
prefix
+ "-dim="
+ str(conf["dim"])
+ ",projection_method="
+ conf["projection_method"]
+ ",input_matrix="
+ conf["input_matrix"]
+ ",normalization="
+ str(conf["normalization"])
+ ",weights="
+ (
",".join(map(str, conf["weights"]))
if conf["weights"] is not None
else "None"
)
+ ",alpha="
+ (str(conf["alpha"]) if "alpha" in conf else "")
+ ",C="
+ (str(conf["C"]) if "alpha" in conf else "1.0")
+ ".mat"
)
6 changes: 6 additions & 0 deletions tests/test/baseline/create_baselines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import degree_cent_baseline
import fast_rp_baseline

if __name__ == "__main__":
degree_cent_baseline.run()
fast_rp_baseline.run()
Original file line number Diff line number Diff line change
@@ -1,59 +1,16 @@
import csv
import json
from collections import Counter
from functools import partial

import networkx as nx
import numpy as np
from algos import run_degree_baseline, run_degree_baseline_complete, weighted_deg_cent
from tqdm import tqdm

data_path_root = "data/"
baseline_path_root = f"{data_path_root}/baseline/"


def weighted_deg_cent(
g: nx.Graph,
dir: str = "",
):
res = Counter()
for e in g.edges:
a = g.get_edge_data(e[0], e[1])["weight"]
match dir:
case "in":
res[e[1]] += a
case "out":
res[e[0]] += a
case _:
res[e[0]] += a
res[e[1]] += a
return res


def run_degree_baseline_complete(g: nx.Graph, _):
s = 1.0 / (len(g) - 1.0)

# d-1 because nx will double count the self-edge
res = {n: (d - 1) * s for n, d in g.degree()}

out = []
for k, v in res.items():
out.append({"Vertex_ID": k, "score": v})

out = [{"top_scores": out}]
return out


def run_degree_baseline(g: nx.Graph, metric):
res = metric(g)

out = []
for k, v in res.items():
out.append({"Vertex_ID": k, "score": v})

out = [{"top_scores": out}]
return out


def create_graph(edges, weights=False, directed=False):
if directed:
g = nx.DiGraph()
Expand Down Expand Up @@ -90,7 +47,7 @@ def create_degree_baseline(paths):
json.dump(res, f) # , indent=2)


if __name__ == "__main__":
def run():
# (data, output_path, fun, metric)
paths = [
# unweighted
Expand Down
44 changes: 44 additions & 0 deletions tests/test/baseline/fast_rp_baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import gzip
import json

import networkx as nx
import numpy as np
import pandas as pd
from algos import fastrp
from dotenv import load_dotenv
from pyTigerGraph.datasets import Datasets

load_dotenv()
data_path_root = "data"
baseline_path_root = f"{data_path_root}/baseline"


def run(ds_name="Cora"):
dataset = Datasets(ds_name)
edges = pd.read_csv(dataset.tmp_dir + f"/{ds_name}/edges.csv", header=None)
edges.columns = ["src", "tgt"]

g = nx.Graph()
g.add_edges_from(edges.to_numpy())
node_ids = sorted(list(g.nodes))
A = nx.adjacency_matrix(g, nodelist=node_ids)
conf = {
"weights": [1, 2, 4],
"dim": 8,
# "projection_method": "sparse",
"projection_method": "gaussian",
"input_matrix": "trans",
"alpha": -0.628,
"normalization": False,
}

vecs = fastrp(A, conf)

assert len(vecs) == len(node_ids)

res = {str(k): list(v) for k, v in zip(node_ids, vecs)}
with gzip.open(f"{baseline_path_root}/ml/fastRP.json.gz", "wb") as f:
f.write(json.dumps(res).encode())

with gzip.open(f"{baseline_path_root}/ml/fastRP.json.gz", "rb") as f:
d = json.load(f)
25 changes: 23 additions & 2 deletions tests/test/setup.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,31 @@
import json
import os
import re
import time

import pyTigerGraph as tg
import util
from dotenv import load_dotenv
from pyTigerGraph.datasets import Datasets
from tqdm import tqdm
from tqdm import tqdm, trange

import util

load_dotenv()
graph_name = "graph_algorithms_testing"
pattern = re.compile(r'"name":\s*"tg_.*"')


def add_reverse_edge(ds: Datasets):
with open(f"{dataset.tmp_dir}/{ds.name}/create_schema.gsql") as f:
schema: str = f.read()
with open(f"{dataset.tmp_dir}/{ds.name}/create_schema.gsql", "w") as f:
schema = schema.replace(
"ADD DIRECTED EDGE Cite (from Paper, to Paper, time Int, is_train Bool, is_val Bool);",
'ADD DIRECTED EDGE Cite (from Paper, to Paper, time Int, is_train Bool, is_val Bool) WITH REVERSE_EDGE="reverse_Cite";',
)
f.write(schema)


if __name__ == "__main__":
host_name = os.getenv("HOST_NAME")
user_name = os.getenv("USER_NAME")
Expand All @@ -28,9 +42,14 @@
if res["error"]:
exit(1)
# load the data
dataset = Datasets("Cora")
add_reverse_edge(dataset)
conn.ingestDataset(dataset, getToken=True)

dataset = Datasets("graph_algorithms_testing")
conn.ingestDataset(dataset, getToken=True)

conn.graphname = graph_name
# install the queries
feat = conn.gds.featurizer()
installed_queries = util.get_installed_queries(conn)
Expand All @@ -43,3 +62,5 @@
print(q)
feat.installAlgorithm(q)

for _ in trange(30, desc="Sleeping while data loads"):
time.sleep(1)
Loading

0 comments on commit ca797fc

Please sign in to comment.