Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gml 1812 specialized testing #160

Merged
merged 5 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added tests/data/baseline/ml/fastRP.json.gz
Binary file not shown.
7 changes: 3 additions & 4 deletions tests/run.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
clear
python3 test/create_baseline.py &&
python3 test/setup.py &&
pytest test/test_centrality.py::TestCentrality
# pytest
python3 test/setup.py &&
python3 test/baseline/create_baselines.py &&
pytest test/test_centrality.py test/test_ml.py
Empty file added tests/test/baseline/__init__.py
Empty file.
2 changes: 2 additions & 0 deletions tests/test/baseline/algos/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .degree_cent import *
from .fastrp import fastrp_wrapper as fastrp
46 changes: 46 additions & 0 deletions tests/test/baseline/algos/degree_cent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from collections import Counter

import networkx as nx


def run_degree_baseline_complete(g: nx.Graph, _):
s = 1.0 / (len(g) - 1.0)

# d-1 because nx will double count the self-edge
res = {n: (d - 1) * s for n, d in g.degree()}

out = []
for k, v in res.items():
out.append({"Vertex_ID": k, "score": v})

out = [{"top_scores": out}]
return out


def run_degree_baseline(g: nx.Graph, metric):
res = metric(g)

out = []
for k, v in res.items():
out.append({"Vertex_ID": k, "score": v})

out = [{"top_scores": out}]
return out


def weighted_deg_cent(
g: nx.Graph,
dir: str = "",
):
res = Counter()
for e in g.edges:
a = g.get_edge_data(e[0], e[1])["weight"]
match dir:
case "in":
res[e[1]] += a
case "out":
res[e[0]] += a
case _:
res[e[0]] += a
res[e[1]] += a
return res
107 changes: 107 additions & 0 deletions tests/test/baseline/algos/fastrp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# source: https://github.com/GTmac/FastRP/blob/master/fastrp.py

import numpy as np
from scipy.sparse import csc_matrix, csr_matrix, spdiags
from sklearn import random_projection
from sklearn.preprocessing import normalize, scale


# projection method: choose from Gaussian and Sparse
# input matrix: choose from adjacency and transition matrix
# alpha adjusts the weighting of nodes according to their degree
def fastrp_projection(
A, q=3, dim=128, projection_method="gaussian", input_matrix="adj", alpha=None
):
assert input_matrix == "adj" or input_matrix == "trans"
assert projection_method == "gaussian" or projection_method == "sparse"

if input_matrix == "adj":
M = A
else:
N = A.shape[0]
normalizer = spdiags(np.squeeze(1.0 / csc_matrix.sum(A, axis=1)), 0, N, N)
M = normalizer @ A
# Gaussian projection matrix
if projection_method == "gaussian":
transformer = random_projection.GaussianRandomProjection(
n_components=dim, random_state=42
)
# Sparse projection matrix
else:
transformer = random_projection.SparseRandomProjection(
n_components=dim, random_state=42
)
Y = transformer.fit(M)
# Random projection for A
if alpha is not None:
Y.components_ = Y.components_ @ spdiags(
np.squeeze(np.power(csc_matrix.sum(A, axis=1), alpha)), 0, N, N
)
cur_U = transformer.transform(M)
U_list = [cur_U]

for _ in range(2, q + 1):
cur_U = M @ cur_U
U_list.append(cur_U)
return U_list


# When weights is None, concatenate instead of linearly combines the embeddings from different powers of A
def fastrp_merge(U_list, weights, normalization=False):
dense_U_list = (
[_U.todense() for _U in U_list] if type(U_list[0]) == csc_matrix else U_list
)
_U_list = (
[normalize(_U, norm="l2", axis=1) for _U in dense_U_list]
if normalization
else dense_U_list
)

if weights is None:
return np.concatenate(_U_list, axis=1)
U = np.zeros_like(_U_list[0])
for cur_U, weight in zip(_U_list, weights):
U += cur_U * weight
# U = scale(U.todense())
# U = normalize(U.todense(), norm='l2', axis=1)
return scale(U.toarray()) if type(U) == csr_matrix else scale(U)


# A is always the adjacency matrix
# the choice between adj matrix and trans matrix is decided in the conf
def fastrp_wrapper(A, conf):
U_list = fastrp_projection(
A,
q=len(conf["weights"]),
dim=conf["dim"],
projection_method=conf["projection_method"],
input_matrix=conf["input_matrix"],
alpha=conf["alpha"],
)
U = fastrp_merge(U_list, conf["weights"], conf["normalization"])
return U


def get_emb_filename(prefix, conf):
return (
prefix
+ "-dim="
+ str(conf["dim"])
+ ",projection_method="
+ conf["projection_method"]
+ ",input_matrix="
+ conf["input_matrix"]
+ ",normalization="
+ str(conf["normalization"])
+ ",weights="
+ (
",".join(map(str, conf["weights"]))
if conf["weights"] is not None
else "None"
)
+ ",alpha="
+ (str(conf["alpha"]) if "alpha" in conf else "")
+ ",C="
+ (str(conf["C"]) if "alpha" in conf else "1.0")
+ ".mat"
)
6 changes: 6 additions & 0 deletions tests/test/baseline/create_baselines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import degree_cent_baseline
import fast_rp_baseline

if __name__ == "__main__":
degree_cent_baseline.run()
fast_rp_baseline.run()
Original file line number Diff line number Diff line change
@@ -1,59 +1,16 @@
import csv
import json
from collections import Counter
from functools import partial

import networkx as nx
import numpy as np
from algos import run_degree_baseline, run_degree_baseline_complete, weighted_deg_cent
from tqdm import tqdm

data_path_root = "data/"
baseline_path_root = f"{data_path_root}/baseline/"


def weighted_deg_cent(
g: nx.Graph,
dir: str = "",
):
res = Counter()
for e in g.edges:
a = g.get_edge_data(e[0], e[1])["weight"]
match dir:
case "in":
res[e[1]] += a
case "out":
res[e[0]] += a
case _:
res[e[0]] += a
res[e[1]] += a
return res


def run_degree_baseline_complete(g: nx.Graph, _):
s = 1.0 / (len(g) - 1.0)

# d-1 because nx will double count the self-edge
res = {n: (d - 1) * s for n, d in g.degree()}

out = []
for k, v in res.items():
out.append({"Vertex_ID": k, "score": v})

out = [{"top_scores": out}]
return out


def run_degree_baseline(g: nx.Graph, metric):
res = metric(g)

out = []
for k, v in res.items():
out.append({"Vertex_ID": k, "score": v})

out = [{"top_scores": out}]
return out


def create_graph(edges, weights=False, directed=False):
if directed:
g = nx.DiGraph()
Expand Down Expand Up @@ -90,7 +47,7 @@ def create_degree_baseline(paths):
json.dump(res, f) # , indent=2)


if __name__ == "__main__":
def run():
# (data, output_path, fun, metric)
paths = [
# unweighted
Expand Down
44 changes: 44 additions & 0 deletions tests/test/baseline/fast_rp_baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import gzip
import json

import networkx as nx
import numpy as np
import pandas as pd
from algos import fastrp
from dotenv import load_dotenv
from pyTigerGraph.datasets import Datasets

load_dotenv()
data_path_root = "data"
baseline_path_root = f"{data_path_root}/baseline"


def run(ds_name="Cora"):
dataset = Datasets(ds_name)
edges = pd.read_csv(dataset.tmp_dir + f"/{ds_name}/edges.csv", header=None)
edges.columns = ["src", "tgt"]

g = nx.Graph()
g.add_edges_from(edges.to_numpy())
node_ids = sorted(list(g.nodes))
A = nx.adjacency_matrix(g, nodelist=node_ids)
conf = {
"weights": [1, 2, 4],
"dim": 8,
# "projection_method": "sparse",
"projection_method": "gaussian",
"input_matrix": "trans",
"alpha": -0.628,
"normalization": False,
}

vecs = fastrp(A, conf)

assert len(vecs) == len(node_ids)

res = {str(k): list(v) for k, v in zip(node_ids, vecs)}
with gzip.open(f"{baseline_path_root}/ml/fastRP.json.gz", "wb") as f:
f.write(json.dumps(res).encode())

with gzip.open(f"{baseline_path_root}/ml/fastRP.json.gz", "rb") as f:
d = json.load(f)
25 changes: 23 additions & 2 deletions tests/test/setup.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,31 @@
import json
import os
import re
import time

import pyTigerGraph as tg
import util
from dotenv import load_dotenv
from pyTigerGraph.datasets import Datasets
from tqdm import tqdm
from tqdm import tqdm, trange

import util

load_dotenv()
graph_name = "graph_algorithms_testing"
pattern = re.compile(r'"name":\s*"tg_.*"')


def add_reverse_edge(ds: Datasets):
with open(f"{dataset.tmp_dir}/{ds.name}/create_schema.gsql") as f:
schema: str = f.read()
with open(f"{dataset.tmp_dir}/{ds.name}/create_schema.gsql", "w") as f:
schema = schema.replace(
"ADD DIRECTED EDGE Cite (from Paper, to Paper, time Int, is_train Bool, is_val Bool);",
'ADD DIRECTED EDGE Cite (from Paper, to Paper, time Int, is_train Bool, is_val Bool) WITH REVERSE_EDGE="reverse_Cite";',
)
f.write(schema)


if __name__ == "__main__":
host_name = os.getenv("HOST_NAME")
user_name = os.getenv("USER_NAME")
Expand All @@ -28,9 +42,14 @@
if res["error"]:
exit(1)
# load the data
dataset = Datasets("Cora")
add_reverse_edge(dataset)
conn.ingestDataset(dataset, getToken=True)

dataset = Datasets("graph_algorithms_testing")
conn.ingestDataset(dataset, getToken=True)

conn.graphname = graph_name
# install the queries
feat = conn.gds.featurizer()
installed_queries = util.get_installed_queries(conn)
Expand All @@ -43,3 +62,5 @@
print(q)
feat.installAlgorithm(q)

for _ in trange(30, desc="Sleeping while data loads"):
time.sleep(1)
Loading
Loading