-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdev_graph.py
97 lines (84 loc) · 3.86 KB
/
dev_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from scholar_classes import MyScholarlyScraper, get_cited_by
from scholarly import scholarly
import networkx as nx
import random
import time
import pickle
import datetime
import concurrent
'''
Funktioniert.
TODO:
1) Liste mit papers EXTEND liste mit related
2) `get_citations` und `fill` über alle einträge aus 1.
Wichtig: alles operiert nur auf Liste aus 1
'''
def subsequent_iterator(a=0, n=1e7):
n = int(n)
for i in range(a, a+n):
yield i
def get_hash(scholarly_item):
return hash(scholarly_item['bib']['title'].replace(' ', ''))
sterm = 'biomass valorization'
sc = MyScholarlyScraper(search_term=sterm)
# sc = MyScholarlyScraper(search_term='biomass valorization')
sc.load_n_items(50)
g = nx.Graph()
for i, active_item in enumerate(sc.active_items):
print(100 * '=')
print('Started NX Generation')
print(100 * '=')
active_item_id = get_hash(active_item[1])
g.add_nodes_from([(active_item_id, {'meta': active_item})])
# the related and citing articles for the current active article (since they are already sorted, simple indexing
# is enough). Looks like [(index, [{}, {}, ...], success message), ...]
related_items = sc.related[i]
citing_items = sc.cited_by[i]
if citing_items[2]:
for citing_item in citing_items[1]:
# node id for the current related article
citing_item_id = get_hash(citing_item)
if type(citing_item) != dict:
print('A')
g.add_nodes_from([(citing_item_id, {'meta': citing_item})])
g.add_edge(active_item_id, citing_item_id)
# get the citing artcicles of the related items
if related_items[2]:
print(100 * '-')
print(f'Started iteration through related items')
print(100 * '-')
if related_items[1]:
for related_item in related_items[1]:
# node id for the current related article
related_item_id = get_hash(related_item)
if type(related_item) != dict:
print('A')
g.add_nodes_from([(related_item_id, {'meta': related_item})])
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
# a bit hacky but I have to add the index (related_item[0]) to the call, although it is the same for
# all items
related_citations_list = list(executor.map(lambda item: get_cited_by((related_items[0], item)), related_items[1]))
# related citations list looks like (index, [{}, {}, ...], success)
for related_citing_item in related_citations_list:
if related_citing_item[2]: # success
# node id for the current citing article
if related_citing_item[1]: # list could be empty
for rel_item in related_citing_item[1]:
related_citing_item_id = get_hash(rel_item)
if type(rel_item) != dict:
print('A')
g.add_nodes_from([(related_citing_item_id, {'meta': rel_item})])
g.add_edge(related_item_id, related_citing_item_id)
with open(datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S') + '_' + f'{sterm}.pickle', 'wb') as f:
pickle.dump(g, f)
print(100 * '=')
print('Stopped NX Generation')
print(100 * '=')
with open(datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S') + '_' + f'{sterm}.pickle', 'wb') as f:
pickle.dump(g, f)
# TODO instead of saving sc. Save sc.search_result
with open(datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S') + '_' + f'{sterm}_search_resuls.pickle', 'wb') as f:
pickle.dump(list(sc.search_result), f)
with open(datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S') + '_' + f'{sterm}_sc.pickle', 'wb') as f:
pickle.dump(sc, f)
_l = [next(item) for item in sc.search_result]