-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_utils.py
98 lines (77 loc) · 3.34 KB
/
read_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import json
import pathlib as pl
import pandas as pd
import networkx as nx
from networkx.readwrite import json_graph
def get_team_no_from_log_filename(filename):
'''extract team no from a log filename of format *_<team_no>.*'''
return int(str(pl.Path(filename).stem).split('_')[-1])
def get_team_no_from_transcript_filename(filename):
'''extract team no from a transcript filename of format *_<team_no>.*'''
return int(str(pl.Path(filename).stem).split('_')[-1])
def get_team_no_from_corpus_filename(filename):
'''extract team no from a corpus filename of format *_<team_no>.*'''
return int(str(pl.Path(filename).stem).split('_')[-1])
def get_team_no_from_routine_filename(filename):
'''extract team no from a routine filename of format *_<team_no>.*'''
return int(str(pl.Path(filename).stem).split('_')[-1])
def read_tables(input_dir, form='transcript', verbose=True):
'''read transcript, log or corpus tables at a directory'''
if form == 'transcript':
parser = get_team_no_from_log_filename
elif form == 'log':
parser = get_team_no_from_transcript_filename
elif form == 'corpus':
parser = get_team_no_from_corpus_filename
elif form == 'routine':
parser = get_team_no_from_routine_filename
else:
raise ValueError
if verbose:
print('Reading {} files from {}.'.format(form, input_dir))
# Make a list of the available files.
filelist = sorted([f for f in input_dir.glob('*.csv')])
if verbose:
print('{} {} files found.'.format(form, len(filelist)))
# Organize the file list as a dictionary keyed by team number.
files = {parser(f): f for f in filelist}
if verbose:
for team_no, file in files.items():
print('File {} belongs to team {:2d}'.format(file.stem, team_no))
# Read the files.
dfs = dict()
for team_no, f in files.items():
# Read the file into a table.
df = pd.read_csv(f, sep='\t')
dfs[team_no] = df
# Make type conversions depending on the form.
if form == 'routine':
from ast import literal_eval
for team_no, df in dfs.items():
# Convert utterance_no_list strings to python list objects.
df.utterances = df.utterances.apply(literal_eval)
if verbose:
for team_no, df in dfs.items():
if form == 'log':
print('{} of {:2d} contains {:4d} events'.format(
form.title(), team_no, len(df)))
elif form == 'transcript':
print('Transcript of {:2d} has {:4d} utterances'.format(
team_no, len(df)))
return dfs
def read_network(graph_file):
# Load from file, formatted as a node-link JSON.
# c.f. https://networkx.org/documentation/stable/reference/readwrite/json_graph.html
# in particular https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.json_graph.node_link_graph.html
try:
with pl.Path(graph_file).open() as f:
data = json.load(f)
graph = json_graph.node_link_graph(data)
for u, v, d in graph.edges(data=True):
u_node = graph.nodes[u]
v_node = graph.nodes[v]
return graph
except IOError as e:
# The file was missing.
load_text = "File {} not found".format(graph_file)
return nx.Graph()