-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopic_reader.py
executable file
·87 lines (66 loc) · 2.95 KB
/
topic_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
import re
import subprocess
class TopicReader:
def __init__(self, topics_file_name):
self.filename = topics_file_name
self.file = open(self.filename)
self.topics = []
print("Read topics...",flush=True)
self._read_topics_file()
print("Preprocess topics...",flush=True)
self._preprocess_titles()
def _read_topics_file(self):
while True:
line = self.file.readline()
if not line:
break
if not line.strip():
continue
while line and not line.startswith('<top>'):
line = self.file.readline()
if not line:
break
while not line.startswith('<num>'):
line = self.file.readline()
topic_no = int(re.search('Number: (\d+)', line.strip()).group(1))
print("Parsing topic {}".format(topic_no),flush=True)
while not line.startswith('<title>'):
line = self.file.readline()
# Robust04 specific:
topic_title = line.strip()[8:]
line = self.file.readline().strip()
while not line.startswith('</title>') and not line.startswith('<desc>'):
topic_title += line
line = self.file.readline().strip()
while not line.startswith('<desc>'):
line = self.file.readline().strip()
topic_desc = ""
line = self.file.readline().strip()
while not line.startswith('</desc>') and not line.startswith('<narr>'):
topic_desc += line
line = self.file.readline().strip()
while not line.startswith('<narr>'):
line = self.file.readline().strip()
topic_nar = ""
line = self.file.readline().strip()
while not line.startswith('</narr>') and not line.startswith('</top>'):
topic_nar += line
line = self.file.readline().strip()
while not line.startswith('</top>'):
line = self.file.readline().strip()
topic = {
'number' : topic_no,
'title' : topic_title,
'description' : topic_desc,
'narrative' : topic_nar
}
self.topics.append(topic)
def _preprocess_titles(self):
for i, topic in enumerate(self.topics):
title = topic['title']
process = subprocess.Popen("""./olddog/target/appassembler/bin/nl.ru.preprocess.ProcessQuery {}""".format(title).split(), stdout=subprocess.PIPE)
stdout = process.communicate()[0].decode("utf-8").strip()
self.topics[i]['title'] = stdout
def get_topics(self):
return self.topics