-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathinit.py
executable file
·47 lines (35 loc) · 1.28 KB
/
init.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env python
# Given a question ### (as a string or int)
# finds the matching top_docs.### file
# and reads the <docnum> and <text> content into a dictionary
from BeautifulSoup import BeautifulStoneSoup
import os
import glob
global_doc_dict = {}
def get_doc(doc_num):
return global_doc_dict[doc_num]
def get_corpus(qNum=0):
path = 'corpus'
dict = {}
# if no question # is passed will loop through the entire corpus
if qNum == 0:
fname = 'top_docs.*'
else:
fname = 'top_docs.' + str(qNum)
# loop through corpus file(s)
for infile in glob.glob( os.path.join(path, fname) ):
f = open(infile)
soup = BeautifulStoneSoup(f)
# each <docnum>,<text> pair is inside a parent <doc>
for d in soup('doc'):
docNum = d.findNext('docno').renderContents().strip()
dT = d.findNext('text')
# because file 372 has a <doc> and <docnum> without any <text> (the last one)
if dT != None:
docText = dT.renderContents().strip()
docText = " ".join(docText.split("\n"))
dict[docNum] = docText
global_doc_dict[docNum] = docText
return dict
if __name__ == "__main__":
get_corpus(qNum=204)