-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBSBI.py
107 lines (89 loc) · 3.39 KB
/
BSBI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import logging
import json
from collections import defaultdict
from itertools import groupby
from CACMIndex import CACMIndex
from CS276Index import CS276Index
from config import index_path
from BSBIndex import BSBIndex
logging.basicConfig(format='%(asctime)s - %(levelname)s : %(message)s', level=logging.INFO)
def get_key(item):
return item[0]
class BSBI:
"""
Build a Block Sort-based index
"""
def __init__(self, collection):
self.collection = collection
self.blocks = []
self.intermediate_results = []
self.index = {}
self.terms = {}
self.documents = {}
def build(self):
self.segment()
self.build_inverted_index()
self.merge()
self.write("index", self.index)
self.write("documents", self.documents)
self.write("terms", self.terms)
logging.info("Finished building inverted index.")
def segment(self):
"""
Segment collection in blocks
"""
if self.collection == 'CACM':
collection_index = CACMIndex()
collection_index.build()
self.blocks.append(collection_index)
elif self.collection == 'CS276':
collection_index = CS276Index()
collection_index.build()
self.blocks = [collection_index] * 10
self.terms = collection_index.get_term_dict()
def build_inverted_index(self):
"""
Build the intermediate index for each block
"""
logging.info("Building inverted index...")
for i, block in enumerate(self.blocks):
block_index = BSBIndex(self.collection, self.terms, block.get_document_dict(i))
block_index.build()
logging.info("Built block {} index".format(i))
self.intermediate_results.append(block_index)
def merge(self):
"""
Merge intermediate results to build the final BSBIndex on the whole collection
"""
logging.info("Merging intermediate results...")
index_results = defaultdict(list)
for result in self.intermediate_results:
self.documents.update(result.get_documents())
# Merge intermediate indexes
for k, v in result.get_index().items():
if k in index_results.keys():
index_results[k] += v
else:
index_results[k] = v
# Group by doc_id for each term_id
for term_id, posting_list in index_results.items():
parsed = groupby(sorted(posting_list, key=get_key), key=get_key)
for doc_id, occ in parsed:
if term_id in self.index.keys():
self.index[term_id].append((doc_id, sum([oc[1] for oc in occ])))
else:
self.index[term_id] = [(doc_id, sum([oc[1] for oc in occ]))]
def get_index(self):
return self.index
def write(self, title, json_obj):
with open(index_path + "/" + title + "_" + self.collection + ".json", "w") as f:
json.dump(json_obj, f)
f.close()
def load(self):
with open(index_path + "/index_" + self.collection + ".json") as f:
print(json.load(f))
f.close()
if __name__ == "__main__":
# index = BSBI('CACM')
index = BSBI('CS276') # takes at least 5 minutes to run for the 2 first folders, 7 minutes for 3 first folders
index.build()