-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathevaluation.py
139 lines (113 loc) · 7.62 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import json
from preprocessing import *
from clustering import *
def execute_preprocessing():
print("Started preprocessing...")
_corpora, _dictionary, _avdl = metadata_extractor()
print("Metadata extraction completed.")
dict_dump(_corpora, "corpora")
print("Backup corpora dumped.")
dict_dump(_dictionary, "dictionary")
print("Backup dictionary dumped.")
dict_dump({"avdl": _avdl}, "avdl")
print("Backup avdl dumped.")
_tf_idf_dictionary = tf_idf_calculator(_corpora, _dictionary)
print("TF-IDF values calculated.")
dict_dump(_tf_idf_dictionary, "tf_idf")
print("Backup TF-IDF dumped.")
_tf_idf_clusters, _tf_idf_cluster_doc_map = compose_clusters(_tf_idf_dictionary)
print("Clustering of documents based on TF-IDF values completed.")
_clusters_dump_obj = { "clusters": _tf_idf_clusters, "cluster_doc_map": _tf_idf_cluster_doc_map }
dict_dump(_clusters_dump_obj, "clusters_dump_obj_tf_idf")
print("Backup TF-IDF Clusters dumped.")
_bm25_dictionary = bm25_calculator(_corpora, _dictionary, _avdl)
print("BM25 values calculated.")
dict_dump(_bm25_dictionary, "bm25")
print("Backup BM25 dumped.")
_bm25_clusters, _bm25_cluster_doc_map = compose_clusters(_bm25_dictionary)
print("Clustering of documents based on BM25 values completed.")
_clusters_dump_obj = { "clusters": _bm25_clusters, "cluster_doc_map": _bm25_cluster_doc_map }
dict_dump(_clusters_dump_obj, "clusters_dump_obj_bm_25")
print("Backup BM25 Clusters dumped.")
_query_tf_idf_dicts, _query_bm25_dicts = query_analyzer(_dictionary, len(_corpora), True, _avdl)
print("Odd-numbered-queries have been analyzed.")
_tf_idf_relevance_analysis = cos_sim_relevance_analyzer(_query_tf_idf_dicts, _tf_idf_dictionary, "tf_idf")
_bm25_relevance_analysis = cos_sim_relevance_analyzer(_query_bm25_dicts, _bm25_dictionary, "bm25")
_rrf_relevance_analysis = reciprocal_ranking_fusion(_tf_idf_relevance_analysis, _bm25_relevance_analysis, "rrf")
_combined_relevance_analysis = score_combination(_tf_idf_relevance_analysis, _bm25_relevance_analysis)
_tf_idf_clustered_relevance_analysis = evaluate_with_related_clusters(_tf_idf_clusters, _tf_idf_cluster_doc_map, _query_tf_idf_dicts, _tf_idf_dictionary, "tf_idf_with_clustering")
_bm25_clustered_relevance_analysis = evaluate_with_related_clusters(_bm25_clusters, _bm25_cluster_doc_map, _query_bm25_dicts, _bm25_dictionary, "bm25_with_clustering")
_rrf_clustered_relevance_analysis = reciprocal_ranking_fusion(_tf_idf_clustered_relevance_analysis, _bm25_clustered_relevance_analysis, "rrf_with_clustering")
_combined_clustered_relevance_analysis = score_combination(_tf_idf_clustered_relevance_analysis, _bm25_clustered_relevance_analysis, "_with_clustering")
print("TF-IDF Relevance analysis have been done with odd-numbered-queries.")
return _tf_idf_relevance_analysis, _bm25_relevance_analysis, \
_rrf_relevance_analysis, _combined_relevance_analysis, \
_tf_idf_clustered_relevance_analysis, _bm25_clustered_relevance_analysis, \
_rrf_clustered_relevance_analysis, _combined_clustered_relevance_analysis
def load_continue_preprocessing(loadCorpora=True, loadDict=True):
print("Loading sequence initiated.")
if not loadCorpora and not loadDict:
_tf_idf_relevance_analysis, _bm25_relevance_analysis, \
_rrf_relevance_analysis, _combined_relevance_analysis, \
_tf_idf_clustered_relevance_analysis, _bm25_clustered_relevance_analysis, \
_rrf_clustered_relevance_analysis, _combined_clustered_relevance_analysis = execute_preprocessing()
else:
fin_corpora = open("corpora.json", "r", encoding="utf-8")
_corpora = json.load(fin_corpora)
fin_corpora.close()
fin_dictionary = open("dictionary.json", "r", encoding="utf-8")
_dictionary = json.load(fin_dictionary)
fin_dictionary.close()
fin_avdl = open("avdl.json", "r", encoding="utf-8")
_avdl_dictionary = json.load(fin_avdl)
_avdl = _avdl_dictionary["avdl"]
fin_avdl.close()
fin_tf_idf = open("tf_idf.json", "r", encoding="utf-8")
_tf_idf_dictionary = json.load(fin_tf_idf)
fin_tf_idf.close()
fin_bm25 = open("bm25.json", "r", encoding="utf-8")
_bm25_dictionary = json.load(fin_bm25)
fin_bm25.close()
fin_tf_idf_cluster = open("clusters_dump_obj_tf_idf.json", "r", encoding="utf-8")
tf_idf_cluster_dump_obj = json.load(fin_tf_idf_cluster)
fin_tf_idf_cluster.close()
_tf_idf_clusters = tf_idf_cluster_dump_obj["clusters"]
_tf_idf_cluster_doc_map = tf_idf_cluster_dump_obj["cluster_doc_map"]
fin_bm25_cluster = open("clusters_dump_obj_bm_25.json", "r", encoding="utf-8")
bm_25_cluster_dump_obj = json.load(fin_bm25_cluster)
fin_bm25_cluster.close()
_bm25_clusters = bm_25_cluster_dump_obj["clusters"]
_bm25_cluster_doc_map = bm_25_cluster_dump_obj["cluster_doc_map"]
print("Loading sequence completed.")
_query_tf_idf_dicts, _query_bm25_dicts = query_analyzer(_dictionary, len(_corpora) , True, _avdl)
_tf_idf_relevance_analysis = cos_sim_relevance_analyzer(_query_tf_idf_dicts, _tf_idf_dictionary, "tf_idf")
_bm25_relevance_analysis = cos_sim_relevance_analyzer(_query_bm25_dicts, _bm25_dictionary, "bm25")
_rrf_relevance_analysis = reciprocal_ranking_fusion(_tf_idf_relevance_analysis, _bm25_relevance_analysis, "rrf")
_combined_relevance_analysis = score_combination(_tf_idf_relevance_analysis, _bm25_relevance_analysis)
_tf_idf_clustered_relevance_analysis = evaluate_with_related_clusters(_tf_idf_clusters, _tf_idf_cluster_doc_map, _query_tf_idf_dicts, _tf_idf_dictionary, "tf_idf_with_clustering")
_bm25_clustered_relevance_analysis = evaluate_with_related_clusters(_bm25_clusters, _bm25_cluster_doc_map, _query_bm25_dicts, _bm25_dictionary, "bm25_with_clustering")
_rrf_clustered_relevance_analysis = reciprocal_ranking_fusion(_tf_idf_clustered_relevance_analysis, _bm25_clustered_relevance_analysis, "rrf_with_clustering")
_combined_clustered_relevance_analysis = score_combination(_tf_idf_clustered_relevance_analysis, _bm25_clustered_relevance_analysis, "_with_clustering")
return _tf_idf_relevance_analysis, _bm25_relevance_analysis, \
_rrf_relevance_analysis, _combined_relevance_analysis, \
_tf_idf_clustered_relevance_analysis, _bm25_clustered_relevance_analysis, \
_rrf_clustered_relevance_analysis, _combined_clustered_relevance_analysis
def start_evaluation(reload=False):
if not reload:
_tf_idf_relevance_analysis, _bm25_relevance_analysis, \
_rrf_relevance_analysis, _combined_relevance_analysis, \
_tf_idf_clustered_relevance_analysis, _bm25_clustered_relevance_analysis, \
_rrf_clustered_relevance_analysis, _combined_clustered_relevance_analysis = execute_preprocessing()
else:
_tf_idf_relevance_analysis, _bm25_relevance_analysis, \
_rrf_relevance_analysis, _combined_relevance_analysis, \
_tf_idf_clustered_relevance_analysis, _bm25_clustered_relevance_analysis, \
_rrf_clustered_relevance_analysis, _combined_clustered_relevance_analysis = load_continue_preprocessing()
print("Relevance analysis completed.")
if __name__ == "__main__":
'''
-> If the preprocessing data outputs exist in your current directory,
(avdl.json, bm25.json, tf_idf.json, corpora.json, dictionary.json, clusters_dump_obj_bm_25.json, clusters_dump_obj_tf_idf.json),
you can run with the option reload=True
'''
start_evaluation(reload=False)