-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarize_textrank.py
73 lines (61 loc) · 3.83 KB
/
summarize_textrank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from gensim.summarization.summarizer import summarize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
import pandas as pd
def summarize_doc(content, len_words):
summarized = summarize(content, word_count=len_words)
words = summarized.split(' ')
tokenizer = RegexpTokenizer(r'\w+')
filtered_words = [word for word in words if word not in stopwords.words('english')]
filtered_words = tokenizer.tokenize(' '.join(filtered_words))
return summarized, filtered_words
dataset = pd.read_csv('./dataset.csv')
testrank_res = pd.DataFrame(dataset['id'])
testrank_res['BLEU'], testrank_res['ROUGE2_f'], testrank_res['ROUGE1_f'], testrank_res['ROUGE1_p'], testrank_res['ROUGE2_p'] = None, None, None, None, None
testrank_res['BLEU_unfilter'], testrank_res['ROUGE2_f_unfilter'], testrank_res['ROUGE1_f_unfilter'], testrank_res['ROUGE1_p_unfilter'], testrank_res['ROUGE2_p_unfilter'] = None, None, None, None, None
testrank_res['Summary'] = None
testrank_res['ROUGEl_p_unfilter'], testrank_res['ROUGEl_p'], testrank_res['ROUGEl_f'], testrank_res['ROUGEl_f_unfilter'] = None, None, None, None
for index, paper in dataset.iterrows():
try:
content = paper['text']
if len(content) < len(paper['abstract']) * 3:
print("Too small text for paper", paper['id'], index)
raise ValueError
# ratio = round(len(paper['abstract'])/len(content), 3)
num_words = len(paper['abstract'].split(' '))
sum_text, filtered_words = summarize_doc(content, num_words)
abstract = paper['abstract'].split()
tokenizer = RegexpTokenizer(r'\w+')
filtered_abstract = [word for word in abstract if word not in stopwords.words('english')]
filtered_abstract = tokenizer.tokenize(' '.join(filtered_abstract))
bleu_score = sentence_bleu(filtered_abstract, filtered_words)
rouge = Rouge()
rouge_score = rouge.get_scores(' '.join(filtered_words), ' '.join(filtered_abstract))
# print(len(sum_text), len(paper['abstract']), len(' '.join(filtered_abstract)), len(' '.join(filtered_words)))
testrank_res['Summary'].iloc[index] = sum_text
testrank_res['BLEU'].iloc[index] = bleu_score
testrank_res['ROUGE2_f'].iloc[index] = rouge_score[0]['rouge-2']['f']
testrank_res['ROUGE1_f'].iloc[index] = rouge_score[0]['rouge-1']['f']
testrank_res['ROUGE2_p'].iloc[index] = rouge_score[0]['rouge-2']['p']
testrank_res['ROUGE1_p'].iloc[index] = rouge_score[0]['rouge-1']['p']
# Score on not cleaned text
bleu_score_unfilter = sentence_bleu(sum_text.split(' '), abstract)
rouge_unfilter = Rouge()
rouge_score_unfilter = rouge_unfilter.get_scores(sum_text, paper['abstract'])
# print(len(sum_text), len(paper['abstract']), len(' '.join(filtered_abstract)), len(' '.join(filtered_words)))
testrank_res['BLEU_unfilter'].iloc[index] = bleu_score_unfilter
testrank_res['ROUGE2_f_unfilter'].iloc[index] = rouge_score_unfilter[0]['rouge-2']['f']
testrank_res['ROUGE1_f_unfilter'].iloc[index] = rouge_score_unfilter[0]['rouge-1']['f']
testrank_res['ROUGE2_p_unfilter'].iloc[index] = rouge_score_unfilter[0]['rouge-2']['p']
testrank_res['ROUGE1_p_unfilter'].iloc[index] = rouge_score_unfilter[0]['rouge-1']['p']
testrank_res['ROUGEl_p_unfilter'].iloc[index] = rouge_score_unfilter[0]['rouge-l']['p']
testrank_res['ROUGEl_p'].iloc[index] = rouge_score[0]['rouge-l']['p']
testrank_res['ROUGEl_f'].iloc[index] = rouge_score[0]['rouge-l']['f']
testrank_res['ROUGEl_f_unfilter'].iloc[index] = rouge_score_unfilter[0]['rouge-l']['f']
print("Iteration: ", index)
except:
pass
print(testrank_res.head(5))
testrank_res.to_csv('textrank_scores.csv', index=False)