-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsummarizer.py
161 lines (125 loc) · 4.61 KB
/
summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# -*- coding: utf-8 -*-
from __future__ import division
import re
import numpy as np
from goose import Goose
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem.snowball import *
from nltk.tokenize import word_tokenize, sent_tokenize
import sys
title = ''
# split a paragraph into sentences.
def splitToSentences(content):
return sent_tokenize(content)
# split text into paragraphs
def splitToParagraphs(content):
return [c for c in content.split("\n") if len(c) is not 0]
# get the intersection between two sentences
def getIntersection(s1, s2):
global title
s1 = set(s1)
s2 = set(s2)
s3 = set(title)
# if the sentences are empty the rank of this will be 0
if (len(s1) + len(s2)) == 0:
return 0
# returning the score of the sentence s1 wrt s2
return len(s1 & s2 & s3)/((len(s1) + len(s2))/2)
# create a key for an object from a sentence
def sentenceKey(sentence):
sentence = re.sub(r'\W+', '', sentence)
return sentence
# stem and remove any stop words form a sentence
def stemAndRemoveStopWords(sentence, stemmer):
s = word_tokenize(sentence)
s = [w for w in s if not w in stopwords.words('english')]
s = [stemmer.stem(word) for word in s]
return s
def rankSentences(content):
# create a list of all sentences
paragraphs = splitToParagraphs(content)
# sentences = splitToSentences(content)
sentences = []
for p in paragraphs:
s = splitToSentences(p)
for x in s:
sentences.append(x)
n = len(sentences)
# stem and remove stopwords
stemmer = PorterStemmer()
# stemmer = SnowballStemmer("english")
cleanedSentences = [stemAndRemoveStopWords(x, stemmer) for x in sentences]
# Create the sentence adjacency matrix
values = np.zeros((n, n))
for i in range(n):
for j in range(n):
values[i][j] = getIntersection(cleanedSentences[i], cleanedSentences[j])
# create sentence adjacency matrix
values = np.dot(values, (np.ones((n,n)) - np.eye(n)))
score = np.sum(values, axis=1)
sentenceDictionary = {sentenceKey(sentences[i]):score[i] for i in range(n)}
thresholdScore = np.mean(score) + 0.3*np.std(score)
return sentenceDictionary, thresholdScore
# get the best sentence from each paragraph
def getBestSentences(paragraph, sentenceDictionary, thresholdScore):
sentences = splitToSentences(paragraph)
bestSentence = ""
maxScore = 0
# loop through each sentence and find it its value
for s in sentences:
key = sentenceKey(s)
if key:
if sentenceDictionary[key] >= thresholdScore:
bestSentence += " " + s
# for s in sentences:
# key = sentenceKey(s)
# if key:
# if sentenceDictionary[key] > maxScore:
# maxScore = sentenceDictionary[key]
# bestSentence = s
return bestSentence
# summarize the text
def summarize(content, sentenceDictionary, thresholdScore):
global title
paragraphs = splitToParagraphs(content)
summary = []
if title:
summary.append(title.strip())
summary.append("")
for p in paragraphs:
sentence = getBestSentences(p, sentenceDictionary, thresholdScore).strip()
if sentence:
summary.append(sentence)
return ("\n").join(summary)
# Summarize the content
def doSummary(content):
sentenceDictionary, thresholdScore = rankSentences(content)
summary = summarize(content, sentenceDictionary, thresholdScore)
return summary
# using the Goose library to extract the content of a url
def getContent(url):
g = Goose()
article = g.extract(url=url)
return article.title, article.cleaned_text
# grab the content of a url and return a summarized version of it
def parseWebpage(url):
global title
pat = ['\[(.*?)\]', '[\n\s].*([0-9]+[.][^0-9])', '[\n\s].*([^a-zA-Z][a-z][/)])']
# reps = [('\xe2\x80\x93','-')]
title, content = getContent(url)
for pattern in pat:
content = re.sub(pattern, "", content)
# for rep in reps:
# content = re.sub(rep[0],rep[1],content)
print "Original text size: ",len(content)
return doSummary(content)
if __name__ == '__main__':
if len(sys.argv) > 1:
url = sys.argv[1]
else:
# url = "https://medium.com/war-is-boring/one-easy-way-to-blow-7-6-billion-10b0933124f5"
url = "http://en.wikipedia.org/wiki/Github"
summary = parseWebpage(url)
print "Summarized word size:%d\n" % len(summary)
print summary.encode('UTF-8')