-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
70 lines (49 loc) · 1.57 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from bs4 import BeautifulSoup
import requests
def ScrapeURL(text):
url = "https://google.com/search?q="
result = url + text + "+site:stackoverflow.com"
print(url)
return result
def scrape_page(url):
response = requests.get(ScrapeURL(url))
questions = []
soup = BeautifulSoup(response.text, "html.parser")
question_summary = soup.find_all("a", href=True)
for q in question_summary:
url = q.get('href')
furl = url.strip('/url?q=')
if furl.startswith("https://stackoverflow.com/questions/"):
questions.append(furl)
value = answer_scrape(questions[0])
print(value)
return value
def answer_scrape(url):
print(url)
response = requests.get(url)
answers = []
soup = BeautifulSoup(response.text, "html.parser")
answer_summary = soup.find_all("div", class_="accepted-answer")
print(answer_summary)
# check if none
if len(answer_summary) == 0:
answer = soup.find_all("div", class_="answer")
largest = 0
index = 0
for a in answer:
num = a.select("js-vote-count")
print(num)
answer_summary.append(answer[index])
print(answer_summary)
for a in answer_summary:
c = a.find_all('pre')
listS = map(str, c)
list_string = map(split, listS)
print(list_string)
return list(list_string)
def split(n):
mytext = n.replace('\\n', '<br>')
mytext = mytext.replace('\n', '<br>')
mytext = mytext.replace('\\t', ' ')
mytext = mytext.replace('\t', ' ')
return mytext