-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_finviz_articles.py
93 lines (72 loc) · 4.06 KB
/
scrape_finviz_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import os
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from scrape_finviz_article import ScraperReuters, ScraperMarketWatch
"""
Returns a selenium webdriver with the specified url loaded
"""
def get_driver(url):
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--log-level=3')
chrome_options.add_argument("--enable-javascript")
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-gpu')
serv = Service(os.getcwd()+'/chromedriver')
driver = webdriver.Chrome(options=chrome_options, service=serv)
driver.get(url)
return driver
if __name__ == "__main__":
# Maps publisher names as they appear in finviz html to the corresponding Scraper class that was defined in scrape_finviz_article
publishers = {
"Reuters": ScraperReuters,
"MarketWatch": ScraperMarketWatch,
}
driver = get_driver('https://finviz.com/news.ashx?v=2')
# Variables with finviz at the front refer to actual HTML DOM elements. Each element in this list refers to the set of articles from one publisher
finviz_publishers = driver.find_elements(By.XPATH, value='//a[@class="nn-title-link"]')
# Information about articles on finviz are loaded before any scraping is done as leaving the finviz website open for too long causes it to refresh, which messes up the DOM Object pointers
articles = []
for finviz_publisher in finviz_publishers:
file_count = 0
publisher_name = finviz_publisher.text
# Only scrape from publishers that have a corresponding scraper class defined
if publisher_name in publishers:
publisher_dirpath = os.path.join(os.getcwd(), "Publishers", publisher_name)
if not os.path.exists(publisher_dirpath):
os.makedirs(publisher_dirpath)
# Gets all the individual article metadata and saves it to a tuple in the articles list
finviz_articles = finviz_publisher.find_elements(By.XPATH, value='./../../../../../../following-sibling::tr[@class="nn"]')
for finviz_article in finviz_articles:
finviz_article_link = finviz_article.find_element(By.XPATH, value='./descendant::a')
finviz_article_date = finviz_article.find_element(By.XPATH, value='./descendant::td[@class="nn-date"]')
article_title = finviz_article_link.text
article_date = finviz_article_date.text
article_link = finviz_article_link.get_attribute('href')
article_filename = publisher_name + f"-{file_count}" + ".json"
file_count += 1
article_filepath = os.path.join(publisher_dirpath, article_filename)
articles.append((article_title, article_date, article_link, article_filename, article_filepath, publisher_name))
# Goes through all articles and creates an appropriate json file to store the article metadata and text
for article in articles:
article_title, article_date, article_link, article_filename, article_filepath, publisher_name = article
print("Loading link: " + article_link)
article_scraper = publishers[publisher_name](url=article_link, title=article_title, date=article_date)
with open(article_filepath, "w" ) as f:
article_json = {
"title":article_title,
"total_text":article_scraper.getBody(),
"summary":"summary to be filled here",
"key":"Finviz_" + publisher_name,
"date":article_date,
"url":article_link,
}
json.dump(article_json, f)
# Garbage collection to close selenium instances
article_scraper.driver.close()
del(article_scraper)