-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping.py
94 lines (72 loc) · 2.86 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.service import Service
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException, ElementClickInterceptedException, WebDriverException
from selenium.webdriver.support.ui import Select, WebDriverWait # available since 2.4.0
from pathlib import Path
from urllib.parse import urljoin
from lxml.html import fromstring
import time
import json
from pymongo import MongoClient
from pathlib import Path
import json
if not Path('adidas.html').exists():
driver_path = r'C:\dmozilla\geckodriver.exe'
ser = Service(driver_path)
opt = webdriver.FirefoxOptions()
# don't show the browser
# opt.add_argument('-headless')
# Create a new instance of the Chrome driver
driver = webdriver.Firefox(service=ser, options=opt)
# go to the Adidas website
driver.get("https://www.adidas.com/us/shoes?grid=true")
# wait for the page to load
SCROLL_PAUSE_TIME = 3
new_height = 0
START = 0
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom 500px by 500px
driver.execute_script(f"window.scrollTo({START}, {START + 500});")
START += 500
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height +=500
last_height = driver.execute_script("return document.body.scrollHeight")
if new_height >= last_height:
break
try:
page = driver.page_source
with open('adidas.html', 'w') as f:
json.dump(page, f)
print('Downloading adidas.html')
time.sleep(5)
print('Downloading adidas.html completed, closing driver')
driver.quit()
except Exception as e:
print(e)
driver.quit()
else :
with open('adidas.html', 'r') as f:
page = json.load(f)
dataImg = []
tree = fromstring(page)
imgs = tree.xpath('//div[@class="glass-product-card__assets"]//a/img')
for i, img in enumerate(imgs):
if img.attrib['src']:
if img.attrib['src'].endswith('.jpg'):
dataImg.append({img.attrib['src'].split('/')[-1] : img.attrib['src']})
# save to MongoDB
try:
mongo_uri = "mongodb://localhost"
client = MongoClient(mongo_uri)
db = client['adidas']
collection = db['datapage']
collection.insert_many(dataImg)
print('Inserted to MongoDB')
except Exception as e:
print(e)