-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathamazon_reviews_extraction.py
130 lines (97 loc) Β· 3.87 KB
/
amazon_reviews_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Import packages
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
# Header to set the requests as a browser requests
headers = {
'authority': 'www.amazon.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
# URL of The amazon Review page
reviews_url = 'https://www.amazon.com/Legendary-Whitetails-Journeyman-Jacket-Tarmac/product-reviews/B013KW38RQ/'
# Define Page No
len_page = 4
### <font color="red">Functions</font>
# Extra Data as Html object from amazon Review page
def reviewsHtml(url, len_page):
# Empty List define to store all pages html data
soups = []
# Loop for gather all 3000 reviews from 300 pages via range
for page_no in range(1, len_page + 1):
# parameter set as page no to the requests body
params = {
'ie': 'UTF8',
'reviewerType': 'all_reviews',
'filterByStar': 'critical',
'pageNumber': page_no,
}
# Request make for each page
response = requests.get(url, headers=headers)
# Save Html object by using BeautifulSoup4 and lxml parser
soup = BeautifulSoup(response.text, 'lxml')
# Add single Html page data in master soups list
soups.append(soup)
return soups
# Grab Reviews name, description, date, stars, title from HTML
def getReviews(html_data):
# Create Empty list to Hold all data
data_dicts = []
# Select all Reviews BOX html using css selector
boxes = html_data.select('div[data-hook="review"]')
# Iterate all Reviews BOX
for box in boxes:
# Select Name using css selector and cleaning text using strip()
# If Value is empty define value with 'N/A' for all.
try:
name = box.select_one('[class="a-profile-name"]').text.strip()
except Exception as e:
name = 'N/A'
try:
stars = box.select_one('[data-hook="review-star-rating"]').text.strip().split(' out')[0]
except Exception as e:
stars = 'N/A'
try:
title = box.select_one('[data-hook="review-title"]').text.strip()
except Exception as e:
title = 'N/A'
try:
# Convert date str to dd/mm/yyy format
datetime_str = box.select_one('[data-hook="review-date"]').text.strip().split(' on ')[-1]
date = datetime.strptime(datetime_str, '%B %d, %Y').strftime("%d/%m/%Y")
except Exception as e:
date = 'N/A'
try:
description = box.select_one('[data-hook="review-body"]').text.strip()
except Exception as e:
description = 'N/A'
# create Dictionary with al review data
data_dict = {
'Name' : name,
'Stars' : stars,
'Title' : title,
'Date' : date,
'Description' : description
}
# Add Dictionary in master empty List
data_dicts.append(data_dict)
return data_dicts
### <font color="red">Data Process</font>
# Grab all HTML
html_datas = reviewsHtml(reviews_url, len_page)
# Empty List to Hold all reviews data
reviews = []
# Iterate all Html page
for html_data in html_datas:
# Grab review data
review = getReviews(html_data)
# add review data in reviews empty list
reviews += review
# Create a dataframe with reviews Data
df_reviews = pd.DataFrame(reviews)
print(df_reviews)
# Save data
df_reviews.to_csv('reviews.csv', index=False)