This repository has been archived by the owner on Jan 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
googleimagescollector.py
269 lines (243 loc) · 10.1 KB
/
googleimagescollector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json # to scrape Google ajax responses
import lxml.cssselect # to fetch data from HTML by means of CSS selectors
import lxml.html # to fetch data from HTML by means of CSS selectors
import os # to operate with files and directories
import random # to randomly select User-Agent header value
import requests # to perform specific HTTP requests
import shutil # to perform file downloading
import sys # to get command line arguments
import urllib.parse # to quote URL parameters
class GoogleImagesCollector(object):
# set of image urls - failed to download
bad_urls = None
# str for console logging actions
BASE_LOG_LINE = None
# base for ajax Google requests
BASE_URL = \
'https://www.google.com.ua/search?async=_id:rg_s,_pms:qs&q={query}&start=0&asearch=ichunk&tbm=isch'
# base for ajax Google requests with tbs parameter
BASE_TBS_URL = \
'https://www.google.com.ua/search?async=_id:rg_s,_pms:qs&q={query}&start=0&asearch=ichunk&tbm=isch&tbs={tbs}'
BASE_EXACT_URL = \
'https://www.google.com.ua/search?async=_id:rg_s,_pms:qs&q="{query}"&start=0&asearch=ichunk&tbm=isch'
BASE_EXACT_TBS_URL = \
'https://www.google.com.ua/search?async=_id:rg_s,_pms:qs&q="{query}"&start=0&asearch=ichunk&tbm=isch&tbs={tbs}'
# directory name to store images
directory = None
# set of image urls - successfully downloaded
downloaded_urls = None
# dict of allowed MIME types and corresponding file extensions
MIME = {
'image/gif': 'gif',
'image/jpeg': 'jpg',
'image/pjpeg': 'jpg',
'image/png': 'png',
'image/svg+xml': 'svg',
'image/tiff': 'tiff',
'image/vnd.microsoft.icon': 'ico',
'image/vnd.wap.wbmp': 'wbmp',
'image/webp': 'webp',
}
# search query
query = None
# requests session
session = None
# list of possible values of tbs parameter
TBS = [
'itp:photo',
'itp:face',
'itp:clipart',
'itp:lineart',
'qdr:d',
'qdr:w',
'itp:animated',
'ic:color',
'ic:gray',
'ic:trans',
]
def __init__(self, query, directory=None):
"""
Initializes and object with a query and a directory name
"""
self.query = query
# if directory name is not provided, directory is named after the query
self.directory = directory or os.path.join('images', query.replace('/', '&'))
if not os.path.exists(self.directory):
os.makedirs(self.directory)
self.BASE_LOG_LINE = query + ': {url}'
self.session = requests.Session()
def collect(self, imagenum=500):
"""
Performs collecting of {imagenum} images
into {self.directory} directory
"""
if len(os.listdir(self.directory)) >= imagenum:
print('{query}: there are already enough images.'.format(query=self.query))
return
query = encodeURIComponent(self.query)
self.bad_urls = set()
self.downloaded_urls = set()
# the first url doesn't use tbs parameter
page_urls = [GoogleImagesCollector.BASE_EXACT_URL.format(query=query)]
# let's add urls with different possible values of tbs parameter
page_urls.extend([GoogleImagesCollector.BASE_EXACT_TBS_URL.format(query=query,
tbs=tbs) for tbs in GoogleImagesCollector.TBS])
# page_urls.append(GoogleImagesCollector.BASE_URL.format(query=query))
# let's add urls with different possible values of tbs parameter
# page_urls.extend([GoogleImagesCollector.BASE_TBS_URL.format(query=query,
# tbs=tbs) for tbs in GoogleImagesCollector.TBS])
# let's process urls until the job is done or there are no urls left
for page_url in page_urls:
print('\n')
print(page_url)
# let's fetch a page
page = self.fetch_page(page_url)
# let's process urls from this page
for img_url in get_img_urls_from_page(page):
if img_url not in self.downloaded_urls and img_url \
not in self.bad_urls:
# if the url is new
if self.download_image(img_url):
# and if the download is successful
# let's memoize the url as a good one
self.downloaded_urls.add(img_url)
else:
# if the download is not successful
# let's memoize the url as a bad one
self.bad_urls.add(img_url)
if len(self.downloaded_urls) >= imagenum:
# if the job is done
print('{success} images downloaded.'.format(success=len(self.downloaded_urls)))
# let's finish
return
# if we failed to fetch enough images
print("I'm sorry! I've did my best: {imagenum} images.".format(imagenum=len(self.downloaded_urls)))
def download_image(self, url, timeout=10):
"""
Downloads an image by url
"""
self.log(url)
try:
# let's request a file in stream mode
response = self.session.get(url, stream=True,
timeout=timeout)
except Exception:
print('Download failed because of connection error.')
return False
if response.status_code == 200:
try:
# let's check if the file's MIME is allowed and
# get a corresponding file extension
file_extension = \
GoogleImagesCollector.MIME[response.headers['content-type'
]]
except KeyError:
# if there is not any Content-Type or
# if the MIME is disallowed
print('Download was cancelled because something is wrong about data type.')
return False
file_path = os.path.join(self.directory, get_filename(url,
file_extension))
try:
# let's check if the file is already downloaded
# by number of bytes
if os.path.isfile(file_path) \
and os.stat(file_path).st_size \
== int(response.headers['content-length']):
print('This file is already present.')
return True
except KeyError:
# if Content-Length is not provided
# we should try to download the file again
pass
# let's save the image
with open(file_path, 'wb') as file:
try:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, file)
except Exception:
print("Download was interrupted due to a connection error.")
return False
return True
else:
# if the status code is not 200
print('Download failed because of unknown reasons.')
return False
def fetch_page(self, url):
"""
Fetches Google ajax page by url
"""
return lxml.html.fromstring(json.loads(self.session.get(url,
headers={'User-Agent': get_ua()}).content.decode('utf-8'
))[1][1])
def log(self, url):
"""
Logs an url
"""
print(self.BASE_LOG_LINE.format(url=url))
def encodeURIComponent(input_str, quotate=urllib.parse.quote):
"""
Python equivalent of javascript's encodeURIComponent
"""
return quotate(input_str.encode('utf-8'), safe='~()*!.\'')
def get_filename(url, file_extension):
"""
Transforms an url into a filename
"""
# let's remove protocol name from url
filename = url[url.index('//') + 2:]
# is there an extension in this url?
last_dot_index = filename.find('.', -5)
filename = filename[:last_dot_index]
# let's normalize the filename
filename = slugify(filename)
return '{filename}.{fileextension}'.format(filename=filename,
fileextension=file_extension)
def get_img_url_from_meta(meta):
"""
Returns image's url from a certain meta JSON
"""
return json.loads(meta)['ou']
def get_img_urls_from_page(page,
META_SELECTOR=lxml.cssselect.CSSSelector('.rg_meta'
)):
"""
Returns images' urls from a page
"""
return [get_img_url_from_meta(meta_elem.text_content())
for meta_elem in META_SELECTOR(page)]
def get_ua(ua_list=[
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
,
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
,
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0'
,
]):
"""
Provides pseudo-random User-Agent from a saved list of ones
"""
return random.choice(ua_list)
def slugify(value, keepcharacters=('_', )):
"""
Removes non-alpha characters from text
"""
return ''.join((c if c.isalnum() or c in keepcharacters else '_')
for c in value).rstrip()[:128]
if __name__ == '__main__':
# if the module is being executed as a separate script
if len(sys.argv) < 2:
print('Provide some search query!')
elif len(sys.argv) < 3:
# if no imagenum is provided
GoogleImagesCollector(sys.argv[1]).collect(imagenum=100)
else:
GoogleImagesCollector(sys.argv[1]).collect(imagenum=int(sys.argv[2]))