-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
111 lines (91 loc) · 4.2 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from bs4 import BeautifulSoup
import argparse, requests, json, pickle, os,sys, urllib.parse
class downloader:
def __init__(self, search_term, verbose_mode):
self.search_term = search_term
self.verbose_mode = verbose_mode
self.downloadurls = []
def get_urls(self, cache=True):
if self.search_term is None:
print("The search term is empty. If you have a cache pickle file, use the -p argument")
return
url = 'https://www.google.com/search?tbm=isch&q='
search_string=self.search_term.replace(' ', '+')
search_url = url + urllib.parse.quote(search_string, safe='').replace('-','%2D')
response = requests.get(search_url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15'})
text_soup = BeautifulSoup(response.content, "html.parser")
allimages = text_soup.find_all('div', {'class':'rg_meta'})
for i in allimages:
self.downloadurls.append(json.loads(i.text)['ou'])
if self.verbose_mode:
print(f"The search url is : {search_url}")
print(f'Obtained {len(self.downloadurls)} image urls')
if cache:
if not os.path.exists('./caches/'):
os.mkdir('caches')
fname = './caches/' + search_string.replace('+','_') + '.cache'
print(f"caching query to file: {fname}...")
fp = open(fname,'wb')
pickle.dump(self.downloadurls, fp)
fp.close()
def printprogress(self, number):
bar_len = 60
filled_len = int(round(bar_len * number / float(len(self.downloadurls))))
percents = round(100.0 * number / float(len(self.downloadurls)), 1)
bar = '=' * filled_len + '-' * (bar_len - filled_len)
sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', "Downloading"))
sys.stdout.flush()
def download(self, download_location='./'):
if len(self.downloadurls) == 0:
print("The list of urls is empty. Either call get_urls() or load data from cache file.")
exit(-1)
# To choose the file names for the images
if self.search_term is not None:
filefoldername=self.search_term.replace(' ', '_')
else:
# Since the search term is empty, we need to get the cache file name and use that as the individual image name
filefoldername=self.pickle_location.split('/')[-1].split('.')[0]
final_path = download_location + 'downloads/'+filefoldername
# Check if the download folder exists
if not os.path.exists(download_location +'downloads'):
os.mkdir(download_location +'downloads')
# Check if the search_term folder in the download folder exists
if not os.path.exists(final_path):
os.mkdir(final_path)
# Now since we have the links, create the file names and download it to downloads/<query>/<img name> folder
count=0
for url,no in zip(self.downloadurls,range(0,len(self.downloadurls))):
file_name = filefoldername + str(count) + '.' + url.split('.')[-1].split("?")[0].split("&")[0].split("/")[0]
# Print progress
self.printprogress(no)
# Download the URLs
response = requests.get(url)
if response.ok:
with open(final_path+'/'+file_name, 'wb') as imagefile:
imagefile.write(response.content)
count+=1
pass
def load_from_cache(self, p):
if p is None:
print("No cache location specified. Please try again")
exit(-1)
self.pickle_location = p
fp = open(p,'rb')
self.downloadurls = pickle.load(fp)
if self.verbose_mode:
print(self.downloadurls)
if __name__ == '__main__':
# Parse command line arguments
parser = argparse.ArgumentParser(description="Google Image Dataset Scraper")
parser.add_argument('-v',action='store_true', default=False, help='Debug/verbose mode')
parser.add_argument('-c',action='store_true', default=False, help='Cache images offline in a pickle object')
parser.add_argument('-s',dest='search_string', default=None, help='Search string to query')
parser.add_argument('-d',dest='download_location',default='./', help='Image download location')
parser.add_argument('-p',dest='pickle_location', default=None, help='Pickle object location')
args = parser.parse_args()
x = downloader(args.search_string, verbose_mode=args.v)
if args.search_string is not None:
x.get_urls(cache=args.c)
elif args.pickle_location is not None:
x.load_from_cache(args.pickle_location)
x.download(download_location=args.download_location)