-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
75 lines (63 loc) · 2.23 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""Image Crawler."""
import argparse
import os
from tqdm import tqdm
from browser import extract_urls, get_driver
from constants import DEFAULT_IMG_COUNT
from logger import AppLogger
from utils import download_images
def crawl(args: argparse):
"""Crawl images."""
obj_to_search = [obj.strip() for obj in args.object.split(",")]
if 0 == len(obj_to_search):
LOG.info("Please specify atleast one object")
return
LOG.info("%s Image Crawler %s", "*" * 10, "*" * 10)
LOG.info("Scraping for: %s", obj_to_search)
LOG.info("Output Dir: %s", args.out_dir)
LOG.info("Headless: %r", args.headless)
LOG.info("Max number of images to download: %s \n", args.max_count)
LOG.info("Started extracting URLs")
web_driver = get_driver(args.headless)
links_dict = dict()
with tqdm(obj_to_search, desc="Extracting URLs", colour="green") as progress_bar:
for obj in progress_bar:
img_links = extract_urls(
web_driver=web_driver, obj=obj, max_urls=args.max_count
)
links_dict[obj] = img_links
LOG.info("URL extract complete.")
LOG.info("Starting download")
for item in links_dict.items():
download_images(
item[1][: args.max_count], obj_to_search=item[0], out_dir=args.out_dir
)
LOG.info("Downloading complete.")
web_driver.quit()
def main():
"""Parse arguments and start crawling."""
parser = argparse.ArgumentParser(description="Image crawler")
parser.add_argument(
"--object",
default="",
type=str,
required=True,
help="Enter the object to search for.",
)
parser.add_argument(
"--out_dir", default="./images", type=str, help="Destination path for images."
)
parser.add_argument("--headless", help="Runs in background.", action="store_true")
parser.add_argument(
"--max_count",
default=DEFAULT_IMG_COUNT,
type=int,
help=f"Maximum number of images to download, defaults to {DEFAULT_IMG_COUNT}",
)
args = parser.parse_args()
if not os.path.exists(args.out_dir):
os.mkdir(args.out_dir)
crawl(args)
if __name__ == "__main__":
LOG = AppLogger().get_logger("image_crawler")
main()