-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautomotive_crawler_html.py
71 lines (57 loc) · 2.65 KB
/
automotive_crawler_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import requests
import time
def fetch_and_save_html(url, save_path, max_retries=3):
attempts = 0
while attempts < max_retries:
try:
response = requests.get(url, timeout=10)
if response.status_code == 200:
with open(save_path, 'w', encoding='utf-8') as file:
file.write(response.text)
print(f"Saved HTML for URL: {url}")
return True
else:
print(f"Failed to fetch URL: {url} with status code {response.status_code}")
return False
except requests.exceptions.Timeout:
print(f"Timeout for URL: {url}. Attempt {attempts + 1} of {max_retries}. Retrying...")
except requests.RequestException as e:
print(f"Error for URL: {url}: {e}. Attempt {attempts + 1} of {max_retries}. Retrying...")
attempts += 1
time.sleep(2) # Delay before retrying
print(f"Failed to fetch URL after {max_retries} attempts: {url}")
return False
# Read categories to process from cat_not_crawl.txt
with open('automotive_cats.txt', 'r') as file:
categories_to_crawl = {line.strip() for line in file if line.strip()}
# Directory where category folders and urls.txt files are located
base_dir = "crawler_data_by_cat/automotive"
for category in os.listdir(base_dir):
if category not in categories_to_crawl:
print(f"Skipping category not listed in cat_not_crawl.txt: {category}")
continue
print(f"Processing category: {category}")
cat_dir = os.path.join(base_dir, category)
urls_file = os.path.join(cat_dir, "urls.txt")
# Check if urls.txt exists
if not os.path.isfile(urls_file):
print(f"No urls.txt found for category {category}")
continue
# Create a subfolder to store HTML files for each URL
html_dir = os.path.join(cat_dir, "html_files")
os.makedirs(html_dir, exist_ok=True)
with open(urls_file, 'r', encoding='utf-8') as file:
urls = [line.strip() for line in file if line.strip()]
# Crawl each URL and save the HTML content
for url in urls:
# Generate a filename from the URL (replace slashes with underscores)
filename = url.split("/")[-1].replace(".htm", "") + ".html"
save_path = os.path.join(html_dir, filename)
# Fetch and save HTML content if it doesn't already exist
if not os.path.exists(save_path):
print(f"Fetching HTML for URL: {url}")
fetch_and_save_html(url, save_path)
else:
print(f"HTML for URL already exists: {url}")
print("HTML content saved in 'html_files' folder within each category directory.")