automotive_crawler_html.py

import os
import requests
import time

def fetch_and_save_html(url, save_path, max_retries=3):
    attempts = 0

    while attempts < max_retries:
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                with open(save_path, 'w', encoding='utf-8') as file:
                    file.write(response.text)
                print(f"Saved HTML for URL: {url}")
                return True
            else:
                print(f"Failed to fetch URL: {url} with status code {response.status_code}")
                return False
        except requests.exceptions.Timeout:
            print(f"Timeout for URL: {url}. Attempt {attempts + 1} of {max_retries}. Retrying...")
        except requests.RequestException as e:
            print(f"Error for URL: {url}: {e}. Attempt {attempts + 1} of {max_retries}. Retrying...")
        
        attempts += 1
        time.sleep(2)  # Delay before retrying

    print(f"Failed to fetch URL after {max_retries} attempts: {url}")
    return False

# Read categories to process from cat_not_crawl.txt
with open('automotive_cats.txt', 'r') as file:
    categories_to_crawl = {line.strip() for line in file if line.strip()}

# Directory where category folders and urls.txt files are located
base_dir = "crawler_data_by_cat/automotive"

for category in os.listdir(base_dir):
    if category not in categories_to_crawl:
        print(f"Skipping category not listed in cat_not_crawl.txt: {category}")
        continue

    print(f"Processing category: {category}")
    cat_dir = os.path.join(base_dir, category)
    urls_file = os.path.join(cat_dir, "urls.txt")

    # Check if urls.txt exists
    if not os.path.isfile(urls_file):
        print(f"No urls.txt found for category {category}")
        continue

    # Create a subfolder to store HTML files for each URL
    html_dir = os.path.join(cat_dir, "html_files")
    os.makedirs(html_dir, exist_ok=True)

    with open(urls_file, 'r', encoding='utf-8') as file:
        urls = [line.strip() for line in file if line.strip()]

    # Crawl each URL and save the HTML content
    for url in urls:
        # Generate a filename from the URL (replace slashes with underscores)
        filename = url.split("/")[-1].replace(".htm", "") + ".html"
        save_path = os.path.join(html_dir, filename)

        # Fetch and save HTML content if it doesn't already exist
        if not os.path.exists(save_path):
            print(f"Fetching HTML for URL: {url}") 
            fetch_and_save_html(url, save_path)
        else:
            print(f"HTML for URL already exists: {url}")

print("HTML content saved in 'html_files' folder within each category directory.")