gso_crawler_url.py

import os
import requests
from bs4 import BeautifulSoup
import time
import urllib3

urllib3.disable_warnings()

def crawl(url):
    max_retries = 3
    attempts = 0

    while attempts < max_retries:
        try:
            response = requests.get(url, timeout=10, verify=False)

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, "html.parser")
                post_content_div = soup.find("div", class_="tab-pane fade fusion-clearfix in active")
                if post_content_div:
                    links = [a['href'] for a in post_content_div.find_all("a", href=True)]
                    for link in links:
                        print(link)

                    print(len(links))
                    return links
                else:
                    print("No post-content div found.")
                    return []
                
        except requests.exceptions.Timeout:
            print(f"Timeout occurred for URL: {url}. Attempt {attempts + 1} of {max_retries}. Retrying...")
        except requests.RequestException as e:
            print(f"Request failed for URL: {url} with error: {e}. Attempt {attempts + 1} of {max_retries}. Retrying...")

        attempts += 1
        time.sleep(2)  

    print(f"Failed to fetch {url} after {max_retries} attempts.")
    return []

with open('gso_cats.txt', 'r') as file:
    cats = [line.strip() for line in file if line.strip()]

for cat in cats:
    print("Crawling url for",cat)
    url = f"https://www.gso.gov.vn/{cat}" 
    links = crawl(url)

    cat_folder = os.path.join("gso_data", cat)
    os.makedirs(cat_folder, exist_ok=True)

    # Save crawl data for each category in urls.txt inside each category folder
    result_file_path = os.path.join(cat_folder, "urls.txt")
    with open(result_file_path, 'w', encoding='utf-8') as file:
        for link in links:
            file.write(f"{link}\n")