gso_crawler_1_table.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
import os
import re
from selenium.common.exceptions import NoSuchElementException

url = "https://pxweb.gso.gov.vn/pxweb/vi/D%C3%A2n%20s%E1%BB%91%20v%C3%A0%20lao%20%C4%91%E1%BB%99ng/D%C3%A2n%20s%E1%BB%91%20v%C3%A0%20lao%20%C4%91%E1%BB%99ng/V02.01.px/?rxid=3b0537b5-21a6-480d-bfe9-4f3cd35438ab"

download_folder = os.path.join(os.getcwd()) 
os.makedirs(download_folder, exist_ok=True)

# Set Chrome WebDriver options
options = webdriver.ChromeOptions()
prefs = {
    "download.default_directory": download_folder,
    "download.prompt_for_download": False,
    "safebrowsing.enabled": True
}
options.add_experimental_option("prefs", prefs)

# Initialize WebDriver
driver = webdriver.Chrome(options=options)

print(f"Processing URL: {url}")
    
def extract_version_from_url(url):
    pattern = r'\/([Vv]\d{2}\.\d{2})'
    match = re.search(pattern, url)
    return match.group(1) if match else "default_version"

# Navigate to the URL
driver.get(url)
time.sleep(1)

# Extract page title
title_element = driver.find_element(By.ID, 'ctl00_ContentPlaceHolderMain_MenuTitle')
title_text = title_element.text.strip()

# Sanitize title for valid filenames
title_text = re.sub(r'\(\*\)', '', title_text).strip()
title_text = re.sub(r'[<>:"/\\|?*]', '', title_text).strip() # Remove invalid characters

# Select all sections
select_all_buttons = driver.find_elements(By.CLASS_NAME, 'variableselector_valuesselect_select_all_imagebutton')
for button in select_all_buttons:
    button.click()
    time.sleep(0.5)


# Continue to the next step
continue_button = driver.find_element(By.CLASS_NAME, 'variableselector_continue_button')
continue_button.click()
time.sleep(1)

try:
    # Locate the button using its class name and text
    close_button = driver.find_element(By.XPATH, "//button[span[text()='Close']]")
    close_button.click()  # Click the Close button
    print("Clicked the Close button.")
except NoSuchElementException:
    print("Close button not found.")

# Save the query
save_link = driver.find_element(By.ID, 'ctl00_ctl00_ContentPlaceHolderMain_lnkSaveQueryInformation')
save_link.click()
time.sleep(1)

# Select Excel format
dropdown = Select(driver.find_element(By.ID, 'ctl00_ctl00_ContentPlaceHolderMain_SavedQueryFeature_ddlOutputFormats'))
dropdown.select_by_value('FileTypeExcelX')

# Start the download
finish_button = driver.find_element(By.ID, 'ctl00_ctl00_ContentPlaceHolderMain_SavedQueryFeature_btnCreateSaveQuery')
finish_button.click()
time.sleep(1)

# Get the download URL
url_input = driver.find_element(By.ID, 'ctl00_ctl00_ContentPlaceHolderMain_SavedQueryFeature_txtSaveQueryUrl')
download_url = url_input.get_attribute("value")

# Navigate to the download URL
driver.get(download_url)

# Wait for download to complete
time.sleep(5)

# Rename the downloaded file
version = extract_version_from_url(url)
downloaded_file = os.path.join(download_folder, f"{version}.xlsx")
final_file = os.path.join(download_folder, f"{title_text}.xlsx")

if os.path.exists(downloaded_file):
    os.rename(downloaded_file, final_file)
    print(f"Downloaded and renamed: {final_file}")
else:
    print(f"Failed to find downloaded file for URL: {url}")
# Close the WebDriver
driver.quit()

print("Processing complete.")