-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgso_crawler_1_table.py
102 lines (80 loc) · 3.28 KB
/
gso_crawler_1_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
import os
import re
from selenium.common.exceptions import NoSuchElementException
url = "https://pxweb.gso.gov.vn/pxweb/vi/D%C3%A2n%20s%E1%BB%91%20v%C3%A0%20lao%20%C4%91%E1%BB%99ng/D%C3%A2n%20s%E1%BB%91%20v%C3%A0%20lao%20%C4%91%E1%BB%99ng/V02.01.px/?rxid=3b0537b5-21a6-480d-bfe9-4f3cd35438ab"
download_folder = os.path.join(os.getcwd())
os.makedirs(download_folder, exist_ok=True)
# Set Chrome WebDriver options
options = webdriver.ChromeOptions()
prefs = {
"download.default_directory": download_folder,
"download.prompt_for_download": False,
"safebrowsing.enabled": True
}
options.add_experimental_option("prefs", prefs)
# Initialize WebDriver
driver = webdriver.Chrome(options=options)
print(f"Processing URL: {url}")
def extract_version_from_url(url):
pattern = r'\/([Vv]\d{2}\.\d{2})'
match = re.search(pattern, url)
return match.group(1) if match else "default_version"
# Navigate to the URL
driver.get(url)
time.sleep(1)
# Extract page title
title_element = driver.find_element(By.ID, 'ctl00_ContentPlaceHolderMain_MenuTitle')
title_text = title_element.text.strip()
# Sanitize title for valid filenames
title_text = re.sub(r'\(\*\)', '', title_text).strip()
title_text = re.sub(r'[<>:"/\\|?*]', '', title_text).strip() # Remove invalid characters
# Select all sections
select_all_buttons = driver.find_elements(By.CLASS_NAME, 'variableselector_valuesselect_select_all_imagebutton')
for button in select_all_buttons:
button.click()
time.sleep(0.5)
# Continue to the next step
continue_button = driver.find_element(By.CLASS_NAME, 'variableselector_continue_button')
continue_button.click()
time.sleep(1)
try:
# Locate the button using its class name and text
close_button = driver.find_element(By.XPATH, "//button[span[text()='Close']]")
close_button.click() # Click the Close button
print("Clicked the Close button.")
except NoSuchElementException:
print("Close button not found.")
# Save the query
save_link = driver.find_element(By.ID, 'ctl00_ctl00_ContentPlaceHolderMain_lnkSaveQueryInformation')
save_link.click()
time.sleep(1)
# Select Excel format
dropdown = Select(driver.find_element(By.ID, 'ctl00_ctl00_ContentPlaceHolderMain_SavedQueryFeature_ddlOutputFormats'))
dropdown.select_by_value('FileTypeExcelX')
# Start the download
finish_button = driver.find_element(By.ID, 'ctl00_ctl00_ContentPlaceHolderMain_SavedQueryFeature_btnCreateSaveQuery')
finish_button.click()
time.sleep(1)
# Get the download URL
url_input = driver.find_element(By.ID, 'ctl00_ctl00_ContentPlaceHolderMain_SavedQueryFeature_txtSaveQueryUrl')
download_url = url_input.get_attribute("value")
# Navigate to the download URL
driver.get(download_url)
# Wait for download to complete
time.sleep(5)
# Rename the downloaded file
version = extract_version_from_url(url)
downloaded_file = os.path.join(download_folder, f"{version}.xlsx")
final_file = os.path.join(download_folder, f"{title_text}.xlsx")
if os.path.exists(downloaded_file):
os.rename(downloaded_file, final_file)
print(f"Downloaded and renamed: {final_file}")
else:
print(f"Failed to find downloaded file for URL: {url}")
# Close the WebDriver
driver.quit()
print("Processing complete.")