-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgso_crawler_url.py
57 lines (45 loc) · 1.88 KB
/
gso_crawler_url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import requests
from bs4 import BeautifulSoup
import time
import urllib3
urllib3.disable_warnings()
def crawl(url):
max_retries = 3
attempts = 0
while attempts < max_retries:
try:
response = requests.get(url, timeout=10, verify=False)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
post_content_div = soup.find("div", class_="tab-pane fade fusion-clearfix in active")
if post_content_div:
links = [a['href'] for a in post_content_div.find_all("a", href=True)]
for link in links:
print(link)
print(len(links))
return links
else:
print("No post-content div found.")
return []
except requests.exceptions.Timeout:
print(f"Timeout occurred for URL: {url}. Attempt {attempts + 1} of {max_retries}. Retrying...")
except requests.RequestException as e:
print(f"Request failed for URL: {url} with error: {e}. Attempt {attempts + 1} of {max_retries}. Retrying...")
attempts += 1
time.sleep(2)
print(f"Failed to fetch {url} after {max_retries} attempts.")
return []
with open('gso_cats.txt', 'r') as file:
cats = [line.strip() for line in file if line.strip()]
for cat in cats:
print("Crawling url for",cat)
url = f"https://www.gso.gov.vn/{cat}"
links = crawl(url)
cat_folder = os.path.join("gso_data", cat)
os.makedirs(cat_folder, exist_ok=True)
# Save crawl data for each category in urls.txt inside each category folder
result_file_path = os.path.join(cat_folder, "urls.txt")
with open(result_file_path, 'w', encoding='utf-8') as file:
for link in links:
file.write(f"{link}\n")