-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest_scraping.py
executable file
·35 lines (28 loc) · 1.08 KB
/
test_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Development script:
Used for finding the parameters needed to successfully scrape an individual webpage correctly
before parseing the code to the scrapy spider (which will crawl the full archive).
Created on Thu Aug 6 17:10:09 2020
@author: tim
"""
import requests
from scrapy import Selector
import pandas as pd
url = "http://case.doe.gov.bd/index.php?option=com_xmap&sitemap=1&Itemid=14"
#url = "http://case.doe.gov.bd/index.php?option=com_content&view=category&id=8&Itemid=32"
req = requests.get(url)
url_content = req.content
sel = Selector(text=url_content)
course_blocks = sel.xpath('//a[contains(@href,"aqi-archives")]')
course_links = course_blocks.xpath('@href')
var = course_blocks.xpath('text()').extract()
url = ''.join(['http://case.doe.gov.bd/', course_links.extract_first()])
print(url)
req = requests.get(url)
url_content = req.content
sel2 = Selector(text=url_content)
date = sel2.xpath('//span[contains(., "Date")]')
data_table = sel2.xpath('//table[@class="mceItemTable"]')
df = pd.read_html(data_table.extract_first())[0]