-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.py
27 lines (24 loc) · 922 Bytes
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import requests
from bs4 import BeautifulSoup
from sqlitedict import SqliteDict
with SqliteDict('./db.sqlite', autocommit=True) as d:
url = 'https://training.ia-toki.org/submissions/programming/all/?pageIndex={page}&orderDir=asc'
page = d.get('page', 0)
while True:
html = requests.get(url.format(page=page)).text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table')
table_body = table.find('tbody')
rows = table_body.find_all('tr')
if len(rows)>0:
for row in rows:
cols = [ele.text.strip() for ele in row.find_all('td')]
score = int(cols[6])
key = '{}|{}|{}'.format(cols[1], cols[2], cols[3])
if score > d.get(key, -1):
d[key] = score
d['page'] = page
print(page)
page = page+1
else:
break