forked from RimoChan/sese-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path虫.py
49 lines (39 loc) · 1.37 KB
/
虫.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import logging
from functools import lru_cache
from urllib.parse import urlparse
from typing import Optional
from reppy.robots import Robots
import requests
from 配置 import 爬虫的名字
class LoliError(Exception):
...
@lru_cache(maxsize=512)
def 萝卜(url):
rp = Robots.fetch(url+'/robots.txt', timeout=5)
return rp
def 真爬(url, 乖=True, timeout=5, 大小限制=None) -> str:
q = urlparse(url)
if 乖:
rp = 萝卜(f'{q.scheme}://{q.netloc}')
if not rp.allowed(url, 爬虫的名字):
raise LoliError('被禁了,不行!')
resp = requests.get(url, timeout=timeout, headers={'user-agent': 爬虫的名字}, stream=True)
if resp.status_code == 404:
raise LoliError('没有!没有!')
resp.raise_for_status()
if 'text/html' not in resp.headers.get('Content-Type', ''):
raise LoliError(f'类型{resp.headers.get("Content-Type")}不行!')
if 大小限制:
data = next(resp.iter_content(大小限制))
else:
data = resp.content
if resp.encoding == 'ISO-8859-1':
return data.decode('utf8', 'ignore') # 猜测编码的性能太差,直接硬上
else:
return data.decode(resp.encoding, 'ignore')
def 爬(url, **d) -> Optional[str]:
try:
return 真爬(url, **d)
except LoliError as e:
logging.warning(f'{url} {e}')
return None