-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScrapPage.py
50 lines (44 loc) · 1.67 KB
/
ScrapPage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import urllib
import urllib.request
from io import BytesIO
import gzip
import re
def GetPageContent(url):
try:
req = urllib.request.Request(url)
req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0")
req.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
req.add_header("Accept-Language", "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3")
response = urllib.request.urlopen(req)
r = None
if response.info().get('Content-Encoding') == 'gzip':
buf = BytesIO(response.read())
f = gzip.GzipFile(fileobj=buf)
r = f.read()
else:
r = response.read()
return r
except:
return ""
def WriteToFile(htmlContent, filePath):
if (type(htmlContent) is str):
print("yes")
with open(filePath, "w") as file:
file.write(htmlContent)
if (type(htmlContent) is bytes):
print("yes byte")
with open(filePath, "wb") as file:
file.write(htmlContent)
return("Done")
def url_Formattage(url):
regex_url=re.findall(r'openjur' , url)
regex_adomy=re.findall(r'admody' , url)
regex_caselaw=re.findall(r'caselaw' , url)
#if ((regex_url) or (regex_adomy) or (regex_caselaw) ):
print("yes")
nouveau = url.replace("https://translate.google.com/translate?hl=fr&sl=de&u=" , '')
nouveau=nouveau.replace("https://translate.google.com/translate?hl=fr&sl=en&u=" , '')
nouveau=nouveau.replace("&prev=search&pto=aue" , '')
return(nouveau)
#else:
#return(url)