-
Notifications
You must be signed in to change notification settings - Fork 2
/
cmsFinder.py
73 lines (64 loc) · 2.56 KB
/
cmsFinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from bs4 import BeautifulSoup
import urllib.request
from urllib.error import HTTPError, URLError
import requests
import json
pagesToScrub = ["http://www.puppy.com", "http://cats.com", "http://www.kittens.com"]
psiBaseApi = (r"https://www.googleapis.com/pagespeedonline/v4/runPagespeed?url=http%3A%2F%2F")
print ("Domain, CMS, Desktop PSI Score, Mobile PSI Score")
userAgent= {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}
def psiGrade (gradeUri):
psiDataObject = requests.get(gradeUri)
if "id" in psiDataObject.json():
jsonPsiDataObject = psiDataObject.json()["ruleGroups"]
psiSpeed = jsonPsiDataObject["SPEED"]["score"]
return (psiSpeed)
else:
return ("unknown")
def cmsTextMatch (html):
if b"static.wixstatic.com" in html or b"wix" in html:
return ("Wix")
elif b"wp-content" in html or b"wordpress" in html or b"wp-json" in html:
return ("Wordpress")
elif b"static1.squarespace.com" in html or b"squarespace" in html:
return ("Squarespace")
elif b"weebly" in html:
return ("Weebly")
elif b"drupal" in html:
return ("Drupal")
elif b"craftCMS" in html or b"CRAFT" in html:
return ("CRAFT")
else:
return ("unknown")
for pageToScrub in pagesToScrub:
cms = ""
deskPsiGrade = ""
mobPsiGrade = ""
try:
req = urllib.request.Request(pageToScrub, headers=userAgent)
page = urllib.request.urlopen(req)
except HTTPError as e:
cms = ("unknown")
except URLError as e:
cms = ("unknown")
else:
soup = BeautifulSoup(page, "html.parser")
if soup.find(attrs={"name":"generator"}):
cmsMeta = soup.find(attrs={"name":"generator"})
cms = cmsMeta.get("content")
else:
try:
req = urllib.request.Request(pageToScrub, headers=userAgent)
page2 = urllib.request.urlopen(req)
except HTTPError as e:
cms = ("unknown")
except URLError as e:
cms = ("unknown")
else:
cms = cmsTextMatch(page2.read())
psiGradableUri = pageToScrub.split("//")[1]
psiDeskReqUri = (f"{psiBaseApi}{psiGradableUri}&fields=id%2CruleGroups")
psiMobReqUri = (f"{psiBaseApi}{psiGradableUri}&fields=id%2CruleGroups&strategy=mobile")
deskPsiGrade = psiGrade(psiDeskReqUri)
mobPsiGrade = psiGrade(psiMobReqUri)
print (f"{pageToScrub}, {cms}, {deskPsiGrade}, {mobPsiGrade}")