-
Notifications
You must be signed in to change notification settings - Fork 0
/
getLinks.py
113 lines (95 loc) · 3.32 KB
/
getLinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from bs4 import BeautifulSoup
import requests
from termcolor import colored
import re
def prepareUrl(baseUrl):
if not re.match("^https?://.*", baseUrl):
baseUrl="http://"+baseUrl
if baseUrl[-1] == "/":
baseUrl = baseUrl[:-1]
return baseUrl
def prettifyLinks(links, baseUrl):
baseUrlSplit = baseUrl.rsplit('/',1)
if re.match("/.*\..*$", baseUrlSplit[1]):
baseUrl = baseUrlSplit[0]
for idx,l in enumerate(links):
if l[0]=='#':
links[idx] = baseUrl+'/'+l
elif l[0]=='/':
links[idx] = baseUrl+l
elif not re.match("^https?://.*", l):
links[idx] = baseUrl+"/"+l
return links
def internalLinks(links, baseUrl):
internalLinks = []
protocolSplit = baseUrl.rsplit("/", 1)
regex = "^https?://(www\.)?"+protocolSplit[1]+".*"
for l in links:
if re.match(regex, l):
internalLinks.append(l)
return internalLinks
def externalLinks(links, baseUrl):
return list(set(links) - set(internalLinks(links, baseUrl)))
def getRawLinks(baseUrl):
try:
result = requests.get(baseUrl)
except requests.exceptions.MissingSchema as e:
print("Invalid URL >> http://YOUR_URL")
exit()
except requests.exceptions.ConnectionError as e:
print("[ERROR] Coud not connect")
print(e)
exit()
content = result.content
soup = BeautifulSoup(content, "lxml")
links=[]
for a in soup.find_all("a", href=True):
links.append(a.get("href"))
for a in soup.find_all("area", href=True):
links.append(a.get("href"))
for a in soup.find_all("base", href=True):
links.append(a.get("href"))
for a in soup.find_all("link", href=True):
links.append(a.get("href"))
for a in soup.find_all("audio", src=True):
links.append(a.get("src"))
for a in soup.find_all("embed", src=True):
links.append(a.get("src"))
for a in soup.find_all("iframe", src=True):
links.append(a.get("src"))
for a in soup.find_all("img", src=True):
links.append(a.get("src"))
for a in soup.find_all("input", src=True):
links.append(a.get("src"))
for a in soup.find_all("script", src=True):
links.append(a.get("src"))
for a in soup.find_all("source", src=True):
links.append(a.get("src"))
for a in soup.find_all("track", src=True):
links.append(a.get("src"))
for a in soup.find_all("video", src=True):
links.append(a.get("src"))
links = list(filter(None, links)) # Remove empty links
for l in links:
if(re.match("[^@]+@[^@]+\.[^@]+", l)):# Remove e-mails (mailto:)
links.remove(l)
links = list(set(links))
print("Total links: {}".format(len(links)))
return links
def testLinks(links):
total = len(links)
for idx,l in enumerate(links):
print("[{}/{}] Testing {}".format(idx+1,total,l))
if requests.get(l).status_code != 200:
links.remove(l)
else:
print(colored("OK", 'green'))
return links
if __name__ == "__main__":
baseUrl = "http://www.eb.mil.br" # No trailing '/'
links = getRawLinks(baseUrl)
prettyLinks = prettifyLinks(links, baseUrl)
ok = testLinks(prettyLinks)
notOk = prettyLinks - ok
for l in ok:
print(l)