-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentidades.py
96 lines (79 loc) · 2.76 KB
/
entidades.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import requests
import json
import re
from unicodedata import normalize
import time
from bs4 import BeautifulSoup
import re
from selenium import webdriver #connect python with webbrowser-chrome
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
api = 'XXXXXXXXXXXXXXXXXXXXXXXXXX'
def check_exists_by_xpath(driver, xpath):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
def sort_confidence(val):
return val[2]
def cleanMe(soup):
for script in soup(["script", "style", "form", 'nav', 'meta', 'source']):
script.extract()
return soup
options = Options()
options.headless = True
driver = webdriver.Firefox(executable_path=r'browsers/geckodriver', options=options)
#GET URLS
with open("URLS.txt", 'r') as file:
urls = file.readlines()
#inicializando el txt final
file = open("Entidades_Output", "w")
for url in urls:
print('[+] Extrayendo Informacion de : ' ,url.replace('\n', ''))
driver.get(url)
if check_exists_by_xpath(driver, "//meta[@name='description']"):
metadesc = driver.find_element_by_xpath("//meta[@name='description']").text
print(metadesc)
else:
metadesc = ""
title = driver.find_element_by_tag_name('title').get_attribute("innerHTML")
texto = driver.find_element_by_tag_name('body')
soup = BeautifulSoup(texto.get_attribute("innerHTML"), 'html.parser')
soup = cleanMe(soup)
lista_parrafos = []
lista_headings = []
headings = soup.find_all(re.compile('^h[1-2]$'))
for heading in headings:
lista_headings.append(heading.getText())
parrafos = soup.find_all('p')
primeros_parrafos = []
for p in parrafos:
parrafo = p.getText().split(" ")
if len(parrafo) > 9:
primeros_parrafos.append(p.getText())
primeros_parrafos_ok = primeros_parrafos[0:1]
unificar_texto = ''
unificar_texto = title + '. ' + metadesc + '. '
for h in lista_headings:
unificar_texto = unificar_texto + h
for p in primeros_parrafos_ok:
unificar_texto = unificar_texto + '. ' + p
string = unificar_texto
string = string.replace(" ", "%20")
response = requests.get("https://api.dandelion.eu/datatxt/nex/v1/?lang=es&text=" + string + "&token=" + api)
string = response.content
string = string.decode('utf-8')
json_object = json.loads(string)
entidades_lista = []
for obj in json_object['annotations']:
entidades_lista.append([obj['label'], obj['title'], obj['confidence']])
#ORDENAMOS LA LISTA
entidades_lista.sort(key = sort_confidence,reverse = True)
file.write("##### : " + url.replace('\n','') + " : #####"+ '\n')
file.write("'LABEL' - 'TITULO' - 'CONFIDENCIA'"+ '\n')
for el in entidades_lista:
print(el)
file.write(str(el).replace("[","").replace("]","") + '\n')
file.close()
driver.close()