-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrapadvisor2.py
74 lines (64 loc) · 2.52 KB
/
trapadvisor2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from scrapy.item import Field
from scrapy.item import Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.loader.processors import MapCompose
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
class Opinion(Item):
titulo = Field()
calificacion = Field()
contenido = Field()
autor = Field()
class TrapAdvisor(CrawlSpider):
name = "OpinionesTripAdvisor"
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'CLOSESPIDER_PAGECOUNT': 100
}
allowed_domains = ['tripadvisor.com.ar']
start_urls = ['https://www.tripadvisor.com.ar/Hotels-g303506-Rio_de_Janeiro_State_of_Rio_de_Janeiro-Hotels.html']
download_delay = 1
rules = (
#Paginacion de hoteles (h)
Rule(
LinkExtractor(
allow=r'-oa\d+-'
), follow=True
),
#Detalle de hoteles (v)
Rule(
LinkExtractor(
allow=r'/Hotel_Review-',
restrict_xpaths=['//div[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]//a[@data-clicksource="HotelName"]']
), follow=True
),
#Paginacion de opiniones dentro de un hotel (h)
Rule(
LinkExtractor(
allow=r'-or\d+-'
), follow=True
),
#Detalle de perfil de usuario (v)
Rule(
LinkExtractor(
allow=r'/Profile/',
restrict_xpaths=['//div[@data-test-target="reviews-tab"]//a[contains(@class, "ui_header")]']
), follow=True, callback='parse_opinion'
)
)
def obtenerCalificacion(self, texto):
#ui_bubble_rating bubble_10
calificacion = texto.split("_")[-1]
return calificacion
def parse_opinion(self, response):
sel = Selector(response)
opiniones = sel.xpath('//div[@id="content"]/div/div')
autor = sel.xpath('//h1/span/text()').get()
for opinion in opiniones:
item = ItemLoader(Opinion(), opinion)
item.add_value('autor', autor)
item.add_xpath('titulo', './/div[@class="_3IEJ3tAK _2K4zZcBv"]')
item.add_xpath('contenido', './/q/text()')
item.add_xpath('calificacion', './/div[@class="_1VhUEi8g _2K4zZcBv"]/span/@class', MapCompose(self.obtenerCalificacion))
yield item.load_item()