forked from xingag/spider_python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider_bai_si_bu_de_jie.py
205 lines (159 loc) · 5.25 KB
/
spider_bai_si_bu_de_jie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python
# encoding: utf-8
"""
@version: v1.0
@author: xag
@license: Apache Licence
@contact: [email protected]
@site: http://www.xingag.top
@software: PyCharm
@file: spider_bai_si_bu_de_jie.py
@time: 2018/9/25 19:58
@description:利用多线程爬取【百思不得姐】网站的文字和图片并下载到csv文件中
"""
import requests
from lxml import etree
import threading
from queue import Queue
import time
import csv
from urllib import request
import fileutils
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Referer': 'http://www.budejie.com/hot/1'
}
class BSSpider(threading.Thread):
"""
爬取每一页的数据
"""
def __init__(self, page_queue, joke_queue, name, *args, **kwargs):
super(BSSpider, self).__init__(*args, **kwargs)
# 1.初始化数据
self.page_queue = page_queue
self.joke_queue = joke_queue
self.name = name
def run(self):
while True:
# 2.如果页面队列为空,就退出循环
if self.page_queue.empty():
print(self.name + '任务完成~')
# while not self.joke_queue.empty():
# print(self.joke_queue.get())
break
# 3.从队列中获取页面地址
page_url = self.page_queue.get()
self.spider_page(page_url)
# 6.休眠0.5秒
time.sleep(0.5)
def spider_page(self, page_url):
"""
爬取一页的数据
:param page_url:页面的url
:return:
"""
response = requests.get(page_url, headers=HEADERS)
text_raw = response.text
html_element = etree.HTML(text_raw)
# 4.利用xpath去解析数据
div_elements = html_element.xpath('//div[@class="j-r-list"]')
for div_element in div_elements:
duan_zi_elments = div_element.xpath('./ul/li')
for duan_zi_elment in duan_zi_elments:
# 【数据】用户名
username = duan_zi_elment.xpath('.//a[@class="u-user-name"]/text()')[0]
# 【数据】段子发布时间
pubtime = duan_zi_elment.xpath('.//span/text()')[0]
desc_element = duan_zi_elment.xpath('.//div[@class="j-r-list-c-desc"]')[0]
# 【数据】段子描述内容
content = desc_element.xpath('./a/text()')[0]
img_div_element = duan_zi_elment.xpath('.//div[@class="j-r-list-c-img"]')[0]
img = img_div_element.xpath('.//img/@data-original')[0]
alt = img_div_element.xpath('.//img/@alt')[0]
# 5.把解析后的数据以元组的方式放入到队列中去
self.joke_queue.put((username, content, img, alt, pubtime))
class BSWriter(threading.Thread):
"""
下载图片、写入文字数据到csv文件中
"""
def __init__(self, page_queue, joke_queue, writer, gLock, name, *args, **kwargs):
super(BSWriter, self).__init__(*args, **kwargs)
# 1.初始化
self.page_queue = page_queue
self.joke_queue = joke_queue
self.writer = writer
self.gLock = gLock
self.name = name
def run(self):
while True:
if self.joke_queue.empty() and self.page_queue.empty():
print(self.name + '任务完成~')
break
# 2.从joke_queue队列中获取数据
joke_info = self.joke_queue.get(timeout=40)
username, content, img, alt, pubtime = joke_info
# 3.上锁
self.gLock.acquire()
# 4.写入数据到csv中
self.writer.writerow((username, content, img, alt, pubtime))
# 5.下载图片到本地
# file_name = alt + fileutils.get_file_suffix(img)
# request.urlretrieve(img, './imgs/%s' % file_name)
# 6.释放锁
self.gLock.release()
print('写入一条数据成功')
class BSDownImg(threading.Thread):
"""
下载图片的消费者
"""
def __init__(self, page_queue, joke_queue, gLock, name, *args, **kwargs):
super(BSDownImg, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.joke_queue = joke_queue
self.gLock = gLock
self.name = name
def run(self):
while True:
if self.joke_queue.empty() and self.page_queue.empty():
print(self.name + '任务完成~')
break
username, content, img, alt, pubtime = self.joke_queue.get(timeout=40)
# 上锁并下载图片
self.gLock.acquire()
file_name = alt + fileutils.get_file_suffix(img)
request.urlretrieve(img, './imgs/%s' % file_name)
self.gLock.release()
print('下载一张图片成功')
def spider():
"""
爬取百思不得姐的前20页数据
:return:
"""
# 1.构建队列【生产者、消费者需要上锁的对象】
page_queue = Queue(20)
joke_queue = Queue(200)
# 2.锁对象
gLock = threading.Lock()
# 3.写入
fp = open('jokes.csv', 'a', newline='', encoding='utf-8')
writer = csv.writer(fp)
# 4.写入csv表头信息
writer.writerow(['username', 'content', 'img', 'alt', 'pubtime'])
# 5.前10页待爬取的地址,放入到队列中
for page_num in range(1, 11):
page_url = 'http://www.budejie.com/hot/%d' % page_num
page_queue.put(page_url)
# 6.构建10个生成者来进行爬虫
for x in range(1, 6):
t = BSSpider(page_queue, joke_queue, name='生产者%d' % x)
t.start()
# 7.构建 20 个消费者来写入数据到csv文件中
for x in range(1, 21):
t = BSWriter(page_queue, joke_queue, writer, gLock, name='消费者-文字%d' % x)
t.start()
# 8.构建 50 个消费者来下载图片
for x in range(1, 51):
t = BSDownImg(page_queue, joke_queue, gLock, name='消费者-图片%d' % x)
t.start()
if __name__ == '__main__':
spider()