-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawluser.py
226 lines (192 loc) · 6.08 KB
/
crawluser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from weibo import APIClient
import json
import random
import threading
import datetime
import time
import MySQLdb
import Queue
import os
import sys
import datetime
def timediff(timestart, timestop):
t = (timestop-timestart)
time_day = t.seconds / (24*60*60)
s_time = t.seconds
ms_time = t.microseconds / 1000000
usedtime = int(s_time + ms_time)
time_hour = usedtime / 60 / 60
time_minute = (usedtime - time_hour * 3600 ) / 60
time_second = usedtime - time_hour * 3600 - time_minute * 60
time_micsecond = (t.microseconds - t.microseconds / 1000000) / 1000
retstr = "%d天 %02d:%02d:%02d" %(time_day, time_hour, time_minute, time_second)
return retstr
tokenlist = [
'2.00Z21hZC01klB_a5eeb2ccbewGBDSE',
'2.00Z21hZC01klB_a5eeb2ccbewGBDSE',
'2.00Z21hZCwRUA3D1ac7184899RDEsHD',
'2.00Z21hZC9ononDb0371b97760MEIX6',
'2.00Z21hZCc3QVwDfe836ce906V_5HBE',
'2.00Z21hZCZIFV3Cdad8ac8b48ruTRRC',
'2.00Z21hZCQUkaKC8d06e925fdcm_SIB',
'2.00Z21hZCbv2fNCf81de27045IRMMIC',
'2.00Z21hZCut1A9B6be8328459JVavOD',
'2.00Z21hZC0RNIfCa4abbb84dctPdI_E',
]
APP_KEY = '1234567' # app keyjj
APP_SECRET = 'abcdefghijklmn' # app secret
CALLBACK_URL = 'http://www.example.com/callback' # callback url
expires_in =99999999999
userlist = []
now = 0
month =['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
total = 0
error = 0
starttime = datetime.datetime.now()
crawlTime = 0
screenFile = sys.stdout
#logFile = open('/home/ray/code/NewDB/LOG','w+')
logFile = open('./LOG','w+')
sp = threading.Semaphore(1) #信号量
sp2 = threading.Semaphore(1)
sp3 = threading.Semaphore(1)
class ThreadCrawl(threading.Thread):
def __init__(self, i):
self.i = i
threading.Thread.__init__(self)
print self.getName(), ' start!'
def run(self): #i表示用第几个token
print self.getName()
global now
global userlist
global sp, sp2, sp3
global total, error, starttime
global crawlTime
con = MySQLdb.connect(host='localhost', user='root',passwd='131',charset='utf8')
con.ping(True)
#选择数据库
con.select_db('nwb');
#获取操作游标
cur = con.cursor()
cur.execute("SET NAMES utf8")
cur.execute("SET CHARACTER_SET_CLIENT=utf8")
cur.execute("SET CHARACTER_SET_RESULTS=utf8")
con.commit()
queue.get()
client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL)
client.set_access_token(tokenlist[self.i], expires_in)
flag = True
ti = 0
thisCrawl = 0
while sp.acquire() and now < len(userlist):
if flag:
tu = userlist[now]
now += 1
ti = tu % 10 #ti是当前用户的id的最后一位
flag = False
sys.stdout = screenFile
tmp = os.system( 'clear' )
nowtime = datetime.datetime.now()
print "Run Time: %s" % timediff(starttime, nowtime)
print "%0.4f%s\tnow:\t%d\ttot:\t%d" % ((now + 0.0)/len(userlist), '%', now, len(userlist))
print self.getName() + '\ttotal:' + str(total)
print 'crawlTime: ', crawlTime
ttime = nowtime - starttime
tlimit = (ttime.days * 24 + ttime.seconds / 3600 + 1) * 1000 - 50
thisLimit =(ttime.days * 24 + ttime.seconds / 3600 + 1) * 150 - 10
if thisCrawl > thisLimit:
print '\r%s Limit!wait %d seconds~~' % (self.getName(), 3600 - ttime.seconds%3600), #加一个,则不换行
if crawlTime > tlimit:
print '\rTotal Limit! wait %d seconds~~' % (3600 - ttime.seconds%3600), #加一个,则不换行
print 'ERROR: ', str(error)
sys.stdout = logFile
sp.release()
while thisCrawl > thisLimit:
nowtime = datetime.datetime.now()
ttime = nowtime - starttime
thisLimit =(ttime.days * 24 + ttime.seconds / 3600 + 1) * 150 - 10
time.sleep(1)
while crawlTime > tlimit:
nowtime = datetime.datetime.now()
ttime = nowtime - starttime
tlimit = (ttime.days * 24 + ttime.seconds / 3600 + 1) * 1000 - 50
time.sleep(1)
sql = 'select min(id) from data%d where uid = %d' % (ti, tu)
tt = 1
cur.execute(sql)
con.commit()
tt = cur.fetchone()
tt = tt[0]
maxId = 0
minData = '2013-12-12 00:00:00'
if tt != None:
maxId = tt
sql = 'select createdAt from data%d where id = %d' % (ti, maxId)
cur.execute(sql)
con.commit()
k = cur.fetchone()
minData = str(k[0])
print minData
if minData < '2012-11-00 00:00:00':
flag = True
print '从数据库中读出的消息比11月的早,退出!!!'
continue
try:
messages = client.statuses.user_timeline.get(uid=tu, count=100, max_id = maxId)
#print messages
except:
error += 1
sys.stdout = screenFile
print self.getName() + ' HTTP ERROR!!' + str(error) + ' uid: ' +
str(tu) + ' max_id: ' + str(maxId)
crawlTime += 1
thisCrawl += 1
continue
crawlTime += 1
thisCrawl += 1
if len(messages['statuses']) < 100:
flag = True
print '这次抓取到的消息数目为%d,少于100,退出!' % len(messages['statuses'])
for message in messages['statuses']:
total += 1
c = message['created_at']
dt = "%s-%02d-%s %s" % (c[-4:], month.index(c[4:7]) +1, c[8:10], c[11:19] )
if dt < '2012-11-00 00:00:00':
flag = True
print '这次抓取到了%s,在11月之前的消息,退出!' % dt
break
if 'deleted' in message:
message['reposts_count'] = -1
t = "insert ignore into data%d values(%d, %d,'%s', '%s', %d);\n" %(ti, message['id'], tu, message['text'].replace('\\','\\\\').replace('\'','\\\''), dt, message['reposts_count'])
print t
cur.execute(t)
# con.commit()
con.commit()
queue.task_done()
######以下为主程序######
conn = MySQLdb.connect(host='localhost', user='root',passwd='131',charset='utf8')
conn.ping(True)
cursor = conn.cursor()
conn.select_db('nwb');
cursor.execute("SET NAMES utf8")
cursor.execute("SET CHARACTER_SET_CLIENT=utf8")
cursor.execute("SET CHARACTER_SET_RESULTS=utf8")
cursor.execute('select * from user order by uid desc')
conn.commit()
for i in cursor.fetchall():
userlist.append(i[0])
queue = Queue.Queue()
for i in range(10):
queue.put(i)
for i in range(10):
t = ThreadCrawl(i)
t.setDaemon(True)
t.start()
queue.join()
conn.close()
crawledUser.close()
cursor.close()