-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsf_warc_lister.py
123 lines (104 loc) · 4.74 KB
/
sf_warc_lister.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import warc
import urlparse
import argparse
import socialfeedtools.utils as utils
class ResponseRecord():
def __init__(self, record_id, record_url, date):
self.record_id = record_id
self.record_url = record_url
self.date = date
class ApiResponseRecord(ResponseRecord):
def __init__(self, record_id, record_url, date, service, api_method, api_args):
ResponseRecord.__init__(self, record_id, record_url, date)
self.service = service
self.api_method = api_method
self.api_args = api_args
def parse_tumblr_url(url):
#Parse the url
#http://api.tumblr.com/v2/blog/justinlittman-dev.tumblr.com/posts?oauth_body_hash=2jmj7l5rSw0yVb%2FvlWAYkK%2FYBwk%3D&oauth_nonce=98846162&oauth_timestamp=1426651697&oauth_consumer_key=Fki0Q9w9QcW95yy66RtFCni14QpM0pjuHbDWMrZ9aPXcsthVQq&oauth_signature_method=HMAC-SHA1&oauth_version=1.0&limit=20&offset=0&oauth_token=&api_key=Fki0Q9w9QcW95yy66RtFCni14QpM0pjuHbDWMrZ9aPXcsthVQq&oauth_signature=iQ5hsKPkOFUVQQhmkvTLS4rHZ10%3D
(scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
path_parts = path.split("/")
assert len(path_parts) == 5
assert path_parts[1] == "v2"
api_method = "%s.%s" % (path_parts[2], path_parts[4])
api_args = urlparse.parse_qs(query)
#Addd blog to args
api_args["base-hostname"] = [path_parts[3]]
#Remove oauth keys
del_keys = []
for key in api_args:
if key.startswith("oauth_"):
del_keys.append(key)
for key in del_keys:
del api_args[key]
#Remove api_key
if "api_key" in api_args:
del api_args["api_key"]
return "tumblr", api_method, api_args
def parse_flickr_url(url):
#https://api.flickr.com/services/rest/?nojsoncallback=1&user_id=131866249%40N02&method=flickr.people.getInfo&format=json
(scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
api_args = urlparse.parse_qs(query)
assert "method" in api_args
assert len(api_args["method"]) == 1
api_method = api_args["method"][0]
if api_method.startswith("flickr."):
api_method=api_method[7:]
#Remove method from api_args
del api_args["method"]
if "nojsoncallback" in api_args:
del api_args["nojsoncallback"]
if "format" in api_args:
del api_args["format"]
if "secret" in api_args:
del api_args["secret"]
return "flickr", api_method, api_args
def parse_twitter_url(url):
#https://api.twitter.com/1.1/statuses/user_timeline.json?page=1&screen_name=justin_littman
(scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
path_parts = path.split("/")
assert len(path_parts) == 4
assert path_parts[1] == "1.1"
api_method = "%s.%s" % (path_parts[2], path_parts[3][0:-5])
api_args = urlparse.parse_qs(query)
return "twitter", api_method, api_args
api_func_dict = {
utils.is_tumblr_url: parse_tumblr_url,
utils.is_flickr_url: parse_flickr_url,
utils.is_twitter_rest_url: parse_twitter_url
}
def to_response_record(record):
url = record.header["WARC-Target-URI"]
for is_func in api_func_dict:
if is_func(url):
service, api_method, api_args = api_func_dict[is_func](url)
return ApiResponseRecord(record.header.record_id, url, record.header.date, service, api_method, api_args)
return ResponseRecord(record.header.record_id, url, record.header.date)
def list_records(filepath, services=()):
print "File %s" % filepath
f = warc.open(filepath)
try:
for record in f:
if record.type == 'response':
resp_record = to_response_record(record)
if (not services
or (isinstance(resp_record, ApiResponseRecord) and resp_record.service in services)
or (not isinstance(resp_record, ApiResponseRecord) and "other" in services)):
print "Record %s" % resp_record.record_id
print "Url: %s" % resp_record.record_url
print "Date: %s" % resp_record.date
if isinstance(resp_record, ApiResponseRecord):
print "Service: %s" % resp_record.service
print "API method: %s (%s)" % (resp_record.api_method, resp_record.api_args)
finally:
f.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--services",
help="A comma separated list of services to limit the results to. "
"Services are: twitter, tumblr, flickr, other.")
parser.add_argument("filepath", nargs="+", help="Filepath of the warc.")
args = parser.parse_args()
svcs = args.services.split(",") if args.services else ()
for fp in args.filepath:
list_records(fp, services=svcs)