-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjson2xml_reddit.py
210 lines (146 loc) · 6.73 KB
/
json2xml_reddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# python json2xml_reddit.py reddit_data/reddit_openingposts/RS_2014-12 reddit_data/2014/RC_2014-12.1000000
import os
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import json
import dicttoxml
import re
import fileinput
from collections import defaultdict
class Thread:
def __init__(self,threadid,category,ttype,title):
self.threadid = threadid
self.posts = []
self.category = category
self.ttype = ttype
self.title = title
def addPost(self,post):
self.posts.append(post)
def getNrOfPosts(self):
return len(self.posts)
def printXML(self,out):
out.write("<thread id=\""+self.threadid+"\">\n<category>"+self.category+"</category>\n<title>"+self.title+"</title>\n<posts>\n")
for post in self.posts:
post.printXML(out)
out.write("</posts>\n</thread>\n")
class Post:
def __init__(self,postid,author,timestamp,body,parentid,ups,downs):
self.postid = postid
self.author = author
self.timestamp = timestamp
self.body = body
self.parentid = parentid
self.ups = ups
self.downs = downs
#sys.stderr.write(parent_id+" upvotes:"+str(ups)+"\n")
def printXML(self,out):
out.write("<post id=\""+self.postid+"\">\n<author>"+self.author+"</author>\n<timestamp>"+str(self.timestamp)+"</timestamp>\n<parentid>"+self.parentid+"</parentid>\n<body>"+self.body+"</body>\n<upvotes>"+str(self.ups)+"</upvotes>\n<downvotes>"+str(self.downs)+"</downvotes>\n</post>\n")
######### READING RS FILE: FILE WITH OPENING POSTS (SUBMISSIONS) ###########
json_S_file = sys.argv[1]
sys.stderr.write("Reading "+json_S_file+"\n")
thread_ids = set() # list of thread ids, to find if we have the opening post for a given comment
threads = dict() # key is thread_id, value is Object thread, to find the thread given the thread id
i=0
for json_string in fileinput.input([json_S_file]):
i=i+1
if (i%10000==0):
sys.stderr.write(str(i)+" lines read\n")
parsed_json = json.loads(json_string)
thread_id = parsed_json['id']
# sys.stderr.write("thread id:"+thread_id+"\n")
subreddit = parsed_json['subreddit']
title = parsed_json['title']
thread = Thread(thread_id,subreddit,"",title)
thread_ids.add(thread_id)
threads[thread_id] = thread
# Add the opening post of the thread as post to the thread:
if 'author' in parsed_json:
author = parsed_json['author']
else:
author = ""
timestamp = parsed_json['created_utc']
body = parsed_json['selftext']
parent_id = ""
downs = parsed_json['downs']
ups = parsed_json['ups']
post = Post(thread_id,author,timestamp,body,parent_id,ups,downs)
thread.addPost(post)
######### READING RC FILE: FILE WITH COMMENTS ###########
json_C_file = sys.argv[2]
sys.stderr.write("Reading "+json_C_file+"\n")
postcountperthread = dict() # key is thread_id, value is # of posts in thread
jsonforlinenr = dict() # key is line number, value is parsed json
# for efficiency reasons, we first read the comments file once and count the
# nr of posts per thread (the posts are ordered by timestamp, not by thread)
# then when we go over all thread_ids in a second round, we can print a
# thread as soon as we have seen all posts
#tmpfile = json_C_file+".tmp"
#tmp = open(tmpfile,"w")
i=0
submissionpresent = 0
for json_string in fileinput.input([json_C_file]):
i=i+1
if (i%10000==0):
sys.stderr.write(str(i)+" lines read\n")
parsed_json = json.loads(json_string)
jsonforlinenr[i] = parsed_json
thread_id = re.sub("t[0-9]_","",parsed_json['link_id'])
# parent_id = re.sub("t[0-9]_","",parsed_json['parent_id'])
# sys.stderr.write("thread id for comment:"+thread_id+"\n")
# sys.stderr.write("parent id for comment:"+parent_id+"\n")
# if (thread_id in threads.keys()) :
if (thread_id in thread_ids) :
# sys.stderr.write("We have the opening post for this thread: "+thread_id+"\n")
# tmp.write(json_string)
submissionpresent = submissionpresent +1
if (submissionpresent%1000==0) :
sys.stderr.write(str(submissionpresent)+" comments found for which we have the original submission\n")
if (thread_id in postcountperthread.keys()) :
postcountperthread[thread_id] = postcountperthread[thread_id] +1
else :
postcountperthread[thread_id] = 2 # the opening post is the first. the first comment is the second
#tmp.close()
percwithsubmission = 100 * float(submissionpresent)/float(i)
sys.stderr.write("Total nr of comments: "+str(i)+". For "+str(percwithsubmission)+"% of the comments we have the original submission\n")
######### READING TMP FILE WITH COMMENTS FOR WHICH WE HAVE THE ORIGINAL SUBMISSION ###########
sys.stderr.write("Parsing and printing comment threads\n")
#print("<forum type=\"reddit\">")
for j in range(1,i) :
if (j%100==0) :
sys.stderr.write(str(j)+" items of "+str(i)+" parsed\n");
#for json_string in fileinput.input([tmpfile]):
parsed_json = jsonforlinenr[j]
# parsed_json = json.loads(json_string)
thread_id = re.sub("t[0-9]_","",parsed_json['link_id'])
if (thread_id in thread_ids) :
subreddit = parsed_json['subreddit']
author = parsed_json['author']
timestamp = parsed_json['created_utc']
post_id = parsed_json['id']
body = parsed_json['body'].encode('utf-8').strip()
parent_id = re.sub("t[0-9]_","",parsed_json['parent_id'])
downs = parsed_json['downs']
ups = parsed_json['ups']
# sys.stderr.write(thread_id+" upvotes:"+str(ups)+"\n")
post = Post(post_id,author,timestamp,body,parent_id,ups,downs)
thread = threads[thread_id]
thread.addPost(post)
# we only add a post to a thread if we have the opening post (from the RS file)
# else :
# thread = Thread(thread_id,subreddit,"")
# threads[thread_id] = thread
# thread.addPost(post)
sys.stderr.write("TOTAL nr of posts in thread "+thread_id+":"+str(postcountperthread[thread_id])+"\n")
sys.stderr.write("CURRENT nr of posts in thread: "+thread_id+":"+str(thread.getNrOfPosts())+"\n")
if (thread.getNrOfPosts() >= postcountperthread[thread_id]) :
sys.stderr.write("print thread "+thread_id+"\n")
if (not os.path.exists("reddit_data/per_subreddit/"+subreddit)) :
os.makedirs("reddit_data/per_subreddit/"+subreddit)
out = open("reddit_data/per_subreddit/"+subreddit+"/"+thread_id+".xml","w")
out.write("<?xml version=\"1.0\"?>\n")
out.write("<forum type=\"reddit\">\n")
thread.printXML(out)
out.write("</forum>\n")
out.close()
#print("</forum>")