-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathplot_messages.py
139 lines (124 loc) · 5.28 KB
/
plot_messages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
This script does the analysis on the facebook data collected and analyzes the trends on how the
messages are recieved by a person, this also has a visualization. You need to close one graph to see other
graph, download the messages data from facebook and give the absolute path of the script.
Command:
python messages_json_plot.py --msg ./messages
Author:
Farhaan Bukhsh <[email protected]>
Anubhav Singh
"""
import pandas as pd
import json
import os
import datetime
import matplotlib.pyplot as plt
import operator
import argparse
import re
class FacebookMessageAnalyser:
""" This class is used to analyse and visualize facebook message data"""
def __init__(self):
self.conversations = []
self.all_messages = []
self.monthly_aggregate = None
self.yearly_aggregate = None
self.daily_aggregate = None
self.message_filename = "message.json"
def __call__(self, messages_dir):
self.conversations = self.get_all_conversation(messages_dir)
self.populate_all_messages(messages_dir)
self.create_manipulate_dataframes()
self.visualize_data(self.monthly_aggregate, "Number of Months",
"Cummulative Messages", "Cummulative Monthly Messages", "Monthly Plot")
self.visualize_data(self.daily_aggregate, "Number of Days",
"Cummulative Messages", "Cummulative Daily Messages", "Daily Plot")
self.visualize_data(self.yearly_aggregate, "Years", "Cummulative Messages",
"Cummulative Yearly Messages", "Yearly Plot")
def get_all_conversation(self, messages_dir):
"""
:params message_dir: The location of the directory
:returns: a list of all the directory i.e conversations
Returns a list of all the converstaion that has taken place.
"""
conversations = []
dirs = [convo for convo in os.listdir(
messages_dir) if os.path.isdir(messages_dir+"/"+convo) == True]
# Sanitary check to see if there is a message.json file present
for d in dirs:
files = [x for x in os.listdir(
messages_dir+"/"+d) if os.path.isfile(messages_dir+"/"+d+"/"+x) == True]
try:
if re.search(r'message(_\d+)?\.json', files[0]):
self.message_filename = files[0]
conversations.append(d)
except:
pass
return conversations
def populate_all_messages(self, messages_dir):
"""
:params message_dir: The location of the directory
Populate all the messages user has reciceved.
"""
for convo in self.conversations:
f = messages_dir + "/" + convo + "/" + self.message_filename
with open(f) as msg_json_f:
msg_json = json.load(msg_json_f)
for msg in msg_json["messages"]:
self.all_messages.append(msg)
def create_manipulate_dataframes(self):
"""
This method is used to get all the required columns
to the dataframe and store the appropriate aggregation in the
variables.
"""
msgdf = pd.DataFrame.from_dict(self.all_messages)
msgdf = msgdf[["timestamp_ms", "sender_name"]]
msgdf["time"] = msgdf["timestamp_ms"].apply(
lambda x: datetime.datetime.fromtimestamp(x/1000))
msgdf["year"] = msgdf["time"].apply(lambda convo: convo.year)
msgdf["month"] = msgdf["time"].apply(
lambda convo: convo.month)
msgdf["day"] = msgdf["time"].apply(lambda convo: convo.day)
self.yearly_aggregate = msgdf["year"].value_counts()
self.monthly_aggregate = msgdf["month"].value_counts()
self.daily_aggregate = msgdf["day"].value_counts()
def cumulative_list(self, lists):
"""
:params list: The list of values that has to be cummilated
:returns: The cummilated list
Turn the dicrete values into continuous value
"""
cu_list = []
length = len(lists)
cu_list = [sum(lists[0:convo + 1]) for convo in range(0, length)]
return cu_list
def visualize_data(self, visualize_points, xlable, ylable, title, msg):
"""
Create visualization for the given points and show the lables
"""
print(msg)
x_axis = visualize_points.index.tolist()[::-1]
y_axis = visualize_points.tolist()[::-1]
y_axis = self.cumulative_list(y_axis)
x, y = zip(*sorted(zip(x_axis, y_axis), key=operator.itemgetter(0)))
plt.xlabel(xlable)
plt.ylabel(ylable)
plt.title(title)
plt.plot(x, y)
plt.show()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--msg', help="Message directory location")
args = parser.parse_args()
if args.msg is None:
loc = input('Enter facebook archive extracted location: ')
# currently only focused on inbox
loc = loc + "/messages/inbox"
else:
loc = args.msg
if not os.path.isdir(loc):
print("The provided location doesn't seem to be right")
exit(1)
facebook_analysis = FacebookMessageAnalyser()
facebook_analysis(loc)