-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombine_praise_datasets.py
69 lines (52 loc) · 2.23 KB
/
combine_praise_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import json
import pandas as pd
import numpy as np
from datetime import datetime
# Opening JSON file
f = open('praise_samples_810180621930070088_1.json')
# returns JSON object as
# a dictionary
data = json.load(f)
unchecked = []
f_praiseDataScrape = open('Token_Engineering_Commons_praise_2.csv')
f_praiseDataGoogle = open('GoogleDiscord Praise Bot Sheet - Sheet1.csv')
praiseDataScrape = pd.read_csv(f_praiseDataScrape)
praiseDataGoogle = pd.read_csv(f_praiseDataGoogle)
praiseDataGoogle = praiseDataGoogle.iloc[1:, :6].copy()
praiseDataGoogle.rename(
columns={'FROM': 'From', 'DATE': 'Date', 'ROOM': 'Channel', 'TO': 'To'}, inplace=True)
praiseDataGoogle = praiseDataGoogle[praiseDataGoogle['SERVER']
!= 'Commons Stack']
# get rid of different usernames that refer to the same person
praiseDataScrape = praiseDataScrape.replace(
['griff (💜, 💜)#8888'], 'griff#3281')
praiseDataScrape = praiseDataScrape.replace(
['Zeptimus (⏳,⏳)#3359'], 'Zeptimus#3359')
praiseDataGoogle = praiseDataGoogle.replace(
['griff (💜, 💜)#8888'], 'griff#3281')
praiseDataGoogle = praiseDataGoogle.replace(
['Zeptimus (⏳,⏳)#3359'], 'Zeptimus#3359')
praiseDataScrape = praiseDataScrape.replace(
['Jolie_Ze#0295'], 'aka_roro#0295')
praiseDataGoogle = praiseDataGoogle.replace(
['Jolie_Ze#0295'], 'aka_roro#0295')
print(praiseDataScrape.head())
print(praiseDataGoogle.head())
praiseDataScrape["Date"] = pd.to_datetime(
praiseDataScrape['Date'])
praiseDataGoogle["Date"] = pd.to_datetime(
praiseDataGoogle['Date'])
scrape_filtered = praiseDataScrape.loc[(
praiseDataScrape['Date'] >= '2021-11-01') & (praiseDataScrape['Date'] < '2022-01-31')]
google_filtered = praiseDataGoogle.loc[(
praiseDataGoogle['Date'] >= '2021-11-01') & (praiseDataGoogle['Date'] < '2022-01-31')]
new_df = pd.merge(scrape_filtered, google_filtered, on=[
'From', 'Date', 'To'], how='outer')
only_incomplete = new_df.loc[~new_df.index.isin(new_df.dropna().index)]
print(only_incomplete.head())
only_incomplete.to_csv("only_incomplete.csv", index=False, header=False)
print(new_df.head())
new_df.to_csv("joined_dataset.csv", index=False, header=False)
# Closing file
f_praiseDataScrape.close()
f_praiseDataGoogle.close()