-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcuration_backend.py
219 lines (196 loc) · 14.6 KB
/
curation_backend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# uvicorn curation_backend:app --reload --port 5000
import copy
import json
from utils import get_config, join_sets, load_model, print_log, save_model, load_model_dict, record_existing_votes
from collections import Counter, defaultdict
import random
import numpy as np
import pandas as pd
from superdebug import debug
from process_data import get_model_input
from model import get_best_model
import time
import re
from train import evaluate_model, train_model
from fastapi import FastAPI
app = FastAPI()
CONFIG_PATH = "configs/deploy_CURIO_full_data.yml"
print(CONFIG_PATH)
config = get_config(CONFIG_PATH, "_curation", print_config = False)
batch_size = config["batch_size"]
upvote_confidence_thres = config["upvote_confidence_thres"] # 0.5
target, original_feature_map, categorical_features, string_features, train_data, test_data, test_data_info, train_submission_upvote_df, num_all_users = get_model_input(config)
model, token_embedding = get_best_model(config, categorical_features, string_features, original_feature_map)
model = model.to(model.device); model.eval()
all_users = list(range(max(max(train_data["USERNAME"]), max(test_data["USERNAME"])) + 1))
existing_votes, _, _, _, _ = record_existing_votes(pd.concat([train_data, test_data], axis=0))
for user_submission_id in list(existing_votes.keys()):
if existing_votes[user_submission_id] == 0:
del existing_votes[user_submission_id]
# existing_votes: {"user-submission_id": 1}
inverse_username_map = {original_feature_map["USERNAME"][username]:username for username in original_feature_map["USERNAME"].keys()}
community_curators = json.loads(open("data/Curio/community_curators.json", "r").read())
def construct_submission_info(submission_id, community, author, title, content, username = "", vote = 1.0):
return pd.Series({
'SUBMISSION_ID': submission_id,
'SUBREDDIT': f"r/{community.replace(' ', '|')}",
'CREATED_TIME': re.sub("[0-9][0-9]:[0-9][0-9]:[0-9][0-9] ", "", time.ctime(time.time())),
'USERNAME': 0 if username == "" else inverse_username_map[username],
'VOTE': vote,
'TITLE': title,
'AUTHOR': inverse_username_map[author] if type(author) == str else author,
'#_COMMENTS': 0,
'NSFW': 'false',
'SCORE': 0,
'UPVOTED_%': 0.5,
'LINK': '',
'SUBMISSION_TEXT': (title + " [SEP] " + content) if content != "" else title,
'UPVOTED_USERS': [],
'DOWNVOTED_USERS': []}
)
def predict_curator_upvote_rate_with_model(community: str, author: str, title: str, content: str, submission_id: str, model, new_existing_votes = None):
# Curation threshold: if we set it as one, then the posts posted by a curator will always be curated since they will automatically receive one curator upvote. And if we set it as two, then the posts posted by a curator and echoed by another curator will also always get curated.
if new_existing_votes is None:
new_existing_votes = existing_votes
submission_info = construct_submission_info(submission_id, community, author, title, content)
if community.replace(" ", "|") in community_curators and "TEST" not in community and "test" not in community:
curator_usernames = community_curators[community.replace(" ", "|")]
else:
curator_usernames = []
curator_users = {inverse_username_map[username] for username in curator_usernames}
if len(curator_users) > 0:
print(f"Predicting {len(curator_users)} curators")
# get group_users_submissions_data
group_users_submissions_data = []
unique_submissions = pd.DataFrame([submission_info])
for user in curator_users:
submissions = unique_submissions.copy(deep=True)
submissions["USERNAME"] = [user] * len(submissions) # it doesn't matter whether the user itself is in UPVOTED_USERS / DOWNVOTED_USERS: we will substitute it with real votes
group_users_submissions_data.append(submissions)
group_users_submissions_data = pd.concat(group_users_submissions_data,axis=0)
# predict unseen votes
predicted_group_users_submissions_votes = evaluate_model(config, model, data=group_users_submissions_data, weights = None, batch_size=config["batch_size"], sample_voted_users=False, extra_input = (categorical_features, string_features, target), ret = "prediction") # ndarray size: (3423664, 1)
pred_user_submission_vote_vec = - np.ones([max(curator_users) + 1], dtype = int) # use ground truth vote if available, -1 for not in data
curator_votes = [0, 0]
usernames = group_users_submissions_data["USERNAME"].to_numpy()
for row_i in range(len(group_users_submissions_data)):
username = usernames[row_i]
vote_score = predicted_group_users_submissions_votes[row_i, 0]
if f'{username}-{submission_id}' not in new_existing_votes:
vote = int(vote_score >= upvote_confidence_thres)
else: # use existing votes if available
vote = int(new_existing_votes[f'{username}-{submission_id}'] >= 0.5)
pred_user_submission_vote_vec[username] = vote
curator_votes[vote] += 1
assert (pred_user_submission_vote_vec != -1).sum() > 0
# calculate %upvotes
curator_votes.append(curator_votes[1] / (curator_votes[0] + curator_votes[1])) # %upvotes
curator_upvote_rate = (pred_user_submission_vote_vec == 1).sum(axis=0).astype(float)/(pred_user_submission_vote_vec != -1).sum(axis=0).astype(float) # scalar
assert curator_votes[-1] == curator_upvote_rate
# curate_submission = curator_votes[-1] >= upvote_ratio_thres
else:
curator_upvote_rate = 1.0
return curator_upvote_rate
def predict_curator_upvote_rate(community: str, author: str, title: str, content: str, submission_id: str = "custom"):
return predict_curator_upvote_rate_with_model(community, author, title, content, submission_id, model)
def finetune_model_1step(model, submission_id, community, author, title, content, username, vote = 1.0):
# NOTE: input copy.deepcopy(model) to avoid modifying the original model
if "TEST" in community or "test" in community:
print("Do not finetune model on test community")
return model
submission_info = construct_submission_info(submission_id, community, author, title, content, username, vote)
new_model = next(train_model(config, model, data=pd.DataFrame([submission_info]), weights = None, batch_size=1, epochs=1, verbose=2, validation_split=False, step_generator=True, n_step_per_sample=1, extra_input = (categorical_features, string_features, target)))
print(f"Model finetuned on {username} {'upvote' if vote == 1.0 else 'downvote'} on {submission_id} in {community} ({title})")
return new_model
@app.get("/predict_curator_upvote_rate_with_new_upvote")
def predict_curator_upvote_rate_with_new_upvote(community: str, username: str, author: str, title: str, content: str, submission_id: str):
# Usage: when a user posted a post (and upvoted it, so username = author), or when a user reacted to a post
# Finetune the model one step further and then predict the curator upvote rate
global model
if f'{inverse_username_map[username]}-{submission_id}' not in existing_votes or existing_votes[f'{inverse_username_map[username]}-{submission_id}'] != 1:
existing_votes[f'{inverse_username_map[username]}-{submission_id}'] = 1
# train model
model = finetune_model_1step(model, submission_id, community, author, title, content, username)
# predict
return predict_curator_upvote_rate_with_model(community, author, title, content, submission_id, model)
@app.get("/predict_curator_upvote_rate_with_new_downvote")
def predict_curator_upvote_rate_with_new_downvote(community: str, username: str, author: str, title: str, content: str, submission_id: str):
# Usage: when a user removed their reaction to a post
# Finetune the model one step further and then predict the curator upvote rate
global model
if f'{inverse_username_map[username]}-{submission_id}' in existing_votes and existing_votes[f'{inverse_username_map[username]}-{submission_id}'] == 1:
del existing_votes[f'{inverse_username_map[username]}-{submission_id}']
# train model
model = finetune_model_1step(model, submission_id, community, author, title, content, username, vote = 0.0)
# predict
return predict_curator_upvote_rate_with_model(community, author, title, content, submission_id, model)
# def train_model_predict_curator_upvote_rate_with_new_echo(orig_community: str, new_community: str, username: str, author: str, title: str, content: str, model, orig_submission_id: str = "custom", echoed_submission_id: str = "custom", new_existing_votes = None):
# # when a user is attempting to echo a post or when the user has already echoed the post, we need to finetune the model further (the user (echoauthor) will upvote on this post in the original community, the original author will also upvote on the post in the new community it is echoed to) and then predict the curator upvote rate
# # train model
# new_model = finetune_model_1step(copy.deepcopy(model), orig_submission_id, orig_community, author, title, content, username) # username is echoauthor
# new_model = finetune_model_1step(new_model, echoed_submission_id, new_community, author, title, content, author) # author is original post author
# # predict
# curator_upvote_rate = predict_curator_upvote_rate_with_model(new_community, author, title, content, echoed_submission_id, new_model, new_existing_votes)
# return new_model, curator_upvote_rate
@app.get("/predict_curator_upvote_rate_with_echo_attempt")
def predict_curator_upvote_rate_with_echo_attempt(orig_community: str, new_community: str, username: str, author: str, title: str, content: str, orig_submission_id: str, echoed_submission_id: str = "custom"):
# Usage: when a user is attempting to echo a post
new_existing_votes = copy.deepcopy(existing_votes)
new_existing_votes[f'{inverse_username_map[username]}-{orig_submission_id}'] = 1
new_existing_votes[f'{inverse_username_map[username]}-{echoed_submission_id}'] = 1
new_existing_votes[f'{inverse_username_map[author]}-{echoed_submission_id}'] = 1
# train model
new_model = finetune_model_1step(copy.deepcopy(model), orig_submission_id, orig_community, author, title, content, username) # username is echoauthor
new_model = finetune_model_1step(new_model, echoed_submission_id, new_community, author, title, content, author) # author is original post author
# predict
curator_upvote_rate = predict_curator_upvote_rate_with_model(new_community, author, title, content, echoed_submission_id, new_model, new_existing_votes)
return curator_upvote_rate
@app.get("/predict_curator_upvote_rate_with_echo_proposal")
def predict_curator_upvote_rate_with_echo_proposal(orig_community: str, username: str, author: str, title: str, content: str, orig_submission_id: str):
# Usage: when a user has proposed an echo (to an ancestor community)
global model, existing_votes
existing_votes[f'{inverse_username_map[username]}-{orig_submission_id}'] = 1
# train model
model = finetune_model_1step(model, orig_submission_id, orig_community, author, title, content, username) # username is echoauthor
# predict
curator_upvote_rate = predict_curator_upvote_rate_with_model(orig_community, author, title, content, orig_submission_id, model)
return curator_upvote_rate
@app.get("/predict_curator_upvote_rate_with_echo_consent")
def predict_curator_upvote_rate_with_new_echo(orig_community: str, new_community: str, username: str, author: str, title: str, content: str, orig_submission_id: str, echoed_submission_id: str):
# Usage: when the author has consented to the echo (to an ancestor community)
global model, existing_votes
existing_votes[f'{inverse_username_map[username]}-{echoed_submission_id}'] = 1
existing_votes[f'{inverse_username_map[author]}-{echoed_submission_id}'] = 1
# train model
model = finetune_model_1step(model, echoed_submission_id, new_community, author, title, content, author) # author is original post author
# predict
curator_upvote_rate_orig_community = predict_curator_upvote_rate_with_model(orig_community, author, title, content, orig_submission_id, model)
curator_upvote_rate_new_community = predict_curator_upvote_rate_with_model(new_community, author, title, content, echoed_submission_id, model)
return {"orig_community": curator_upvote_rate_orig_community, "new_community": curator_upvote_rate_new_community}
@app.get("/predict_curator_upvote_rate_with_new_echo")
def predict_curator_upvote_rate_with_new_echo(orig_community: str, new_community: str, username: str, author: str, title: str, content: str, orig_submission_id: str, echoed_submission_id: str):
# Usage: when a user has already echoed the post (to a descendant community)
global model, existing_votes
existing_votes[f'{inverse_username_map[username]}-{orig_submission_id}'] = 1
existing_votes[f'{inverse_username_map[username]}-{echoed_submission_id}'] = 1
existing_votes[f'{inverse_username_map[author]}-{echoed_submission_id}'] = 1
# train model
model = finetune_model_1step(model, orig_submission_id, orig_community, author, title, content, username) # username is echoauthor
model = finetune_model_1step(model, echoed_submission_id, new_community, author, title, content, author) # author is original post author
# predict
curator_upvote_rate_orig_community = predict_curator_upvote_rate_with_model(orig_community, author, title, content, orig_submission_id, model)
curator_upvote_rate_new_community = predict_curator_upvote_rate_with_model(new_community, author, title, content, echoed_submission_id, model)
return {"orig_community": curator_upvote_rate_orig_community, "new_community": curator_upvote_rate_new_community}
if __name__ == "__main__":
custom_submission_id = "custom_" + str(random.randint(1000000000000000, 9999999999999999))
custom_author = "wanrong" # input("Input author's username: ")
custom_title = 'Employee of the year' # input("Input post title: ")
custom_content = 'This is so funny!!!' # input("Input post content: ")
custom_community = 'The Positive Corner' # input("Input community: ")
curator_upvote_rate = predict_curator_upvote_rate(custom_community, custom_author, custom_title, custom_content, custom_submission_id)
debug(curator_upvote_rate=curator_upvote_rate)
upvote_ratio_thres = config["upvote_ratio_thres"] # 0.5
if curator_upvote_rate > upvote_ratio_thres:
print(f"You can post immediately")
else:
print("your post will stay in the background first.")