-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcuration_interface.py
196 lines (138 loc) · 13.4 KB
/
curation_interface.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import pickle
import nbimporter
import streamlit as st
from curation import *
import os
import json
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "google-service-account-file.json"
from google.cloud import language_v1
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
import grpc, google
client = language_v1.LanguageServiceClient()
@st.experimental_singleton
def get_db():
submission_analysis_path = "data/reddit/submission_analysis.db"
engine = create_engine(f"sqlite:///{submission_analysis_path}", connect_args={'timeout': 10})
DBSession = sessionmaker(bind=engine)
session = DBSession()
return session
session = get_db()
Base = declarative_base()
class Analysis(Base):
__tablename__ = 'analysis'
id = Column(String, primary_key=True, autoincrement=True)
sentiment_score = Column(Integer)
content_classes = Column(String)
entities = Column(String)
CONFIG_PATH = "configs/subreddit_minority_no_peer.yml"
config = get_config(CONFIG_PATH, "_curation", print_config = False)
selected_subreddit = config["selected_subreddit"]
selected_subreddit = "r/politics"
user_grouping_method = config["user_grouping_method"] #
user_grouping_method = "interest_r/Conservative_r/Liberal random_user_as_group"
st.title(f"Community: {selected_subreddit}")
if 'cached_predictions' not in st.session_state:
active_user_votes_thres = config["active_user_votes_thres"]
batch_size = config["batch_size"]
submission_sentiment_map = {}
submission_class_map = {}
submission_entity_map = {}
target, original_feature_map, categorical_features, string_features, train_data, test_data, test_data_info, train_submission_upvote_df, num_all_users = get_model_input(config)
extra_input = (categorical_features, string_features, target)
model, token_embedding = get_best_model(config, categorical_features, string_features, original_feature_map)
model.eval()
all_users = list(range(max(max(train_data["USERNAME"]), max(test_data["USERNAME"])) + 1))
subreddit_votes_counter, subreddit_active_users, subreddit_user_vote_count, subreddit_train_submissions, subreddit_test_submissions, all_submissions = get_subreddits_submissions(train_data, test_data, user_votes_thres = active_user_votes_thres, max_test_submissions_per_subreddit=config["max_test_submissions_per_subreddit"]) # subreddit_votes_counter, subreddit_users, subreddit_train_submissions are based on train_data, subreddit_test_submissions are based on test_data
existing_votes, existing_user_votes, existing_user_updown_votes, existing_submission_votes, existing_user_subreddits = record_existing_votes(train_data)
selected_subreddit_active_users, subreddit_active_users, subreddit_votes_counter, subreddit_train_submissions, subreddit_test_submissions = get_selected_subreddit_info(config, selected_subreddit, subreddit_active_users, subreddit_votes_counter, subreddit_train_submissions, subreddit_test_submissions, original_feature_map, active_user_votes_thres)
analyze_post = False
submission_text_map = test_data[["SUBMISSION_ID", "SUBMISSION_TEXT"]].set_index("SUBMISSION_ID").to_dict()["SUBMISSION_TEXT"]
if analyze_post and config["submission_source"] == "test_data":
subreddit_submissions_ids = list(subreddit_test_submissions[selected_subreddit].keys())
subreddit_submissions_ids, subreddit_submissions_text = get_submissions_text(subreddit_submissions_ids, submission_text_map, pass_analyzed = True, submission_sentiment_map = submission_sentiment_map, submission_class_map = submission_class_map, submission_entity_map = submission_entity_map)
# subreddit_submissions_ids, subreddit_submissions_text
post_analysis_batch({id: text for id, text in zip(subreddit_submissions_ids, subreddit_submissions_text)}, session, Analysis, submission_sentiment_map, submission_class_map, submission_entity_map, language_v1, client, google)
manual_user_groups = config["manual_user_groups"]
# manual_user_groups = {"Conservative": {66, 39, 10, 44, 16, 60}, "Democratic":{0, 65, 64, 37, 49, 52, 20, 22, 23, 26, 29}}
debug(user_grouping_method=user_grouping_method)
debug(max_user = max(all_users), max_selected_subreddit_active_users = max([int(_) for _ in selected_subreddit_active_users]))
all_username_tokens = [f"USERNAME_{user_i}" for user_i in all_users]
all_username_token_ids = torch.tensor(model.tokenizer.convert_tokens_to_ids(all_username_tokens))
all_username_token_ids = all_username_token_ids.to(model.device); model = model.to(model.device)
with torch.no_grad():
user_embedding = model.lm_encoder.embeddings.word_embeddings(all_username_token_ids)
debug(user_embedding=user_embedding.shape)
# debug(all_username_tokens=all_username_tokens, all_username_token_ids=all_username_token_ids, user_embedding=user_embedding)
selected_subreddit_active_users_reps, selected_subreddit_active_user_i_user_map = get_user_reps(selected_subreddit_active_users, all_user_embedding=user_embedding, train_data=train_data, selected_submissions = subreddit_train_submissions[selected_subreddit], user_grouping_method = user_grouping_method)
debug(selected_subreddit_active_users_reps=selected_subreddit_active_users_reps) # NOTE: selected_subreddit_active_users_reps is not None only if user_grouping_method == "neural" or "votes"
reliability_bias_df, media_url_re = get_url_reliability_bias()
if type(selected_subreddit_active_users_reps) == torch.tensor: selected_subreddit_active_users_reps = selected_subreddit_active_users_reps.cpu()
users_in_groups, group_centers = get_user_groups(selected_subreddit, selected_subreddit_active_users, selected_subreddit_active_users_reps, selected_subreddit_active_user_i_user_map, user_grouping_method=user_grouping_method, existing_user_votes=existing_user_votes, manual_user_groups=manual_user_groups, train_data = train_data, original_feature_map=original_feature_map, selected_submissions = subreddit_train_submissions[selected_subreddit], model = model, subreddit_active_users=subreddit_active_users, selected_subreddit_active_users=selected_subreddit_active_users, subreddit_user_vote_count = subreddit_user_vote_count, reliability_bias_df=reliability_bias_df, media_url_re=media_url_re, extra_input=extra_input)
debug(user_num_in_group = {x: len(y) for x, y in users_in_groups.items()})
submissions_before_curation:dict = subreddit_test_submissions[selected_subreddit]
# # TODO: only select a small set of submissions
# one_key = list(submissions_before_curation.keys())[0]
# submissions_before_curation = {one_key: submissions_before_curation[one_key]}
pred_group_votes_info = {}
model = model.to(model.device); model.eval()
groups_preferred_submissions, groups_preferred_submissions_text, groups_submission_upvote_count_matrix, groups_curator_upvote_rate_content = predict_groups_preferences(config, model, users_in_groups, submissions_before_curation, subreddit_test_submissions, selected_subreddit, group_centers=group_centers, user_grouping_method=user_grouping_method, existing_votes=existing_votes, existing_user_updown_votes=existing_user_updown_votes, pred_group_votes_info = pred_group_votes_info, upvote_ratio_thres = 0.5, upvote_confidence_thres=0.0, selected_subreddit_active_user_i_user_map=selected_subreddit_active_user_i_user_map, extra_input=extra_input, display = False)
subreddit_users = list(set(train_data[train_data["SUBREDDIT"] == selected_subreddit]["USERNAME"]))
available_usernames = [original_feature_map["USERNAME"][username] for username in subreddit_users]
inverse_username_map = {original_feature_map["USERNAME"][username]:username for username in subreddit_users}
st.session_state['cached_predictions'] = (config, model, train_data, test_data, users_in_groups, submissions_before_curation, subreddit_test_submissions, selected_subreddit, group_centers, user_grouping_method, existing_votes, existing_user_updown_votes, selected_subreddit_active_user_i_user_map, extra_input, submission_sentiment_map, submission_class_map, submission_entity_map, reliability_bias_df, media_url_re, pred_group_votes_info, original_feature_map, all_submissions, subreddit_users, available_usernames, inverse_username_map, existing_user_subreddits)
debug("Stored to cache")
else:
debug("Using cache")
config, model, train_data, test_data, users_in_groups, submissions_before_curation, subreddit_test_submissions, selected_subreddit, group_centers, user_grouping_method, existing_votes, existing_user_updown_votes, selected_subreddit_active_user_i_user_map, extra_input, submission_sentiment_map, submission_class_map, submission_entity_map, reliability_bias_df, media_url_re, pred_group_votes_info, original_feature_map, all_submissions, subreddit_users, available_usernames, inverse_username_map, existing_user_subreddits = st.session_state['cached_predictions']
batch_size = 1024
upvote_ratio_thres = st.sidebar.slider("Set the threshold for curator upvote rate", 0.0, 1.0, config["upvote_ratio_thres"], step = 0.01)
if type(config["upvote_confidence_thres"]) == list:
config["upvote_confidence_thres"] = config["upvote_confidence_thres"][0]
config["upvote_confidence_thres"] = float(config["upvote_confidence_thres"])
upvote_confidence_thres = st.sidebar.slider("Set the threshold for upvote confidence", 0.0, 1.0, config["upvote_confidence_thres"], step = 0.01)
group_names = list(users_in_groups.keys())
if "manual" not in group_names:
group_names.append("manual")
group_x = st.selectbox("Select a group of curators from our recommendations", group_names)
if group_x == "manual":
chosen_users = st.multiselect("Manually select curators", available_usernames)
users_in_groups["manual"] = {inverse_username_map[x] for x in chosen_users}
left_col, right_col = st.columns(2)
explore_curators = left_col.button("View user information")
do_curate = right_col.button("Get curated posts")
if explore_curators:
user_info_list = []
for username in users_in_groups["manual"]:
user_info, user_info_str = get_more_user_info(username, original_feature_map, existing_user_subreddits, train_data, selected_subreddit)
user_info_list.append(user_info)
user_info_df = pd.DataFrame.from_records(user_info_list).set_index("username")
user_info_df["joined subreddits"] = user_info_df["joined subreddits"].map(lambda x: str(x))
user_info_df["upvoted posts"] = user_info_df["upvoted posts"].map(lambda x: json.dumps(x))
user_info_df["downvoted posts"] = user_info_df["downvoted posts"].map(lambda x: json.dumps(x))
st.write(user_info_df)
if do_curate:
groups_preferred_submissions, groups_preferred_submissions_text, groups_submission_upvote_count_matrix = predict_groups_preferences(config, model, users_in_groups, submissions_before_curation, subreddit_test_submissions, selected_subreddit, group_centers=group_centers, user_grouping_method=user_grouping_method, existing_votes=existing_votes, existing_user_updown_votes=existing_user_updown_votes, pred_group_votes_info = pred_group_votes_info, upvote_ratio_thres = upvote_ratio_thres, upvote_confidence_thres=upvote_confidence_thres, selected_subreddit_active_user_i_user_map=selected_subreddit_active_user_i_user_map, extra_input=extra_input)
max_show_posts = 30
all_preferred_submissions_text = set.intersection(*[set(groups_preferred_submissions_text[group_x]) for group_x in groups_preferred_submissions_text])
# groups_preferred_submissions_text
top_preferred_submission_text = groups_preferred_submissions_text[group_x][:max_show_posts]
top_preferred_submission_ids = groups_preferred_submissions[group_x][:max_show_posts]
top_distinct_preferred_submission_text = [_ for i, _ in enumerate(groups_preferred_submissions_text[group_x]) if _ in set(groups_preferred_submissions_text[group_x]) - all_preferred_submissions_text][:max_show_posts]
top_distinct_preferred_submission_ids = [groups_preferred_submissions[group_x][i] for i, _ in enumerate(groups_preferred_submissions_text[group_x]) if _ in set(groups_preferred_submissions_text[group_x]) - all_preferred_submissions_text][:max_show_posts]
# TODO:
show_text = top_preferred_submission_text
show_ids = top_preferred_submission_ids
for i, submission_text in enumerate(show_text):
user_bar, text_bar = st.columns([3, 10])
id = show_ids[i]
user_bar.write(f'##### {all_submissions[id]["USERNAME"] if "USERNAME" not in original_feature_map else original_feature_map["USERNAME"][all_submissions[id]["USERNAME"]]}')
user_bar.write(all_submissions[id]["CREATED_TIME"])
text_bar.warning(submission_text)
print(f"Users in group {group_x} prefers {show_text}") #
st.write("Pearson ranking items:", groups_submission_upvote_count_matrix.index.to_list())
groups_submission_upvote_count_matrix_nonzero = groups_submission_upvote_count_matrix[groups_submission_upvote_count_matrix.sum(axis = 1) != 0]
group_preference_pearson_corr = np.corrcoef(groups_submission_upvote_count_matrix_nonzero) # (697, 697)
st.write(group_preference_pearson_corr)
visualize_group_preferences(groups_preferred_submissions, test_data, user_grouping_method, submission_sentiment_map = submission_sentiment_map, submission_class_map=submission_class_map, submission_entity_map=submission_entity_map, reliability_bias_df=reliability_bias_df, media_url_re=media_url_re)