Skip to content

Commit

Permalink
combine responsible and unit score
Browse files Browse the repository at this point in the history
  • Loading branch information
gtani committed Sep 2, 2023
1 parent 2800341 commit 1bb5b7b
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 114 deletions.
14 changes: 10 additions & 4 deletions configuration/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ features:
use: true
sequence_jump:
use: true
string_length:
use: false
parameters:
contamination: 0.1
time_changed:
use: true
gps:
Expand All @@ -79,16 +79,20 @@ features:
use: false
number_answered:
use: true
parameters:
contamination: 0.11
total_duration:
use: true
parameters:
contamination: 0.11
total_elapse:
use: false
use: true
parameters:
contamination: 0.1
single_question:
use: false
multi_option_question:
use: false
use: true
days_from_start:
use: false
comment_length:
Expand All @@ -97,3 +101,5 @@ features:
use: false
comment_duration:
use: false
string_length:
use: false
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ scikit-learn>=1.3.0
scipy>=1.10.1
seaborn>=0.12.2
pyod>=1.1.0
pythresh>=0.3.3


16 changes: 11 additions & 5 deletions src/feature_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(self, config):
print('Items Build')
self._df_unit = self.make_df_unit()
print('Unit Build')
self._df_resp = self.make_df_responsible()
# Define ask that get recurrently used
self.numeric_question_mask = (
(self._df_item['type'] == 'NumericQuestion') &
Expand All @@ -44,7 +45,7 @@ def df_item(self):
getattr(self, method_name)(feature_name)
# print(f"{feature_name} Processed")
except Exception as e:
print("ERROR ON FEATURE ITEM: {}, It won't be used in further calculation".format(feature_name))
print("WARNING: FEATURE ITEM: {} won't be used in further calculation".format(feature_name))
return self._df_item

@property
Expand All @@ -56,7 +57,7 @@ def df_unit(self):
print(f"Processing {feature_name} ...")
getattr(self, method_name)(feature_name)
except Exception as e:
print("ERROR ON FEATURE UNIT: {}, It won't be used in further calculation".format(feature_name))
print("WARNING: FEATURE UNIT: {}, It won't be used in further calculation".format(feature_name))
return self._df_unit

@property
Expand Down Expand Up @@ -198,7 +199,8 @@ def get_df_time(self):

# Get the min date from the min question sequesce as there might be some time setting
# change later that would change the starting date if just looking at the min of timestamp_local
starting_timestamp = df_time[df_time['event'].isin(['AnswerSet'])].groupby('interview__id')['timestamp_local'].min()
starting_timestamp = df_time[df_time['event'].isin(['AnswerSet'])].groupby('interview__id')[
'timestamp_local'].min()
df_time['f__starting_timestamp'] = df_time['interview__id'].map(starting_timestamp)

min_date = df_time['f__starting_timestamp'].min()
Expand Down Expand Up @@ -264,6 +266,12 @@ def make_df_unit(self):
df_unit = self.add_unit_time_features(df_unit)
return df_unit

def make_df_responsible(self):
df_resp = self.df_active_paradata[['responsible']].copy()
df_resp.drop_duplicates(inplace=True)
df_resp = df_resp[(df_resp['responsible'] != '') & (~pd.isnull(df_resp['responsible']))]
return df_resp

def save_data(self, df, file_name):

target_dir = os.path.join(self.config.data.raw, self.config.surveys)
Expand Down Expand Up @@ -412,7 +420,6 @@ def count_elements_or_nan(val):
self._df_item.loc[multi_list_mask, feature_name] = self._df_item.loc[multi_list_mask, 'value'].apply(
count_elements_or_nan)
# f__share_selected, share between answers selected, and available answers (only for unlinked questions)
# TODO! confirm that it makes sense to use just put f__answer_share_selected in place of f__answer_selected
self._df_item[feature_name] = self._df_item[feature_name] / self._df_item['n_answers']

def make_feature_item__comment_length(self, feature_name):
Expand Down Expand Up @@ -527,7 +534,6 @@ def add_pause_features(self, df_unit):
df_unit = df_unit.merge(df_pause, how='left', on='interview__id')
return df_unit


def add_unit_time_features(self, df_unit):
# Define the list of features depending on time
time_features = ['f__total_duration', 'f__total_elapse', 'f__days_from_start', 'f__time_changed']
Expand Down
11 changes: 5 additions & 6 deletions src/item_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,11 @@ def filter_variable_name_by_frequency(df, feature_name, frequency=100, min_uniqu
return variables

@staticmethod
def filter_columns(data, index_col, threshold=0.2):
# !TODO TRHESHOLD SHOLUD PROBABLY BE IN ABSOUTE TERMS AS IT WOULD BE AFFECTED
def filter_columns(data, index_col, threshold=100):
drop_columns = []
keep_columns = []
total_interviews = data.interview__id.nunique()
for col in data.columns:
if (data[col].nunique() < 3 or data[col].count() / total_interviews < threshold) and col not in index_col:
if (data[col].nunique() < 3 or data[col].count() < threshold) and col not in index_col:
drop_columns.append(col)
else:
keep_columns.append(col)
Expand Down Expand Up @@ -152,9 +150,10 @@ def make_score__sequence_jump(self):
# Select only those variables that have at least three distinct values and more than one hundred records
valid_variables = self.filter_variable_name_by_frequency(df, feature_name, frequency=100, min_unique_values=3)
df[score_name] = 0
contamination = self.get_contamination_parameter(feature_name)
for var in valid_variables:
mask = (df['variable_name'] == var)
model = INNE()
model = INNE(contamination=contamination, random_state=42)
model.fit(df[mask][[feature_name]])
df.loc[mask, score_name] = model.predict(df[mask][[feature_name]])
return df
Expand All @@ -163,7 +162,7 @@ def make_score__first_decimal(self):

feature_name = 'f__first_decimal'
score_name = self.rename_feature(feature_name)
df = self.df_item[~pd.isnull(self.df_item[feature_name])]#.copy()
df = self.df_item[~pd.isnull(self.df_item[feature_name])].copy()
# Select only those variables that have at least three distinct values and more than one hundred records
valid_variables = self.filter_variable_name_by_frequency(df, feature_name, frequency=100, min_unique_values=3)
df[score_name] = 0
Expand Down
Loading

0 comments on commit 1bb5b7b

Please sign in to comment.