combine responsible and unit score

worldbank · Sep 2, 2023 · 1bb5b7b · 1bb5b7b
1 parent 2800341
commit 1bb5b7b
Show file tree

Hide file tree

Showing 5 changed files with 131 additions and 114 deletions.
diff --git a/configuration/main.yaml b/configuration/main.yaml
@@ -56,8 +56,8 @@ features:
     use: true
   sequence_jump:
     use: true
-  string_length:
-    use: false
+    parameters:
+      contamination: 0.1
   time_changed:
     use: true
   gps:
@@ -79,16 +79,20 @@ features:
     use: false
   number_answered:
     use: true
+    parameters:
+      contamination: 0.11
   total_duration:
     use: true
     parameters:
       contamination: 0.11
   total_elapse:
-    use: false
+    use: true
+    parameters:
+      contamination: 0.1
   single_question:
     use: false
   multi_option_question:
-      use: false
+      use: true
   days_from_start:
     use: false
   comment_length:
@@ -97,3 +101,5 @@ features:
     use: false
   comment_duration:
     use: false
+  string_length:
+    use: false
diff --git a/requirements.txt b/requirements.txt
@@ -6,5 +6,6 @@ scikit-learn>=1.3.0
 scipy>=1.10.1
 seaborn>=0.12.2
 pyod>=1.1.0
+pythresh>=0.3.3
 
 
diff --git a/src/feature_processing.py b/src/feature_processing.py
@@ -18,6 +18,7 @@ def __init__(self, config):
         print('Items Build')
         self._df_unit = self.make_df_unit()
         print('Unit Build')
+        self._df_resp = self.make_df_responsible()
         # Define ask that get recurrently used
         self.numeric_question_mask = (
                 (self._df_item['type'] == 'NumericQuestion') &
@@ -44,7 +45,7 @@ def df_item(self):
                     getattr(self, method_name)(feature_name)
                     # print(f"{feature_name} Processed")
                 except Exception as e:
-                    print("ERROR ON FEATURE ITEM: {}, It won't be used in further calculation".format(feature_name))
+                    print("WARNING: FEATURE ITEM: {} won't be used in further calculation".format(feature_name))
         return self._df_item
 
     @property
@@ -56,7 +57,7 @@ def df_unit(self):
                     print(f"Processing {feature_name} ...")
                     getattr(self, method_name)(feature_name)
                 except Exception as e:
-                    print("ERROR ON FEATURE UNIT: {}, It won't be used in further calculation".format(feature_name))
+                    print("WARNING: FEATURE UNIT: {}, It won't be used in further calculation".format(feature_name))
         return self._df_unit
 
     @property
@@ -198,7 +199,8 @@ def get_df_time(self):
 
         # Get the min date from the min question sequesce as there might be some time setting
         # change later that would change the starting date if just looking at the min of timestamp_local
-        starting_timestamp = df_time[df_time['event'].isin(['AnswerSet'])].groupby('interview__id')['timestamp_local'].min()
+        starting_timestamp = df_time[df_time['event'].isin(['AnswerSet'])].groupby('interview__id')[
+            'timestamp_local'].min()
         df_time['f__starting_timestamp'] = df_time['interview__id'].map(starting_timestamp)
 
         min_date = df_time['f__starting_timestamp'].min()
@@ -264,6 +266,12 @@ def make_df_unit(self):
         df_unit = self.add_unit_time_features(df_unit)
         return df_unit
 
+    def make_df_responsible(self):
+        df_resp = self.df_active_paradata[['responsible']].copy()
+        df_resp.drop_duplicates(inplace=True)
+        df_resp = df_resp[(df_resp['responsible'] != '') & (~pd.isnull(df_resp['responsible']))]
+        return df_resp
+
     def save_data(self, df, file_name):
 
         target_dir = os.path.join(self.config.data.raw, self.config.surveys)
@@ -412,7 +420,6 @@ def count_elements_or_nan(val):
         self._df_item.loc[multi_list_mask, feature_name] = self._df_item.loc[multi_list_mask, 'value'].apply(
             count_elements_or_nan)
         # f__share_selected, share between answers selected, and available answers (only for unlinked questions)
-        # TODO! confirm that it makes sense to use just put f__answer_share_selected in place of f__answer_selected
         self._df_item[feature_name] = self._df_item[feature_name] / self._df_item['n_answers']
 
     def make_feature_item__comment_length(self, feature_name):
@@ -527,7 +534,6 @@ def add_pause_features(self, df_unit):
             df_unit = df_unit.merge(df_pause, how='left', on='interview__id')
         return df_unit
 
-
     def add_unit_time_features(self, df_unit):
         # Define the list of features depending on time
         time_features = ['f__total_duration', 'f__total_elapse', 'f__days_from_start', 'f__time_changed']

diff --git a/src/item_processing.py b/src/item_processing.py
@@ -37,13 +37,11 @@ def filter_variable_name_by_frequency(df, feature_name, frequency=100, min_uniqu
         return variables
 
     @staticmethod
-    def filter_columns(data, index_col, threshold=0.2):
-        # !TODO TRHESHOLD SHOLUD PROBABLY BE IN ABSOUTE TERMS AS IT WOULD BE AFFECTED
+    def filter_columns(data, index_col, threshold=100):
         drop_columns = []
         keep_columns = []
-        total_interviews = data.interview__id.nunique()
         for col in data.columns:
-            if (data[col].nunique() < 3 or data[col].count() / total_interviews < threshold) and col not in index_col:
+            if (data[col].nunique() < 3 or data[col].count() < threshold) and col not in index_col:
                 drop_columns.append(col)
             else:
                 keep_columns.append(col)
@@ -152,9 +150,10 @@ def make_score__sequence_jump(self):
         # Select only those variables that have at least three distinct values and more than one hundred records
         valid_variables = self.filter_variable_name_by_frequency(df, feature_name, frequency=100, min_unique_values=3)
         df[score_name] = 0
+        contamination = self.get_contamination_parameter(feature_name)
         for var in valid_variables:
             mask = (df['variable_name'] == var)
-            model = INNE()
+            model = INNE(contamination=contamination, random_state=42)
             model.fit(df[mask][[feature_name]])
             df.loc[mask, score_name] = model.predict(df[mask][[feature_name]])
         return df
@@ -163,7 +162,7 @@ def make_score__first_decimal(self):
 
         feature_name = 'f__first_decimal'
         score_name = self.rename_feature(feature_name)
-        df = self.df_item[~pd.isnull(self.df_item[feature_name])]#.copy()
+        df = self.df_item[~pd.isnull(self.df_item[feature_name])].copy()
         # Select only those variables that have at least three distinct values and more than one hundred records
         valid_variables = self.filter_variable_name_by_frequency(df, feature_name, frequency=100, min_unique_values=3)
         df[score_name] = 0