hygia-org · AnHoff · Jun 19, 2023 · Jul 3, 2023 · Jul 3, 2023 · Jul 14, 2023
@@ -13,17 +13,62 @@ def test_annotate_data(self):
             'feature_ks_count_sequence_squared_special_characters': [0.05, 0.15, 0.25, 0.35],
             'feature_ks_ratio_of_numeric_digits_squared': [0.05, 0.15, 0.25, 0.9],
             'feature_ks_average_of_char_count_squared': [0.05, 0.15, 0.25, 0.35],
+            'feature_re_target1': [True, False, True, False],
+            'feature_re_target2': [False, True, False, True],
         })
 
         key_smash_thresholds = {
             'count_sequence_squared_vowels': ['above', 1.00],
             'count_sequence_squared_consonants':['above',  1.999],
             'count_sequence_squared_special_characters': ['above', 2.2499],
-            # 'ratio_of_numeric_digits_squared': ['above', 2.9],
+            'ratio_of_numeric_digits_squared': ['above', 0.5],
             'average_of_char_count_squared': ['above', 2.78],
             'shannon_entropy' : ['below', 2.0]
         }
 
         result = self.annotate_data.annotate_data(df, concatened_column_name='concat_address', ks_thresholds=key_smash_thresholds)
 
-        assert 'target' in result.columns
+        assert 'target' in result.columns
+
+    def test_annotate_data_no_key_smash_columns(self):
+        df = pd.DataFrame({
+            'concat_address': ['test', 'asdasd', 'DELL', 'PENDIENTE 123!@#'],
+            'feature_re_target1': [True, False, True, False],
+            'feature_re_target2': [False, True, False, True],
+        })
+
+        key_smash_thresholds = {
+            'count_sequence_squared_vowels': ['above', 1.00],
+            'count_sequence_squared_consonants':['above',  1.999],
+            'count_sequence_squared_special_characters': ['above', 2.2499],
+            'ratio_of_numeric_digits_squared': ['above', 0.5],
+            'average_of_char_count_squared': ['above', 2.78],
+            'shannon_entropy' : ['below', 2.0]
+        }
+
+        result = self.annotate_data.annotate_data(df, concatened_column_name='concat_address', ks_thresholds=key_smash_thresholds)
+
+        assert 'target' in result.columns
+
+    def test_annotate_data_no_regex_columns(self):
+        df = pd.DataFrame({
+            'concat_address': ['test', 'asdasd', 'DELL', 'PENDIENTE 123!@#'],
+            'feature_ks_count_sequence_squared_vowels': [0.05, 0.15, 0.25, 0.35],
+            'feature_ks_count_sequence_squared_consonants': [0.05, 0.15, 0.9, 0.35],
+            'feature_ks_count_sequence_squared_special_characters': [0.05, 0.15, 0.25, 0.35],
+            'feature_ks_ratio_of_numeric_digits_squared': [0.05, 0.15, 0.25, 0.9],
+            'feature_ks_average_of_char_count_squared': [0.05, 0.15, 0.25, 0.35],
+        })
+
+        key_smash_thresholds = {
+            'count_sequence_squared_vowels': ['above', 1.00],
+            'count_sequence_squared_consonants':['above',  1.999],
+            'count_sequence_squared_special_characters': ['above', 2.2499],
+            'ratio_of_numeric_digits_squared': ['above', 0.5],
+            'average_of_char_count_squared': ['above', 2.78],
+            'shannon_entropy' : ['below', 2.0]
+        }
+
+        result = self.annotate_data.annotate_data(df, concatened_column_name='concat_address', ks_thresholds=key_smash_thresholds)
+
+        assert 'target' in result.columns
@@ -12,6 +12,7 @@
     ('00000', False),
     ('00001', False)
 ])
+
 class TestAugmentData:
     def setup_method(self):
         self.augment_data = AugmentData(country="MEXICO")
@@ -22,10 +23,11 @@ def test_validate_zipcode(self, zipcode, expected):
     def test_validate_zipcodes(self, zipcode, expected):
         df = pd.DataFrame({'zipcode': [zipcode]})
         result = self.augment_data.validate_zipcodes(df, 'zipcode')
-        assert result.equals(pd.DataFrame({'zipcode_is_valid': [expected]}))
+        expected_result = pd.DataFrame({'zipcode_is_valid': [expected]})
+        assert result.equals(expected_result)
 
     def test_augment_data(self, zipcode, expected):
         df = pd.DataFrame({'zipcode': [zipcode]})
         result = self.augment_data.augment_data(df, 'zipcode')
         expected_result = pd.DataFrame({'zipcode': [zipcode], 'zipcode_is_valid': [expected]})
-        assert result.equals(expected_result)
+        assert result.equals(expected_result)
@@ -1,5 +1,6 @@
 import pandas as pd
 from sklearn.datasets import make_classification
+from sklearn.preprocessing import LabelEncoder
 from hygia import RandomForestModel
 
 class TestRandomForestModel:
@@ -18,3 +19,20 @@ def test_random_forest_model(self):
         assert scores['precision'] >= 0.0 and scores['precision'] <= 1
         assert scores['recall'] >= 0.0 and scores['recall'] <= 1
         assert scores['f1'] >= 0.0 and scores['f1'] <= 1
+
+    def test_predict(self):
+        X, _ = make_classification(n_samples=100, n_features=20, random_state=42)
+        columns = ['feature_'+str(i) for i in range(X.shape[1])]
+        df = pd.DataFrame(X, columns=columns)
+        df['target'] = ['valid'] * len(df)
+        df.loc[0, 'target'] = 'key_smash'
+
+        model = RandomForestModel(normalize=False)
+        label_encoder = LabelEncoder()
+        df['target_encoded'] = label_encoder.fit_transform(df['target'])
+
+        model.train_and_get_scores(df, 'target_encoded', columns)
+
+        result = model.predict(df[columns], 'target_encoded')
+
+        assert len(result) == len(df)
@@ -1,4 +1,4 @@
-
+import yaml
 from hygia.parser.YAML_parser import YAMLParser
 
 class TestYamlParser():
@@ -19,11 +19,27 @@ def test_yaml_has_description(self):
         assert 'description' in self.yaml
         assert self.yaml['description'] == 'DAG de teste'
 
-    def test_yaml_has_feature_settings(self):
+    def test_yaml_has_output_folder(self):
         assert 'output_folder' in self.yaml
         assert self.yaml['output_folder'] == 'output'
-        
+
     def test_yaml_has_feature_settings(self):
         assert 'feature_engineering' in self.yaml
         assert type(self.yaml['feature_engineering']) == list
         assert 'input' in self.yaml['feature_engineering'][0]
+
+    def test_yaml_has_nrows(self):
+        assert 'nrows' in self.yaml
+        assert isinstance(self.yaml['nrows'], int)
+
+    def test_yaml_has_engine(self):
+        assert 'engine' in self.yaml
+        assert isinstance(self.yaml['engine'], str)
+
+    def test_yaml_has_encoding(self):
+        assert 'encoding' in self.yaml
+        assert isinstance(self.yaml['encoding'], str)
+
+    def test_yaml_has_separator(self):
+        assert 'separator' in self.yaml
+        assert isinstance(self.yaml['separator'], str)
@@ -1,8 +1,7 @@
 import pytest
-
 from hygia.parser.annotate_data_parser import AnnotateDataParser
 
-class TestAnnotateParser():
+class TestAnnotateParser:
 
     def setup_method(self):
         self.model_data = [{'input': {'columns': ['street', 'phone'], 'thresholds': {'ksmash_sequence_vowels': 1.0, 'ksmash_sequence_consonants': 1.999, 'ksmash_sequence_special_characters': 2.2499, 'ksmash_numbers': 2.9, 'ksmash_char_frequence': 2.78}}}]
@@ -33,7 +32,7 @@ def test_parser_annotate_data(self):
         assert 'ksmash_sequence_vowels' in thresholds
         assert thresholds['ksmash_sequence_vowels'] == 1.0
 
-    def test_get_thresholds(self):
+    def test_get_thresholds_with_input(self):
         thresholds = self.parser.get_thresholds(self.model_data[0]['input'])
 
         assert 'ksmash_char_frequence' in thresholds
@@ -51,7 +50,7 @@ def test_get_thresholds(self):
         assert 'ksmash_sequence_vowels' in thresholds
         assert thresholds['ksmash_sequence_vowels'] == 1.0
 
-    def test_get_keyboard_smash_default_config(self):
+    def test_get_thresholds_with_input_missing_keys(self):
         thresholds = self.parser.get_thresholds({'thresholds': {'ksmash_sequence_vowels': 1.1}})
 
         assert 'ksmash_char_frequence' in thresholds
@@ -69,7 +68,7 @@ def test_get_keyboard_smash_default_config(self):
         assert 'ksmash_sequence_vowels' in thresholds
         assert thresholds['ksmash_sequence_vowels'] == 1.1
 
-    def test_get_keyboard_smash_default_config(self):
+    def test_get_thresholds_without_input(self):
         thresholds = self.parser.get_thresholds()
 
         assert 'ksmash_char_frequence' in thresholds

@@ -0,0 +1,39 @@
+import pytest
+from hygia.parser.pre_processing_parser import PreProcessingParser
+
+class TestPreProcessingParser:
+
+    def setup_method(self):
+        self.columns_name = ['col1', 'col2']
+        self.parser = PreProcessingParser(self.columns_name)
+
+    def test_parse_pre_processing_configs_with_empty_data(self):
+        data = []
+        result = self.parser.parse(data)
+
+        assert result is None
+
+    def test_get_dataframe(self):
+        aliases = [{'col3': 'new_col3'}, {'col4': 'new_col4'}]
+        self.parser._get_dataframe(aliases)
+
+        assert 'col3' in self.parser.columns_name
+        assert 'col4' in self.parser.columns_name
+
+    def test_get_dataframe_with_empty_aliases(self):
+        aliases = []
+        self.parser._get_dataframe(aliases)
+
+        assert self.parser.columns_name == self.columns_name
+
+    def test_get_dataframe_with_existing_columns(self):
+        aliases = [{'new_col1': 'col1'}, {'new_col2': 'col2'}]
+        self.parser._get_dataframe(aliases)
+
+        assert self.parser.columns_name == self.columns_name
+
+    def test_get_dataframe_with_empty_aliases_and_existing_columns(self):
+        aliases = []
+        self.parser._get_dataframe(aliases)
+
+        assert self.parser.columns_name == self.columns_name