Skip to content

Commit

Permalink
Merge pull request #72 from hygia-org/65/normalize-key-smash-features
Browse files Browse the repository at this point in the history
65/normalize key smash features
  • Loading branch information
RochaCarla authored Feb 9, 2023
2 parents b70ff9b + 99faaaa commit 33c37d1
Show file tree
Hide file tree
Showing 15 changed files with 985 additions and 134 deletions.
5 changes: 4 additions & 1 deletion data/dicts/mexico_abbreviations.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ AV,AVENUE
NO,NUMBER
NUM,NUMBER
PO,POST OFFICE
P.O,POST OFFICE
BLVD,BOULEVARD
LT,LOTE
MZ,MANZANA
MZ,MANZANA
CDMX,Ciudad de México
DF,Distrito Federal
Binary file not shown.
Binary file not shown.
2 changes: 2 additions & 0 deletions data/models/normalization_absolutes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
feature_ks_count_sequence_squared_vowels,feature_ks_count_sequence_squared_consonants,feature_ks_count_sequence_squared_special_characters,feature_ks_average_of_char_count_squared
15.03125,30.0,30.0,30.0
81 changes: 52 additions & 29 deletions examples/MEXICO_predict_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,28 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mrunning feature engineering with configs below...\u001b[37m\n",
"\u001b[1mlanguage -> \u001b[22mes\n",
"\u001b[1mdimensions -> \u001b[22m25\n"
]
}
],
"source": [
"import pandas as pd\n",
"import hygia as hg\n",
"\n",
"pre_process_data = hg.PreProcessData()\n",
"pre_process_data = hg.PreProcessData(country=\"MEXICO\")\n",
"augment_data = hg.AugmentData(country=\"MEXICO\")\n",
"feature_engineering = hg.FeatureEngineering()\n",
"rf_model = hg.RandomForestModel('../data/models/RandomForest_Ksmash_WordEmbedding_Regex.pkl')"
"feature_engineering = hg.FeatureEngineering(country=\"MEXICO\")\n",
"rf_model = hg.RandomForestModel('../data/models/RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl',\n",
" normalization_absolutes_file='../data/models/normalization_absolutes.csv')"
]
},
{
Expand All @@ -33,12 +44,12 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"file_path = '../data/tmp/AI_LATA_ADDRESS_MEX_modificado.csv'\n",
"df = pd.read_csv(file_path, sep='¨', nrows=500_000, engine='python')"
"df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')"
]
},
{
Expand All @@ -51,11 +62,11 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = df.augment_data.augment_data()"
"df = augment_data.augment_data(df, zipcode_column_name='ZIP_CODE_L')"
]
},
{
Expand All @@ -76,7 +87,17 @@
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"aliases indified: \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2 -> \u001b[22m['STREET_ADDRESS_1', 'STREET_ADDRESS_2']\n",
"handle null values in the column \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2\u001b[22m\n",
"extract features from -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n"
]
}
],
"source": [
"concatened_column_name = 'concat_STREET_ADDRESS_1_STREET_ADDRESS_2'\n",
"df = pre_process_data.pre_process_data(df, ['STREET_ADDRESS_1', 'STREET_ADDRESS_2'], concatened_column_name)\n",
Expand All @@ -88,12 +109,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Check new columns names"
"# Check features columns names"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand All @@ -102,7 +123,6 @@
"['feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_ks_ratio_of_numeric_digits_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_0_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_1_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
Expand All @@ -129,28 +149,30 @@
" 'feature_we_22_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_23_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_we_24_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_context_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_exactly_the_word_dell_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_exactly_the_word_test_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_only_numbers_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_only_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_email_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_url_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_date_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_contains_exactly_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_is_substring_of_column_name_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_only_one_char_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_only_white_spaces_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
" 'feature_re_empty_concat_STREET_ADDRESS_1_STREET_ADDRESS_2']"
]
},
"execution_count": 8,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ks_we_and_re_colummns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]\n",
"ks_we_and_re_colummns"
"all_features_columns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]\n",
"model_features_columns = [col for col in all_features_columns if 'ratio_of_numeric_digits_squared' not in col]\n",
"model_features_columns"
]
},
{
Expand All @@ -163,33 +185,32 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"name": "stdout",
"output_type": "stream",
"text": [
"/home/anapaula/dell/Playground/env/lib/python3.8/site-packages/sklearn/base.py:409: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names\n",
" warnings.warn(\n"
"\u001b[33mrunning model...\u001b[37m\n"
]
},
{
"data": {
"text/plain": [
"0.0 497769\n",
"1.0 2231\n",
"Name: prediction, dtype: int64"
"0.0 2512460\n",
"1.0 7836\n",
"Name: prediction_is_key_smash, dtype: int64"
]
},
"execution_count": 9,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['prediction'] = rf_model.predict(df[ks_we_and_re_colummns].values)\n",
"df['prediction'].value_counts()"
"df['prediction_is_key_smash'] = rf_model.predict(df[model_features_columns], concatened_column_name)\n",
"df['prediction_is_key_smash'].value_counts()"
]
},
{
Expand All @@ -202,11 +223,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df[['concat_STREET_ADDRESS_1_STREET_ADDRESS_2', 'prediction']].to_csv('data/tmp/prediction.csv')"
"df[df['prediction_is_key_smash'] == 1][[concatened_column_name, 'prediction_is_key_smash']] \\\n",
" .drop_duplicates(subset=[concatened_column_name]) \\\n",
" .to_csv(f'../data/tmp/prediction_rf_ks_regex_enrich_normal.csv')"
]
}
],
Expand Down
Loading

0 comments on commit 33c37d1

Please sign in to comment.