Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

353 create imputation markers #14

Merged
merged 20 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
9ddd6af
Change unit tests from dropping to selecting, ready for adding more c…
Jday7879 May 15, 2024
1fbbd83
Adding module to calculate imputation flag columns
Jday7879 May 16, 2024
70dfad4
Creating unit test and test data for imputation flag
Jday7879 May 16, 2024
9bd4c2a
Copying input data to fix pandas copy warnings
Jday7879 May 16, 2024
f334147
Adding docstrings
Jday7879 May 16, 2024
2bd4b04
Refactoring `matched_pair` column to include target column in name
Jday7879 May 16, 2024
122610b
Update impute flags to include impute from construction
Jday7879 May 16, 2024
f1372f0
Create function to convert impute flags into single column with strings
Jday7879 May 16, 2024
f1abca8
Fixing pandas copy on slice warning
Jday7879 May 17, 2024
77855a5
Updating docstring and handle case where needed columns are not included
Jday7879 May 17, 2024
0607562
Update error message
Jday7879 May 17, 2024
e24f451
Adding unit test for string flag column
Jday7879 May 17, 2024
052c376
Renaming imputation flag function to imputation_flag_marker
Jday7879 May 21, 2024
fc56bd0
Rename column in test data
Jday7879 May 21, 2024
1501e0e
Refactor to use dictionary to store imputation markers and conditions…
Jday7879 May 21, 2024
e8458ff
Refactor to define column names earlier in function
Jday7879 May 21, 2024
a88482f
Add f_predictive_auxiliary variable to test data
AntonZogk May 22, 2024
df6930b
refactor: Add predictive_auxiliary as function argument
AntonZogk May 22, 2024
c4a7256
Change period type to int
AntonZogk May 22, 2024
7ca00a7
Update expected columns in function and tests
AntonZogk May 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/flag_and_count_matched_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def flag_matched_pair_merge(
time_difference = -time_difference

# Creating new DF, shifting period for forward or backward
df_with_predictive_column = df[[reference, strata, target]]
df_with_predictive_column = df.copy()[[reference, strata, target]]
df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(
months=time_difference
)
Expand All @@ -55,7 +55,7 @@ def flag_matched_pair_merge(
how="left",
)

matched_col_name = forward_or_backward + "_matched_pair"
matched_col_name = forward_or_backward + "_matched_pair_" + target

df[matched_col_name] = np.where(
df[[target, predictive_col_name]].isnull().any(axis=1), False, True
Expand Down Expand Up @@ -107,7 +107,7 @@ def flag_matched_pair_shift(
df["validate_date"] = np.where(
df[period].dt.month - df["predictive_period"].dt.month == shift, True, False
)
matched_col_name = forward_or_backward + "_matched_pair"
matched_col_name = forward_or_backward + "_matched_pair_" + target

df[matched_col_name] = np.where(
df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]),
Expand Down
137 changes: 137 additions & 0 deletions src/imputation_flags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import numpy as np
import pandas as pd


def create_impute_flags(
df: pd.DataFrame,
target: str,
reference: str,
strata: str,
auxiliary: str,
predictive_auxiliary: str,
):

"""
function to create logical columns for each type of imputation
output columns are needed to create the string flag column for
imputation methods.
Function requires f_predictive and b_predictive columns produced
by `flag_matched_pair` function.

Parameters
----------
df : pd.DataFrame
DataFrame containing forward, backward predictive period columns (
These columns are created by calling flag_matched_pair_merge forward
and backwards)

target : str
Column name containing target variable.
reference : str
Column name containing business reference id.
strata : str
Column name containing strata information (sic).
auxiliary : str
Column name containing auxiliary data.
predictive_auxiliary: str
Column name containing predictive auxiliary data, this is created,
by flag_matched_pair_merge function.

Returns
-------
pd.DataFrame
Dataframe with four additional logical columns determining if target
is a return (r_flag) can be imputed by forward imputation (fir_flag),
backward imputation (bir_flag) or can be constructed (c_flag)
"""
for direction in ["f", "b"]:
try:
df["{}_predictive_{}".format(direction, target)]
except KeyError:
raise KeyError(
"Dataframe needs column '{}_predictive_{}',".format(direction, target)
+ " run flag_matched_pair function first"
)
forward_target_roll = "f_predictive_" + target + "_roll"
backward_target_roll = "b_predictive_" + target + "_roll"
forward_aux_roll = "f_predictive_" + auxiliary + "_roll"

df[forward_target_roll] = df.groupby([reference, strata])[
"f_predictive_" + target
].ffill()

df[backward_target_roll] = df.groupby([reference, strata])[
"b_predictive_" + target
].bfill()

df["r_flag"] = df[target].notna()

df["fir_flag"] = np.where(
df[forward_target_roll].notna() & df[target].isna(), True, False
)

df["bir_flag"] = np.where(
df[backward_target_roll].notna() & df[target].isna(), True, False
)

construction_conditions = df[target].isna() & df[auxiliary].notna()
df["c_flag"] = np.where(construction_conditions, True, False)

df[forward_aux_roll] = df.groupby([reference, strata])[predictive_auxiliary].ffill()

fic_conditions = df[target].isna() & df[forward_aux_roll].notna()
df["fic_flag"] = np.where(fic_conditions, True, False)

df.drop(
[
forward_target_roll,
backward_target_roll,
forward_aux_roll,
predictive_auxiliary,
],
axis=1,
inplace=True,
)

return df


def generate_imputation_marker(df: pd.DataFrame) -> pd.DataFrame:
"""
Function to add column containing the a string indicating the method of
imputation to use following the hierarchy in specifications

Parameters
----------
df : pd.DataFrame
DataFrame containing logical columns produced by `create_imputation_flags`
(r_flag, fir_flag, bir_flag, fic_flag and c_flag)


Returns
-------
pd.DataFrame
Dataframe with additional column containing imputation marker
i.e. the type of imputation method that should be used to fill
missing returns.
"""

imputation_markers_and_conditions = {
"r": df["r_flag"],
"fir": ~df["r_flag"] & df["fir_flag"],
"bir": ~df["r_flag"] & ~df["fir_flag"] & df["bir_flag"],
"fic": ~df["r_flag"] & ~df["fir_flag"] & ~df["bir_flag"] & df["fic_flag"],
"c": ~df["r_flag"]
& ~df["fir_flag"]
& ~df["bir_flag"]
& ~df["fic_flag"]
& df["c_flag"],
}

df["imputation_marker"] = np.select(
imputation_markers_and_conditions.values(),
imputation_markers_and_conditions.keys(),
default="error",
)

return df
28 changes: 28 additions & 0 deletions tests/imputation_flag_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
reference,strata,period,target_variable,auxiliary,f_predictive_target_variable,b_predictive_target_variable,r_flag,fir_flag,bir_flag,c_flag,fic_flag,f_predictive_auxiliary,imputation_marker
1,100,202001,8444.0,51.0,,,True,False,False,False,False,,r
1,100,202002,,51.0,8444.0,2003.0,False,True,True,True,True,51.0,fir
1,100,202003,2003.0,51.0,,1003.0,True,False,False,False,False,51.0,r
1,100,202004,1003.0,51.0,2003.0,,True,False,False,False,False,51.0,r
2,100,202001,,72.0,,,False,False,True,True,False,,bir
2,100,202002,,,,,False,False,True,False,True,72.0,bir
2,100,202003,,72.0,,3251.0,False,False,True,True,True,,bir
2,100,202004,3251.0,72.0,,,True,False,False,False,False,72.0,r
3,100,202001,,7.0,,7511.0,False,False,True,True,False,,bir
3,100,202002,7511.0,7.0,,1234.0,True,False,False,False,False,7.0,r
3,100,202003,1234.0,7.0,7511.0,1214.0,True,False,False,False,False,7.0,r
3,100,202004,1214.0,7.0,1234.0,,True,False,False,False,False,7.0,r
4,100,202001,64.0,81.0,,,True,False,False,False,False,,r
4,100,202002,,81.0,64.0,,False,True,True,True,True,81.0,fir
4,100,202003,,81.0,,254.0,False,True,True,True,True,81.0,fir
4,100,202004,254.0,81.0,,,True,False,False,False,False,81.0,r
5,100,202001,65.0,81.0,,342.0,True,False,False,False,False,,r
5,100,202002,342.0,81.0,65.0,634.0,True,False,False,False,False,81.0,r
5,100,202003,634.0,81.0,342.0,254.0,True,False,False,False,False,81.0,r
5,100,202004,254.0,81.0,634.0,,True,False,False,False,False,81.0,r
6,100,202001,64.0,81.0,,,True,False,False,False,False,,r
6,100,202002,,81.0,64.0,654.0,False,True,True,True,True,81.0,fir
6,100,202003,654.0,81.0,,,True,False,False,False,False,81.0,r
6,100,202004,,81.0,654.0,,False,True,False,True,True,81.0,fir
7,100,202001,,40.0,,,False,False,False,True,False,,c
7,100,202002,,,,,False,False,False,False,True,40.0,fic
7,100,202003,,,,,False,False,False,False,True,,fic
2 changes: 1 addition & 1 deletion tests/test_data_matched_pair/case1_expected_output.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
1,101,202401,237,False,0,True,2
1,101,202402,281,True,2,False,1
1,101,202403,,False,1,False,0
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data_matched_pair/case2_expected_output.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
1,101,202401,237,False,0,True,2
1,101,202402,281,True,2,False,1
1,101,202403,,False,1,False,0
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data_matched_pair/case3_expected_output.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
reference,strata,period,target_variable,f_matched_pair,f_matched_pair_count,b_matched_pair,b_matched_pair_count
reference,strata,period,target_variable,f_matched_pair_target_variable,f_matched_pair_count,b_matched_pair_target_variable,b_matched_pair_count
1,101,202401,237,False,0,True,2
1,101,202402,281,True,2,False,0
1,101,202403,,False,0,False,0
Expand Down
106 changes: 76 additions & 30 deletions tests/test_flag_and_count_matched_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,15 @@
class TestMatchedPair:
def test_flag_matched_pair_merge_forward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
axis=1,
inplace=True,
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"f_matched_pair_target_variable",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable"]
]
Expand All @@ -44,11 +48,15 @@ def test_flag_matched_pair_merge_forward(self, expected_output_file):

def test_flag_matched_pair_merge_backward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
axis=1,
inplace=True,
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"b_matched_pair_target_variable",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable"]
]
Expand All @@ -60,33 +68,67 @@ def test_flag_matched_pair_merge_backward(self, expected_output_file):

def test_count_matched_pair_forward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["b_matched_pair", "b_matched_pair_count"], axis=1, inplace=True
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"f_matched_pair_target_variable",
"f_matched_pair_count",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable", "f_matched_pair"]
[
"reference",
"strata",
"period",
"target_variable",
"f_matched_pair_target_variable",
]
]
df_output = count_matches(df_input, "f_matched_pair", "period", "strata")
df_output = count_matches(
df_input, "f_matched_pair_target_variable", "period", "strata"
)
assert_frame_equal(df_output, df_expected_output)

def test_count_matches_backward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair", "f_matched_pair_count"], axis=1, inplace=True
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"b_matched_pair_target_variable",
"b_matched_pair_count",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable", "b_matched_pair"]
[
"reference",
"strata",
"period",
"target_variable",
"b_matched_pair_target_variable",
]
]
df_output = count_matches(df_input, "b_matched_pair", "period", "strata")
df_output = count_matches(
df_input, "b_matched_pair_target_variable", "period", "strata"
)
assert_frame_equal(df_output, df_expected_output)

def test_flag_matched_pair_shift_forward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair_count", "b_matched_pair", "b_matched_pair_count"],
axis=1,
inplace=True,
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"f_matched_pair_target_variable",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable"]
]
Expand All @@ -98,11 +140,15 @@ def test_flag_matched_pair_shift_forward(self, expected_output_file):

def test_flag_matched_pair_shift_backward(self, expected_output_file):
df_expected_output = load_and_format(expected_output_file)
df_expected_output.drop(
["f_matched_pair_count", "f_matched_pair", "b_matched_pair_count"],
axis=1,
inplace=True,
)
df_expected_output = df_expected_output[
[
"reference",
"strata",
"period",
"target_variable",
"b_matched_pair_target_variable",
]
]
df_input = df_expected_output[
["reference", "strata", "period", "target_variable"]
]
Expand Down
Loading
Loading