Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/cfpb/hmda-platform
Browse files Browse the repository at this point in the history
  • Loading branch information
kgudel committed Sep 16, 2024
2 parents a9ea91d + ecb4fc3 commit 78740a8
Show file tree
Hide file tree
Showing 3 changed files with 84,564 additions and 84,534 deletions.
6 changes: 5 additions & 1 deletion common/src/main/pyhmda/parse_census_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ def conv_scf(val: str) -> str:
cfconverters = {k: v[1] for k, v in census_file_columns.items()}
parsed_census_df = pd.read_csv(args.censusfile, sep=',', header=None, usecols=cfkeys,
converters=cfconverters)[cfkeys].rename(cfcolnames, axis=1)
parsed_census_df = apply_authorized_modifications(census_file_authorized_modifications,
parsed_census_df)
logging.info(f"Parsed {args.censusfile}")

root, ext = os.path.splitext(args.delineationfile)
Expand All @@ -54,7 +56,7 @@ def conv_scf(val: str) -> str:
dfcolnames = {k: v[0] for k, v in delineation_file_columns.items()}
dfconverters = {k: v[1] for k, v in delineation_file_columns.items()}
parsed_delin_df = pd.read_csv(prepared_file, sep=',', header=None, usecols=dfkeys,
converters=dfconverters).rename(dfcolnames, axis=1)
converters=dfconverters).rename(dfcolnames, axis=1)
logging.info(f"Parsed {prepared_file}")

parsed_delin_df["MSAOrMDTitle"] = parsed_delin_df.apply(lambda row:
Expand All @@ -66,6 +68,8 @@ def conv_scf(val: str) -> str:
else f"{os.path.splitext(args.censusfile)[0]}-parsed.txt"
output_df = parsed_census_df.merge(parsed_delin_df,
how="left", on=["FIPSStateCode", "FIPSCountyCode"])
output_df["MSAOrMDTitle"] = output_df.apply(lambda row:
"" if row.CBSACode == "99999" else row.MSAOrMDTitle, axis=1)
output_df.to_csv(output_file, sep='|', index=False)
logging.info(f"Wrote output file {output_file}")
os.remove(prepared_file)
25 changes: 25 additions & 0 deletions common/src/main/pyhmda/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@

from datetime import datetime
import logging
import pandas as pd
import re
Expand Down Expand Up @@ -60,6 +61,8 @@ def conv_optpct(val: str) -> Any:


def prepare_file(read_file: str, write_file: str, pattern: str, expected_match: float=0.95) -> None:
"""Performs preprocessing on source files to insure clean reads by pandas.
"""
lc, mc = 0, 0
with open(read_file, 'r') as rf:
with open(write_file, 'w') as wf:
Expand All @@ -71,3 +74,25 @@ def prepare_file(read_file: str, write_file: str, pattern: str, expected_match:
if mc < expected_match * lc:
sys.exit(f"{read_file} pattern matched only {mc} of {lc} lines")
logging.info(f"Prepared file {write_file}")


def apply_authorized_modifications(modmap: dict, df: pd.DataFrame) -> pd.DataFrame:
"""Applies modifications to non-CFPB-owned source data files on directions of outside agencies.
"""
for mod_date in sorted(modmap.keys()):
df = modmap[mod_date](df)
return df


# Census Flat File Modifications

def replace_MedianAge_2002_values(df: pd.DataFrame) -> pd.DataFrame:
df.loc[df["MedianAge"] == 2002, "MedianAge"] = 6
return df


# Modifications to published Census Flat Files directed by the US Census Bureau.
census_file_authorized_modifications = {
datetime(2024, 9, 1) : replace_MedianAge_2002_values
}

Loading

0 comments on commit 78740a8

Please sign in to comment.