Merge branch 'master' of https://github.com/cfpb/hmda-platform

cfpb · Sep 16, 2024 · 78740a8 · 78740a8
2 parents a9ea91d + ecb4fc3
commit 78740a8
Show file tree

Hide file tree

Showing 3 changed files with 84,564 additions and 84,534 deletions.
diff --git a/common/src/main/pyhmda/parse_census_file.py b/common/src/main/pyhmda/parse_census_file.py
@@ -39,6 +39,8 @@ def conv_scf(val: str) -> str:
 cfconverters = {k: v[1] for k, v in census_file_columns.items()}
 parsed_census_df = pd.read_csv(args.censusfile, sep=',', header=None, usecols=cfkeys,
     converters=cfconverters)[cfkeys].rename(cfcolnames, axis=1)
+parsed_census_df = apply_authorized_modifications(census_file_authorized_modifications,
+                                                  parsed_census_df)
 logging.info(f"Parsed {args.censusfile}")
 
 root, ext = os.path.splitext(args.delineationfile)
@@ -54,7 +56,7 @@ def conv_scf(val: str) -> str:
 dfcolnames = {k: v[0] for k, v in delineation_file_columns.items()}
 dfconverters = {k: v[1] for k, v in delineation_file_columns.items()}
 parsed_delin_df = pd.read_csv(prepared_file, sep=',', header=None, usecols=dfkeys,
-                               converters=dfconverters).rename(dfcolnames, axis=1)
+                              converters=dfconverters).rename(dfcolnames, axis=1)
 logging.info(f"Parsed {prepared_file}")
 
 parsed_delin_df["MSAOrMDTitle"] = parsed_delin_df.apply(lambda row:
@@ -66,6 +68,8 @@ def conv_scf(val: str) -> str:
     else f"{os.path.splitext(args.censusfile)[0]}-parsed.txt"
 output_df = parsed_census_df.merge(parsed_delin_df,
     how="left", on=["FIPSStateCode", "FIPSCountyCode"])
+output_df["MSAOrMDTitle"] = output_df.apply(lambda row:
+    "" if row.CBSACode == "99999" else row.MSAOrMDTitle, axis=1)
 output_df.to_csv(output_file, sep='|', index=False)
 logging.info(f"Wrote output file {output_file}")
 os.remove(prepared_file)
diff --git a/common/src/main/pyhmda/utils.py b/common/src/main/pyhmda/utils.py
@@ -1,4 +1,5 @@
 
+from datetime import datetime
 import logging
 import pandas as pd
 import re
@@ -60,6 +61,8 @@ def conv_optpct(val: str) -> Any:
 
 
 def prepare_file(read_file: str, write_file: str, pattern: str, expected_match: float=0.95) -> None:
+    """Performs preprocessing on source files to insure clean reads by pandas.
+    """
     lc, mc = 0, 0
     with open(read_file, 'r') as rf:
         with open(write_file, 'w') as wf:
@@ -71,3 +74,25 @@ def prepare_file(read_file: str, write_file: str, pattern: str, expected_match:
     if mc < expected_match * lc:
         sys.exit(f"{read_file} pattern matched only {mc} of {lc} lines")
     logging.info(f"Prepared file {write_file}")
+
+
+def apply_authorized_modifications(modmap: dict, df: pd.DataFrame) -> pd.DataFrame:
+    """Applies modifications to non-CFPB-owned source data files on directions of outside agencies.
+    """
+    for mod_date in sorted(modmap.keys()):
+        df = modmap[mod_date](df)
+    return df
+
+
+# Census Flat File Modifications
+
+def replace_MedianAge_2002_values(df: pd.DataFrame) -> pd.DataFrame:
+    df.loc[df["MedianAge"] == 2002, "MedianAge"] = 6
+    return df
+
+
+# Modifications to published Census Flat Files directed by the US Census Bureau.
+census_file_authorized_modifications = {
+    datetime(2024, 9, 1) : replace_MedianAge_2002_values
+}
+