diff --git a/.github/workflows/extract-chicago-permits.yaml b/.github/workflows/extract-chicago-permits.yaml index 6104cd0..e2415b9 100644 --- a/.github/workflows/extract-chicago-permits.yaml +++ b/.github/workflows/extract-chicago-permits.yaml @@ -61,28 +61,31 @@ jobs: shell: bash working-directory: ${{ env.WORKING_DIR }} + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }} + aws-region: us-east-1 + - name: Extract permits run: pipenv run python3 permit_cleaning.py shell: bash working-directory: ${{ env.WORKING_DIR }} + env: + AWS_REGION: us-east-1 + AWS_ATHENA_S3_STAGING_DIR: s3://ccao-athena-results-us-east-1/ - name: Compress permit directories into one file id: compress-permits run: | ZIP_FILENAME="chicago-permits-$(date +%Y%m%d%H%M%S).zip" mkdir chicago-permits - mv csvs_for_* chicago-permits/ + mv files_for_* chicago-permits/ zip -r "$ZIP_FILENAME" chicago-permits echo "filename=$ZIP_FILENAME" >> "$GITHUB_OUTPUT" shell: bash working-directory: ${{ env.WORKING_DIR }} - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }} - aws-region: us-east-1 - - name: Upload compressed permit file to S3 id: s3-upload run: | diff --git a/chicago/Pipfile b/chicago/Pipfile index 7fdefcd..0b5fe4f 100644 --- a/chicago/Pipfile +++ b/chicago/Pipfile @@ -7,6 +7,8 @@ name = "pypi" requests = "2.31.*" pandas = "2.1.*" sodapy = "2.2.*" +PyAthena = "3.0.*" +XlsxWriter = "3.1.*" [dev-packages] diff --git a/chicago/Pipfile.lock b/chicago/Pipfile.lock index eb896d4..c579ef0 100644 --- a/chicago/Pipfile.lock +++ b/chicago/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "9de45cd2a2f6e5ea7690542e76edfe029f926a2a58915b9706a8f191bc0cb8b0" + "sha256": "c59fc39217f202ca670f42d03663e716d7b6dadfc78b7f1fbec4936434538f83" }, "pipfile-spec": 6, "requires": { @@ -16,13 +16,29 @@ ] }, "default": { + "boto3": { + "hashes": [ + "sha256:620f1eb3e18e780be58383b4a4e10db003d2314131190514153996032c8d932d", + "sha256:8d54fa3a9290020f9a7f488f9cbe821029de0af05a677751b12973a5f726a5e2" + ], + "markers": "python_version >= '3.7'", + "version": "==1.33.11" + }, + "botocore": { + "hashes": [ + "sha256:b14b328f902d120de0a09eaa657a9a701c0ceeb711197c2f01ef0523f855086c", + "sha256:b46227eb3fa9cfdc8f5a83920ef347e67adea8095830ed265a3373b13b54421f" + ], + "markers": "python_version >= '3.7'", + "version": "==1.33.11" + }, "certifi": { "hashes": [ - "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082", - "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9" + "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1", + "sha256:e036ab49d5b79556f99cfc2d9320b34cfbe5be05c5871b51de9329f0603b0474" ], "markers": "python_version >= '3.6'", - "version": "==2023.7.22" + "version": "==2023.11.17" }, "charset-normalizer": { "hashes": [ @@ -120,13 +136,29 @@ "markers": "python_full_version >= '3.7.0'", "version": "==3.3.2" }, + "fsspec": { + "hashes": [ + "sha256:6271f1d3075a378bfe432f6f42bf7e1d2a6ba74f78dd9b512385474c579146a0", + "sha256:c4da01a35ac65c853f833e43f67802c25213f560820d54ddf248f92eddd5e990" + ], + "markers": "python_version >= '3.8'", + "version": "==2023.12.1" + }, "idna": { "hashes": [ - "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", - "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2" + "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca", + "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f" ], "markers": "python_version >= '3.5'", - "version": "==3.4" + "version": "==3.6" + }, + "jmespath": { + "hashes": [ + "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", + "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe" + ], + "markers": "python_version >= '3.7'", + "version": "==1.0.1" }, "numpy": { "hashes": [ @@ -172,42 +204,50 @@ }, "pandas": { "hashes": [ - "sha256:0296a66200dee556850d99b24c54c7dfa53a3264b1ca6f440e42bad424caea03", - "sha256:04d4c58e1f112a74689da707be31cf689db086949c71828ef5da86727cfe3f82", - "sha256:08637041279b8981a062899da0ef47828df52a1838204d2b3761fbd3e9fcb549", - "sha256:11a771450f36cebf2a4c9dbd3a19dfa8c46c4b905a3ea09dc8e556626060fe71", - "sha256:1329dbe93a880a3d7893149979caa82d6ba64a25e471682637f846d9dbc10dd2", - "sha256:1f539e113739a3e0cc15176bf1231a553db0239bfa47a2c870283fd93ba4f683", - "sha256:22929f84bca106921917eb73c1521317ddd0a4c71b395bcf767a106e3494209f", - "sha256:321ecdb117bf0f16c339cc6d5c9a06063854f12d4d9bc422a84bb2ed3207380a", - "sha256:35172bff95f598cc5866c047f43c7f4df2c893acd8e10e6653a4b792ed7f19bb", - "sha256:3cc4469ff0cf9aa3a005870cb49ab8969942b7156e0a46cc3f5abd6b11051dfb", - "sha256:4441ac94a2a2613e3982e502ccec3bdedefe871e8cea54b8775992485c5660ef", - "sha256:465571472267a2d6e00657900afadbe6097c8e1dc43746917db4dfc862e8863e", - "sha256:59dfe0e65a2f3988e940224e2a70932edc964df79f3356e5f2997c7d63e758b4", - "sha256:72c84ec1b1d8e5efcbff5312abe92bfb9d5b558f11e0cf077f5496c4f4a3c99e", - "sha256:7cf4cf26042476e39394f1f86868d25b265ff787c9b2f0d367280f11afbdee6d", - "sha256:7fa2ad4ff196768ae63a33f8062e6838efed3a319cf938fdf8b95e956c813042", - "sha256:a5d53c725832e5f1645e7674989f4c106e4b7249c1d57549023ed5462d73b140", - "sha256:acf08a73b5022b479c1be155d4988b72f3020f308f7a87c527702c5f8966d34f", - "sha256:b99c4e51ef2ed98f69099c72c75ec904dd610eb41a32847c4fcbc1a975f2d2b8", - "sha256:d5ded6ff28abbf0ea7689f251754d3789e1edb0c4d0d91028f0b980598418a58", - "sha256:de21e12bf1511190fc1e9ebc067f14ca09fccfb189a813b38d63211d54832f5f", - "sha256:f7ea8ae8004de0381a2376662c0505bb0a4f679f4c61fbfd122aa3d1b0e5f09d", - "sha256:fc77309da3b55732059e484a1efc0897f6149183c522390772d3561f9bf96c00", - "sha256:fca5680368a5139d4920ae3dc993eb5106d49f814ff24018b64d8850a52c6ed2", - "sha256:fcd76d67ca2d48f56e2db45833cf9d58f548f97f61eecd3fdc74268417632b8a" + "sha256:00028e6737c594feac3c2df15636d73ace46b8314d236100b57ed7e4b9ebe8d9", + "sha256:0aa6e92e639da0d6e2017d9ccff563222f4eb31e4b2c3cf32a2a392fc3103c0d", + "sha256:1ebfd771110b50055712b3b711b51bee5d50135429364d0498e1213a7adc2be8", + "sha256:294d96cfaf28d688f30c918a765ea2ae2e0e71d3536754f4b6de0ea4a496d034", + "sha256:3f06bda01a143020bad20f7a85dd5f4a1600112145f126bc9e3e42077c24ef34", + "sha256:426dc0f1b187523c4db06f96fb5c8d1a845e259c99bda74f7de97bd8a3bb3139", + "sha256:45d63d2a9b1b37fa6c84a68ba2422dc9ed018bdaa668c7f47566a01188ceeec1", + "sha256:482d5076e1791777e1571f2e2d789e940dedd927325cc3cb6d0800c6304082f6", + "sha256:6b728fb8deba8905b319f96447a27033969f3ea1fea09d07d296c9030ab2ed1d", + "sha256:8a706cfe7955c4ca59af8c7a0517370eafbd98593155b48f10f9811da440248b", + "sha256:8ea107e0be2aba1da619cc6ba3f999b2bfc9669a83554b1904ce3dd9507f0860", + "sha256:ab5796839eb1fd62a39eec2916d3e979ec3130509930fea17fe6f81e18108f6a", + "sha256:b0513a132a15977b4a5b89aabd304647919bc2169eac4c8536afb29c07c23540", + "sha256:b7d852d16c270e4331f6f59b3e9aa23f935f5c4b0ed2d0bc77637a8890a5d092", + "sha256:bd7d5f2f54f78164b3d7a40f33bf79a74cdee72c31affec86bfcabe7e0789821", + "sha256:bdec823dc6ec53f7a6339a0e34c68b144a7a1fd28d80c260534c39c62c5bf8c9", + "sha256:d2d3e7b00f703aea3945995ee63375c61b2e6aa5aa7871c5d622870e5e137623", + "sha256:d65148b14788b3758daf57bf42725caa536575da2b64df9964c563b015230984", + "sha256:d797591b6846b9db79e65dc2d0d48e61f7db8d10b2a9480b4e3faaddc421a171", + "sha256:dc9bf7ade01143cddc0074aa6995edd05323974e6e40d9dbde081021ded8510e", + "sha256:e9f17f2b6fc076b2a0078862547595d66244db0f41bf79fc5f64a5c4d635bead", + "sha256:edbaf9e8d3a63a9276d707b4d25930a262341bca9874fcb22eff5e3da5394732", + "sha256:f237e6ca6421265643608813ce9793610ad09b40154a3344a088159590469e46", + "sha256:f69b0c9bb174a2342818d3e2778584e18c740d56857fc5cdb944ec8bbe4082cf", + "sha256:fcb68203c833cc735321512e13861358079a96c174a61f5116a1de89c58c0ef7" ], "index": "pypi", "markers": "python_version >= '3.9'", - "version": "==2.1.3" + "version": "==2.1.4" + }, + "pyathena": { + "hashes": [ + "sha256:1ef983d478bc182c2ea31d75bf4fd5755425985e5da4f8e181235f2d2733e536", + "sha256:80e8f953abcfb29926167a57fe6de57ecc9d26f2f1a697cbc29870cde6fecdaa" + ], + "markers": "python_full_version >= '3.8.1'", + "version": "==3.0.10" }, "python-dateutil": { "hashes": [ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.8.2" }, "pytz": { @@ -226,12 +266,20 @@ "markers": "python_version >= '3.7'", "version": "==2.31.0" }, + "s3transfer": { + "hashes": [ + "sha256:368ac6876a9e9ed91f6bc86581e319be08188dc60d50e0d56308ed5765446283", + "sha256:c9e56cbe88b28d8e197cf841f1f0c130f246595e77ae5b5a05b69fe7cb83de76" + ], + "markers": "python_version >= '3.7'", + "version": "==0.8.2" + }, "six": { "hashes": [ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, "sodapy": { @@ -242,6 +290,14 @@ "index": "pypi", "version": "==2.2.0" }, + "tenacity": { + "hashes": [ + "sha256:5398ef0d78e63f40007c1fb4c0bff96e1911394d2fa8d194f77619c05ff6cc8a", + "sha256:ce510e327a630c9e1beaf17d42e6ffacc88185044ad85cf74c0a8887c6a0f88c" + ], + "markers": "python_version >= '3.7'", + "version": "==8.2.3" + }, "tzdata": { "hashes": [ "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a", @@ -252,11 +308,19 @@ }, "urllib3": { "hashes": [ - "sha256:55901e917a5896a349ff771be919f8bd99aff50b79fe58fec595eb37bbc56bb3", - "sha256:df7aa8afb0148fa78488e7899b2c59b5f4ffcfa82e6c54ccb9dd37c1d7b52d54" + "sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84", + "sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e" ], - "markers": "python_version >= '3.8'", - "version": "==2.1.0" + "markers": "python_version >= '3.7'", + "version": "==2.0.7" + }, + "xlsxwriter": { + "hashes": [ + "sha256:b61c1a0c786f82644936c0936ec96ee96cd3afb9440094232f7faef9b38689f0", + "sha256:de810bf328c6a4550f4ffd6b0b34972aeb7ffcf40f3d285a0413734f9b63a929" + ], + "markers": "python_version >= '3.6'", + "version": "==3.1.9" } }, "develop": {} diff --git a/chicago/permit_cleaning.py b/chicago/permit_cleaning.py index 34b770c..d3a6698 100644 --- a/chicago/permit_cleaning.py +++ b/chicago/permit_cleaning.py @@ -2,7 +2,7 @@ Chicago Permit Ingest Process - Automation This script automates the current process for cleaning permit data from the Chicago Data Portal's Building Permits table -and preparing it for upload to iasWorld via SmartFile. This involves fetching the data, cleaning up certain fields, +and preparing it for upload to iasWorld via SmartFile. This involves fetching the data, cleaning up certain fields, organizing columns to match the SmartFile template, and batching the data into Excel workbooks of 200 rows each. This process also splits off data that is ready for upload from data that still needs some manual review before upload, saving each in separate Excel workbooks in separate folders. Data that need review are split into two categories and corresponding folders/files: @@ -14,7 +14,7 @@ The following will also need to be updated: - At the beginning of each year: update year to current year in SQL_QUERY inside pull_existing_pins_from_athena() function - - Update limit as desired in the url in the download_all_permits() function (currently set at 5000 rows for testing purposes) + - Update limit as desired in the url in the download_all_permits() function (currently set at 5000 rows for testing purposes) """ import requests @@ -37,15 +37,21 @@ def pull_existing_pins_from_athena(): region_name=os.getenv("AWS_REGION"), ) - SQL_QUERY = "SELECT pin, pin10 FROM default.vw_pin_universe WHERE triad_name='City' AND year='2023';" - + SQL_QUERY = """ + SELECT + CAST(pin AS varchar) AS pin, + CAST(pin10 AS varchar) AS pin10 + FROM default.vw_pin_universe + WHERE triad_name = 'City' AND year = '2023' + """ + cursor = conn.cursor() cursor.execute(SQL_QUERY) chicago_pin_universe = as_pandas(cursor) chicago_pin_universe.to_csv("chicago_pin_universe.csv", index=False) - return chicago_pin_universe - + return chicago_pin_universe + def download_all_permits(): # update limit in url below when ready to work with full dataset (as of Dec 7, 2023 dataset has 757,677 rows) @@ -58,22 +64,22 @@ def download_all_permits(): def expand_multi_pin_permits(df): - """ + """ Data from the Chicago open data permits table (data this script works with) has rows uniquely identified by permit number. Permits can apply to multiple PINs, with additional PINs recorded in the PIN2 - PIN10 fields. - We want rows that are uniquely identified by PIN and permit number. + We want rows that are uniquely identified by PIN and permit number. This function creates new rows for each additional PIN in multi-PIN permits and saves the relevant PIN in pin_solo. - """ + """ # the downloaded dataframe will not include any pin columns that are completely blank, so check for existing ones here all_pin_columns = ["pin1", "pin2", "pin3", "pin4", "pin5", "pin6", "pin7", "pin8", "pin9", "pin10"] pin_columns = [col for col in df.columns if col in all_pin_columns] non_pin_columns = [col for col in df.columns if col not in pin_columns] melted_df = pd.melt(df, id_vars=non_pin_columns, value_vars=pin_columns, var_name="pin_type", value_name="solo_pin") - + # keep rows with NA for pin1, filter out rows with NA for other pins melted_df = melted_df[(melted_df["pin_type"] == "pin1") | ((melted_df["pin_type"] != "pin1") & melted_df["solo_pin"].notna())] - + # order rows by permit number then pin type (so pins will be in order of their assigned numbering in permit table, not necessarily by pin number) melted_df = melted_df.sort_values(by=["permit_", "pin_type"]).reset_index(drop=True) @@ -81,7 +87,7 @@ def expand_multi_pin_permits(df): # update pin to match formatting of iasWorld -def format_pin(df): +def format_pin(df): # iasWorld format doesn't include dashes df["pin_final"] = df["solo_pin"].astype(str).str.replace("-", "") # add zeros to 10-digit PINs to transform into 14-digits PINs @@ -109,28 +115,28 @@ def organize_columns(df): "contact_1_name": "Applicant* [USER21]", "work_description": "Notes [NOTE1]" } - + data_relevant = df[[col for col in df.columns if col in column_renaming_dict]] data_renamed = data_relevant.rename(columns=column_renaming_dict) - + column_order = ["Original PIN", # will keep original PIN column for rows flagged for invalid PINs - "PIN* [PARID]", - "Local Permit No.* [USER28]", - "Issue Date* [PERMDT]", + "PIN* [PARID]", + "Local Permit No.* [USER28]", + "Issue Date* [PERMDT]", "Desc 1* [DESC1]", - "Desc 2 Code 1 [USER6]", - "Desc 2 Code 2 [USER7]", + "Desc 2 Code 1 [USER6]", + "Desc 2 Code 2 [USER7]", "Desc 2 Code 3 [USER8]", - "Amount* [AMOUNT]", - "Assessable [IS_ASSESS]", - "Applicant Street Address* [ADDR1]", - "Applicant Address 2 [ADDR2]", - "Applicant City, State, Zip* [ADDR3]", - "Contact Phone* [PHONE]", - "Applicant* [USER21]", - "Notes [NOTE1]", - "Occupy Dt [UDATE1]", - "Submit Dt* [CERTDATE]", + "Amount* [AMOUNT]", + "Assessable [IS_ASSESS]", + "Applicant Street Address* [ADDR1]", + "Applicant Address 2 [ADDR2]", + "Applicant City, State, Zip* [ADDR3]", + "Contact Phone* [PHONE]", + "Applicant* [USER21]", + "Notes [NOTE1]", + "Occupy Dt [UDATE1]", + "Submit Dt* [CERTDATE]", "Est Comp Dt [UDATE2]" ] @@ -145,14 +151,13 @@ def flag_invalid_pins(df, valid_pins): df["FLAG COMMENTS"] = "" # invalid 14-digit PIN flag - valid_pins["pin"] = valid_pins["pin"].astype(str) df["FLAG, INVALID: PIN* [PARID]"] = np.where(df["PIN* [PARID]"] == "", 0, ~df["PIN* [PARID]"].isin(valid_pins["pin"])) - + # also check if 10-digit PINs are valid to narrow down on problematic portion of invalid PINs - df["pin_10digit"] = df["PIN* [PARID]"].astype(str).str[:10] - df["FLAG, INVALID: pin_10digit"] = np.where(df["pin_10digit"] == "", 0, df["pin_10digit"].isin(valid_pins["pin10"])) - - # create variable that is the numbers following the 10-digit PIN + df["pin_10digit"] = df["PIN* [PARID]"].astype(str).str[:10] + df["FLAG, INVALID: pin_10digit"] = np.where(df["pin_10digit"] == "", 0, ~df["pin_10digit"].isin(valid_pins["pin10"])) + + # create variable that is the numbers following the 10-digit PIN # (not pulling last 4 digits from the end in case there are PINs that are not 14-digits in Chicago permit data) df["pin_suffix"] = df["PIN* [PARID]"].astype(str).str[10:] @@ -165,11 +170,11 @@ def flag_invalid_pins(df, valid_pins): def flag_fix_long_fields(df): - # will use these abbreviations to shorten applicant name field (Applicant* [USER21]) within 50 character field limit + # will use these abbreviations to shorten applicant name field (Applicant* [USER21]) within 50 character field limit name_shortening_dict = { - "ASSOCIATION": "ASSN", + "ASSOCIATION": "ASSN", "COMPANY": "CO", - "BUILDING": "BLDG", + "BUILDING": "BLDG", "FOUNDATION": "FNDN", "ILLINOIS": "IL", "STREET": "ST", @@ -183,9 +188,9 @@ def flag_fix_long_fields(df): "LIMITED": "LTD", "PLAZA": "PLZ" } - + df["Applicant* [USER21]"] = df["Applicant* [USER21]"].replace(name_shortening_dict, regex=True) - + # these fields have the following character limits in Smartfile / iasWorld, flag if over limit long_fields_to_flag = [ ("FLAG, LENGTH: Applicant Name", "Applicant* [USER21]", 50, "Applicant* [USER21] over 50 char limit by "), @@ -193,16 +198,16 @@ def flag_fix_long_fields(df): ("FLAG, LENGTH: Applicant Street Address", "Applicant Street Address* [ADDR1]", 40, "Applicant Street Address* [ADDR1] over 40 char limit by "), ("FLAG, LENGTH: Note1", "Notes [NOTE1]", 2000, "Notes [NOTE1] over 2000 char limit by ") ] - + for flag_name, column, limit, comment in long_fields_to_flag: df[flag_name] = df[column].apply(lambda val: 0 if pd.isna(val) else (1 if len(str(val)) > limit else 0)) df["FLAG COMMENTS"] += df[column].apply(lambda val: "" if pd.isna(val) else ("" if len(str(val)) < limit else comment + str(len(str(val)) - limit) + "; ")) - + # round Amount to closest dollar because smart file doesn't accept decimal amounts, then flag values above upper limit df["Amount* [AMOUNT]"] = pd.to_numeric(df["Amount* [AMOUNT]"], errors="coerce").round().astype("Int64") df["FLAG, VALUE: Amount"] = df["Amount* [AMOUNT]"].apply(lambda value: 0 if pd.isna(value) or value <= 2147483647 else 1) df["FLAG COMMENTS"] += df["Amount* [AMOUNT]"].apply(lambda value: "" if pd.isna(value) or value <= 2147483647 else "Amount* [AMOUNT] over value limit of 2147483647; ") - + # also flag rows where fields are blank for manual review (for fields we're populating in smartfile template) empty_fields_to_flag = [ ("FLAG, EMPTY: PIN", "PIN* [PARID]"), @@ -223,10 +228,10 @@ def flag_fix_long_fields(df): df["FLAGS, TOTAL - LENGTH/VALUE"] = df.filter(like="FLAG, LENGTH").values.sum(axis=1) + df.filter(like="FLAG, VALUE").values.sum(axis=1) df["FLAGS, TOTAL - EMPTY/INVALID"] = df.filter(like="FLAG, EMPTY").values.sum(axis=1) + df.filter(like="FLAG, INVALID").values.sum(axis=1) - # need a column that identifies rows with flags for field length/amount but no flags for emptiness/invalidness + # need a column that identifies rows with flags for field length/amount but no flags for emptiness/invalidness # since these two categories will get split into separate excel workbooks df["MANUAL REVIEW"] = np.where((df["FLAGS, TOTAL - EMPTY/INVALID"] == 0) & (df["FLAGS, TOTAL - LENGTH/VALUE"] > 0), 1, 0) - + # for ease of analysts viewing, edits flag columns to read "Yes" when row is flagged and blank otherwise (easier than columns of 0s and 1s) flag_columns = list(df.filter(like="FLAG, LENGTH").columns) + list(df.filter(like="FLAG, VALUE").columns) + list(df.filter(like="FLAG, EMPTY").columns) + list(df.filter(like="FLAG, INVALID").columns) df[flag_columns] = df[flag_columns].replace({0: "", 1: "Yes"}) @@ -247,14 +252,14 @@ def save_xlsx_files(df, max_rows, file_base_name): df_ready = df[(df["FLAGS, TOTAL - LENGTH/VALUE"] == 0) & (df["FLAGS, TOTAL - EMPTY/INVALID"] == 0)].reset_index() df_ready = df_ready.drop(columns=df_ready.filter(like="FLAG").columns).\ drop(columns=["index", "Original PIN", "MANUAL REVIEW", "pin_10digit", "pin_suffix"]) - + df_review_length = df[df["MANUAL REVIEW"] == 1].reset_index() df_review_length = df_review_length.drop(columns=df_review_length.filter(like="FLAG, EMPTY")).\ drop(columns=df_review_length.filter(like="FLAG, INVALID")).\ drop(columns=["Original PIN", "FLAGS, TOTAL - EMPTY/INVALID", "index", "MANUAL REVIEW", "pin_10digit", "pin_suffix"]) - + df_review_empty_invalid = df[df["FLAGS, TOTAL - EMPTY/INVALID"] > 0].reset_index().\ - drop(columns=["index", "MANUAL REVIEW", "pin_10digit", "pin_suffix"]) + drop(columns=["index", "MANUAL REVIEW", "pin_10digit", "pin_suffix"]) print("# rows ready for upload: ", len(df_ready)) print("# rows flagged for length: ", len(df_review_length)) @@ -320,4 +325,4 @@ def save_xlsx_files(df, max_rows, file_base_name): file_base_name = gen_file_base_name() -save_xlsx_files(permits_shortened, 200, file_base_name) \ No newline at end of file +save_xlsx_files(permits_shortened, 200, file_base_name)