From fc10f4cd8c06941c965f6509999c112f0fce87ee Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Wed, 4 Dec 2024 23:21:47 +0000 Subject: [PATCH 1/2] Pull class codes from Athena instead of `ccao` package in land_nbhd_rate script --- .../ccao/ccao-land-land_nbhd_rate.R | 293 +++++++++--------- 1 file changed, 149 insertions(+), 144 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/ccao/ccao-land-land_nbhd_rate.R b/etl/scripts-ccao-data-warehouse-us-east-1/ccao/ccao-land-land_nbhd_rate.R index 22a88d5ab..34f1080ca 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/ccao/ccao-land-land_nbhd_rate.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/ccao/ccao-land-land_nbhd_rate.R @@ -1,144 +1,149 @@ -library(arrow) -library(aws.s3) -library(dplyr) -library(openxlsx) -library(purrr) -library(readr) -library(snakecase) -library(stringr) -library(tidyr) -source("utils.R") - -# This script retrieves and cleans land value spreadsheets provided by -# the Valuations department and formats them for use in Athena -AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") -AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") -input_bucket <- file.path(AWS_S3_RAW_BUCKET, "ccao", "land") -output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "ccao", "land") - -# Location of remote files -remote_file_raw_nbhd_rate_2022 <- file.path( - input_bucket, "nbhd_rate", "2022.xlsx" -) -remote_file_raw_nbhd_rate_2023 <- file.path( - input_bucket, "nbhd_rate", "2023.xlsx" -) -remote_file_raw_nbhd_rate_2024 <- file.path( - input_bucket, "nbhd_rate", "2024.xlsx" -) -remote_file_warehouse_nbhd_rate <- file.path( - output_bucket, "land_nbhd_rate" -) - - -# Temp file to download workbook -tmp_file_nbhd_rate_2022 <- tempfile(fileext = ".xlsx") -tmp_file_nbhd_rate_2023 <- tempfile(fileext = ".xlsx") -tmp_file_nbhd_rate_2024 <- tempfile(fileext = ".xlsx") - -# Grab the workbook from the raw S3 bucket -aws.s3::save_object( - object = remote_file_raw_nbhd_rate_2022, - file = tmp_file_nbhd_rate_2022 -) -aws.s3::save_object( - object = remote_file_raw_nbhd_rate_2023, - file = tmp_file_nbhd_rate_2023 -) -aws.s3::save_object( - object = remote_file_raw_nbhd_rate_2024, - file = tmp_file_nbhd_rate_2024 -) - -# List of regression classes -class <- ccao::class_dict %>% - filter(regression_class) %>% - pull(class_code) - -# Load the raw workbooks, rename and clean up columns -land_nbhd_rate_2022 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2022) %>% - set_names(snakecase::to_snake_case(names(.))) %>% - select( - township_code = twp_number, - township_name = twp_name, - town_nbhd = twp_nbhd, - `2019` = `2019_rate`, - `2022` = `2022_rate` - ) %>% - pivot_longer( - c(`2019`, `2022`), - names_to = "year", values_to = "land_rate_per_sqft" - ) %>% - mutate( - across(c(township_code:town_nbhd, year), as.character), - town_nbhd = str_remove_all(town_nbhd, "-"), - land_rate_per_sqft = parse_number(land_rate_per_sqft) - ) %>% - expand_grid(class) - -land_nbhd_rate_2023 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2023) %>% - set_names(snakecase::to_snake_case(names(.))) %>% - select( - town_nbhd = neighborhood_id, - `2020` = `2020_2_00_class_unit_price`, - `2023` = `2023_2_00_class_unit_price` - ) %>% - mutate( - town_nbhd = gsub("\\D", "", town_nbhd), - township_code = substr(town_nbhd, 1, 2), - township_name = ccao::town_convert(township_code) - ) %>% - relocate(c(township_code, township_name)) %>% - pivot_longer( - c(`2020`, `2023`), - names_to = "year", values_to = "land_rate_per_sqft" - ) %>% - mutate(across(c(township_code:town_nbhd, year), as.character)) %>% - expand_grid(class) - -land_nbhd_rate_2024 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2024) %>% - set_names(snakecase::to_snake_case(names(.))) %>% - mutate( - town_nbhd = paste0( - township_code, str_pad(neighborhood, 3, side = "left", pad = "0") - ) - ) %>% - select( - town_nbhd, - classes, - `2021` = `2021_unit_price`, - `2024` = `2024_unit_price` - ) %>% - mutate( - town_nbhd = gsub("\\D", "", town_nbhd), - township_code = substr(town_nbhd, 1, 2), - township_name = ccao::town_convert(township_code) - ) %>% - relocate(c(township_code, township_name)) %>% - pivot_longer( - c(`2021`, `2024`), - names_to = "year", values_to = "land_rate_per_sqft" - ) %>% - mutate(across(c(township_code:town_nbhd, year), as.character)) %>% - expand_grid(class) %>% - # 2024 contains bifurcated neighborhood land rates across class - filter( - !(classes == "all other regression classes" & class %in% c("210", "295")), - !(classes == "2-10s/2-95s" & !(class %in% c("210", "295"))) - ) %>% - select(-classes) - -# Write the rates to S3, partitioned by year -bind_rows( - land_nbhd_rate_2022, - land_nbhd_rate_2023, - land_nbhd_rate_2024 -) %>% - relocate(land_rate_per_sqft, .after = last_col()) %>% - group_by(year) %>% - arrow::write_dataset( - path = remote_file_warehouse_nbhd_rate, - format = "parquet", - hive_style = TRUE, - compression = "snappy" - ) +library(arrow) +library(aws.s3) +library(dplyr) +library(noctua) +library(openxlsx) +library(purrr) +library(readr) +library(snakecase) +library(stringr) +library(tidyr) +source("utils.R") + +# This script retrieves and cleans land value spreadsheets provided by +# the Valuations department and formats them for use in Athena +AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") +AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") +input_bucket <- file.path(AWS_S3_RAW_BUCKET, "ccao", "land") +output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "ccao", "land") + +AWS_ATHENA_CONN_NOCTUA <- dbConnect(noctua::athena(), rstudio_conn_tab = FALSE) + +# Location of remote files +remote_file_raw_nbhd_rate_2022 <- file.path( + input_bucket, "nbhd_rate", "2022.xlsx" +) +remote_file_raw_nbhd_rate_2023 <- file.path( + input_bucket, "nbhd_rate", "2023.xlsx" +) +remote_file_raw_nbhd_rate_2024 <- file.path( + input_bucket, "nbhd_rate", "2024.xlsx" +) +remote_file_warehouse_nbhd_rate <- file.path( + output_bucket, "land_nbhd_rate" +) + + +# Temp file to download workbook +tmp_file_nbhd_rate_2022 <- tempfile(fileext = ".xlsx") +tmp_file_nbhd_rate_2023 <- tempfile(fileext = ".xlsx") +tmp_file_nbhd_rate_2024 <- tempfile(fileext = ".xlsx") + +# Grab the workbook from the raw S3 bucket +aws.s3::save_object( + object = remote_file_raw_nbhd_rate_2022, + file = tmp_file_nbhd_rate_2022 +) +aws.s3::save_object( + object = remote_file_raw_nbhd_rate_2023, + file = tmp_file_nbhd_rate_2023 +) +aws.s3::save_object( + object = remote_file_raw_nbhd_rate_2024, + file = tmp_file_nbhd_rate_2024 +) + +# List of regression classes +class <- dbGetQuery( + AWS_ATHENA_CONN_NOCTUA, + "SELECT class_code FROM ccao.class_dict WHERE regression_class" +) %>% + pull(class_code) + +# Load the raw workbooks, rename and clean up columns +land_nbhd_rate_2022 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2022) %>% + set_names(snakecase::to_snake_case(names(.))) %>% + select( + township_code = twp_number, + township_name = twp_name, + town_nbhd = twp_nbhd, + `2019` = `2019_rate`, + `2022` = `2022_rate` + ) %>% + pivot_longer( + c(`2019`, `2022`), + names_to = "year", values_to = "land_rate_per_sqft" + ) %>% + mutate( + across(c(township_code:town_nbhd, year), as.character), + town_nbhd = str_remove_all(town_nbhd, "-"), + land_rate_per_sqft = parse_number(land_rate_per_sqft) + ) %>% + expand_grid(class) + +land_nbhd_rate_2023 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2023) %>% + set_names(snakecase::to_snake_case(names(.))) %>% + select( + town_nbhd = neighborhood_id, + `2020` = `2020_2_00_class_unit_price`, + `2023` = `2023_2_00_class_unit_price` + ) %>% + mutate( + town_nbhd = gsub("\\D", "", town_nbhd), + township_code = substr(town_nbhd, 1, 2), + township_name = ccao::town_convert(township_code) + ) %>% + relocate(c(township_code, township_name)) %>% + pivot_longer( + c(`2020`, `2023`), + names_to = "year", values_to = "land_rate_per_sqft" + ) %>% + mutate(across(c(township_code:town_nbhd, year), as.character)) %>% + expand_grid(class) + +land_nbhd_rate_2024 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2024) %>% + set_names(snakecase::to_snake_case(names(.))) %>% + mutate( + town_nbhd = paste0( + township_code, str_pad(neighborhood, 3, side = "left", pad = "0") + ) + ) %>% + select( + town_nbhd, + classes, + `2021` = `2021_unit_price`, + `2024` = `2024_unit_price` + ) %>% + mutate( + town_nbhd = gsub("\\D", "", town_nbhd), + township_code = substr(town_nbhd, 1, 2), + township_name = ccao::town_convert(township_code) + ) %>% + relocate(c(township_code, township_name)) %>% + pivot_longer( + c(`2021`, `2024`), + names_to = "year", values_to = "land_rate_per_sqft" + ) %>% + mutate(across(c(township_code:town_nbhd, year), as.character)) %>% + expand_grid(class) %>% + # 2024 contains bifurcated neighborhood land rates across class + filter( + !(classes == "all other regression classes" & class %in% c("210", "295")), + !(classes == "2-10s/2-95s" & !(class %in% c("210", "295"))) + ) %>% + select(-classes) + +# Write the rates to S3, partitioned by year +bind_rows( + land_nbhd_rate_2022, + land_nbhd_rate_2023, + land_nbhd_rate_2024 +) %>% + relocate(land_rate_per_sqft, .after = last_col()) %>% + group_by(year) %>% + arrow::write_dataset( + path = remote_file_warehouse_nbhd_rate, + format = "parquet", + hive_style = TRUE, + compression = "snappy" + ) From 303be387d1d8359f5d21ed9360132069227f8943 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Thu, 5 Dec 2024 21:51:17 +0000 Subject: [PATCH 2/2] Convert land_nbhd_rate back to DOS file format --- .../ccao/ccao-land-land_nbhd_rate.R | 298 +++++++++--------- 1 file changed, 149 insertions(+), 149 deletions(-) diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/ccao/ccao-land-land_nbhd_rate.R b/etl/scripts-ccao-data-warehouse-us-east-1/ccao/ccao-land-land_nbhd_rate.R index 34f1080ca..9aa97fa86 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/ccao/ccao-land-land_nbhd_rate.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/ccao/ccao-land-land_nbhd_rate.R @@ -1,149 +1,149 @@ -library(arrow) -library(aws.s3) -library(dplyr) -library(noctua) -library(openxlsx) -library(purrr) -library(readr) -library(snakecase) -library(stringr) -library(tidyr) -source("utils.R") - -# This script retrieves and cleans land value spreadsheets provided by -# the Valuations department and formats them for use in Athena -AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") -AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") -input_bucket <- file.path(AWS_S3_RAW_BUCKET, "ccao", "land") -output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "ccao", "land") - -AWS_ATHENA_CONN_NOCTUA <- dbConnect(noctua::athena(), rstudio_conn_tab = FALSE) - -# Location of remote files -remote_file_raw_nbhd_rate_2022 <- file.path( - input_bucket, "nbhd_rate", "2022.xlsx" -) -remote_file_raw_nbhd_rate_2023 <- file.path( - input_bucket, "nbhd_rate", "2023.xlsx" -) -remote_file_raw_nbhd_rate_2024 <- file.path( - input_bucket, "nbhd_rate", "2024.xlsx" -) -remote_file_warehouse_nbhd_rate <- file.path( - output_bucket, "land_nbhd_rate" -) - - -# Temp file to download workbook -tmp_file_nbhd_rate_2022 <- tempfile(fileext = ".xlsx") -tmp_file_nbhd_rate_2023 <- tempfile(fileext = ".xlsx") -tmp_file_nbhd_rate_2024 <- tempfile(fileext = ".xlsx") - -# Grab the workbook from the raw S3 bucket -aws.s3::save_object( - object = remote_file_raw_nbhd_rate_2022, - file = tmp_file_nbhd_rate_2022 -) -aws.s3::save_object( - object = remote_file_raw_nbhd_rate_2023, - file = tmp_file_nbhd_rate_2023 -) -aws.s3::save_object( - object = remote_file_raw_nbhd_rate_2024, - file = tmp_file_nbhd_rate_2024 -) - -# List of regression classes -class <- dbGetQuery( - AWS_ATHENA_CONN_NOCTUA, - "SELECT class_code FROM ccao.class_dict WHERE regression_class" -) %>% - pull(class_code) - -# Load the raw workbooks, rename and clean up columns -land_nbhd_rate_2022 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2022) %>% - set_names(snakecase::to_snake_case(names(.))) %>% - select( - township_code = twp_number, - township_name = twp_name, - town_nbhd = twp_nbhd, - `2019` = `2019_rate`, - `2022` = `2022_rate` - ) %>% - pivot_longer( - c(`2019`, `2022`), - names_to = "year", values_to = "land_rate_per_sqft" - ) %>% - mutate( - across(c(township_code:town_nbhd, year), as.character), - town_nbhd = str_remove_all(town_nbhd, "-"), - land_rate_per_sqft = parse_number(land_rate_per_sqft) - ) %>% - expand_grid(class) - -land_nbhd_rate_2023 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2023) %>% - set_names(snakecase::to_snake_case(names(.))) %>% - select( - town_nbhd = neighborhood_id, - `2020` = `2020_2_00_class_unit_price`, - `2023` = `2023_2_00_class_unit_price` - ) %>% - mutate( - town_nbhd = gsub("\\D", "", town_nbhd), - township_code = substr(town_nbhd, 1, 2), - township_name = ccao::town_convert(township_code) - ) %>% - relocate(c(township_code, township_name)) %>% - pivot_longer( - c(`2020`, `2023`), - names_to = "year", values_to = "land_rate_per_sqft" - ) %>% - mutate(across(c(township_code:town_nbhd, year), as.character)) %>% - expand_grid(class) - -land_nbhd_rate_2024 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2024) %>% - set_names(snakecase::to_snake_case(names(.))) %>% - mutate( - town_nbhd = paste0( - township_code, str_pad(neighborhood, 3, side = "left", pad = "0") - ) - ) %>% - select( - town_nbhd, - classes, - `2021` = `2021_unit_price`, - `2024` = `2024_unit_price` - ) %>% - mutate( - town_nbhd = gsub("\\D", "", town_nbhd), - township_code = substr(town_nbhd, 1, 2), - township_name = ccao::town_convert(township_code) - ) %>% - relocate(c(township_code, township_name)) %>% - pivot_longer( - c(`2021`, `2024`), - names_to = "year", values_to = "land_rate_per_sqft" - ) %>% - mutate(across(c(township_code:town_nbhd, year), as.character)) %>% - expand_grid(class) %>% - # 2024 contains bifurcated neighborhood land rates across class - filter( - !(classes == "all other regression classes" & class %in% c("210", "295")), - !(classes == "2-10s/2-95s" & !(class %in% c("210", "295"))) - ) %>% - select(-classes) - -# Write the rates to S3, partitioned by year -bind_rows( - land_nbhd_rate_2022, - land_nbhd_rate_2023, - land_nbhd_rate_2024 -) %>% - relocate(land_rate_per_sqft, .after = last_col()) %>% - group_by(year) %>% - arrow::write_dataset( - path = remote_file_warehouse_nbhd_rate, - format = "parquet", - hive_style = TRUE, - compression = "snappy" - ) +library(arrow) +library(aws.s3) +library(dplyr) +library(noctua) +library(openxlsx) +library(purrr) +library(readr) +library(snakecase) +library(stringr) +library(tidyr) +source("utils.R") + +# This script retrieves and cleans land value spreadsheets provided by +# the Valuations department and formats them for use in Athena +AWS_S3_RAW_BUCKET <- Sys.getenv("AWS_S3_RAW_BUCKET") +AWS_S3_WAREHOUSE_BUCKET <- Sys.getenv("AWS_S3_WAREHOUSE_BUCKET") +input_bucket <- file.path(AWS_S3_RAW_BUCKET, "ccao", "land") +output_bucket <- file.path(AWS_S3_WAREHOUSE_BUCKET, "ccao", "land") + +AWS_ATHENA_CONN_NOCTUA <- dbConnect(noctua::athena(), rstudio_conn_tab = FALSE) + +# Location of remote files +remote_file_raw_nbhd_rate_2022 <- file.path( + input_bucket, "nbhd_rate", "2022.xlsx" +) +remote_file_raw_nbhd_rate_2023 <- file.path( + input_bucket, "nbhd_rate", "2023.xlsx" +) +remote_file_raw_nbhd_rate_2024 <- file.path( + input_bucket, "nbhd_rate", "2024.xlsx" +) +remote_file_warehouse_nbhd_rate <- file.path( + output_bucket, "land_nbhd_rate" +) + + +# Temp file to download workbook +tmp_file_nbhd_rate_2022 <- tempfile(fileext = ".xlsx") +tmp_file_nbhd_rate_2023 <- tempfile(fileext = ".xlsx") +tmp_file_nbhd_rate_2024 <- tempfile(fileext = ".xlsx") + +# Grab the workbook from the raw S3 bucket +aws.s3::save_object( + object = remote_file_raw_nbhd_rate_2022, + file = tmp_file_nbhd_rate_2022 +) +aws.s3::save_object( + object = remote_file_raw_nbhd_rate_2023, + file = tmp_file_nbhd_rate_2023 +) +aws.s3::save_object( + object = remote_file_raw_nbhd_rate_2024, + file = tmp_file_nbhd_rate_2024 +) + +# List of regression classes +class <- dbGetQuery( + AWS_ATHENA_CONN_NOCTUA, + "SELECT class_code FROM ccao.class_dict WHERE regression_class" +) %>% + pull(class_code) + +# Load the raw workbooks, rename and clean up columns +land_nbhd_rate_2022 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2022) %>% + set_names(snakecase::to_snake_case(names(.))) %>% + select( + township_code = twp_number, + township_name = twp_name, + town_nbhd = twp_nbhd, + `2019` = `2019_rate`, + `2022` = `2022_rate` + ) %>% + pivot_longer( + c(`2019`, `2022`), + names_to = "year", values_to = "land_rate_per_sqft" + ) %>% + mutate( + across(c(township_code:town_nbhd, year), as.character), + town_nbhd = str_remove_all(town_nbhd, "-"), + land_rate_per_sqft = parse_number(land_rate_per_sqft) + ) %>% + expand_grid(class) + +land_nbhd_rate_2023 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2023) %>% + set_names(snakecase::to_snake_case(names(.))) %>% + select( + town_nbhd = neighborhood_id, + `2020` = `2020_2_00_class_unit_price`, + `2023` = `2023_2_00_class_unit_price` + ) %>% + mutate( + town_nbhd = gsub("\\D", "", town_nbhd), + township_code = substr(town_nbhd, 1, 2), + township_name = ccao::town_convert(township_code) + ) %>% + relocate(c(township_code, township_name)) %>% + pivot_longer( + c(`2020`, `2023`), + names_to = "year", values_to = "land_rate_per_sqft" + ) %>% + mutate(across(c(township_code:town_nbhd, year), as.character)) %>% + expand_grid(class) + +land_nbhd_rate_2024 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2024) %>% + set_names(snakecase::to_snake_case(names(.))) %>% + mutate( + town_nbhd = paste0( + township_code, str_pad(neighborhood, 3, side = "left", pad = "0") + ) + ) %>% + select( + town_nbhd, + classes, + `2021` = `2021_unit_price`, + `2024` = `2024_unit_price` + ) %>% + mutate( + town_nbhd = gsub("\\D", "", town_nbhd), + township_code = substr(town_nbhd, 1, 2), + township_name = ccao::town_convert(township_code) + ) %>% + relocate(c(township_code, township_name)) %>% + pivot_longer( + c(`2021`, `2024`), + names_to = "year", values_to = "land_rate_per_sqft" + ) %>% + mutate(across(c(township_code:town_nbhd, year), as.character)) %>% + expand_grid(class) %>% + # 2024 contains bifurcated neighborhood land rates across class + filter( + !(classes == "all other regression classes" & class %in% c("210", "295")), + !(classes == "2-10s/2-95s" & !(class %in% c("210", "295"))) + ) %>% + select(-classes) + +# Write the rates to S3, partitioned by year +bind_rows( + land_nbhd_rate_2022, + land_nbhd_rate_2023, + land_nbhd_rate_2024 +) %>% + relocate(land_rate_per_sqft, .after = last_col()) %>% + group_by(year) %>% + arrow::write_dataset( + path = remote_file_warehouse_nbhd_rate, + format = "parquet", + hive_style = TRUE, + compression = "snappy" + )