-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
13 add gold mcs epc data #17
base: dev
Are you sure you want to change the base?
Changes from all commits
cfe2149
aec3c3e
a7be30d
60b59bc
a56f765
cf57335
a37451a
43fd319
cd6d1eb
ba2dc7a
ecebb72
3c5d597
8768231
465e997
31cf6d1
bfe9ea3
3a765f0
fd5d6e6
7518c22
1f29a92
97899cd
b2d9361
f79bd33
84694fa
71af7e1
71b1ebb
d935d63
5c30dfd
7264776
1636960
2792329
b80d5da
4bff00c
ae42bc6
5273ab7
0855652
cee0675
be738e3
95493ad
37bd4d3
53ecce2
9b1b6e4
4b7ce75
0cfe535
6e8ed9b
430d848
6437ad1
a2db691
4352af1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,9 +34,11 @@ | |
|
||
if __name__ == "__main__": | ||
# ====================================================== | ||
# MCS installations, by off-gas status | ||
|
||
total_cumulative_installations = process_data.get_total_cumsums() | ||
# Total MCS installations | ||
enhanced_mcs = process_data.get_enhanced_combined(mcs_or_gold="mcs") | ||
total_cumulative_installations = process_data.get_total_cumsums( | ||
data=enhanced_mcs, installation_date_col="commission_date" | ||
) | ||
|
||
total_cumulative_installations_chart = time_series_comparison( | ||
data=total_cumulative_installations, | ||
|
@@ -52,7 +54,10 @@ | |
# MCS installations, by off-gas status | ||
|
||
installations_by_gas_status = process_data.cumsums_by_variable( | ||
"off_gas", "Gas status" | ||
"off_gas", | ||
"Gas status", | ||
data=enhanced_mcs, | ||
installation_date_col="commission_date", | ||
) | ||
|
||
installations_by_gas_status_chart = time_series_comparison( | ||
|
@@ -72,7 +77,10 @@ | |
# MCS installations, by rurality | ||
|
||
installations_by_rurality = process_data.cumsums_by_variable( | ||
"rurality_2_label", "Rurality" | ||
"rurality_2_label", | ||
"Rurality", | ||
data=enhanced_mcs, | ||
installation_date_col="commission_date", | ||
) | ||
|
||
installations_by_rurality_chart = time_series_comparison( | ||
|
@@ -89,6 +97,70 @@ | |
output_dir=output_folder, | ||
) | ||
|
||
# ====================================================== | ||
# Total MCS and EPC installations | ||
enhanced_combined = process_data.get_enhanced_combined(mcs_or_gold="gold") | ||
gold_total_cumulative_installations = process_data.get_total_cumsums( | ||
data=enhanced_combined, installation_date_col="HP_INSTALL_DATE" | ||
) | ||
|
||
gold_total_cumulative_installations_chart = time_series_comparison( | ||
data=gold_total_cumulative_installations, | ||
title="Cumulative heat pump installations over time", | ||
y_var="cumsum:Q", | ||
y_title="Number of heat pump installations", | ||
color_var="colour:N", | ||
filename="gold_total_cumulative_installations", | ||
output_dir=output_folder, | ||
) | ||
|
||
# ====================================================== | ||
# MCS and EPC installations, by off-gas status | ||
|
||
gold_installations_by_gas_status = process_data.cumsums_by_variable( | ||
"off_gas", | ||
"Gas status", | ||
data=enhanced_combined, | ||
installation_date_col="HP_INSTALL_DATE", | ||
) | ||
|
||
gold_installations_by_gas_status_chart = time_series_comparison( | ||
data=gold_installations_by_gas_status, | ||
title=[ | ||
"Cumulative number of heat pump installations in Welsh homes", | ||
"located in off- and on-gas postcodes", | ||
], | ||
y_var="Number of heat pumps:Q", | ||
y_title="Number of heat pump installations", | ||
color_var="Gas status:N", | ||
filename="gold_installations_by_gas_status", | ||
output_dir=output_folder, | ||
) | ||
|
||
# ====================================================== | ||
# MCS and EPC installations, by rurality | ||
|
||
gold_installations_by_rurality = process_data.cumsums_by_variable( | ||
"rurality_2_label", | ||
"Rurality", | ||
data=enhanced_combined, | ||
installation_date_col="HP_INSTALL_DATE", | ||
) | ||
|
||
gold_installations_by_rurality_chart = time_series_comparison( | ||
data=gold_installations_by_rurality, | ||
title=[ | ||
"Cumulative number of heat pump installations", | ||
"in Welsh homes located in rural vs urban postcodes", | ||
], | ||
y_var="Number of heat pumps:Q", | ||
y_title="Number of heat pump installations", | ||
color_var="Rurality:N", | ||
domain_max=installations_by_rurality.date.max(), | ||
filename="gold_installations_by_rurality", | ||
output_dir=output_folder, | ||
) | ||
|
||
# ====================================================== | ||
# Proportions of new builds that have heat pumps | ||
|
||
|
@@ -148,7 +220,10 @@ | |
|
||
mcs_retrofits = process_data.get_mcs_retrofits() | ||
mcs_retrofit_cumsums = process_data.cumsums_by_variable( | ||
"country", "wales_col", data=mcs_retrofits | ||
"country", | ||
"wales_col", | ||
data=mcs_retrofits, | ||
installation_date_col="commission_date", | ||
) | ||
# this function works without separating by category - 'wales_col' is a whole column of "Wales" (not used) | ||
|
||
|
@@ -206,19 +281,24 @@ | |
|
||
wales_df = load_wales_df(from_csv=False) | ||
wales_hp = load_wales_hp(wales_df) | ||
wales_mcs = process_data.get_enhanced_mcs() | ||
wales_mcs = process_data.get_enhanced_combined(mcs_or_gold="mcs") | ||
|
||
# English plots | ||
|
||
# Key statistics | ||
intro = "Summary statistics for heat pumps in Wales\n\n" | ||
total_hp = f"Number of heat pumps: {len(wales_hp)}\n" | ||
total_epc = f"Number of properties in EPC: {len(wales_df)}\n" | ||
hp_perc = "Estimated percentage of properties with a heat pump: \ | ||
total_epc_hp = f"Number of heat pumps in EPC: {len(wales_hp)}\n" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this epc or epc + mcs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. EPC only |
||
total_epc_properties = f"Number of properties in EPC: {len(wales_df)}\n" | ||
hp_perc = "Estimated percentage of EPC properties with a heat pump: \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again just epc or epc + mcs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just EPC |
||
{:.2%}\n\n".format( | ||
len(wales_hp) / len(wales_df) | ||
) | ||
|
||
total_hp = f"Number of heat pumps in MCS and EPC: {len(enhanced_combined)}\n" | ||
total_mcs_installations = ( | ||
f"Number of MCS-certified heat pump installations: {len(enhanced_mcs)}\n" | ||
) | ||
|
||
tenure_value_counts = wales_hp.TENURE.value_counts(normalize=True).to_string() | ||
|
||
epc_c_or_above_and_good_walls = wales_df.loc[ | ||
|
@@ -262,9 +342,11 @@ | |
stats_txt.writelines( | ||
[ | ||
intro, | ||
total_hp, | ||
total_epc, | ||
total_epc_hp, | ||
total_epc_properties, | ||
hp_perc, | ||
total_hp, | ||
total_mcs_installations, | ||
tenure_value_counts, | ||
epc_c_wall, | ||
epc_c_wall_proportion, | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -13,11 +13,16 @@ | |||||||||||
|
||||||||||||
from asf_core_data.getters.epc.data_batches import get_batch_path | ||||||||||||
from asf_core_data.config import base_config | ||||||||||||
from asf_core_data.getters.data_getters import download_core_data, logger | ||||||||||||
from asf_core_data.getters.data_getters import ( | ||||||||||||
download_core_data, | ||||||||||||
logger, | ||||||||||||
download_from_s3, | ||||||||||||
) | ||||||||||||
|
||||||||||||
import pandas as pd | ||||||||||||
import numpy as np | ||||||||||||
import os | ||||||||||||
import dask.dataframe as dd | ||||||||||||
crispy-wonton marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
|
||||||||||||
from argparse import ArgumentParser | ||||||||||||
|
||||||||||||
|
@@ -71,6 +76,20 @@ def create_argparser(): | |||||||||||
type=bool, | ||||||||||||
) | ||||||||||||
|
||||||||||||
parser.add_argument( | ||||||||||||
"--gold_mcs_epc_batch", | ||||||||||||
help="Specifies which gold merged EPC-MCS_installation-MCS_installer data batch to use. Only date required in YYMMDD format.", | ||||||||||||
type=str, | ||||||||||||
) | ||||||||||||
|
||||||||||||
parser.add_argument( | ||||||||||||
"--download_gold_data_from_s3", | ||||||||||||
help="If set to True, downloads specified batch of gold merged EPC-MCS_installation-MCS_installer data from S3 locally. " | ||||||||||||
"Note that this download can take 30 minutes and not recommended if `hp_installed_gold_[YYMMDD]` already in supplementary data folder in `inputs`.", | ||||||||||||
default=False, | ||||||||||||
type=str, | ||||||||||||
) | ||||||||||||
|
||||||||||||
crispy-wonton marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
return parser | ||||||||||||
|
||||||||||||
|
||||||||||||
|
@@ -554,3 +573,73 @@ def load_wales_hp(wales_epc): | |||||||||||
wales_hp = wales_epc.loc[wales_epc.HP_INSTALLED].reset_index(drop=True) | ||||||||||||
|
||||||||||||
return wales_hp | ||||||||||||
|
||||||||||||
|
||||||||||||
def load_mcs_epc_combined(): | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is the main part to review |
||||||||||||
""" | ||||||||||||
Get combined gold MCS-EPC dataset filtered for rows with heat pump installations in domestic dwellings. Use local preprocessed dataset unless specified | ||||||||||||
to download data from S3. Downloaded data goes through pre-processing to produce desired pd.DataFrame. | ||||||||||||
|
||||||||||||
Returns: | ||||||||||||
pd.DataFrame: Gold MCS-EPC dataset for domestic dwellings with heat pumps. | ||||||||||||
""" | ||||||||||||
args = get_args() | ||||||||||||
batch = args.gold_mcs_epc_batch | ||||||||||||
download_data = args.download_gold_data_from_s3 | ||||||||||||
|
||||||||||||
if not download_data: | ||||||||||||
path = os.path.join(input_data_path, f"hp_installed_gold_{batch}.csv") | ||||||||||||
return pd.read_csv(path) | ||||||||||||
|
||||||||||||
else: | ||||||||||||
path = f"outputs/gold/merged_epc_mcs_installations_installers_{batch}.csv" | ||||||||||||
|
||||||||||||
logger.info(f"Loading {path} from S3. This will take a while.") | ||||||||||||
|
||||||||||||
download_from_s3(path_to_file=path, output_path=input_data_path) | ||||||||||||
|
||||||||||||
ddf = dd.read_csv( | ||||||||||||
os.path.join( | ||||||||||||
input_data_path, | ||||||||||||
f"merged_epc_mcs_installations_installers_{batch}.csv", | ||||||||||||
), | ||||||||||||
dtype={ | ||||||||||||
"HP_INSTALL_DATE": "object", | ||||||||||||
"UPRN": "object", | ||||||||||||
"installation_type": "object", | ||||||||||||
Comment on lines
+607
to
+609
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are you setting these as object? |
||||||||||||
}, | ||||||||||||
) | ||||||||||||
|
||||||||||||
# Get rows with HP installed only, data already filtered for domestic only | ||||||||||||
hp_installed = ddf[ddf["HP_INSTALLED"] == True] | ||||||||||||
hp_installed = hp_installed[ | ||||||||||||
[ | ||||||||||||
"POSTCODE", | ||||||||||||
"INSPECTION_DATE", | ||||||||||||
"COUNTRY", | ||||||||||||
"UPRN", | ||||||||||||
"HP_INSTALLED", | ||||||||||||
"HP_TYPE", | ||||||||||||
"HP_INSTALL_DATE", | ||||||||||||
"MCS_AVAILABLE", | ||||||||||||
"EPC_AVAILABLE", | ||||||||||||
] | ||||||||||||
] | ||||||||||||
|
||||||||||||
hp_installed = hp_installed.rename(columns={"POSTCODE": "postcode"}) | ||||||||||||
|
||||||||||||
# Convert to pandas df | ||||||||||||
df = hp_installed.compute() | ||||||||||||
crispy-wonton marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
|
||||||||||||
df["HP_INSTALL_DATE"] = pd.to_datetime(df["HP_INSTALL_DATE"]) | ||||||||||||
|
||||||||||||
# Batch 231009 contains data from MCS up to 30 June 2023 and data from EPC up to 31 July 2023 | ||||||||||||
# Must remove additional month of EPC data for consistency | ||||||||||||
if batch == "231009": | ||||||||||||
df = df[df["HP_INSTALL_DATE"] < "2023-07-01"] | ||||||||||||
Comment on lines
+638
to
+639
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead you could just check max date in EPC, max date in MCS and get the min between the two max's :) it would avoid the hardcoding. Also because this might happen in future batches.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great idea but it doesn't look like 'commission_date' col exists in the dataset. I believe the processing combines all installation/commission dates into the HP_INSTALL_DATE col |
||||||||||||
|
||||||||||||
df.to_csv( | ||||||||||||
os.path.join(input_data_path, f"hp_installed_gold_{batch}.csv"), index=False | ||||||||||||
) | ||||||||||||
|
||||||||||||
return df |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From here are the new graphs that use MCS -EPC gold