diff --git a/README.md b/README.md index 355a9a8..2e3bd40 100644 --- a/README.md +++ b/README.md @@ -19,24 +19,31 @@ The remainder of the charts in the response can be produced from code in the rep - Activate conda environment: `conda activate asf_welsh_energy_consultation` - Run `make inputs-pull` to pull the zipped supplementary data from S3 and put it in `/inputs/data`. There will be one folder per historical analysis containing the supplementary data files as listed in the `Historical analysis` section below. + +## Run the script + - Run `python asf_welsh_energy_consultation/analysis/produce_plots_and_stats.py --local_data_dir `. You need to specify the path to the local directory where your local copy of the EPC data is/will be saved by replacing `` with the path to your "ASF_data" directory or equivalent. If you don't have a local directory for ASF core data, you can create a folder called "ASF_data" in your home directory. - You can specify which batch of EPC data to download and MCS data to load from S3 by passing the `--epc_batch` and `--mcs_batch` arguments, both default to downloading/loading the newest data from S3, respectively. - You can specify which supplementary data folder to use by passing the `--supp_data` argument. It defaults to using the latest supplementary data folder. + - You can specify which batch of gold MCS-EPC merged data to use with the `--gold_mcs_epc_batch` argument. Passing batch as YYMMDD. + - If you wish to download and process a new gold MCS-EPC batch (i.e. a different batch from the preprocessed `hp_installed_gold_[YYMMDD].csv` file in the supplementary data folder + in `inputs/data`), you can download and process a new gold MCS-EPC merged dataset by setting the `--download_gold_data_from_s3` argument to `True`. Note that this download can take ~30 minutes. + - Run `python asf_welsh_energy_consultation/analysis/produce_plots_and_stats.py -h` for more info. - To recreate the full October 2023 analysis, set the `--calculate_average_installations` argument to `True`. This will calculate some additional numbers on MCS installations per year included in the October 2023 response. For other historical analyses, this argument is not required and defaults to `False`. - Run `python asf_welsh_energy_consultation/analysis/produce_plots_and_stats.py -h` for more info. -The script should generate the following seven plots which will be saved in your local repo in `outputs/figures`: +## The script should generate the following ten plots which will be saved in your local repo in `outputs/figures`: - `cumulative_retrofits.html` - `electric_tenure.html` -- `installations_by_gas_status.html` -- `installations_by_rurality.html` +- `[gold_]installations_by_gas_status.html` +- `[gold_]installations_by_rurality.html` - `new_build_hp_cumulative.html` - `new_build_hp_proportion.html` -- `total_cumulative_installations.html` +- `[gold_]total_cumulative_installations.html` It should generate a further 10 plots, five in English and five in Welsh, saved in `outputs/figures/english` and `outputs/figures/welsh`, respectively: @@ -79,6 +86,7 @@ Versions/batches of data used for previous analysis are listed below. October 2023 analysis (`/inputs/data/data_202310`): - EPC: 2023_Q2_complete (preprocessed, and preprocessed and deduplicated) +- EPC & MCS gold merged: batcgh 231009 - mcs_installations_231009.csv - mcs_installations_epc_full_231009.csv - dwellings_2021.xlsx - [Number of dwellings by housing characteristics in England and Wales 2021 (released 30 March 2023)](https://www.ons.gov.uk/peoplepopulationandcommunity/housing/datasets/numberofdwellingsbyhousingcharacteristicsinenglandandwales) diff --git a/asf_welsh_energy_consultation/analysis/produce_plots_and_stats.py b/asf_welsh_energy_consultation/analysis/produce_plots_and_stats.py index 7057df9..734e17f 100644 --- a/asf_welsh_energy_consultation/analysis/produce_plots_and_stats.py +++ b/asf_welsh_energy_consultation/analysis/produce_plots_and_stats.py @@ -34,9 +34,11 @@ if __name__ == "__main__": # ====================================================== - # MCS installations, by off-gas status - - total_cumulative_installations = process_data.get_total_cumsums() + # Total MCS installations + enhanced_mcs = process_data.get_enhanced_combined(mcs_or_gold="mcs") + total_cumulative_installations = process_data.get_total_cumsums( + data=enhanced_mcs, installation_date_col="commission_date" + ) total_cumulative_installations_chart = time_series_comparison( data=total_cumulative_installations, @@ -52,7 +54,10 @@ # MCS installations, by off-gas status installations_by_gas_status = process_data.cumsums_by_variable( - "off_gas", "Gas status" + "off_gas", + "Gas status", + data=enhanced_mcs, + installation_date_col="commission_date", ) installations_by_gas_status_chart = time_series_comparison( @@ -72,7 +77,10 @@ # MCS installations, by rurality installations_by_rurality = process_data.cumsums_by_variable( - "rurality_2_label", "Rurality" + "rurality_2_label", + "Rurality", + data=enhanced_mcs, + installation_date_col="commission_date", ) installations_by_rurality_chart = time_series_comparison( @@ -89,6 +97,70 @@ output_dir=output_folder, ) + # ====================================================== + # Total MCS and EPC installations + enhanced_combined = process_data.get_enhanced_combined(mcs_or_gold="gold") + gold_total_cumulative_installations = process_data.get_total_cumsums( + data=enhanced_combined, installation_date_col="HP_INSTALL_DATE" + ) + + gold_total_cumulative_installations_chart = time_series_comparison( + data=gold_total_cumulative_installations, + title="Cumulative heat pump installations over time", + y_var="cumsum:Q", + y_title="Number of heat pump installations", + color_var="colour:N", + filename="gold_total_cumulative_installations", + output_dir=output_folder, + ) + + # ====================================================== + # MCS and EPC installations, by off-gas status + + gold_installations_by_gas_status = process_data.cumsums_by_variable( + "off_gas", + "Gas status", + data=enhanced_combined, + installation_date_col="HP_INSTALL_DATE", + ) + + gold_installations_by_gas_status_chart = time_series_comparison( + data=gold_installations_by_gas_status, + title=[ + "Cumulative number of heat pump installations in Welsh homes", + "located in off- and on-gas postcodes", + ], + y_var="Number of heat pumps:Q", + y_title="Number of heat pump installations", + color_var="Gas status:N", + filename="gold_installations_by_gas_status", + output_dir=output_folder, + ) + + # ====================================================== + # MCS and EPC installations, by rurality + + gold_installations_by_rurality = process_data.cumsums_by_variable( + "rurality_2_label", + "Rurality", + data=enhanced_combined, + installation_date_col="HP_INSTALL_DATE", + ) + + gold_installations_by_rurality_chart = time_series_comparison( + data=gold_installations_by_rurality, + title=[ + "Cumulative number of heat pump installations", + "in Welsh homes located in rural vs urban postcodes", + ], + y_var="Number of heat pumps:Q", + y_title="Number of heat pump installations", + color_var="Rurality:N", + domain_max=installations_by_rurality.date.max(), + filename="gold_installations_by_rurality", + output_dir=output_folder, + ) + # ====================================================== # Proportions of new builds that have heat pumps @@ -148,7 +220,10 @@ mcs_retrofits = process_data.get_mcs_retrofits() mcs_retrofit_cumsums = process_data.cumsums_by_variable( - "country", "wales_col", data=mcs_retrofits + "country", + "wales_col", + data=mcs_retrofits, + installation_date_col="commission_date", ) # this function works without separating by category - 'wales_col' is a whole column of "Wales" (not used) @@ -206,19 +281,24 @@ wales_df = load_wales_df(from_csv=False) wales_hp = load_wales_hp(wales_df) - wales_mcs = process_data.get_enhanced_mcs() + wales_mcs = process_data.get_enhanced_combined(mcs_or_gold="mcs") # English plots # Key statistics intro = "Summary statistics for heat pumps in Wales\n\n" - total_hp = f"Number of heat pumps: {len(wales_hp)}\n" - total_epc = f"Number of properties in EPC: {len(wales_df)}\n" - hp_perc = "Estimated percentage of properties with a heat pump: \ + total_epc_hp = f"Number of heat pumps in EPC: {len(wales_hp)}\n" + total_epc_properties = f"Number of properties in EPC: {len(wales_df)}\n" + hp_perc = "Estimated percentage of EPC properties with a heat pump: \ {:.2%}\n\n".format( len(wales_hp) / len(wales_df) ) + total_hp = f"Number of heat pumps in MCS and EPC: {len(enhanced_combined)}\n" + total_mcs_installations = ( + f"Number of MCS-certified heat pump installations: {len(enhanced_mcs)}\n" + ) + tenure_value_counts = wales_hp.TENURE.value_counts(normalize=True).to_string() epc_c_or_above_and_good_walls = wales_df.loc[ @@ -262,9 +342,11 @@ stats_txt.writelines( [ intro, - total_hp, - total_epc, + total_epc_hp, + total_epc_properties, hp_perc, + total_hp, + total_mcs_installations, tenure_value_counts, epc_c_wall, epc_c_wall_proportion, diff --git a/asf_welsh_energy_consultation/getters/get_data.py b/asf_welsh_energy_consultation/getters/get_data.py index 5f294f0..2f37474 100644 --- a/asf_welsh_energy_consultation/getters/get_data.py +++ b/asf_welsh_energy_consultation/getters/get_data.py @@ -13,11 +13,16 @@ from asf_core_data.getters.epc.data_batches import get_batch_path from asf_core_data.config import base_config -from asf_core_data.getters.data_getters import download_core_data, logger +from asf_core_data.getters.data_getters import ( + download_core_data, + logger, + download_from_s3, +) import pandas as pd import numpy as np import os +import dask.dataframe as dd from argparse import ArgumentParser @@ -71,6 +76,20 @@ def create_argparser(): type=bool, ) + parser.add_argument( + "--gold_mcs_epc_batch", + help="Specifies which gold merged EPC-MCS_installation-MCS_installer data batch to use. Only date required in YYMMDD format.", + type=str, + ) + + parser.add_argument( + "--download_gold_data_from_s3", + help="If set to True, downloads specified batch of gold merged EPC-MCS_installation-MCS_installer data from S3 locally. " + "Note that this download can take 30 minutes and not recommended if `hp_installed_gold_[YYMMDD]` already in supplementary data folder in `inputs`.", + default=False, + type=str, + ) + return parser @@ -554,3 +573,73 @@ def load_wales_hp(wales_epc): wales_hp = wales_epc.loc[wales_epc.HP_INSTALLED].reset_index(drop=True) return wales_hp + + +def load_mcs_epc_combined(): + """ + Get combined gold MCS-EPC dataset filtered for rows with heat pump installations in domestic dwellings. Use local preprocessed dataset unless specified + to download data from S3. Downloaded data goes through pre-processing to produce desired pd.DataFrame. + + Returns: + pd.DataFrame: Gold MCS-EPC dataset for domestic dwellings with heat pumps. + """ + args = get_args() + batch = args.gold_mcs_epc_batch + download_data = args.download_gold_data_from_s3 + + if not download_data: + path = os.path.join(input_data_path, f"hp_installed_gold_{batch}.csv") + return pd.read_csv(path) + + else: + path = f"outputs/gold/merged_epc_mcs_installations_installers_{batch}.csv" + + logger.info(f"Loading {path} from S3. This will take a while.") + + download_from_s3(path_to_file=path, output_path=input_data_path) + + ddf = dd.read_csv( + os.path.join( + input_data_path, + f"merged_epc_mcs_installations_installers_{batch}.csv", + ), + dtype={ + "HP_INSTALL_DATE": "object", + "UPRN": "object", + "installation_type": "object", + }, + ) + + # Get rows with HP installed only, data already filtered for domestic only + hp_installed = ddf[ddf["HP_INSTALLED"] == True] + hp_installed = hp_installed[ + [ + "POSTCODE", + "INSPECTION_DATE", + "COUNTRY", + "UPRN", + "HP_INSTALLED", + "HP_TYPE", + "HP_INSTALL_DATE", + "MCS_AVAILABLE", + "EPC_AVAILABLE", + ] + ] + + hp_installed = hp_installed.rename(columns={"POSTCODE": "postcode"}) + + # Convert to pandas df + df = hp_installed.compute() + + df["HP_INSTALL_DATE"] = pd.to_datetime(df["HP_INSTALL_DATE"]) + + # Batch 231009 contains data from MCS up to 30 June 2023 and data from EPC up to 31 July 2023 + # Must remove additional month of EPC data for consistency + if batch == "231009": + df = df[df["HP_INSTALL_DATE"] < "2023-07-01"] + + df.to_csv( + os.path.join(input_data_path, f"hp_installed_gold_{batch}.csv"), index=False + ) + + return df diff --git a/asf_welsh_energy_consultation/pipeline/process_data.py b/asf_welsh_energy_consultation/pipeline/process_data.py index c7aa243..783454a 100644 --- a/asf_welsh_energy_consultation/pipeline/process_data.py +++ b/asf_welsh_energy_consultation/pipeline/process_data.py @@ -13,42 +13,48 @@ # PROCESSING MCS -def get_enhanced_mcs(): - """Get dataset of domestic MCS installations with attached off-gas, country and rurality fields. - +def get_enhanced_combined(mcs_or_gold="mcs"): + """Get dataset of either MCS installations or gold merged EPC and MCS installations data with attached off-gas, country and rurality fields for Wales only. + Args: + mcs_or_gold, str: Specifies use of MCS installations data or MCS-EPC gold merged data for creating combined dataset. Defaults to "mcs". Returns: pd.DataFrame: Dataset as described above. """ - mcs = get_data.get_mcs_domestic() + if mcs_or_gold == "gold": + df = get_data.load_mcs_epc_combined() + df_name = "gold merged EPC-MCS installation" + else: + df = get_data.get_mcs_domestic() + df_name = "MCS installation" og = get_data.get_offgas() countries = get_data.get_countries() rural = get_data.get_rurality_by_oa() # join with off-gas data - mcs = mcs.merge(og, on="postcode", how="left") - mcs["off_gas"] = mcs["off_gas"].fillna("On gas").replace({True: "Off gas"}) + df = df.merge(og, on="postcode", how="left") + df["off_gas"] = df["off_gas"].fillna("On gas").replace({True: "Off gas"}) # join with regions in order to filter to Wales - mcs = mcs.merge(countries, on="postcode", how="left") - if mcs.country.isna().sum() > 0: + df = df.merge(countries, on="postcode", how="left") + if df.country.isna().sum() > 0: logger.warning( - f"{mcs.country.isna().sum()} MCS installation records have no country match. " + f"{df.country.isna().sum()} {df_name} records have no country match. " f"Potential loss of data when filtering for Wales." ) - mcs = mcs.loc[mcs["country"] == "Wales"].reset_index(drop=True) + df = df.loc[df["country"] == "Wales"].reset_index(drop=True) # There will be records with no match # Some will be new postcodes (new build developments) # and some may be expired postcodes # join with rurality data - mcs = mcs.merge(rural, on="postcode", how="left") - if mcs.rurality_10_code.isna().sum() > 0: + df = df.merge(rural, on="postcode", how="left") + if df.rurality_10_code.isna().sum() > 0: logger.warning( - f"Loss of data when using rurality variable: {mcs.rurality_10_code.isna().sum()} Welsh MCS installation records have no rurality code match." + f"Loss of data when using rurality variable: {df.rurality_10_code.isna().sum()} Welsh {df_name} records have no rurality code match." ) # add custom rurality column (rurality "type 7": all different types of urban mapped to Urban) - mcs["rurality_7"] = mcs["rurality_10_label"].replace( + df["rurality_7"] = df["rurality_10_label"].replace( { "Urban city and town": "Urban", "Urban major conurbation": "Urban", @@ -57,40 +63,41 @@ def get_enhanced_mcs(): } ) - return mcs - - -# load enhanced MCS as part of this script, so only needs to be done once -enhanced_mcs = get_enhanced_mcs() + return df -def get_total_cumsums(): +def get_total_cumsums(data, installation_date_col): """ - Gets cumulative number of MCS-certified HP installations for Wales. + Gets cumulative number of HP installations for Wales. + + Args: + data pd.Dataframe: Dataframe of HP installations in Wales. + installation_date_col str: Name of column containing HP installation date. Returns: - pd.Dataframe containing cumulative MCS installations for Wales over time. + pd.Dataframe containing cumulative number of HP installations for Wales over time. """ - mcs = get_enhanced_mcs() - mcs["n"] = 1 - cumulative_total = mcs.groupby("commission_date")["n"].sum().reset_index() + data["n"] = 1 + cumulative_total = data.groupby(installation_date_col)["n"].sum().reset_index() # Sort by date ascending - cumulative_total = cumulative_total.sort_values("commission_date") + cumulative_total = cumulative_total.sort_values(installation_date_col) # Get cumulative total cumulative_total["cumsum"] = cumulative_total.n.cumsum() cumulative_total = cumulative_total.loc[ - cumulative_total.commission_date >= "2015-01-01" + cumulative_total[installation_date_col] >= "2015-01-01" ].reset_index(drop=True) - cumulative_total = cumulative_total.rename(columns={"commission_date": "date"}) + cumulative_total = cumulative_total.rename(columns={installation_date_col: "date"}) cumulative_total["colour"] = 1 # add single colour category for plotting return cumulative_total -def cumsums_by_variable(variable, new_var_name, data=enhanced_mcs): +def cumsums_by_variable( + variable, new_var_name, data, installation_date_col="HP_INSTALL_DATE" +): """Process data into a form giving the cumulative total of installations on each date for each category of a variable. @@ -104,15 +111,15 @@ def cumsums_by_variable(variable, new_var_name, data=enhanced_mcs): """ # calculate total number of installations for each date/category pair - totals = data.groupby(["commission_date", variable]).size() + totals = data.groupby([installation_date_col, variable]).size() totals = totals.reset_index().rename(columns={0: "sum"}) idx = pd.date_range( - totals["commission_date"].min(), totals["commission_date"].max() + totals[installation_date_col].min(), totals[installation_date_col].max() ) - totals = totals.pivot(index="commission_date", columns=variable).fillna(0) + totals = totals.pivot(index=installation_date_col, columns=variable).fillna(0) totals.index = pd.DatetimeIndex(totals.index) @@ -373,7 +380,7 @@ def get_mcs_retrofits(): # this makes sense because if they had been built with a HP we would expect them to appear in EPC # due to new build EPC requirements - enhanced_mcs = get_enhanced_mcs() + enhanced_mcs = get_enhanced_combined(mcs_or_gold="mcs") enhanced_mcs = add_unique_mcs_id(enhanced_mcs) mcs_retrofits = enhanced_mcs.loc[ ~enhanced_mcs["unique_id"].isin(hp_when_built_indices) @@ -448,7 +455,7 @@ def get_installations_per_year(): pandas.DataFrame of MCS installations per year in Wales. """ - mcs = get_enhanced_mcs() + mcs = get_enhanced_combined(mcs_or_gold="mcs") mcs["n"] = 1 mcs["year"] = pd.to_datetime(mcs["commission_date"]).dt.year installations_by_year = mcs.groupby("year")["n"].sum().reset_index() diff --git a/requirements.txt b/requirements.txt index 28a0ff2..c681af7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,13 @@ altair==4.2.2 -numpy==1.23.4 -pandas==1.5.1 altair_viewer==0.4.0 altair_saver==0.5.0 -matplotlib -odfpy -selenium==4.2.0 argparse==1.4.0 +dask==2023.5.0 +matplotlib==3.7.3 +numpy==1.23.4 +odfpy +pandas==1.5.1 s3fs>=2023.3.0 +selenium==4.2.0 asf_core_data@ git+ssh://git@github.com/nestauk/asf_core_data.git nesta_ds_utils@ git+ssh://git@github.com/nestauk/nesta_ds_utils.git