nestauk · crispy-wonton · Sep 21, 2023 · Sep 21, 2023 · Sep 21, 2023 · Sep 21, 2023
diff --git a/README.md b/README.md
@@ -19,24 +19,31 @@ The remainder of the charts in the response can be produced from code in the rep
 - Activate conda environment: `conda activate asf_welsh_energy_consultation`
 - Run `make inputs-pull` to pull the zipped supplementary data from S3 and put it in `/inputs/data`. There will be one folder per historical analysis
   containing the supplementary data files as listed in the `Historical analysis` section below.
+
+## Run the script
+
 - Run `python asf_welsh_energy_consultation/analysis/produce_plots_and_stats.py --local_data_dir <YOUR_LOCAL_DIR>`. You need to specify the path to the local
   directory where your local copy of the EPC data is/will be saved by replacing `<YOUR_LOCAL_DIR>` with the path to your "ASF_data" directory or equivalent.
   If you don't have a local directory for ASF core data, you can create a folder called "ASF_data" in your home directory.
   - You can specify which batch of EPC data to download and MCS data to load from S3 by passing the `--epc_batch` and `--mcs_batch` arguments, both
     default to downloading/loading the newest data from S3, respectively.
   - You can specify which supplementary data folder to use by passing the `--supp_data` argument. It defaults to using the latest supplementary data folder.
+  - You can specify which batch of gold MCS-EPC merged data to use with the `--gold_mcs_epc_batch` argument. Passing batch as YYMMDD.
+  - If you wish to download and process a new gold MCS-EPC batch (i.e. a different batch from the preprocessed `hp_installed_gold_[YYMMDD].csv` file in the supplementary data folder
+    in `inputs/data`), you can download and process a new gold MCS-EPC merged dataset by setting the `--download_gold_data_from_s3` argument to `True`. Note that this download can take ~30 minutes.
+  - Run `python asf_welsh_energy_consultation/analysis/produce_plots_and_stats.py -h` for more info.
   - To recreate the full October 2023 analysis, set the `--calculate_average_installations` argument to `True`. This will calculate some additional numbers on MCS installations per year included in the October 2023 response. For other historical analyses, this argument is not required and defaults to `False`.
   - Run `python asf_welsh_energy_consultation/analysis/produce_plots_and_stats.py -h` for more info.
 
-The script should generate the following seven plots which will be saved in your local repo in `outputs/figures`:
+## The script should generate the following ten plots which will be saved in your local repo in `outputs/figures`:
 
 - `cumulative_retrofits.html`
 - `electric_tenure.html`
-- `installations_by_gas_status.html`
-- `installations_by_rurality.html`
+- `[gold_]installations_by_gas_status.html`
+- `[gold_]installations_by_rurality.html`
 - `new_build_hp_cumulative.html`
 - `new_build_hp_proportion.html`
-- `total_cumulative_installations.html`
+- `[gold_]total_cumulative_installations.html`
 
 It should generate a further 10 plots, five in English and five in Welsh, saved in `outputs/figures/english` and `outputs/figures/welsh`, respectively:
 
@@ -79,6 +86,7 @@ Versions/batches of data used for previous analysis are listed below.
 October 2023 analysis (`/inputs/data/data_202310`):
 
 - EPC: 2023_Q2_complete (preprocessed, and preprocessed and deduplicated)
+- EPC & MCS gold merged: batcgh 231009
 - mcs_installations_231009.csv
 - mcs_installations_epc_full_231009.csv
 - dwellings_2021.xlsx - [Number of dwellings by housing characteristics in England and Wales 2021 (released 30 March 2023)](https://www.ons.gov.uk/peoplepopulationandcommunity/housing/datasets/numberofdwellingsbyhousingcharacteristicsinenglandandwales)

diff --git a/asf_welsh_energy_consultation/analysis/produce_plots_and_stats.py b/asf_welsh_energy_consultation/analysis/produce_plots_and_stats.py
@@ -34,9 +34,11 @@
 
 if __name__ == "__main__":
     # ======================================================
-    # MCS installations, by off-gas status
-
-    total_cumulative_installations = process_data.get_total_cumsums()
+    # Total MCS installations
+    enhanced_mcs = process_data.get_enhanced_combined(mcs_or_gold="mcs")
+    total_cumulative_installations = process_data.get_total_cumsums(
+        data=enhanced_mcs, installation_date_col="commission_date"
+    )
 
     total_cumulative_installations_chart = time_series_comparison(
         data=total_cumulative_installations,
@@ -52,7 +54,10 @@
     # MCS installations, by off-gas status
 
     installations_by_gas_status = process_data.cumsums_by_variable(
-        "off_gas", "Gas status"
+        "off_gas",
+        "Gas status",
+        data=enhanced_mcs,
+        installation_date_col="commission_date",
     )
 
     installations_by_gas_status_chart = time_series_comparison(
@@ -72,7 +77,10 @@
     # MCS installations, by rurality
 
     installations_by_rurality = process_data.cumsums_by_variable(
-        "rurality_2_label", "Rurality"
+        "rurality_2_label",
+        "Rurality",
+        data=enhanced_mcs,
+        installation_date_col="commission_date",
     )
 
     installations_by_rurality_chart = time_series_comparison(
@@ -89,6 +97,70 @@
         output_dir=output_folder,
     )
 
+    # ======================================================
+    # Total MCS and EPC installations
+    enhanced_combined = process_data.get_enhanced_combined(mcs_or_gold="gold")
+    gold_total_cumulative_installations = process_data.get_total_cumsums(
+        data=enhanced_combined, installation_date_col="HP_INSTALL_DATE"
+    )
+
+    gold_total_cumulative_installations_chart = time_series_comparison(
+        data=gold_total_cumulative_installations,
+        title="Cumulative heat pump installations over time",
+        y_var="cumsum:Q",
+        y_title="Number of heat pump installations",
+        color_var="colour:N",
+        filename="gold_total_cumulative_installations",
+        output_dir=output_folder,
+    )
+
+    # ======================================================
+    # MCS and EPC installations, by off-gas status
+
+    gold_installations_by_gas_status = process_data.cumsums_by_variable(
+        "off_gas",
+        "Gas status",
+        data=enhanced_combined,
+        installation_date_col="HP_INSTALL_DATE",
+    )
+
+    gold_installations_by_gas_status_chart = time_series_comparison(
+        data=gold_installations_by_gas_status,
+        title=[
+            "Cumulative number of heat pump installations in Welsh homes",
+            "located in off- and on-gas postcodes",
+        ],
+        y_var="Number of heat pumps:Q",
+        y_title="Number of heat pump installations",
+        color_var="Gas status:N",
+        filename="gold_installations_by_gas_status",
+        output_dir=output_folder,
+    )
+
+    # ======================================================
+    # MCS and EPC installations, by rurality
+
+    gold_installations_by_rurality = process_data.cumsums_by_variable(
+        "rurality_2_label",
+        "Rurality",
+        data=enhanced_combined,
+        installation_date_col="HP_INSTALL_DATE",
+    )
+
+    gold_installations_by_rurality_chart = time_series_comparison(
+        data=gold_installations_by_rurality,
+        title=[
+            "Cumulative number of heat pump installations",
+            "in Welsh homes located in rural vs urban postcodes",
+        ],
+        y_var="Number of heat pumps:Q",
+        y_title="Number of heat pump installations",
+        color_var="Rurality:N",
+        domain_max=installations_by_rurality.date.max(),
+        filename="gold_installations_by_rurality",
+        output_dir=output_folder,
+    )
+
     # ======================================================
     # Proportions of new builds that have heat pumps
 
@@ -148,7 +220,10 @@
 
     mcs_retrofits = process_data.get_mcs_retrofits()
     mcs_retrofit_cumsums = process_data.cumsums_by_variable(
-        "country", "wales_col", data=mcs_retrofits
+        "country",
+        "wales_col",
+        data=mcs_retrofits,
+        installation_date_col="commission_date",
     )
     # this function works without separating by category - 'wales_col' is a whole column of "Wales" (not used)
 
@@ -206,19 +281,24 @@
 
     wales_df = load_wales_df(from_csv=False)
     wales_hp = load_wales_hp(wales_df)
-    wales_mcs = process_data.get_enhanced_mcs()
+    wales_mcs = process_data.get_enhanced_combined(mcs_or_gold="mcs")
 
     # English plots
 
     # Key statistics
     intro = "Summary statistics for heat pumps in Wales\n\n"
-    total_hp = f"Number of heat pumps: {len(wales_hp)}\n"
-    total_epc = f"Number of properties in EPC: {len(wales_df)}\n"
-    hp_perc = "Estimated percentage of properties with a heat pump: \
+    total_epc_hp = f"Number of heat pumps in EPC: {len(wales_hp)}\n"
+    total_epc_properties = f"Number of properties in EPC: {len(wales_df)}\n"
+    hp_perc = "Estimated percentage of EPC properties with a heat pump: \
         {:.2%}\n\n".format(
         len(wales_hp) / len(wales_df)
     )
 
+    total_hp = f"Number of heat pumps in MCS and EPC: {len(enhanced_combined)}\n"
+    total_mcs_installations = (
+        f"Number of MCS-certified heat pump installations: {len(enhanced_mcs)}\n"
+    )
+
     tenure_value_counts = wales_hp.TENURE.value_counts(normalize=True).to_string()
 
     epc_c_or_above_and_good_walls = wales_df.loc[
@@ -262,9 +342,11 @@
         stats_txt.writelines(
             [
                 intro,
-                total_hp,
-                total_epc,
+                total_epc_hp,
+                total_epc_properties,
                 hp_perc,
+                total_hp,
+                total_mcs_installations,
                 tenure_value_counts,
                 epc_c_wall,
                 epc_c_wall_proportion,

diff --git a/asf_welsh_energy_consultation/getters/get_data.py b/asf_welsh_energy_consultation/getters/get_data.py
@@ -13,11 +13,16 @@
 
 from asf_core_data.getters.epc.data_batches import get_batch_path
 from asf_core_data.config import base_config
-from asf_core_data.getters.data_getters import download_core_data, logger
+from asf_core_data.getters.data_getters import (
+    download_core_data,
+    logger,
+    download_from_s3,
+)
 
 import pandas as pd
 import numpy as np
 import os
+import dask.dataframe as dd
 
 from argparse import ArgumentParser
 
@@ -71,6 +76,20 @@ def create_argparser():
         type=bool,
     )
 
+    parser.add_argument(
+        "--gold_mcs_epc_batch",
+        help="Specifies which gold merged EPC-MCS_installation-MCS_installer data batch to use. Only date required in YYMMDD format.",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--download_gold_data_from_s3",
+        help="If set to True, downloads specified batch of gold merged EPC-MCS_installation-MCS_installer data from S3 locally. "
+        "Note that this download can take 30 minutes and not recommended if `hp_installed_gold_[YYMMDD]` already in supplementary data folder in `inputs`.",
+        default=False,
+        type=str,
+    )
+
     return parser
 
 
@@ -554,3 +573,73 @@ def load_wales_hp(wales_epc):
     wales_hp = wales_epc.loc[wales_epc.HP_INSTALLED].reset_index(drop=True)
 
     return wales_hp
+
+
+def load_mcs_epc_combined():
+    """
+    Get combined gold MCS-EPC dataset filtered for rows with heat pump installations in domestic dwellings. Use local preprocessed dataset unless specified
+    to download data from S3. Downloaded data goes through pre-processing to produce desired pd.DataFrame.
+
+    Returns:
+        pd.DataFrame: Gold MCS-EPC dataset for domestic dwellings with heat pumps.
+    """
+    args = get_args()
+    batch = args.gold_mcs_epc_batch
+    download_data = args.download_gold_data_from_s3
+
+    if not download_data:
+        path = os.path.join(input_data_path, f"hp_installed_gold_{batch}.csv")
+        return pd.read_csv(path)
+
+    else:
+        path = f"outputs/gold/merged_epc_mcs_installations_installers_{batch}.csv"
+
+        logger.info(f"Loading {path} from S3. This will take a while.")
+
+        download_from_s3(path_to_file=path, output_path=input_data_path)
+
+        ddf = dd.read_csv(
+            os.path.join(
+                input_data_path,
+                f"merged_epc_mcs_installations_installers_{batch}.csv",
+            ),
+            dtype={
+                "HP_INSTALL_DATE": "object",
+                "UPRN": "object",
+                "installation_type": "object",
+            },
+        )
+
+        # Get rows with HP installed only, data already filtered for domestic only
+        hp_installed = ddf[ddf["HP_INSTALLED"] == True]
+        hp_installed = hp_installed[
+            [
+                "POSTCODE",
+                "INSPECTION_DATE",
+                "COUNTRY",
+                "UPRN",
+                "HP_INSTALLED",
+                "HP_TYPE",
+                "HP_INSTALL_DATE",
+                "MCS_AVAILABLE",
+                "EPC_AVAILABLE",
+            ]
+        ]
+
+        hp_installed = hp_installed.rename(columns={"POSTCODE": "postcode"})
+
+        # Convert to pandas df
+        df = hp_installed.compute()
+
+        df["HP_INSTALL_DATE"] = pd.to_datetime(df["HP_INSTALL_DATE"])
+
+        # Batch 231009 contains data from MCS up to 30 June 2023 and data from EPC up to 31 July 2023
+        # Must remove additional month of EPC data for consistency
+        if batch == "231009":
+            df = df[df["HP_INSTALL_DATE"] < "2023-07-01"]
-        if batch == "231009":
-            df = df[df["HP_INSTALL_DATE"] < "2023-07-01"]
+        max_epc_date = df["HP_INSTALL_DATE"].max()
+        max_mcs_date = df["commision_date"].max()
+        df = df[df["HP_INSTALL_DATE"] <= min(max_epc_date, max_mcs_date)]
-        if batch == "231009":
-            df = df[df["HP_INSTALL_DATE"] < "2023-07-01"]
+        max_epc_date = df["HP_INSTALL_DATE"].max()
+        max_mcs_date = df["commision_date"].max()
+        df = df[df["HP_INSTALL_DATE"] <= min(max_epc_date, max_mcs_date)]
+
+        df.to_csv(
+            os.path.join(input_data_path, f"hp_installed_gold_{batch}.csv"), index=False
+        )
+
+        return df