eia capacity

OSeMOSYS · Oct 14, 2024 · 6061119 · 6061119
1 parent 3145012
commit 6061119
Show file tree

Hide file tree

Showing 5 changed files with 160 additions and 169 deletions.
diff --git a/workflow/scripts/osemosys_global/validation/eia.py b/workflow/scripts/osemosys_global/validation/eia.py
@@ -43,6 +43,38 @@
     "Other gases fossil fuel": "OTH",
 }
 
+OG_GEN_NAME_MAPPER = {
+    "BIO": "BIO",
+    "CCG": "GAS",
+    "COA": "COA",
+    "CSP": "SPV",
+    "HYD": "HYD",
+    "OCG": "GAS",
+    "OIL": "OIL",
+    "SPV": "SPV",
+    "TRN": None,
+    "URN": "URN",
+    "WON": "WND",
+    "WOF": "WND",
+    "WAV": "WAV",
+}
+
+OG_CAP_NAME_MAPPER = {
+    "BIO": "BIO",
+    "CCG": "FFS",
+    "COA": "FFS",
+    "CSP": "SPV",
+    "HYD": "HYD",
+    "OCG": "FFS",
+    "OIL": "FFS",
+    "SPV": "SPV",
+    "TRN": None,
+    "URN": "URN",
+    "WON": "WND",
+    "WOF": "WND",
+    "WAV": "WAV",
+}
+
 ###
 # public functions
 ###
@@ -58,40 +90,6 @@ def get_eia_generation(json_file: str, **kwargs) -> pd.DataFrame:
     return _format_eia_generation_data(df)
 
 
-def format_og_generation(prod_tech_annual: pd.DataFrame) -> pd.DataFrame:
-    """Formats ProductionByTechnologyAnnual data for eia comparison"""
-
-    name_mapper = {
-        "BIO": "BIO",
-        "CCG": "GAS",
-        "COA": "COA",
-        "CSP": "SPV",
-        "HYD": "HYD",
-        "OCG": "GAS",
-        "OIL": "OIL",
-        "SPV": "SPV",
-        "TRN": None,
-        "URN": "URN",
-        "WON": "WND",
-        "WOF": "WND",
-        "WAV": "WAV",
-    }
-
-    df = prod_tech_annual.copy()
-
-    if len(df.columns) == 1:
-        df = df.reset_index()
-
-    df = df[(df.TECHNOLOGY.str.startswith("PWR")) & (df.YEAR < 2023)]
-    df["COUNTRY"] = df.TECHNOLOGY.str[6:9]
-    df["CODE"] = df.TECHNOLOGY.str[3:6]
-    df["CODE"] = df.CODE.map(name_mapper)
-    df = df.dropna(subset="CODE")
-    df["TECHNOLOGY"] = df.CODE + df.COUNTRY
-    df = df.drop(columns=["FUEL", "COUNTRY", "CODE"])
-    return df.groupby(["REGION", "TECHNOLOGY", "YEAR"]).sum()
-
-
 ###
 # private functions
 ###
@@ -105,7 +103,12 @@ def _read_eia_data(json_file: str) -> pd.DataFrame:
     df = pd.read_json(json_file)
     df["name"] = df.name.map(lambda x: x.split(", ")[0])
     df = df.explode(column="data")
-    df["year"] = df.data.map(lambda x: datetime.fromtimestamp(x["date"] / 1000).year)
+    # not sure why, but the 'datetime.fromtimestamp(x["date"] / 1000).year' call gives the
+    # next year rather than the correct one. ie. If I call the year 2020, 2021 values are
+    # returned. Thats why the extra '+1' at the end of the lambda
+    df["year"] = df.data.map(
+        lambda x: datetime.fromtimestamp(x["date"] / 1000).year + 1
+    )
     df["VALUE"] = df.data.map(lambda x: x["value"])
     df["VALUE"] = df.VALUE.fillna(0)
     return df.drop(
@@ -114,19 +117,29 @@ def _read_eia_data(json_file: str) -> pd.DataFrame:
 
 
 def _format_eia_capacity_data(eia: pd.DataFrame) -> pd.DataFrame:
-    """Formats data into otoole compatiable data structure"""
+    """Formats data into otoole compatiable data structure
+
+    Note, no unit conversion as capacity is already given in GW
+    """
 
     df = eia.copy()
 
     df["name"] = df.name.map(
         lambda x: x.split(" electricity installed capacity")[0]
     ).map(CAPACITY_MAPPER)
     df["name"] = df.name + df.iso
-    df = df.drop(columns=["iso"])
-    df = df.groupby(["name", "year"], as_index=False).sum()
+    df["VALUE"] = (
+        df.VALUE.replace("NA", 0)
+        .replace("--", 0)
+        .replace("ie", 0)
+        .replace("(s)", 0)
+        .fillna(0)
+        .astype(float)
+    )
     df = df.rename(columns={"name": "TECHNOLOGY", "year": "YEAR"})
     df["REGION"] = "GLOBAL"
-    return df.set_index(["REGION", "TECHNOLOGY", "YEAR"])
+    df = df[["REGION", "TECHNOLOGY", "YEAR", "VALUE"]]
+    return df.groupby(["REGION", "TECHNOLOGY", "YEAR"]).sum()
 
 
 def _format_eia_generation_data(eia: pd.DataFrame) -> pd.DataFrame:
@@ -147,10 +160,10 @@ def _format_eia_generation_data(eia: pd.DataFrame) -> pd.DataFrame:
         .replace("NA", 0)
         .astype(float)
     )
-    df = df.groupby(["name", "year"], as_index=False).sum()
     df = df.rename(columns={"name": "TECHNOLOGY", "year": "YEAR"})
     df["REGION"] = "GLOBAL"
     # billion kWh -> PJ
     # 1B kWh = 1 TWh * (1PWh / 1000TWh) * (3600sec / hr) = 1 PWs = 1 PJ
     df["VALUE"] = df.VALUE.mul(3.6)
-    return df.set_index(["REGION", "TECHNOLOGY", "YEAR"])
+    df = df[["REGION", "TECHNOLOGY", "YEAR", "VALUE"]]
+    return df.groupby(["REGION", "TECHNOLOGY", "YEAR"]).sum()
diff --git a/workflow/scripts/osemosys_global/validation/ember.py b/workflow/scripts/osemosys_global/validation/ember.py
@@ -52,29 +52,6 @@ def get_ember_generation(csv_file: str, **kwargs) -> pd.DataFrame:
     return _format_ember_generation_data(df)
 
 
-def format_og_data(og: pd.DataFrame) -> pd.DataFrame:
-    """Formats OG results for ember comparison
-
-    Works on:
-    - ProductionByTechnologyAnnual
-    - TotalCapacityAnnual
-    """
-
-    df = og.copy()
-
-    if len(df.columns) == 1:
-        df = df.reset_index()
-
-    df = df[(df.TECHNOLOGY.str.startswith("PWR")) & (df.YEAR < 2023)]
-    df["COUNTRY"] = df.TECHNOLOGY.str[6:9]
-    df["CODE"] = df.TECHNOLOGY.str[3:6]
-    df["CODE"] = df.CODE.map(OG_NAME_MAPPER)
-    df = df.dropna(subset="CODE")
-    df["TECHNOLOGY"] = df.CODE + df.COUNTRY
-    df = df[["REGION", "TECHNOLOGY", "YEAR", "VALUE"]]
-    return df.groupby(["REGION", "TECHNOLOGY", "YEAR"]).sum()
-
-
 ###
 # private functions
 ###

diff --git a/workflow/scripts/osemosys_global/validation/irena.py b/workflow/scripts/osemosys_global/validation/irena.py
@@ -60,29 +60,6 @@ def get_irena_generation(csv_file: str, iso_codes: str, **kwargs) -> pd.DataFram
     return _format_irena_generation_data(df)
 
 
-def format_og_data(prod_tech_annual: pd.DataFrame) -> pd.DataFrame:
-    """Formats OG results for irena comparison
-
-    Works on:
-    - ProductionByTechnologyAnnual
-    - TotalCapacityAnnual
-    """
-
-    df = prod_tech_annual.copy()
-
-    if len(df.columns) == 1:
-        df = df.reset_index()
-
-    df = df[(df.TECHNOLOGY.str.startswith("PWR")) & (df.YEAR < 2023)]
-    df["COUNTRY"] = df.TECHNOLOGY.str[6:9]
-    df["CODE"] = df.TECHNOLOGY.str[3:6]
-    df["CODE"] = df.CODE.map(OG_NAME_MAPPER)
-    df = df.dropna(subset="CODE").copy()
-    df["TECHNOLOGY"] = df.CODE + df.COUNTRY
-    df = df[["REGION", "TECHNOLOGY", "YEAR", "VALUE"]]
-    return df.groupby(["REGION", "TECHNOLOGY", "YEAR"]).sum()
-
-
 ###
 # private functions
 ###

diff --git a/workflow/scripts/osemosys_global/validation/main.py b/workflow/scripts/osemosys_global/validation/main.py
@@ -5,9 +5,9 @@
 """
 
 import pandas as pd
-import matplotlib.pyplot as plt
-from typing import Optional
 from pathlib import Path
+from utils import plot_gen_cap, format_og_data
+from functools import partial
 import eia
 import irena
 import ember
@@ -16,76 +16,6 @@
 
 logger = logging.getLogger(__name__)
 
-###
-# plotters
-###
-
-
-def plot_gen_cap(
-    modelled: pd.DataFrame,
-    actual: pd.DataFrame,
-    variable: str,
-    dataset_name: Optional[str] = None,
-) -> dict[str, tuple[plt.figure, plt.axes]]:
-
-    def _join_data(
-        modelled: pd.DataFrame, actual: pd.DataFrame, dataset_name: Optional[str] = None
-    ) -> pd.DataFrame:
-
-        if not dataset_name:
-            dataset_name = "ACTUAL"
-
-        modelled = modelled.rename(columns={"VALUE": "OSeMOSYS"})
-        actual = actual.rename(columns={"VALUE": dataset_name})
-        df = modelled.join(actual)
-
-        assert len(df.index.get_level_values("REGION").unique()) == 1
-
-        return df.droplevel("REGION")
-
-    assert modelled.index.names == actual.index.names
-
-    if variable == "generation":
-        units = "PJ"
-    elif variable == "capacity":
-        units = "GW"
-    else:
-        raise ValueError(
-            f"Variable must be one of ['generation', 'capacity']. Recieved {variable}"
-        )
-
-    df = _join_data(modelled, actual, dataset_name).reset_index()
-    df["TECH"] = df["TECHNOLOGY"].str[0:3]
-    df["COUNTRY"] = df["TECHNOLOGY"].str[3:]
-
-    data = {}
-
-    countries = df.COUNTRY.unique()
-    for country in countries:
-        df_country = df[df.COUNTRY == country]
-        years = df_country.YEAR.unique()
-        n_rows = len(years)
-        fig, axs = plt.subplots(n_rows, 1, figsize=(10, n_rows * 4))
-        for i, year in enumerate(years):
-            df_year = (
-                df_country[df_country.YEAR == year]
-                .drop(columns=["TECHNOLOGY", "YEAR", "COUNTRY"])
-                .set_index("TECH")
-            )
-            title = f"{country} {variable.capitalize()} in {year}"
-            if n_rows > 1:
-                ax = axs[i]
-            else:
-                ax = axs
-            df_year.plot(
-                kind="bar", ax=ax, rot=45, title=title, xlabel="", ylabel=units
-            )
-
-        data[country] = (fig, axs)
-
-    return data
-
-
 ###
 # getters
 ###
@@ -96,19 +26,19 @@ def get_generation_funcs(datasource: str) -> dict[str, callable]:
         case "eia" | "EIA" | "Eia":
             return {
                 "getter": eia.get_eia_generation,
-                "formatter": eia.format_og_generation,
+                "formatter": partial(format_og_data, mapper=eia.OG_GEN_NAME_MAPPER),
                 "plotter": plot_gen_cap,
             }
         case "irena" | "IRENA" | "Irena":
             return {
                 "getter": irena.get_irena_generation,
-                "formatter": irena.format_og_data,
+                "formatter": partial(format_og_data, mapper=irena.OG_NAME_MAPPER),
                 "plotter": plot_gen_cap,
             }
         case "ember" | "EMBER" | "Ember":
             return {
                 "getter": ember.get_ember_generation,
-                "formatter": ember.format_og_data,
+                "formatter": partial(format_og_data, mapper=ember.OG_NAME_MAPPER),
                 "plotter": plot_gen_cap,
             }
         case _:
@@ -120,19 +50,19 @@ def get_capacity_funcs(datasource: str) -> dict[str, callable]:
         case "eia" | "EIA" | "Eia":
             return {
                 "getter": eia.get_eia_capacity,
-                "formatter": eia.format_og_capacity,
+                "formatter": partial(format_og_data, mapper=eia.OG_CAP_NAME_MAPPER),
                 "plotter": plot_gen_cap,
             }
         case "irena" | "IRENA" | "Irena":
             return {
                 "getter": irena.get_irena_capacity,
-                "formatter": irena.format_og_data,
+                "formatter": partial(format_og_data, mapper=irena.OG_NAME_MAPPER),
                 "plotter": plot_gen_cap,
             }
         case "ember" | "EMBER" | "Ember":
             return {
                 "getter": ember.get_ember_capacity,
-                "formatter": ember.format_og_data,
+                "formatter": partial(format_og_data, mapper=ember.OG_NAME_MAPPER),
                 "plotter": plot_gen_cap,
             }
         case _:
@@ -147,12 +77,12 @@ def get_capacity_funcs(datasource: str) -> dict[str, callable]:
     if "snakemake" in globals():
         raise NotImplementedError
     else:
-        datasource = "irena"
+        datasource = "eia"
         variable = "capacity"
         result_dir = "results/India/results"
-        data_file = "resources/data/validation/irena_capacity.csv"
+        data_file = "resources/data/validation/eia_capacity.json"
         options = {}
-        options = {"iso_codes": "resources/data/validation/iso.csv"}
+        # options = {"iso_codes": "resources/data/validation/iso.csv"}
 
     csv_results = Path(result_dir)
     validation_results = Path(csv_results, "..", "validation")
@@ -180,7 +110,7 @@ def get_capacity_funcs(datasource: str) -> dict[str, callable]:
         logger.error(f"No validation for {variable} from {datasource}: \n{e}")
 
     if isinstance(actual, pd.DataFrame) and isinstance(modelled, pd.DataFrame):
-        gen = funcs["plotter"](modelled, actual, datasource)
+        gen = funcs["plotter"](modelled, actual, variable, datasource)
         for country, (fig, _) in gen.items():
             p = Path(validation_results, country, variable)
             if not p.exists():