ccao-data · jeancochrane · Dec 4, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
@@ -1 +1 @@
-from ccao.vars_funs import vars_dict, vars_rename
+from ccao.vars_funs import vars_dict, vars_recode, vars_rename
@@ -7,7 +7,7 @@
 
 # Load the default variable dictionary
 _data_path = importlib.resources.files(ccao.data)
-vars_dict = pd.read_csv(str(_data_path / "vars_dict.csv"))
+vars_dict = pd.read_csv(str(_data_path / "vars_dict.csv"), dtype=str)
 vars_dict <- readr::read_csv( 
   file = "data-raw/vars_dict.csv", 
   col_types = readr::cols(var_code = readr::col_character()) 
 ) 
 vars_dict <- readr::read_csv( 
   file = "data-raw/vars_dict.csv", 
   col_types = readr::cols(var_code = readr::col_character()) 
 ) 
 
 # Prefix we use to identify variable name columns in the variable dictionary
 VAR_NAME_PREFIX = "var_name"
@@ -126,3 +126,155 @@ def vars_rename(
         # If the input data is a list, it's not possible to update it inplace,
         # so ignore that argument
         return [mapping.get(col, col) for col in data]
+
+
+def vars_recode(
+    data: pd.DataFrame,
+    cols: list[str] | None = None,
+    code_type: str = "long",
+    as_factor: bool = True,
+    dictionary: pd.DataFrame | None = None,
+) -> pd.DataFrame:
 vars_recode <- function(data, 
                         cols = dplyr::everything(), 
                         type = "long", 
                         as_factor = TRUE, 
                         dict = ccao::vars_dict) { 
 vars_recode <- function(data, 
                         cols = dplyr::everything(), 
                         type = "long", 
                         as_factor = TRUE, 
                         dict = ccao::vars_dict) { 
+    """
+    Replace numerically coded variables with human-readable values.
+
+    The system of record stores characteristic values in a numerically encoded
+    format. This function can be used to translate those values into a
+    human-readable format. For example, EXT_WALL = 2 will become
+    EXT_WALL = "Frame + Masonry". Note that the values and their translations
+    must be specified via a user-defined dictionary. The default dictionary is
+    :data:`vars_dict`.
+
+    Options for ``code_type`` are:
+
+    - ``"long"``, which transforms EXT_WALL = 1 to EXT_WALL = Frame
+    - ``"short"``, which transforms EXT_WALL = 1 to EXT_WALL = FRME
+    - ``"code"``, which keeps the original values (useful for removing
+      improperly coded values, see the note below)
+
+    :param data:
+        A pandas DataFrame with columns to have values replaced.
+    :type data: pandas.DataFrame
+
+    :param cols:
+        A list of column names to be transformed, or ``None`` to select all columns.
+    :type cols: list[str]
+
+    :param code_type:
+        The recoding type. See description above for options.
+    :type code_type: str
+
+    :param as_factor:
+        If True, re-encoded values will be returned as categorical variables
+        (pandas Categorical).
+        If False, re-encoded values will be returned as plain strings.
+    :type as_factor: bool
+
+    :param dictionary:
+        A pandas DataFrame representing the dictionary used to translate
+        encodings.
+    :type dictionary: pandas.DataFrame
+
+    :raises ValueError:
+        If the dictionary is missing required columns or if invalid input is
+        provided.
+
+    :return:
+        The input DataFrame with re-encoded values for the specified columns.
+    :rtype: pandas.DataFrame
+
+    .. note::
+        Values which are in the data but are NOT in the dictionary will be
+        converted to NaN.
+
+    :example:
+
+    .. code-block:: python
+
+        import ccao
+
+        sample_data = ccao.sample_athena
+
+        # Defaults to `long` code type
+        ccao.vars_recode(data=sample_data)
+
+        # Recode to `short` code type
+        ccao.vars_recode(data=sample_data, code_type="short")
+
+        # Recode only specified columns
+        ccao.vars_recode(data=sample_data, cols="GAR1_SIZE")
+    """
+    # Validate the dictionary schema
+    dictionary = dictionary if dictionary is not None else vars_dict
+    if dictionary.empty:
+        raise ValueError("dictionary must be a non-empty pandas DataFrame")
+
+    required_columns = {"var_code", "var_value", "var_value_short"}
+    if not required_columns.issubset(dictionary.columns):
+        raise ValueError(
+            "Input dictionary must contain the following columns: "
+            f"{', '.join(required_columns)}"
+        )
+
+    # Validate code type and convert it to the enum
+    if code_type not in ["short", "long", "code"]:
+        raise ValueError("code_type must be one of 'short', 'long', or 'code'")
+
+    # Filter the dictionary for categoricals only and and pivot it longer for
+    # easier lookup
+    dict_long = dictionary[
+        (dictionary["var_type"] == "char")
+        & (dictionary["var_data_type"] == "categorical")
+    ]
+    dict_long = dict_long.melt(
+        id_vars=["var_code", "var_value", "var_value_short"],
+        value_vars=[
+            col for col in dictionary.columns if col.startswith("var_name_")
+        ],
+        value_name="var_name",
+        var_name="var_type",
+    )
+    dict_long_pkey = ["var_code", "var_value", "var_value_short", "var_name"]
+    dict_long = dict_long[dict_long_pkey]
+    dict_long = dict_long.drop_duplicates(subset=dict_long_pkey)
+
+    # Map the code type to its internal representation in the dictionary
+    values_to = {
+        "code": "var_code",
+        "long": "var_value",
+        "short": "var_value_short",
+    }[code_type]
+
+    # Function to apply to each column to remap column values based on the
+    # vars dict
+    def transform_column(
+        col: pd.Series, var_name: str, values_to: str, as_factor: bool
+    ) -> pd.Series | pd.Categorical:
+        if var_name in dict_long["var_name"].values:
+            var_rows = dict_long[dict_long["var_name"] == var_name]
+            # Get a dictionary mapping the possible codes to their values.
+            # Use `var_code` as the index (keys) for the dictionary, unless
+            # we're selecting `var_code`, in which case we can't set it as the
+            # index and use it for values
+            var_dict = (
+                {code: code for code in var_rows["var_code"].tolist()}
+                if values_to == "var_code"
+                else var_rows.copy().set_index("var_code")[values_to].to_dict()
+            )
+            if as_factor:
+                return pd.Categorical(
+                    col.map(var_dict), categories=list(var_dict.values())
+                )
+            else:
+                return col.map(var_dict)
+        return col
+
+    # Recode specified columns, or all columns if none were specified
+    cols = cols or data.columns
+    for var_name in cols:
+        if var_name in data.columns:
+            data[var_name] = transform_column(
+                data[var_name], var_name, values_to, as_factor
+            )
+
+    return data
@@ -9,6 +9,19 @@ Manage characteristics
 ^^^^^^^^^^^^^^^^^^^^^^
 
 Recode/rename characteristic columns, merge HIE data, and fix characteristic
-errors.
+errors
 
-:doc:`vars_rename() <vars_rename>`
+:doc:`vars_rename() <vars_rename>` |nbsp|
+:doc:`vars_recode() <vars_recode>`
+
+Data
+----
+
+Dictionaries
+^^^^^^^^^^^^
+
+Lookups for numeric codes used in the assessment system
+
+:doc:`vars_dict <vars_dict>`
+
+.. |nbsp| unicode:: 0xA0
@@ -0,0 +1,28 @@
+================================================
+Data dictionary for CCAO data sets and variables
+================================================
+
+A crosswalk of CCAO variable names used in iasWorld, AWS, modeling,
+and open data. Also includes a translation of numeric character codes
+to their human-readable value (ROOF_CNST = 1
+becomes ROOF_CNST = Shingle/Asphalt).
+
+Format
+------
+
+A pandas DataFrame with the following columns:
+
+- **var_name_hie**: Column name of variable when stored in the legacy ADDCHARS SQL table.
+- **var_name_iasworld**: Column name for variable as stored in the system of record (iasWorld).
+- **var_name_athena**: Column name used for views and tables in AWS Athena.
+- **var_name_model**: Column name used while data is flowing through modeling pipelines.
+- **var_name_publish**: Human-readable column name used for public data sets.
+- **var_name_pretty**: Human-readable column name used for publication and reporting.
+- **var_type**: Variable type/prefix indicating the variable's function. For example,
+  ``ind_`` variables are always indicators (booleans), while ``char_`` variables are
+  always property characteristics.
+- **var_data_type**: R data type variable values should be stored as.
+- **var_code**: Factor value for categorical variables. These are the values stored
+  in the system of record.
+- **var_value**: Human-readable translation of factor value.
+- **var_value_short**: Human-readable translation of factor value, but as short as possible.
@@ -0,0 +1,5 @@
+==============================================================
+Replace numerically coded variables with human-readable values
+==============================================================
+
+.. autofunction:: ccao.vars_recode
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from ccao.vars_funs import vars_dict, vars_rename
		from ccao.vars_funs import vars_dict, vars_recode, vars_rename