ccao-data · jeancochrane · Dec 4, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
@@ -26,6 +26,7 @@
 ^man-roxygen$
 ^pkgdown$
 ^public$
+^python
 ^renv$
 ^renv\.lock$
 ^vignettes$

@@ -9,7 +9,6 @@ name: docs
 
 env:
   PYTHONUNBUFFERED: "1"
-  UV_SYSTEM_PYTHON: 1
 
 jobs:
   build:
@@ -33,16 +32,12 @@ jobs:
           needs: website
 
       - name: Install uv
-        uses: astral-sh/setup-uv@v3
-        with:
-          enable-cache: true
-          cache-dependency-glob: python/pyproject.toml
-          cache-suffix: docs
+        uses: astral-sh/setup-uv@v4
 
       - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version-file: python/pyproject.toml
+        shell: bash
+        working-directory: python
+        run: uv venv
 
       - name: Install Python dependencies
         working-directory: python
@@ -54,7 +49,9 @@ jobs:
         shell: Rscript {0}
 
       - name: Build Python docs
-        run: sphinx-build python/docs/source docs/python
+        working-directory: python
+        run: uv run sphinx-build docs/source ../docs/python
+        shell: bash
 
       - name: Configure pages
         uses: actions/configure-pages@v5

@@ -0,0 +1,44 @@
+on:
+  pull_request:
+  push:
+    branches: [main, master]
+
+name: python-build-and-test
+
+env:
+  PYTHONUNBUFFERED: "1"
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v4
+      with:
+        enable-cache: true
+        cache-dependency-glob: python/pyproject.toml
+        cache-suffix: ${{ matrix.python-version }}-test
+
+    - name: Install Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install tox
+      shell: bash
+      run: |
+        uv tool install tox --with tox-uv,tox-gh-actions
+        tox --version
+
+    - name: Build and test with tox
+      shell: bash
+      working-directory: python
+      run: tox r
@@ -200,7 +200,7 @@ vars_rename <- function(data,
 #' @description The system of record stores characteristic values in a
 #' numerically encoded format. This function can be used to translate those
 #' values into a human-readable format. For example, EXT_WALL = 2 will become
-#' EXT_WALL = "Frame + Masonry". Note that the values and their translations are
+#' EXT_WALL = "Masonry". Note that the values and their translations are
 #' must be specified via a user-defined dictionary. The default dictionary is
 #' \code{\link{vars_dict}}.
 #'

@@ -1 +1 @@
-from ccao.vars_funs import vars_dict, vars_rename
+from ccao.vars_funs import vars_dict, vars_recode, vars_rename
@@ -1,25 +1,26 @@
 # Functions for translating variables between different data sources
 import importlib.resources
+import typing
 
 import pandas as pd
 
 import ccao.data
 
 # Load the default variable dictionary
 _data_path = importlib.resources.files(ccao.data)
-vars_dict = pd.read_csv(str(_data_path / "vars_dict.csv"))
+vars_dict = pd.read_csv(str(_data_path / "vars_dict.csv"), dtype=str)
 vars_dict <- readr::read_csv( 
   file = "data-raw/vars_dict.csv", 
   col_types = readr::cols(var_code = readr::col_character()) 
 ) 
 vars_dict <- readr::read_csv( 
   file = "data-raw/vars_dict.csv", 
   col_types = readr::cols(var_code = readr::col_character()) 
 ) 
 
 # Prefix we use to identify variable name columns in the variable dictionary
 VAR_NAME_PREFIX = "var_name"
 
 
 def vars_rename(
-    data: list[str] | pd.DataFrame,
+    data: typing.Union[typing.List[str], pd.DataFrame],
     names_from: str,
     names_to: str,
     output_type: str = "inplace",
-    dictionary: pd.DataFrame | None = None,
-) -> list[str] | pd.DataFrame:
+    dictionary: typing.Optional[pd.DataFrame] = None,
+) -> typing.Union[typing.List[str], pd.DataFrame]:
     """
     Rename variables from one naming convention to another.
 
@@ -126,3 +127,165 @@ def vars_rename(
         # If the input data is a list, it's not possible to update it inplace,
         # so ignore that argument
         return [mapping.get(col, col) for col in data]
+
+
+def vars_recode(
+    data: pd.DataFrame,
+    cols: typing.Optional[typing.List[str]] = None,
+    code_type: str = "long",
+    as_factor: bool = True,
+    dictionary: typing.Optional[pd.DataFrame] = None,
+) -> pd.DataFrame:
+    """
+    Replace numerically coded variables with human-readable values.
+
+    The system of record stores characteristic values in a numerically encoded
+    format. This function can be used to translate those values into a
+    human-readable format. For example, EXT_WALL = 2 will become
+    EXT_WALL = "Masonry". Note that the values and their translations
+    must be specified via a user-defined dictionary. The default dictionary is
+    :data:`vars_dict`.
+
+    Options for ``code_type`` are:
+
+    - ``"long"``, which transforms EXT_WALL = 1 to EXT_WALL = Frame
+    - ``"short"``, which transforms EXT_WALL = 1 to EXT_WALL = FRME
+    - ``"code"``, which keeps the original values (useful for removing
+      improperly coded values, see the note below)
+
+    :param data:
+        A pandas DataFrame with columns to have values replaced.
+    :type data: pandas.DataFrame
+
+    :param cols:
+        A list of column names to be transformed, or ``None`` to select all columns.
+    :type cols: list[str]
+
+    :param code_type:
+        The recoding type. See description above for options.
+    :type code_type: str
+
+    :param as_factor:
+        If True, re-encoded values will be returned as categorical variables
+        (pandas Categorical).
+        If False, re-encoded values will be returned as plain strings.
+    :type as_factor: bool
+
+    :param dictionary:
+        A pandas DataFrame representing the dictionary used to translate
+        encodings.
+    :type dictionary: pandas.DataFrame
+
+    :raises ValueError:
+        If the dictionary is missing required columns or if invalid input is
+        provided.
+
+    :return:
+        The input DataFrame with re-encoded values for the specified columns.
+    :rtype: pandas.DataFrame
+
+    .. note::
+        Values which are in the data but are NOT in the dictionary will be
+        converted to NaN.
+
+    :example:
+
+    .. code-block:: python
+
+        import ccao
+
+        sample_data = ccao.sample_athena
+
+        # Defaults to `long` code type
+        ccao.vars_recode(data=sample_data)
+
+        # Recode to `short` code type
+        ccao.vars_recode(data=sample_data, code_type="short")
+
+        # Recode only specified columns
+        ccao.vars_recode(data=sample_data, cols="GAR1_SIZE")
+    """
+    # Validate the dictionary schema
+    dictionary = dictionary if dictionary is not None else vars_dict
+    if dictionary.empty:
+        raise ValueError("dictionary must be a non-empty pandas DataFrame")
+
+    required_columns = {
+        "var_code",
+        "var_value",
+        "var_value_short",
+        "var_type",
+        "var_data_type",
+    }
+    if not required_columns.issubset(dictionary.columns):
+        raise ValueError(
+            "Input dictionary must contain the following columns: "
+            f"{', '.join(required_columns)}"
+        )
+
+    if not any(col.startswith("var_name_") for col in dictionary.columns):
+        raise ValueError(
+            "Input dictionary must contain at least one var_name_ column"
+        )
+
+    if code_type not in ["short", "long", "code"]:
+        raise ValueError("code_type must be one of 'short', 'long', or 'code'")
+
+    # Filter the dictionary for categoricals only and and pivot it longer for
+    # easier lookup
+    dict_long = dictionary[
+        (dictionary["var_type"] == "char")
+        & (dictionary["var_data_type"] == "categorical")
+    ]
+    dict_long = dict_long.melt(
+        id_vars=["var_code", "var_value", "var_value_short"],
+        value_vars=[
+            col for col in dictionary.columns if col.startswith("var_name_")
+        ],
+        value_name="var_name",
+        var_name="var_type",
+    )
+    dict_long_pkey = ["var_code", "var_value", "var_value_short", "var_name"]
+    dict_long = dict_long[dict_long_pkey]
+    dict_long = dict_long.drop_duplicates(subset=dict_long_pkey)
+
+    # Map the code type to its internal representation in the dictionary
+    values_to = {
+        "code": "var_code",
+        "long": "var_value",
+        "short": "var_value_short",
+    }[code_type]
+
+    # Function to apply to each column to remap column values based on the
+    # vars dict
+    def transform_column(
+        col: pd.Series, var_name: str, values_to: str, as_factor: bool
+    ) -> typing.Union[pd.Series, pd.Categorical]:
+        if var_name in dict_long["var_name"].values:
+            var_rows = dict_long[dict_long["var_name"] == var_name]
+            # Get a dictionary mapping the possible codes to their values.
+            # Use `var_code` as the index (keys) for the dictionary, unless
+            # we're selecting `var_code`, in which case we can't set it as the
+            # index and use it for values
+            var_dict = (
+                {code: code for code in var_rows["var_code"].tolist()}
+                if values_to == "var_code"
+                else var_rows.copy().set_index("var_code")[values_to].to_dict()
+            )
+            if as_factor:
+                return pd.Categorical(
+                    col.map(var_dict), categories=list(var_dict.values())
+                )
+            else:
+                return col.map(var_dict)
+        return col
+
+    # Recode specified columns, or all columns if none were specified
+    cols = cols or data.columns
+    for var_name in cols:
+        if var_name in data.columns:
+            data[var_name] = transform_column(
+                data[var_name], var_name, values_to, as_factor
+            )
+
+    return data
@@ -9,6 +9,19 @@ Manage characteristics
 ^^^^^^^^^^^^^^^^^^^^^^
 
 Recode/rename characteristic columns, merge HIE data, and fix characteristic
-errors.
+errors
 
-:doc:`vars_rename() <vars_rename>`
+:doc:`vars_rename() <vars_rename>` |nbsp|
+:doc:`vars_recode() <vars_recode>`
+
+Data
+----
+
+Dictionaries
+^^^^^^^^^^^^
+
+Lookups for numeric codes and variable names used in the assessment system
+
+:doc:`vars_dict <vars_dict>`
+
+.. |nbsp| unicode:: 0xA0
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from ccao.vars_funs import vars_dict, vars_rename
		from ccao.vars_funs import vars_dict, vars_recode, vars_rename