diff --git a/.Rbuildignore b/.Rbuildignore index be1d40b..82b45ad 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -26,6 +26,7 @@ ^man-roxygen$ ^pkgdown$ ^public$ +^python ^renv$ ^renv\.lock$ ^vignettes$ diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index d1b8bfd..c676a47 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -9,7 +9,6 @@ name: docs env: PYTHONUNBUFFERED: "1" - UV_SYSTEM_PYTHON: 1 jobs: build: @@ -33,16 +32,12 @@ jobs: needs: website - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - enable-cache: true - cache-dependency-glob: python/pyproject.toml - cache-suffix: docs + uses: astral-sh/setup-uv@v4 - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version-file: python/pyproject.toml + shell: bash + working-directory: python + run: uv venv - name: Install Python dependencies working-directory: python @@ -54,7 +49,9 @@ jobs: shell: Rscript {0} - name: Build Python docs - run: sphinx-build python/docs/source docs/python + working-directory: python + run: uv run sphinx-build docs/source ../docs/python + shell: bash - name: Configure pages uses: actions/configure-pages@v5 diff --git a/.github/workflows/pytest-coverage.yaml b/.github/workflows/pytest-coverage.yaml deleted file mode 100644 index e758284..0000000 --- a/.github/workflows/pytest-coverage.yaml +++ /dev/null @@ -1,52 +0,0 @@ -on: - pull_request: - push: - branches: [main, master] - -name: pytest-coverage - -env: - PYTHONUNBUFFERED: "1" - UV_SYSTEM_PYTHON: 1 - -jobs: - pytest-coverage: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: ["3.10", "3.11", "3.12", "3.13"] - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - enable-cache: true - cache-dependency-glob: python/pyproject.toml - cache-suffix: test - - - name: Install Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Install Python dependencies - working-directory: python - shell: bash - run: uv pip install .[dev,docs] - - - name: Run pytest - working-directory: python - shell: bash - run: | - pytest -v --doctest-modules \ - --junitxml=junit/test-results-${{ matrix.python-version }}.xmlpytest - - - name: Upload artifacts - if: failure() - uses: actions/upload-artifact@v4 - with: - name: python/pytest-results-${{ matrix.python-version }} - path: python/junit/test-results-${{ matrix.python-version }}.xml diff --git a/.github/workflows/python-build-and-test.yaml b/.github/workflows/python-build-and-test.yaml new file mode 100644 index 0000000..fd338ec --- /dev/null +++ b/.github/workflows/python-build-and-test.yaml @@ -0,0 +1,44 @@ +on: + pull_request: + push: + branches: [main, master] + +name: python-build-and-test + +env: + PYTHONUNBUFFERED: "1" + +jobs: + build-and-test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + enable-cache: true + cache-dependency-glob: python/pyproject.toml + cache-suffix: ${{ matrix.python-version }}-test + + - name: Install Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install tox + shell: bash + run: | + uv tool install tox --with tox-uv,tox-gh-actions + tox --version + + - name: Build and test with tox + shell: bash + working-directory: python + run: tox r diff --git a/R/vars_funs.R b/R/vars_funs.R index d8862be..62506a0 100644 --- a/R/vars_funs.R +++ b/R/vars_funs.R @@ -200,7 +200,7 @@ vars_rename <- function(data, #' @description The system of record stores characteristic values in a #' numerically encoded format. This function can be used to translate those #' values into a human-readable format. For example, EXT_WALL = 2 will become -#' EXT_WALL = "Frame + Masonry". Note that the values and their translations are +#' EXT_WALL = "Masonry". Note that the values and their translations are #' must be specified via a user-defined dictionary. The default dictionary is #' \code{\link{vars_dict}}. #' diff --git a/python/ccao/__init__.py b/python/ccao/__init__.py index f203ad4..8527c2d 100644 --- a/python/ccao/__init__.py +++ b/python/ccao/__init__.py @@ -1 +1 @@ -from ccao.vars_funs import vars_dict, vars_rename +from ccao.vars_funs import vars_dict, vars_recode, vars_rename diff --git a/python/ccao/vars_funs.py b/python/ccao/vars_funs.py index 2a4ddec..64e4a32 100644 --- a/python/ccao/vars_funs.py +++ b/python/ccao/vars_funs.py @@ -1,5 +1,6 @@ # Functions for translating variables between different data sources import importlib.resources +import typing import pandas as pd @@ -7,19 +8,19 @@ # Load the default variable dictionary _data_path = importlib.resources.files(ccao.data) -vars_dict = pd.read_csv(str(_data_path / "vars_dict.csv")) +vars_dict = pd.read_csv(str(_data_path / "vars_dict.csv"), dtype=str) # Prefix we use to identify variable name columns in the variable dictionary VAR_NAME_PREFIX = "var_name" def vars_rename( - data: list[str] | pd.DataFrame, + data: typing.Union[typing.List[str], pd.DataFrame], names_from: str, names_to: str, output_type: str = "inplace", - dictionary: pd.DataFrame | None = None, -) -> list[str] | pd.DataFrame: + dictionary: typing.Optional[pd.DataFrame] = None, +) -> typing.Union[typing.List[str], pd.DataFrame]: """ Rename variables from one naming convention to another. @@ -126,3 +127,165 @@ def vars_rename( # If the input data is a list, it's not possible to update it inplace, # so ignore that argument return [mapping.get(col, col) for col in data] + + +def vars_recode( + data: pd.DataFrame, + cols: typing.Optional[typing.List[str]] = None, + code_type: str = "long", + as_factor: bool = True, + dictionary: typing.Optional[pd.DataFrame] = None, +) -> pd.DataFrame: + """ + Replace numerically coded variables with human-readable values. + + The system of record stores characteristic values in a numerically encoded + format. This function can be used to translate those values into a + human-readable format. For example, EXT_WALL = 2 will become + EXT_WALL = "Masonry". Note that the values and their translations + must be specified via a user-defined dictionary. The default dictionary is + :data:`vars_dict`. + + Options for ``code_type`` are: + + - ``"long"``, which transforms EXT_WALL = 1 to EXT_WALL = Frame + - ``"short"``, which transforms EXT_WALL = 1 to EXT_WALL = FRME + - ``"code"``, which keeps the original values (useful for removing + improperly coded values, see the note below) + + :param data: + A pandas DataFrame with columns to have values replaced. + :type data: pandas.DataFrame + + :param cols: + A list of column names to be transformed, or ``None`` to select all columns. + :type cols: list[str] + + :param code_type: + The recoding type. See description above for options. + :type code_type: str + + :param as_factor: + If True, re-encoded values will be returned as categorical variables + (pandas Categorical). + If False, re-encoded values will be returned as plain strings. + :type as_factor: bool + + :param dictionary: + A pandas DataFrame representing the dictionary used to translate + encodings. + :type dictionary: pandas.DataFrame + + :raises ValueError: + If the dictionary is missing required columns or if invalid input is + provided. + + :return: + The input DataFrame with re-encoded values for the specified columns. + :rtype: pandas.DataFrame + + .. note:: + Values which are in the data but are NOT in the dictionary will be + converted to NaN. + + :example: + + .. code-block:: python + + import ccao + + sample_data = ccao.sample_athena + + # Defaults to `long` code type + ccao.vars_recode(data=sample_data) + + # Recode to `short` code type + ccao.vars_recode(data=sample_data, code_type="short") + + # Recode only specified columns + ccao.vars_recode(data=sample_data, cols="GAR1_SIZE") + """ + # Validate the dictionary schema + dictionary = dictionary if dictionary is not None else vars_dict + if dictionary.empty: + raise ValueError("dictionary must be a non-empty pandas DataFrame") + + required_columns = { + "var_code", + "var_value", + "var_value_short", + "var_type", + "var_data_type", + } + if not required_columns.issubset(dictionary.columns): + raise ValueError( + "Input dictionary must contain the following columns: " + f"{', '.join(required_columns)}" + ) + + if not any(col.startswith("var_name_") for col in dictionary.columns): + raise ValueError( + "Input dictionary must contain at least one var_name_ column" + ) + + if code_type not in ["short", "long", "code"]: + raise ValueError("code_type must be one of 'short', 'long', or 'code'") + + # Filter the dictionary for categoricals only and and pivot it longer for + # easier lookup + dict_long = dictionary[ + (dictionary["var_type"] == "char") + & (dictionary["var_data_type"] == "categorical") + ] + dict_long = dict_long.melt( + id_vars=["var_code", "var_value", "var_value_short"], + value_vars=[ + col for col in dictionary.columns if col.startswith("var_name_") + ], + value_name="var_name", + var_name="var_type", + ) + dict_long_pkey = ["var_code", "var_value", "var_value_short", "var_name"] + dict_long = dict_long[dict_long_pkey] + dict_long = dict_long.drop_duplicates(subset=dict_long_pkey) + + # Map the code type to its internal representation in the dictionary + values_to = { + "code": "var_code", + "long": "var_value", + "short": "var_value_short", + }[code_type] + + # Function to apply to each column to remap column values based on the + # vars dict + def transform_column( + col: pd.Series, var_name: str, values_to: str, as_factor: bool + ) -> typing.Union[pd.Series, pd.Categorical]: + if var_name in dict_long["var_name"].values: + var_rows = dict_long[dict_long["var_name"] == var_name] + # Get a dictionary mapping the possible codes to their values. + # Use `var_code` as the index (keys) for the dictionary, unless + # we're selecting `var_code`, in which case we can't set it as the + # index and use it for values + var_dict = ( + {code: code for code in var_rows["var_code"].tolist()} + if values_to == "var_code" + else var_rows.copy().set_index("var_code")[values_to].to_dict() + ) + if as_factor: + return pd.Categorical( + col.map(var_dict), categories=list(var_dict.values()) + ) + else: + return col.map(var_dict) + return col + + # Recode specified columns, or all columns if none were specified + cols = cols or data.columns + for var_name in cols: + if var_name in data.columns: + data[var_name] = transform_column( + data[var_name], var_name, values_to, as_factor + ) + + return data diff --git a/python/docs/source/reference.rst b/python/docs/source/reference.rst index 3dc531e..c391428 100644 --- a/python/docs/source/reference.rst +++ b/python/docs/source/reference.rst @@ -9,6 +9,19 @@ Manage characteristics ^^^^^^^^^^^^^^^^^^^^^^ Recode/rename characteristic columns, merge HIE data, and fix characteristic -errors. +errors -:doc:`vars_rename() ` +:doc:`vars_rename() ` |nbsp| +:doc:`vars_recode() ` + +Data +---- + +Dictionaries +^^^^^^^^^^^^ + +Lookups for numeric codes and variable names used in the assessment system + +:doc:`vars_dict ` + +.. |nbsp| unicode:: 0xA0 diff --git a/python/docs/source/vars_dict.rst b/python/docs/source/vars_dict.rst new file mode 100644 index 0000000..c085f8c --- /dev/null +++ b/python/docs/source/vars_dict.rst @@ -0,0 +1,28 @@ +================================================ +Data dictionary for CCAO data sets and variables +================================================ + +A crosswalk of CCAO variable names used in iasWorld, AWS, modeling, +and open data. Also includes a translation of numeric character codes +to their human-readable value (ROOF_CNST = 1 +becomes ROOF_CNST = Shingle/Asphalt). + +Format +------ + +A pandas DataFrame with the following columns: + +- **var_name_hie**: Column name of variable when stored in the legacy ADDCHARS SQL table. +- **var_name_iasworld**: Column name for variable as stored in the system of record (iasWorld). +- **var_name_athena**: Column name used for views and tables in AWS Athena. +- **var_name_model**: Column name used while data is flowing through modeling pipelines. +- **var_name_publish**: Human-readable column name used for public data sets. +- **var_name_pretty**: Human-readable column name used for publication and reporting. +- **var_type**: Variable type/prefix indicating the variable's function. For example, + ``ind_`` variables are always indicators (booleans), while ``char_`` variables are + always property characteristics. +- **var_data_type**: R data type variable values should be stored as. +- **var_code**: Factor value for categorical variables. These are the values stored + in the system of record. +- **var_value**: Human-readable translation of factor value. +- **var_value_short**: Human-readable translation of factor value, but as short as possible. diff --git a/python/docs/source/vars_recode.rst b/python/docs/source/vars_recode.rst new file mode 100644 index 0000000..6bdc9d3 --- /dev/null +++ b/python/docs/source/vars_recode.rst @@ -0,0 +1,5 @@ +============================================================== +Replace numerically coded variables with human-readable values +============================================================== + +.. autofunction:: ccao.vars_recode diff --git a/python/pyproject.toml b/python/pyproject.toml index 733619b..2f3a81f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -3,26 +3,28 @@ name = "ccao" version = "1.3.0" description = "Convenience Functions and Datasets for the Cook County Assessor's Office" readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.9" authors = [ {name = "Jean Cochrane", email="jean.cochrane@cookcountyil.gov"}, {name = "Dan Snow", email="daniel.snow@cookcountyil.gov"}, ] dependencies = [ - "pandas>=2.2.3", + "pandas>=1.4.3", + "numpy>=1.23.1" ] [project.optional-dependencies] dev = [ - "mypy>=1.13.0", - "pytest>=8.3.3", - "ruff>=0.7.4", + "mypy>=1.0.0", + "pytest>=7.0.0", + "ruff>=0.8.0", ] docs = [ - "Sphinx>=8.1.3", - "myst-parser>=4.0.0", + "Sphinx>=7.0", + "myst-parser>=1.0.0", "pydata-sphinx-theme>=0.16.0", - "sphinx-pyproject>=0.3.0" + "sphinx-pyproject>=0.3.0", + "sphinx-autobuild>=2024.10.3" ] [tool.setuptools] @@ -55,3 +57,38 @@ highlight_language = "none" html_theme = "pydata_sphinx_theme" html_logo = "../images/logo.png" html_show_copyright = false + +[tool.pytest.ini_options] +minversion = "7.0" +addopts = "-v --cache-clear -rf --doctest-modules" +console_output_style = "count" + +[tool.tox] +legacy_tox_ini = """ +[tox] +min_version = 4.0 +envlist = + py{39, 310, 311}-lowest + py{39, 310, 311, 312, 313} + +[gh-actions] +python = + 3.9: py39 + 3.10: py310 + 3.11: py311 + 3.12: py312 + 3.13: py313 + +[testenv] +extras = dev,docs +commands = pytest +passenv = + UV_CACHE_DIR + PYTHONUNBUFFERED + +[testenv:py{39, 310, 311}-lowest] +uv_resolution = lowest-direct + +[testenv:py{39, 310, 311, 312, 313}] +uv_resolution = highest +""" diff --git a/python/tests/test_vars_funs.py b/python/tests/test_vars_funs.py index 63c7c93..b1341f5 100644 --- a/python/tests/test_vars_funs.py +++ b/python/tests/test_vars_funs.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import pytest @@ -122,3 +123,262 @@ def test_vars_rename_invalid_output_type(self): output_type="foo", ) assert "output_type must be one of" in str(exc.value) + + +class TestVarsRecode: + @pytest.fixture(scope="class") + def raw_columns(cls) -> list[dict]: + """Metadata describing the columns that we use as fixtures for all + vars_recode tests. Each element of the list is a dict representing a + column""" + return [ + { + # Structure of the input column. We parameterize input data in + # the `input_data` fixture with one parameter per element of + # this dict + "input": { + "athena": {"name": "pin", "value": ["12345"] * 4}, + "iasworld": {"name": "pin", "value": ["12345"] * 4}, + }, + # Structure of the output column. We select the proper column + # based on key for different types of tests + "expected": { + "short": {"name": "pin", "value": ["12345"] * 4}, + "long": {"name": "pin", "value": ["12345"] * 4}, + "code": {"name": "pin", "value": ["12345"] * 4}, + "factor": {"name": "pin", "value": ["12345"] * 4}, + # If `value` is True, expect the column value to be + # recoded to the "long" format; otherwise, expect the + # column value to stay the same as the input value + "col": {"name": "pin", "value": False}, + }, + }, + { + "input": { + "athena": { + "name": "char_ext_wall", + "value": ["1", "2", "0", None], + }, + "iasworld": { + "name": "extwall", + "value": ["1", "2", "0", None], + }, + }, + "expected": { + "short": { + "name": "char_ext_wall", + "value": ["FRAM", "MASR", np.nan, np.nan], + }, + "long": { + "name": "char_ext_wall", + "value": ["Frame", "Masonry", np.nan, np.nan], + }, + "code": { + "name": "char_ext_wall", + "value": ["1", "2", np.nan, np.nan], + }, + "factor": { + "name": "char_ext_wall", + "value": pd.Categorical( + ["1", "2", np.nan, np.nan], + categories=["1", "2", "3", "4"], + ), + }, + "col": {"name": "char_ext_wall", "value": True}, + }, + }, + { + "input": { + "athena": { + "name": "char_bsmt", + "value": ["1", "3", "4", "5"], + }, + "iasworld": { + "name": "bsmt", + "value": ["1", "3", "4", "5"], + }, + }, + "expected": { + "short": { + "name": "char_bsmt", + "value": ["FL", "PT", "CR", np.nan], + }, + "long": { + "name": "char_bsmt", + "value": ["Full", "Partial", "Crawl", np.nan], + }, + "code": { + "name": "char_bsmt", + "value": ["1", "3", "4", np.nan], + }, + "factor": { + "name": "char_bsmt", + "value": pd.Categorical( + ["1", "3", "4", np.nan], + categories=["1", "2", "3", "4"], + ), + }, + "col": {"name": "char_bsmt", "value": True}, + }, + }, + { + "input": { + "athena": {"name": "value", "value": range(1000, 1004)}, + "iasworld": {"name": "value", "value": range(1000, 1004)}, + }, + "expected": { + "short": {"name": "value", "value": range(1000, 1004)}, + "long": {"name": "value", "value": range(1000, 1004)}, + "code": {"name": "value", "value": range(1000, 1004)}, + "factor": {"name": "value", "value": range(1000, 1004)}, + "col": {"name": "value", "value": False}, + }, + }, + { + "input": { + "athena": { + "name": "char_roof_cnst", + "value": ["1", "2", "3", "0"], + }, + "iasworld": { + "name": "user13", + "value": ["1", "2", "3", "0"], + }, + }, + "expected": { + "short": { + "name": "char_roof_cnst", + "value": ["SHAS", "TRGR", "SLTE", np.nan], + }, + "long": { + "name": "char_roof_cnst", + "value": [ + "Shingle + Asphalt", + "Tar + Gravel", + "Slate", + np.nan, + ], + }, + "code": { + "name": "char_roof_cnst", + "value": ["1", "2", "3", np.nan], + }, + "factor": { + "name": "char_roof_cnst", + "value": pd.Categorical( + ["1", "2", "3", np.nan], + categories=["1", "2", "3", "4", "5", "6"], + ), + }, + "col": {"name": "char_roof_cnst", "value": False}, + }, + }, + ] + + @pytest.fixture(params=["athena", "iasworld"]) + def input_data(cls, request, raw_columns): + input_type = request.param + return ( + input_type, + pd.DataFrame( + { + col["input"][input_type]["name"]: col["input"][input_type][ + "value" + ] + for col in raw_columns + } + ), + ) + + @pytest.mark.parametrize("code_type", ["short", "long", "code"]) + def test_vars_recode_code_type(self, input_data, raw_columns, code_type): + input_format, input_data = input_data + expected_output = pd.DataFrame( + { + col["expected"][code_type]["name"]: col["expected"][code_type][ + "value" + ] + for col in raw_columns + } + ) + # Rename the expected output data so it's consistent with whatever input + # data we're looking at + expected_renamed = ccao.vars_rename( + expected_output, names_from="model", names_to=input_format + ) + recoded = ccao.vars_recode( + input_data, code_type=code_type, as_factor=False + ) + assert recoded.equals(expected_renamed) + + def test_vars_recode_cols(self, input_data, raw_columns): + input_format, input_data = input_data + cols = [ + col["expected"]["col"]["name"] + for col in raw_columns + if col["expected"]["col"]["value"] is True + ] + # Rename the cols so they match the input data schema + cols = ccao.vars_rename( + cols, names_from="model", names_to=input_format + ) + code_type = "short" + expected_output = pd.DataFrame( + { + col["expected"]["col"]["name"]: ( + col["expected"][code_type]["value"] + if col["expected"]["col"]["value"] is True + else col["input"]["athena"]["value"] + ) + for col in raw_columns + } + ) + expected_renamed = ccao.vars_rename( + expected_output, names_from="model", names_to=input_format + ) + recoded = ccao.vars_recode( + input_data, cols=cols, code_type=code_type, as_factor=False + ) + assert recoded.equals(expected_renamed) + + def test_vars_recode_as_factor(self, input_data, raw_columns): + input_format, input_data = input_data + expected_output = pd.DataFrame( + { + col["expected"]["factor"]["name"]: col["expected"]["factor"][ + "value" + ] + for col in raw_columns + } + ) + expected_renamed = ccao.vars_rename( + expected_output, names_from="model", names_to=input_format + ) + recoded = ccao.vars_recode( + input_data, code_type="code", as_factor=True + ) + assert recoded.equals(expected_renamed) + + def test_vars_recode_raises_on_empty_dictionary(self): + with pytest.raises(ValueError) as exc: + ccao.vars_recode(pd.DataFrame(), dictionary=pd.DataFrame()) + assert "non-empty" in str(exc.value) + + def test_vars_recode_raises_on_missing_dictionary_columns(self): + dictionary = ccao.vars_dict.drop(columns=["var_code"]) + with pytest.raises(ValueError) as exc: + ccao.vars_recode(pd.DataFrame(), dictionary=dictionary) + assert "dictionary must contain the following column" in str(exc.value) + + def test_vars_recode_raises_on_missing_var_name_columns(self): + dictionary = ccao.vars_dict.drop( + columns=list(ccao.vars_dict.filter(regex="var_name_")) + ) + with pytest.raises(ValueError) as exc: + ccao.vars_recode(pd.DataFrame(), dictionary=dictionary) + assert "dictionary must contain at least one" in str(exc.value) + + def test_vars_recode_raises_on_invalid_code_type(self): + with pytest.raises(ValueError) as exc: + ccao.vars_recode(pd.DataFrame(), code_type="foo") + assert "code_type must be one of" in str(exc.value)