Skip to content

Commit

Permalink
Merge pull request #1419 from emlys/task/1379
Browse files Browse the repository at this point in the history
Use `read_csv_to_dataframe` in validation
  • Loading branch information
phargogh authored Nov 9, 2023
2 parents 329da4f + c2e13ea commit 6769776
Show file tree
Hide file tree
Showing 33 changed files with 964 additions and 1,088 deletions.
3 changes: 3 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ Unreleased Changes
(`#1374 <https://github.com/natcap/invest/issues/1374>`_)
* Datastack archives will now be correctly extracted
(`#1308 <https://github.com/natcap/invest/issues/1308>`_)
* Validation of tables has been improved and standardized, which should
result in more readable validation errors.
(`#1379 <https://github.com/natcap/invest/issues/1379>`_)
* Updated to ``pygeoprocessing`` 2.4.2. This includes an update to
``pygeoprocessing.zonal_statistics``, which is now more correct on certain
edge cases. Aggregated model results may change slightly.
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ GIT_TEST_DATA_REPO_REV := da013683e80ea094fbb2309197e2488c02794da8

GIT_UG_REPO := https://github.com/natcap/invest.users-guide
GIT_UG_REPO_PATH := doc/users-guide
GIT_UG_REPO_REV := 1db6aa847e07b774700ad1432172c791c4729dde
GIT_UG_REPO_REV := 6d40e3c8e56cfb09e579c58312d653086e69d6c4

ENV = "./env"
ifeq ($(OS),Windows_NT)
Expand Down
13 changes: 7 additions & 6 deletions src/natcap/invest/annual_water_yield.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,8 +526,9 @@ def execute(args):
'Checking that watersheds have entries for every `ws_id` in the '
'valuation table.')
# Open/read in valuation parameters from CSV file
valuation_df = utils.read_csv_to_dataframe(
args['valuation_table_path'], MODEL_SPEC['args']['valuation_table_path'])
valuation_df = validation.get_validated_dataframe(
args['valuation_table_path'],
**MODEL_SPEC['args']['valuation_table_path'])
watershed_vector = gdal.OpenEx(
args['watersheds_path'], gdal.OF_VECTOR)
watershed_layer = watershed_vector.GetLayer()
Expand Down Expand Up @@ -645,15 +646,15 @@ def execute(args):
'lulc': pygeoprocessing.get_raster_info(clipped_lulc_path)['nodata'][0]}

# Open/read in the csv file into a dictionary and add to arguments
bio_df = utils.read_csv_to_dataframe(args['biophysical_table_path'],
MODEL_SPEC['args']['biophysical_table_path'])
bio_df = validation.get_validated_dataframe(args['biophysical_table_path'],
**MODEL_SPEC['args']['biophysical_table_path'])
bio_lucodes = set(bio_df.index.values)
bio_lucodes.add(nodata_dict['lulc'])
LOGGER.debug(f'bio_lucodes: {bio_lucodes}')

if 'demand_table_path' in args and args['demand_table_path'] != '':
demand_df = utils.read_csv_to_dataframe(
args['demand_table_path'], MODEL_SPEC['args']['demand_table_path'])
demand_df = validation.get_validated_dataframe(
args['demand_table_path'], **MODEL_SPEC['args']['demand_table_path'])
demand_reclassify_dict = dict(
[(lucode, row['demand']) for lucode, row in demand_df.iterrows()])
demand_lucodes = set(demand_df.index.values)
Expand Down
4 changes: 2 additions & 2 deletions src/natcap/invest/carbon.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,8 @@ def execute(args):
(_INTERMEDIATE_BASE_FILES, intermediate_output_dir),
(_TMP_BASE_FILES, output_dir)], file_suffix)

carbon_pool_df = utils.read_csv_to_dataframe(
args['carbon_pools_path'], MODEL_SPEC['args']['carbon_pools_path'])
carbon_pool_df = validation.get_validated_dataframe(
args['carbon_pools_path'], **MODEL_SPEC['args']['carbon_pools_path'])

try:
n_workers = int(args['n_workers'])
Expand Down
24 changes: 12 additions & 12 deletions src/natcap/invest/coastal_blue_carbon/coastal_blue_carbon.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,9 +570,9 @@ def execute(args):
task_graph, n_workers, intermediate_dir, output_dir, suffix = (
_set_up_workspace(args))

snapshots = utils.read_csv_to_dataframe(
snapshots = validation.get_validated_dataframe(
args['landcover_snapshot_csv'],
MODEL_SPEC['args']['landcover_snapshot_csv']
**MODEL_SPEC['args']['landcover_snapshot_csv']
)['raster_path'].to_dict()

# Phase 1: alignment and preparation of inputs
Expand All @@ -593,9 +593,9 @@ def execute(args):

# We're assuming that the LULC initial variables and the carbon pool
# transient table are combined into a single lookup table.
biophysical_df = utils.read_csv_to_dataframe(
biophysical_df = validation.get_validated_dataframe(
args['biophysical_table_path'],
MODEL_SPEC['args']['biophysical_table_path'])
**MODEL_SPEC['args']['biophysical_table_path'])

# LULC Classnames are critical to the transition mapping, so they must be
# unique. This check is here in ``execute`` because it's possible that
Expand Down Expand Up @@ -963,9 +963,9 @@ def execute(args):
prices = None
if args.get('do_economic_analysis', False): # Do if truthy
if args.get('use_price_table', False):
prices = utils.read_csv_to_dataframe(
prices = validation.get_validated_dataframe(
args['price_table_path'],
MODEL_SPEC['args']['price_table_path']
**MODEL_SPEC['args']['price_table_path']
)['price'].to_dict()
else:
inflation_rate = float(args['inflation_rate']) * 0.01
Expand Down Expand Up @@ -1948,8 +1948,8 @@ def _read_transition_matrix(transition_csv_path, biophysical_df):
landcover transition, and the second contains accumulation rates for
the pool for the landcover transition.
"""
table = utils.read_csv_to_dataframe(
transition_csv_path, MODEL_SPEC['args']['landcover_transitions_table']
table = validation.get_validated_dataframe(
transition_csv_path, **MODEL_SPEC['args']['landcover_transitions_table']
).reset_index()

lulc_class_to_lucode = {}
Expand Down Expand Up @@ -2172,9 +2172,9 @@ def validate(args, limit_to=None):

if ("landcover_snapshot_csv" not in invalid_keys and
"landcover_snapshot_csv" in sufficient_keys):
snapshots = utils.read_csv_to_dataframe(
snapshots = validation.get_validated_dataframe(
args['landcover_snapshot_csv'],
MODEL_SPEC['args']['landcover_snapshot_csv']
**MODEL_SPEC['args']['landcover_snapshot_csv']
)['raster_path'].to_dict()

for snapshot_year, snapshot_raster_path in snapshots.items():
Expand Down Expand Up @@ -2204,8 +2204,8 @@ def validate(args, limit_to=None):
transitions_spec['columns']['[LULC CODE]']['options'].keys())
# lowercase options since utils call will lowercase table values
transition_options = [x.lower() for x in transition_options]
transitions_df = utils.read_csv_to_dataframe(
args['landcover_transitions_table'], transitions_spec)
transitions_df = validation.get_validated_dataframe(
args['landcover_transitions_table'], **transitions_spec)
transitions_mask = ~transitions_df.isin(transition_options) & ~transitions_df.isna()
if transitions_mask.any(axis=None):
transition_numpy_mask = transitions_mask.values
Expand Down
8 changes: 4 additions & 4 deletions src/natcap/invest/coastal_blue_carbon/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,9 @@ def execute(args):
os.path.join(args['workspace_dir'], 'taskgraph_cache'),
n_workers, reporting_interval=5.0)

snapshots_dict = utils.read_csv_to_dataframe(
snapshots_dict = validation.get_validated_dataframe(
args['landcover_snapshot_csv'],
MODEL_SPEC['args']['landcover_snapshot_csv']
**MODEL_SPEC['args']['landcover_snapshot_csv']
)['raster_path'].to_dict()

# Align the raster stack for analyzing the various transitions.
Expand Down Expand Up @@ -213,9 +213,9 @@ def execute(args):
target_path_list=aligned_snapshot_paths,
task_name='Align input landcover rasters')

landcover_df = utils.read_csv_to_dataframe(
landcover_df = validation.get_validated_dataframe(
args['lulc_lookup_table_path'],
MODEL_SPEC['args']['lulc_lookup_table_path'])
**MODEL_SPEC['args']['lulc_lookup_table_path'])

target_transition_table = os.path.join(
output_dir, TRANSITION_TABLE.format(suffix=suffix))
Expand Down
28 changes: 15 additions & 13 deletions src/natcap/invest/coastal_vulnerability.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,10 +461,19 @@
"Shore points with associated habitat data"),
"index_col": "shore_id",
"columns": {
# shore_id and R_hab come first so that they get
# matched before [HABITAT], which matches everything
"shore_id": {
"type": "integer",
"about": "Shore point ID"
},
"R_hab": {
"about": (
"Overall habitat exposure rank, the "
"result of equation (15)"),
"type": "number",
"units": u.none
},
"[HABITAT]": {
"about": (
"Habitat exposure rank for the given "
Expand All @@ -477,13 +486,6 @@
"rank defined in the Habitats Table input."),
"type": "number",
"units": u.none
},
"R_hab": {
"about": (
"Overall habitat exposure rank, the "
"result of equation (15)"),
"type": "number",
"units": u.none
}
}
}
Expand Down Expand Up @@ -2302,8 +2304,8 @@ def _schedule_habitat_tasks(
list of pickle file path strings
"""
habitat_dataframe = utils.read_csv_to_dataframe(
habitat_table_path, MODEL_SPEC['args']['habitat_table_path']
habitat_dataframe = validation.get_validated_dataframe(
habitat_table_path, **MODEL_SPEC['args']['habitat_table_path']
).rename(columns={'protection distance (m)': 'distance'})

habitat_task_list = []
Expand Down Expand Up @@ -2831,8 +2833,8 @@ def assemble_results_and_calculate_exposure(
with open(pickle_path, 'rb') as file:
final_values_dict[var_name] = pickle.load(file)

habitat_df = utils.read_csv_to_dataframe(
habitat_protection_path, MODEL_SPEC['outputs']['intermediate'][
habitat_df = validation.get_validated_dataframe(
habitat_protection_path, **MODEL_SPEC['outputs']['intermediate'][
'contents']['habitats']['contents']['habitat_protection.csv']
).rename(columns={'r_hab': 'R_hab'})
output_layer.StartTransaction()
Expand Down Expand Up @@ -3459,8 +3461,8 @@ def _validate_habitat_table_paths(habitat_table_path):
Raises:
ValueError if any vector in the ``path`` column cannot be opened.
"""
habitat_dataframe = utils.read_csv_to_dataframe(
habitat_table_path, MODEL_SPEC['args']['habitat_table_path'])
habitat_dataframe = validation.get_validated_dataframe(
habitat_table_path, **MODEL_SPEC['args']['habitat_table_path'])
bad_paths = []
for habitat_row in habitat_dataframe.itertuples():
try:
Expand Down
12 changes: 6 additions & 6 deletions src/natcap/invest/crop_production_percentile.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,9 +468,9 @@ def execute(args):
None.
"""
crop_to_landcover_df = utils.read_csv_to_dataframe(
crop_to_landcover_df = validation.get_validated_dataframe(
args['landcover_to_crop_table_path'],
MODEL_SPEC['args']['landcover_to_crop_table_path'])
**MODEL_SPEC['args']['landcover_to_crop_table_path'])
bad_crop_name_list = []
for crop_name in crop_to_landcover_df.index:
crop_climate_bin_raster_path = os.path.join(
Expand Down Expand Up @@ -549,9 +549,9 @@ def execute(args):
climate_percentile_yield_table_path = os.path.join(
args['model_data_path'],
_CLIMATE_PERCENTILE_TABLE_PATTERN % crop_name)
crop_climate_percentile_df = utils.read_csv_to_dataframe(
crop_climate_percentile_df = validation.get_validated_dataframe(
climate_percentile_yield_table_path,
MODEL_SPEC['args']['model_data_path']['contents'][
**MODEL_SPEC['args']['model_data_path']['contents'][
'climate_percentile_yield_tables']['contents'][
'[CROP]_percentile_yield_table.csv'])
yield_percentile_headers = [
Expand Down Expand Up @@ -707,9 +707,9 @@ def execute(args):

# both 'crop_nutrient.csv' and 'crop' are known data/header values for
# this model data.
nutrient_df = utils.read_csv_to_dataframe(
nutrient_df = validation.get_validated_dataframe(
os.path.join(args['model_data_path'], 'crop_nutrient.csv'),
MODEL_SPEC['args']['model_data_path']['contents']['crop_nutrient.csv'])
**MODEL_SPEC['args']['model_data_path']['contents']['crop_nutrient.csv'])
result_table_path = os.path.join(
output_dir, 'result_table%s.csv' % file_suffix)

Expand Down
16 changes: 8 additions & 8 deletions src/natcap/invest/crop_production_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,13 +495,13 @@ def execute(args):

LOGGER.info(
"Checking if the landcover raster is missing lucodes")
crop_to_landcover_df = utils.read_csv_to_dataframe(
crop_to_landcover_df = validation.get_validated_dataframe(
args['landcover_to_crop_table_path'],
MODEL_SPEC['args']['landcover_to_crop_table_path'])
**MODEL_SPEC['args']['landcover_to_crop_table_path'])

crop_to_fertilization_rate_df = utils.read_csv_to_dataframe(
crop_to_fertilization_rate_df = validation.get_validated_dataframe(
args['fertilization_rate_table_path'],
MODEL_SPEC['args']['fertilization_rate_table_path'])
**MODEL_SPEC['args']['fertilization_rate_table_path'])

crop_lucodes = list(crop_to_landcover_df[_EXPECTED_LUCODE_TABLE_HEADER])

Expand Down Expand Up @@ -576,10 +576,10 @@ def execute(args):
task_name='crop_climate_bin')
dependent_task_list.append(crop_climate_bin_task)

crop_regression_df = utils.read_csv_to_dataframe(
crop_regression_df = validation.get_validated_dataframe(
os.path.join(args['model_data_path'],
_REGRESSION_TABLE_PATTERN % crop_name),
MODEL_SPEC['args']['model_data_path']['contents'][
**MODEL_SPEC['args']['model_data_path']['contents'][
'climate_regression_yield_tables']['contents'][
'[CROP]_regression_yield_table.csv'])
for _, row in crop_regression_df.iterrows():
Expand Down Expand Up @@ -803,9 +803,9 @@ def execute(args):

# both 'crop_nutrient.csv' and 'crop' are known data/header values for
# this model data.
nutrient_df = utils.read_csv_to_dataframe(
nutrient_df = validation.get_validated_dataframe(
os.path.join(args['model_data_path'], 'crop_nutrient.csv'),
MODEL_SPEC['args']['model_data_path']['contents']['crop_nutrient.csv'])
**MODEL_SPEC['args']['model_data_path']['contents']['crop_nutrient.csv'])

LOGGER.info("Generating report table")
crop_names = list(crop_to_landcover_df.index)
Expand Down
5 changes: 3 additions & 2 deletions src/natcap/invest/datastack.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from osgeo import gdal

from . import utils
from . import validation

try:
from . import __version__
Expand Down Expand Up @@ -333,8 +334,8 @@ def build_datastack_archive(args, model_name, datastack_path):
contained_files_dir = os.path.join(
data_dir, f'{key}_csv_data')

dataframe = utils.read_csv_to_dataframe(
source_path, args_spec[key])
dataframe = validation.get_validated_dataframe(
source_path, **args_spec[key])
csv_source_dir = os.path.abspath(os.path.dirname(source_path))
for spatial_column_name in spatial_columns:
# Iterate through the spatial columns, identify the set of
Expand Down
12 changes: 6 additions & 6 deletions src/natcap/invest/forest_carbon_edge_effect.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,9 +413,9 @@ def execute(args):
# Map non-forest landcover codes to carbon biomasses
LOGGER.info('Calculating direct mapped carbon stocks')
carbon_maps = []
biophysical_df = utils.read_csv_to_dataframe(
biophysical_df = validation.get_validated_dataframe(
args['biophysical_table_path'],
MODEL_SPEC['args']['biophysical_table_path'])
**MODEL_SPEC['args']['biophysical_table_path'])
pool_list = [('c_above', True)]
if args['pools_to_calculate'] == 'all':
pool_list.extend([
Expand Down Expand Up @@ -624,8 +624,8 @@ def _calculate_lulc_carbon_map(
"""
# classify forest pixels from lulc
biophysical_df = utils.read_csv_to_dataframe(
biophysical_table_path, MODEL_SPEC['args']['biophysical_table_path'])
biophysical_df = validation.get_validated_dataframe(
biophysical_table_path, **MODEL_SPEC['args']['biophysical_table_path'])

lucode_to_per_cell_carbon = {}
cell_size = pygeoprocessing.get_raster_info(
Expand Down Expand Up @@ -688,8 +688,8 @@ def _map_distance_from_tropical_forest_edge(
"""
# Build a list of forest lucodes
biophysical_df = utils.read_csv_to_dataframe(
biophysical_table_path, MODEL_SPEC['args']['biophysical_table_path'])
biophysical_df = validation.get_validated_dataframe(
biophysical_table_path, **MODEL_SPEC['args']['biophysical_table_path'])
forest_codes = biophysical_df[biophysical_df['is_tropical_forest']].index.values

# Make a raster where 1 is non-forest landcover types and 0 is forest
Expand Down
16 changes: 8 additions & 8 deletions src/natcap/invest/habitat_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,12 +372,12 @@ def execute(args):

LOGGER.info("Checking Threat and Sensitivity tables for compliance")
# Get CSVs as dictionaries and ensure the key is a string for threats.
threat_df = utils.read_csv_to_dataframe(
args['threats_table_path'], MODEL_SPEC['args']['threats_table_path']
threat_df = validation.get_validated_dataframe(
args['threats_table_path'], **MODEL_SPEC['args']['threats_table_path']
).fillna('')
sensitivity_df = utils.read_csv_to_dataframe(
sensitivity_df = validation.get_validated_dataframe(
args['sensitivity_table_path'],
MODEL_SPEC['args']['sensitivity_table_path'])
**MODEL_SPEC['args']['sensitivity_table_path'])

half_saturation_constant = float(args['half_saturation_constant'])

Expand Down Expand Up @@ -1086,12 +1086,12 @@ def validate(args, limit_to=None):
"sensitivity_table_path" not in invalid_keys and
"threat_raster_folder" not in invalid_keys):
# Get CSVs as dictionaries and ensure the key is a string for threats.
threat_df = utils.read_csv_to_dataframe(
threat_df = validation.get_validated_dataframe(
args['threats_table_path'],
MODEL_SPEC['args']['threats_table_path']).fillna('')
sensitivity_df = utils.read_csv_to_dataframe(
**MODEL_SPEC['args']['threats_table_path']).fillna('')
sensitivity_df = validation.get_validated_dataframe(
args['sensitivity_table_path'],
MODEL_SPEC['args']['sensitivity_table_path'])
**MODEL_SPEC['args']['sensitivity_table_path'])

# check that the threat names in the threats table match with the
# threats columns in the sensitivity table.
Expand Down
Loading

0 comments on commit 6769776

Please sign in to comment.