Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

U/jrbogart/provenance v2 #111

Merged
merged 8 commits into from
Jul 11, 2024
81 changes: 43 additions & 38 deletions skycatalogs/catalog_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .utils.config_utils import create_config, assemble_SED_models
from .utils.config_utils import assemble_MW_extinction, assemble_cosmology
from .utils.config_utils import assemble_object_types, assemble_provenance
from .utils.config_utils import assemble_file_metadata
from .utils.config_utils import write_yaml
from .utils.star_parquet_input import _star_parquet_reader
from .utils.parquet_schema_utils import make_galaxy_schema
Expand Down Expand Up @@ -234,10 +235,11 @@ def __init__(self, parts, area_partition=None, skycatalog_root=None,
knots=True, logname='skyCatalogs.creator',
pkg_root=None, skip_done=False, no_main=False,
no_flux=False, flux_parallel=16, galaxy_nside=32,
galaxy_stride=1000000, provenance=None,
galaxy_stride=1000000,
dc2=False, sn_object_type='sncosmo', galaxy_type='cosmodc2',
include_roman_flux=False, star_input_fmt='sqlite',
sso_truth=None, sso_sed=None, sso_partition='healpixel'):
sso_truth=None, sso_sed=None, sso_partition='healpixel',
run_options=None):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a docstring entry for run_options please?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, certainly. Just an oversight.

"""
Store context for catalog creation

Expand Down Expand Up @@ -277,7 +279,6 @@ def __init__(self, parts, area_partition=None, skycatalog_root=None,
flux_parallel Number of processes to divide work of computing fluxes
galaxy_nside Healpix configuration value "nside" for galaxy output
galaxy_stride Max number of rows per galaxy row group
provenance Whether to write per-output-file git repo provenance
dc2 Whether to adjust values to provide input comparable
to that for the DC2 run
sn_object_type Which object type to use for SNe.
Expand All @@ -287,6 +288,8 @@ def __init__(self, parts, area_partition=None, skycatalog_root=None,
sso_truth Directory containing Sorcha output
sso_sed Path to sed file to be used for all SSOs
sso_partition Whether to partition by time or by healpixels
run_options The options the outer script (create_sc.py) was
called with

Might want to add a way to specify template for output file name
and template for input sedLookup file name.
Expand Down Expand Up @@ -365,7 +368,6 @@ def __init__(self, parts, area_partition=None, skycatalog_root=None,
self._no_flux = no_flux
self._flux_parallel = flux_parallel
self._galaxy_nside = galaxy_nside
self._provenance = provenance
self._dc2 = dc2
self._include_roman_flux = include_roman_flux
self._obs_sed_factory = None
Expand All @@ -374,6 +376,8 @@ def __init__(self, parts, area_partition=None, skycatalog_root=None,
self._sso_truth = self._sso_creator.sso_truth
self._sso_sed = self._sso_creator.sso_sed
self._sso_partition = sso_partition
self._run_options = run_options
self._tophat_sed_bins = None

def _make_tophat_columns(self, dat, names, cmp):
'''
Expand Down Expand Up @@ -450,10 +454,16 @@ def create_galaxy_catalog(self):
# Save cosmology in case we need to write parameters out later
self._cosmology = gal_cat.cosmology

inputs = {'galaxy_truth': self._galaxy_truth}
file_metadata = assemble_file_metadata(self._pkg_root,
inputs=inputs,
run_options=self._run_options)

arrow_schema = make_galaxy_schema(self._logname,
sed_subdir=self._sed_subdir,
knots=self._knots,
galaxy_type=self._galaxy_type)
galaxy_type=self._galaxy_type,
metadata_input=file_metadata)

for p in self._parts:
self._logger.info(f'Starting on pixel {p}')
Expand Down Expand Up @@ -577,7 +587,7 @@ def create_galaxy_pixel(self, pixel, gal_cat, arrow_schema):
# Find sed bin definition and all the tophat quantities needed
all_q = gal_cat.list_all_quantities()
sed_bins, sed_bulge_names, sed_disk_names = _get_tophat_info(all_q)
self._sed_bins = sed_bins
self._tophat_sed_bins = sed_bins

th_fact = TophatSedFactory(sed_bins,
assemble_cosmology(self._cosmology))
Expand Down Expand Up @@ -687,9 +697,6 @@ def create_galaxy_pixel(self, pixel, gal_cat, arrow_schema):
arrow_schema=arrow_schema,
stride=stride, to_rename=to_rename)

if self._provenance == 'yaml':
self.write_provenance_file(output_path)

def create_galaxy_flux_catalog(self, config_file=None):
'''
Create a second file per healpixel containing just galaxy id and
Expand All @@ -708,9 +715,13 @@ def create_galaxy_flux_catalog(self, config_file=None):
from .skyCatalogs import open_catalog
self._sed_gen = None

file_metadata = assemble_file_metadata(self._pkg_root,
run_options=self._run_options,
flux_file=True)
self._gal_flux_schema =\
make_galaxy_flux_schema(self._logname, self._galaxy_type,
include_roman_flux=self._include_roman_flux)
include_roman_flux=self._include_roman_flux,
metadata_input=file_metadata)
self._gal_flux_needed = [field.name for field in self._gal_flux_schema]

if not config_file:
Expand Down Expand Up @@ -867,8 +878,6 @@ def _create_galaxy_flux_pixel(self, pixel):

writer.close()
self._logger.debug(f'# row groups written to flux file: {rg_written}')
if self._provenance == 'yaml':
self.write_provenance_file(output_path)

def create_pointsource_catalog(self):

Expand All @@ -884,9 +893,13 @@ def create_pointsource_catalog(self):
-------
None
"""
arrow_schema = make_star_schema()
# Need a way to indicate which object types to include; deal with that
# later. For now, default is stars + sn
inputs = {'star_truth': self._star_truth}
file_metadata = assemble_file_metadata(self._pkg_root,
inputs=inputs,
run_options=self._run_options)

arrow_schema = make_star_schema(metadata_input=file_metadata)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the comment below,

    #  Need a way to indicate which object types to include; deal with that
    #  later.  For now, default is stars + sn

still relevant? I would think not, since sncosmo support has been removed. As it stands, it's hard to see how it relates to the code around it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, no longer relevant. Will delete.


for p in self._parts:
self._logger.debug(f'Point sources. Starting on pixel {p}')
self.create_pointsource_pixel(p, arrow_schema,
Expand Down Expand Up @@ -960,9 +973,6 @@ def create_pointsource_pixel(self, pixel, arrow_schema, star_cat=None):
u_bnd = min(l_bnd + stride, last_row_ix + 1)

writer.close()
if self._provenance == 'yaml':
self.write_provenance_file(output_path)

return

def create_pointsource_flux_catalog(self, config_file=None):
Expand All @@ -982,7 +992,12 @@ def create_pointsource_flux_catalog(self, config_file=None):

from .skyCatalogs import open_catalog

self._ps_flux_schema = make_star_flux_schema(self._logname)
file_metadata = assemble_file_metadata(self._pkg_root,
run_options=self._run_options,
flux_file=True)

self._ps_flux_schema = make_star_flux_schema(self._logname,
metadata_input=file_metadata)
if not config_file:
config_file = self.write_config(path_only=True)

Expand Down Expand Up @@ -1100,8 +1115,6 @@ def _create_pointsource_flux_pixel(self, pixel):

writer.close()
self._logger.debug(f'# row groups written to flux file: {rg_written}')
if self._provenance == 'yaml':
self.write_provenance_file(output_path)

def write_config(self, overwrite=False, path_only=False):
'''
Expand Down Expand Up @@ -1131,39 +1144,31 @@ def write_config(self, overwrite=False, path_only=False):
config = create_config(self._catalog_name, self._logname)
if self._global_partition is not None:
config.add_key('area_partition', self._area_partition)
config.add_key('skycatalog_root', self._skycatalog_root)

# Even though the following keys are also in the run options
# section they need to be here so that the flux creation code
# can find them
config.add_key('catalog_dir', self._catalog_dir)
config.add_key('skycatalog_root', self._skycatalog_root)

if self._galaxy_type == 'cosmodc2':
config.add_key('SED_models',
assemble_SED_models(self._sed_bins))
assemble_SED_models(self._tophat_sed_bins))
config.add_key('MW_extinction_values', assemble_MW_extinction())
config.add_key('Cosmology', assemble_cosmology(self._cosmology))
config.add_key('object_types',
assemble_object_types(self._pkg_root,
galaxy_nside=self._galaxy_nside))

config.add_key('galaxy_magnitude_cut', self._mag_cut)
config.add_key('knots_magnitude_cut', self._knots_mag_cut)

inputs = {'galaxy_truth': self._galaxy_truth}
if self._star_truth:
inputs['star_truth'] = self._star_truth
if self._sso_truth:
inputs['sso_truth'] = self._sso_truth
inputs['sso_sed'] = self._sso_sed
config.add_key('provenance', assemble_provenance(self._pkg_root,
inputs=inputs))
config.add_key('provenance',
assemble_provenance(self._pkg_root, inputs=inputs,
run_options=self._run_options))

self._written_config = config.write_config(self._config_path,
overwrite=overwrite)

def write_provenance_file(self, datafile_path):
'''
Write git provenance to a yaml file with name derived from a
just-written datafile name
'''
outpath = datafile_path.rsplit('.', 1)[0] + '_provenance.yaml'

prov = assemble_provenance(self._pkg_root, inputs=None)
write_yaml(prov, outpath)
16 changes: 6 additions & 10 deletions skycatalogs/scripts/create_sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import yaml
from skycatalogs.catalog_creator import CatalogCreator
from skycatalogs.utils.common_utils import print_date, log_callinfo
from skycatalogs.utils.common_utils import callinfo_to_dict

parser = argparse.ArgumentParser(description='''
Create Sky Catalogs. By default create a galaxy catalog for a
Expand Down Expand Up @@ -64,10 +65,6 @@
help='If supplied do not create flux files.')
parser.add_argument('--flux-parallel', default=16, type=int,
help='Number of processes to run in parallel when computing fluxes')
parser.add_argument('--provenance', '--prov', choices=['yaml'], help='''
Persist git provenance information for each file
written. Only supported format currently is as a
small yaml file, written to the data directory.''')
parser.add_argument('--options-file', default=None, help='''
path to yaml file associating option names with values.
Values for any options included will take precedence.''')
Expand Down Expand Up @@ -111,6 +108,7 @@
args.__setattr__(k, opt_dict[k])
else:
raise ValueError(f'Unknown attribute "{k}" in options file {args.options_file}')

logname = 'skyCatalogs.creator'
logger = logging.getLogger(logname)
logger.setLevel(args.log_level)
Expand All @@ -129,10 +127,8 @@
skycatalog_root = os.getenv('SKYCATALOG_ROOT')

parts = args.pixels
if args.provenance:
provenance = args.provenance
else:
provenance = None

opt_dict = callinfo_to_dict(args)

creator = CatalogCreator(parts, area_partition=None,
skycatalog_root=skycatalog_root,
Expand All @@ -148,12 +144,12 @@
flux_parallel=args.flux_parallel,
galaxy_nside=args.galaxy_nside,
galaxy_stride=args.galaxy_stride,
provenance=provenance,
dc2=args.dc2, galaxy_type=args.galaxy_type,
galaxy_truth=args.galaxy_truth,
include_roman_flux=args.include_roman_flux,
star_input_fmt=args.star_input_fmt,
sso_truth=args.sso_truth, sso_sed=args.sso_sed)
sso_truth=args.sso_truth, sso_sed=args.sso_sed,
run_options=opt_dict)
if len(parts) > 0:
logger.info(f'Starting with healpix pixel {parts[0]}')
elif args.sso:
Expand Down
16 changes: 11 additions & 5 deletions skycatalogs/skyCatalogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,9 @@ def __init__(self, config, mp=False, skycatalog_root=None, verbose=False,
'sso_sed.db')

self._sso_sed_factory = SsoSedFactory(self._sso_sed_path)
if not self._sso_sed_factory:
self._logger.warning('SSO appear in the list of available object types but supporting files do not exist')
self._logger.warning('SSOs will not be simulated')
self._extinguisher = MilkyWayExtinction()

# Make our properties accessible to BaseObject, etc.
Expand All @@ -359,9 +362,10 @@ def __init__(self, config, mp=False, skycatalog_root=None, verbose=False,
self.cat_cxt.register_source_type('diffsky_galaxy',
object_class=DiffskyObject)
if 'sso' in config['object_types']:
self.cat_cxt.register_source_type('sso',
object_class=SsoObject,
collection_class=SsoCollection)
if self._sso_sed_factory:
self.cat_cxt.register_source_type('sso',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See my comment below in sed_tools.py. Silently omitting SSOs here seems wrong.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll come up with a suitable place to log a warning message.

object_class=SsoObject,
collection_class=SsoCollection)

@property
def observed_sed_factory(self):
Expand Down Expand Up @@ -639,8 +643,10 @@ def get_object_type_by_hp(self, hp, object_type, region=None, mjd=None,
exposure=EXPOSURE_DEFAULT):
object_list = ObjectList()

# Do we need to check more specifically by object type?
# if hp not in self._hp_info:
if not self.cat_cxt.lookup_collection_type(object_type):
msg = f'object type {object_type} not available for this catalog'
self._logger.warning(msg)
return object_list
if hp not in self.hps_by_type(object_type):
msg = f'In SkyCatalog.get_object_type_by_hp, healpix {hp} '
msg += f'intersects region but has no data file for {object_type}'
Expand Down
33 changes: 11 additions & 22 deletions skycatalogs/utils/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,11 @@
import sys
import logging

__all__ = ['print_callinfo', 'log_callinfo', 'print_date', 'print_dated_msg', 'TIME_TO_SECOND_FMT']
__all__ = ['log_callinfo', 'callinfo_to_dict', 'print_date',
'print_dated_msg', 'TIME_TO_SECOND_FMT']

TIME_TO_SECOND_FMT = '%Y-%m-%d %H:%M:%S'

def print_callinfo(prog, args):
"""
Print information about how a script using argparse was called

Parameters
----------
prog program name, typically sys.argv[0]
args object returned by ArgumentParser.parse_args()
"""

print('{} {} invoked with arguments'.format(dt.now().strftime(TIME_TO_SECOND_FMT), prog))
for e in dir(args):
if not e.startswith('_'):
nm = 'args.' + e
print('{}: {}'.format(e, eval(nm)))

sys.stdout.flush()

def log_callinfo(prog, args, logname):
"""
Expand All @@ -40,12 +24,17 @@ def log_callinfo(prog, args, logname):

logger = logging.getLogger(logname)
log_out = '{} invoked with arguments\n'.format(prog)
for e in dir(args):
if not e.startswith('_'):
nm = 'args.' + e
log_out += ' {}: {}\n'.format(e, eval(nm))
for k,v in dict(sorted(args._get_kwargs())).items():
log_out += ' {}: {}\n'.format(k, v)
logger.info(log_out)

def callinfo_to_dict(args):
"""
Make a dict out of program arguments. Each option value is
either a simple atomic type or a list
"""
return dict(args._get_kwargs())

def print_date(to_second=True, file=None):
"""
Print current time (by default only to nearest second) and flush output
Expand Down
Loading
Loading