From b7851adee1b492daac507f61a218bf1e08cfd5c4 Mon Sep 17 00:00:00 2001 From: Olivia Lynn Date: Tue, 3 Oct 2023 13:26:24 -0400 Subject: [PATCH 1/4] add fsps (have issue midway thru, checking remote) --- .../Use_FSPS_to_Generate_SEDs.ipynb | 518 ++++++++++++++++++ 1 file changed, 518 insertions(+) create mode 100644 examples/creation_examples/Use_FSPS_to_Generate_SEDs.ipynb diff --git a/examples/creation_examples/Use_FSPS_to_Generate_SEDs.ipynb b/examples/creation_examples/Use_FSPS_to_Generate_SEDs.ipynb new file mode 100644 index 0000000..cbc4fdc --- /dev/null +++ b/examples/creation_examples/Use_FSPS_to_Generate_SEDs.ipynb @@ -0,0 +1,518 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Use FSPS to Generate Galaxy Rest-Frame Spectral Energy Distributions and Compute Apparent Magnitudes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Author:** Luca Tortorelli, Josue de Santiago, Eric Charles\n", + "\n", + "**Last Run Successfully:** August 2, 2022\n", + " \n", + "This notebook demonstrates how to use rail_fsps to generate galaxy rest-frame spectral energy distributions (SEDs) with FSPS and how to compute apparent magnitudes from them.\n", + "\n", + "In order to run this notebook you need to have FSPS and Python-FSPS installed. The easiest way to do this is the following (first line applies only in case you already installed it via pip):\n", + "\n", + " pip uninstall fsps\n", + " git clone --recursive https://github.com/dfm/python-fsps.git\n", + " cd python-fsps\n", + " python -m pip install .\n", + " export SPS_HOME=$(pwd)/src/fsps/libfsps" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from rail.core.data import TableHandle\n", + "from rail.creation.engines.fsps_sed_modeler import *\n", + "import rail.fsps\n", + "import h5py\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll start by setting up the Rail data store. RAIL uses ceci, which is designed for pipelines rather than interactive notebooks, the data store will work around that and enable us to use data interactively. See the rail/examples/goldenspike/goldenspike.ipynb example notebook for more details on the Data Store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from rail.core.stage import RailStage\n", + "DS = RailStage.data_store\n", + "DS.__class__.allow_overwrite = True\n", + "RAIL_FSPS_DIR = os.path.abspath(os.path.join(os.path.dirname(rail.fsps.__file__), '..', '..'))\n", + "default_rail_fsps_files_folder = os.path.join(RAIL_FSPS_DIR, 'rail', 'examples_data', 'creation_data', 'data',\n", + " 'fsps_default_data')\n", + "input_file = os.path.join(default_rail_fsps_files_folder, 'input_galaxy_properties_fsps.hdf5')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We generate some mock input data for the sed modeler class that we store in an hdf5 file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_galaxies = 10\n", + "\n", + "redshifts = np.linspace(0.1,1,num=n_galaxies)\n", + "zmet = np.full(n_galaxies, 1, dtype=int)\n", + "stellar_metallicity = np.full(n_galaxies, 0.0) # log10(Z/Zsun)\n", + "pmetals = np.full(n_galaxies, 2.0)\n", + "stellar_velocity_dispersion = np.full(n_galaxies, 100.)\n", + "gas_ionization = np.full(n_galaxies, -1)\n", + "gas_metallicity = np.full(n_galaxies, 0.0)\n", + "tau = np.full(n_galaxies, 1.0)\n", + "const = np.full(n_galaxies, 0.0)\n", + "sf_start = np.full(n_galaxies, 0.0)\n", + "sf_trunc = np.full(n_galaxies, 0.0)\n", + "stellar_age = np.full(n_galaxies, 2.)\n", + "fburst = np.full(n_galaxies, 0.0)\n", + "tburst = np.full(n_galaxies, 11.0)\n", + "sf_slope = np.full(n_galaxies, 0.0)\n", + "dust1_birth_cloud = np.full(n_galaxies, 0.1)\n", + "dust2_diffuse = np.full(n_galaxies, 0.1)\n", + "dust_index = np.full(n_galaxies, -0.7)\n", + "dust_calzetti_modifier = np.full(n_galaxies, -1.)\n", + "mwr = np.full(n_galaxies, 3.1)\n", + "uvb = np.full(n_galaxies, 1.0)\n", + "wgp1 = np.full(n_galaxies, 1)\n", + "dust_gamma = np.full(n_galaxies, 0.01)\n", + "dust_umin = np.full(n_galaxies, 1.0)\n", + "dust_qpah = np.full(n_galaxies, 3.5)\n", + "f_agn = np.full(n_galaxies, 0.01)\n", + "tau_agn = np.full(n_galaxies, 10.0)\n", + "\n", + "gal_t_table = np.linspace(0.05, 13.8, 100) # age of the universe in Gyr\n", + "gal_sfr_table = np.random.uniform(0, 10, gal_t_table.size) # SFR in Msun/yr\n", + "tabulated_sfh = np.full((n_galaxies, 2, len(gal_sfr_table)), [gal_t_table,gal_sfr_table])\n", + "\n", + "wave_lsf = np.linspace(3000, 10000, 2000)\n", + "sigma_lsf = np.full_like(wave_lsf, 0.5)\n", + "tabulated_lsf = np.full((n_galaxies, 2, len(wave_lsf)), [wave_lsf, sigma_lsf])\n", + "\n", + "with h5py.File(input_file, 'w') as h5table:\n", + " data = h5table.create_group('model')\n", + " data.create_dataset(name='redshifts', data=redshifts)\n", + " data.create_dataset(name='zmet', data=zmet)\n", + " data.create_dataset(name='stellar_metallicity', data=stellar_metallicity)\n", + " data.create_dataset(name='pmetals', data=pmetals)\n", + " data.create_dataset(name='stellar_velocity_dispersion', data=stellar_velocity_dispersion)\n", + " data.create_dataset(name='gas_ionization', data=gas_ionization)\n", + " data.create_dataset(name='gas_metallicity', data=gas_metallicity)\n", + " data.create_dataset(name='tau', data=tau)\n", + " data.create_dataset(name='const', data=const)\n", + " data.create_dataset(name='sf_start', data=sf_start)\n", + " data.create_dataset(name='sf_trunc', data=sf_trunc)\n", + " data.create_dataset(name='stellar_age', data=stellar_age)\n", + " data.create_dataset(name='fburst', data=fburst)\n", + " data.create_dataset(name='tburst', data=tburst)\n", + " data.create_dataset(name='sf_slope', data=sf_slope)\n", + " data.create_dataset(name='dust1_birth_cloud', data=dust1_birth_cloud)\n", + " data.create_dataset(name='dust2_diffuse', data=dust2_diffuse)\n", + " data.create_dataset(name='dust_index', data=dust_index)\n", + " data.create_dataset(name='dust_calzetti_modifier', data=dust_calzetti_modifier)\n", + " data.create_dataset(name='mwr', data=mwr)\n", + " data.create_dataset(name='uvb', data=uvb)\n", + " data.create_dataset(name='wgp1', data=wgp1)\n", + " data.create_dataset(name='dust_gamma', data=dust_gamma)\n", + " data.create_dataset(name='dust_umin', data=dust_umin)\n", + " data.create_dataset(name='dust_qpah', data=dust_qpah)\n", + " data.create_dataset(name='f_agn', data=f_agn)\n", + " data.create_dataset(name='tau_agn', data=tau_agn)\n", + " data.create_dataset(name='tabulated_lsf', data=tabulated_lsf)\n", + " data.create_dataset(name='tabulated_sfh', data=tabulated_sfh)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we load the data to the data store to use it later, and we also set the redshifts in which the output will be generated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainFile = os.path.join(input_file)\n", + "training_data = DS.read_file(\"training_data\", TableHandle, trainFile)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create an FSPSSedModeler class object. The latter has a number of configuration parameters to set. All the parameters have default values. Therefore, if the user is not sure about a particular value for a certain parameter, the latter can be left at default. A short description of each parameter can be found in src/rail/creation/engines/fsps_sed_modeler.py. \n", + "\n", + "We run the FSPSSedModeler in sequential mode. Note that each galaxy spectrum takes a few seconds to generate, so it is advisable to proceed in this way only for a limited sample of objects or for testing the code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fspssedmodeler = FSPSSedModeler.make_stage(chunk_size=10, hdf5_groupname='model', name='FSPSSedModeler',\n", + " compute_vega_mags=False,vactoair_flag=False,\n", + " zcontinuous=1, add_agb_dust_model=True,\n", + " add_dust_emission=True, add_igm_absorption=True,\n", + " add_neb_emission=True, add_neb_continuum=True,\n", + " add_stellar_remnants=True, compute_light_ages=False,\n", + " nebemlineinspec=True, smooth_velocity=True,\n", + " smooth_lsf=False, cloudy_dust=False,\n", + " agb_dust=1.0, tpagb_norm_type=2, dell=0.0,\n", + " delt=0.0, redgb=1.0, agb=1.0, fcstar=1.0, sbss=0.0,\n", + " fbhb=0.0, pagb=1.0, redshifts_key='redshifts',\n", + " zmet_key='zmet', stellar_metallicities_key='stellar_metallicity',\n", + " pmetals_key='pmetals', imf_type=1, imf_upper_limit=120.,\n", + " imf_lower_limit=0.08, imf1=1.3, imf2=2.3,imf3=2.3,vdmc=0.08,\n", + " mdave=0.5,evtype=-1,use_wr_spectra=1,logt_wmb_hot=0.0,masscut=150.0,\n", + " velocity_dispersions_key='stellar_velocity_dispersion',min_wavelength=3000,\n", + " max_wavelength=10000,gas_ionizations_key='gas_ionization',\n", + " gas_metallicities_key='gas_metallicity',igm_factor=1.0,sfh_type=3,\n", + " tau_key='tau',const_key='const',sf_start_key='sf_start',\n", + " sf_trunc_key='sf_trunc',stellar_ages_key='stellar_age',\n", + " fburst_key='fburst',tburst_key='tburst',sf_slope_key='sf_slope',\n", + " dust_type=2,dust_tesc=7.0,dust_birth_cloud_key='dust1_birth_cloud',\n", + " dust_diffuse_key='dust2_diffuse',dust_clumps=-99,frac_nodust= 0.0,\n", + " frac_obrun=0.0,dust_index_key='dust_index',\n", + " dust_powerlaw_modifier_key='dust_calzetti_modifier',mwr_key='mwr',\n", + " uvb_key='uvb',wgp1_key='wgp1',wgp2=1,wgp3=1,\n", + " dust_emission_gamma_key='dust_gamma',dust_emission_umin_key='dust_umin',\n", + " dust_emission_qpah_key='dust_qpah',fraction_agn_bol_lum_key='f_agn',\n", + " agn_torus_opt_depth_key='tau_agn',tabulated_sfh_key='tabulated_sfh',\n", + " tabulated_lsf_key='tabulated_lsf',physical_units=False,\n", + " restframe_wave_key='restframe_wavelengths',\n", + " restframe_sed_key='restframe_seds')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we show the example where we provide tabulated star-formation histories to generate the final SED. In this case, FSPS outputs the emission per total stellar mass. We call the fit_model() function to generate the model SEDs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fspssedmodel = fspssedmodeler.fit_model(training_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fspssedmodel.data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We plot the first spectrum to check that the SED generation worked correctly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "with h5py.File('model_FSPSSedModeler.hdf5','r') as h5table:\n", + " for key in h5table.keys():\n", + " print(key)\n", + " redshifts = h5table['redshifts'][()]\n", + " restframe_seds = h5table['restframe_seds'][()]\n", + " restframe_wavelengths = h5table['restframe_wavelengths'][()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.clf()\n", + "plt.plot(restframe_wavelengths[0], restframe_seds[0], lw=2, color='black')\n", + "plt.xlim(3000, 10000)\n", + "plt.xlabel(r'wavelength [$\\AA$]')\n", + "plt.ylabel(r'luminosity density [$\\mathrm{Lsun \\ Hz^{-1}}$]')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running it with ceci" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import ceci\n", + "pipe = ceci.Pipeline.interactive()\n", + "stages = [fspssedmodeler]\n", + "for stage in stages:\n", + " pipe.add_stage(stage)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "help(pipe.initialize)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe.initialize(dict(input=trainFile), dict(output_dir='./temp_output_rail_fsps', log_dir='./logs_rail_fsps',\n", + " resume=False, nprocess=2), None)\n", + "pipe.save('./temp_output_rail_fsps/pipe_saved.yml')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import ceci\n", + "pr = ceci.Pipeline.read('./temp_output_rail_fsps/pipe_saved.yml')\n", + "pr.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tables_io\n", + "rest_frame_sed_models = tables_io.read('temp_output_rail_fsps/model_FSPSSedModeler.hdf5')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.clf()\n", + "plt.plot(rest_frame_sed_models['restframe_wavelengths'][0], rest_frame_sed_models['restframe_seds'][1],\n", + " lw=2, color='black')\n", + "plt.xlim(3000, 10000)\n", + "plt.xlabel(r'wavelength [$\\AA$]')\n", + "plt.ylabel(r'luminosity density [$\\mathrm{Lsun \\ Hz^{-1}}$]')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we use these rest-frame SEDs to generate LSST photometry at user-provided redshifts.\n", + "For this we need the FSPSPhotometryCreator class. The class parameters are:\n", + "- redshifts_key: redshift keyword for the Hdf5 input table.\n", + "- restframe_sed_key: rest-frame sed keyword for the Hdf5 input table.\n", + "- restframe_wave_key: wavelength keyword for the Hdf5 input table.\n", + "- apparent_mags_key: apparent magnitudes keyword for the Hdf5 output table.\n", + "- filter_folder: path to the folder where filter bands are stored.\n", + "- instrument_name: name of the instrument for which we want to compute the magnitudes. The syntax of filenames should be of type instrument_band_transmission.h5. The wavelengths should be in units of Angstrom.\n", + "- wavebands: comma-separated list of filter bands.\n", + "- filter_wave_key: wavelength keyword in the hdf5 table of filter bands.\n", + "- filter_transm_key: transmission keyword in the hdf5 table of filter bands.\n", + "- Om0, Ode0, w0, wa, h: cosmological parameters for a w0waCDM cosmology\n", + "- use_planck_cosmology: True to overwrite the cosmological parameters with Planck15 cosmology model from Astropy\n", + "- physical_units: same meaning as above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from rail.core.stage import RailStage\n", + "import os\n", + "DS = RailStage.data_store\n", + "DS.__class__.allow_overwrite = True\n", + "from rail.core.data import TableHandle\n", + "import rail.fsps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "trainFile = os.path.join('model_FSPSSedModeler.hdf5')\n", + "training_data = DS.read_file(\"training_data\", TableHandle, trainFile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from rail.creation.engines.fsps_photometry_creator import *\n", + "\n", + "RAIL_FSPS_DIR = os.path.abspath(os.path.join(os.path.dirname(rail.fsps.__file__), '..', '..'))\n", + "default_rail_fsps_filter_folder = os.path.join(RAIL_FSPS_DIR, 'rail', 'examples_data', 'creation_data', 'data',\n", + " 'fsps_default_data', 'filters')\n", + "\n", + "fspsphotometrycreator = FSPSPhotometryCreator.make_stage(redshifts_key='redshifts',restframe_sed_key='restframe_seds',\n", + " restframe_wave_key='restframe_wavelengths',\n", + " apparent_mags_key='apparent_mags',\n", + " filter_folder=default_rail_fsps_filter_folder,\n", + " instrument_name='lsst', wavebands='u,g,r,i,z,y',\n", + " filter_wave_key='wave', filter_transm_key='transmission',\n", + " Om0=0.3, Ode0=0.7, w0=-1, wa=0.0, h=0.7,\n", + " use_planck_cosmology=True, physical_units=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We call the sample method to generate LSST photometry once the class has been initialised. The output is a table in Hdf5 format, with three columns: sequential ID, redshifts, apparent AB magnitudes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fspsphotometry = fspsphotometrycreator.sample(input_data=training_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fspsphotometry.data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The RAIL stages can be chained together conveniently using Ceci. The following is an example of how the FSPSSedGenerator and FSPSPhotometryCreator can be run as part of the pipeline Ceci stages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe = ceci.Pipeline.interactive()\n", + "stages = [fspssedmodeler, fspsphotometrycreator]\n", + "for stage in stages:\n", + " pipe.add_stage(stage)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe.initialize(dict(input='input_galaxy_properties_fsps.h5'), dict(output_dir='./temp_output', log_dir='./logs', resume=False, nprocess=2),'./temp_output/pipe_sed_fsps_config.yml')\n", + "pipe.save('./temp_output/pipe_saved.yml')\n", + "pr = ceci.Pipeline.read('./temp_output/pipe_saved.yml')\n", + "pr.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The creation of galaxy SEDs is a computationally intensive process. To speed things up, it is convenient to parallelize the process using MPI. To do that, one needs to set the parameters into the configuration file pipe_saved_config.yml. An example command to be run in the command line with n_cores is:\n", + "\n", + "mpiexec -n n_cores --mpi python3 -m rail SedGenerator --input=input_galaxy_properties_fsps.h5 --name=sed_generator_test --config=pipe_saved_config.yml --output=output_sed_generator_test.h5\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From 99eba5ce306e84c4e26507bfc68beb3fa7b7ab6d Mon Sep 17 00:00:00 2001 From: Olivia Lynn Date: Tue, 10 Oct 2023 13:11:48 -0400 Subject: [PATCH 2/4] Update fsps metadata, clean old version of gpz nb --- .../Use_FSPS_to_Generate_SEDs.ipynb | 2 +- .../GPz_estimation_example.ipynb | 254 ------------------ 2 files changed, 1 insertion(+), 255 deletions(-) delete mode 100644 examples/estimation_examples/GPz_estimation_example.ipynb diff --git a/examples/creation_examples/Use_FSPS_to_Generate_SEDs.ipynb b/examples/creation_examples/Use_FSPS_to_Generate_SEDs.ipynb index cbc4fdc..66bc11c 100644 --- a/examples/creation_examples/Use_FSPS_to_Generate_SEDs.ipynb +++ b/examples/creation_examples/Use_FSPS_to_Generate_SEDs.ipynb @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ diff --git a/examples/estimation_examples/GPz_estimation_example.ipynb b/examples/estimation_examples/GPz_estimation_example.ipynb deleted file mode 100644 index d5fe65d..0000000 --- a/examples/estimation_examples/GPz_estimation_example.ipynb +++ /dev/null @@ -1,254 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "69a40421-e7b3-4a7d-9a97-b70bf6cb8f28", - "metadata": {}, - "source": [ - "# GPz Estimation Example\n", - "\n", - "**Author:** Sam Schmidt\n", - "\n", - "**Last Run Successfully:** September 26, 2023\n", - "\n", - "A quick demo of running GPz on the typical test data. You should have installed rail_gpz_v1 (we highly recommend that you do this from within a custom conda environment so that all dependencies for package versions are met), either by cloning and installing from github, or with:\n", - "```\n", - "pip install pz-rail-gpz-v1\n", - "```\n", - "\n", - "As RAIL is a namespace package, installing rail_gpz_v1 will make `GPzInformer` and `GPzEstimator` available, and they can be imported via:
\n", - "```\n", - "from rail.estimation.algos.gpz import GPzInformer, GPzEstimator\n", - "```\n", - "\n", - "Let's start with all of our necessary imports:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "697e5adf-9e8f-496b-808d-83225c34a31e", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import rail\n", - "import qp\n", - "from rail.core.data import TableHandle\n", - "from rail.core.stage import RailStage\n", - "from rail.estimation.algos.gpz import GPzInformer, GPzEstimator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "abf96656-7cdc-4d45-ba13-355e3386e94d", - "metadata": {}, - "outputs": [], - "source": [ - "# set up the DataStore to keep track of data\n", - "DS = RailStage.data_store\n", - "DS.__class__.allow_overwrite = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "631b2b33-c93b-41b8-b3be-a526418f0f57", - "metadata": {}, - "outputs": [], - "source": [ - "# RAILDIR is a convenience variable set within RAIL that stores the path to the package as installed on your machine. We have several example data files that are copied with RAIL that we can use for our example run, let's grab those files, one for training/validation, and the other for testing:\n", - "from rail.core.utils import RAILDIR\n", - "trainFile = os.path.join(RAILDIR, 'rail/examples_data/testdata/test_dc2_training_9816.hdf5')\n", - "testFile = os.path.join(RAILDIR, 'rail/examples_data/testdata/test_dc2_validation_9816.hdf5')\n", - "training_data = DS.read_file(\"training_data\", TableHandle, trainFile)\n", - "test_data = DS.read_file(\"test_data\", TableHandle, testFile)" - ] - }, - { - "cell_type": "markdown", - "id": "89cfd420-a6cd-4f1d-a54b-536d95aac703", - "metadata": {}, - "source": [ - "Now, we need to set up the stage that will run GPz. We begin by defining a dictionary with the config options for the algorithm. There are sensible defaults set, we will override several of these as an example of how to do this. Config parameters not set in the dictionary will automatically be set to their default values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "44825971-2974-4dde-a072-c774655b22bd", - "metadata": {}, - "outputs": [], - "source": [ - "gpz_train_dict = dict(n_basis=60, trainfrac=0.8, csl_method=\"normal\", max_iter=150, hdf5_groupname=\"photometry\") " - ] - }, - { - "cell_type": "markdown", - "id": "2afd5df8-7449-4a2f-a4f8-7ec18eae92d4", - "metadata": {}, - "source": [ - "Let's set up the training stage. We need to provide a name for the stage for ceci, as well as a name for the model file that will be written by the stage. We also include the arguments in the dictionary we wrote above as additional arguments:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dee70194-46d5-4b3a-a282-b7ca123d008a", - "metadata": {}, - "outputs": [], - "source": [ - "# set up the stage to run our GPZ_training\n", - "pz_train = GPzInformer.make_stage(name=\"GPz_Train\", model=\"GPz_model.pkl\", **gpz_train_dict)" - ] - }, - { - "cell_type": "markdown", - "id": "2cceb899-0acb-448d-8a58-7a61227547a9", - "metadata": {}, - "source": [ - "We are now ready to run the stage to create the model. We will use the training data from `test_dc2_training_9816.hdf5`, which contains 10,225 galaxies drawn from healpix 9816 from the cosmoDC2_v1.1.4 dataset, to train the model. Note that we read this data in called `train_data` in the DataStore. Note that we set `trainfrac` to 0.8, so 80% of the data will be used in the \"main\" training, but 20% will be reserved by `GPzInformer` to determine a SIGMA parameter. We set `max_iter` to 150, so we will see 150 steps where the stage tries to maximize the likelihood. We run the stage as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d925f67e-d3a5-49ae-9a1e-6ac043dbdd0a", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pz_train.inform(training_data)" - ] - }, - { - "cell_type": "markdown", - "id": "c5c7a409-1919-49c6-a258-4af05fe30e00", - "metadata": {}, - "source": [ - "This should have taken about 30 seconds on a typical desktop computer, and you should now see a file called `GPz_model.pkl` in the directory. This model file is used by the `GPzEstimator` stage to determine our redshift PDFs for the test set of galaxies. Let's set up that stage, again defining a dictionary of variables for the config params:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc2a9632-adfd-42cc-96c7-c0bb34390b15", - "metadata": {}, - "outputs": [], - "source": [ - "gpz_test_dict = dict(hdf5_groupname=\"photometry\", model=\"GPz_model.pkl\")\n", - "\n", - "gpz_run = GPzEstimator.make_stage(name=\"gpz_run\", **gpz_test_dict)" - ] - }, - { - "cell_type": "markdown", - "id": "8bfe0f11-5c38-4856-b845-d5fb830e707a", - "metadata": {}, - "source": [ - "Let's run the stage and compute photo-z's for our test set:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4c739a8-1155-4265-ba2b-45709399f291", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "results = gpz_run.estimate(test_data)" - ] - }, - { - "cell_type": "markdown", - "id": "2c0d2a56-029b-45d2-b5e1-10eee75d24bb", - "metadata": {}, - "source": [ - "This should be very fast, under a second for our 20,449 galaxies in the test set. Now, let's plot a scatter plot of the point estimates, as well as a few example PDFs. We can get access to the `qp` ensemble that was written via the DataStore via `results()`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2d8dc61f-c073-44b1-9d32-6e5d715e699e", - "metadata": {}, - "outputs": [], - "source": [ - "ens = results()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "baa61f3d-426e-41d0-9d3f-65eb05a5b7ec", - "metadata": {}, - "outputs": [], - "source": [ - "expdfids = [2, 180, 13517, 18032]\n", - "fig, axs = plt.subplots(4, 1, figsize=(12,10))\n", - "for i, xx in enumerate(expdfids):\n", - " axs[i].set_xlim(0,3)\n", - " ens[xx].plot_native(axes=axs[i])\n", - "axs[3].set_xlabel(\"redshift\", fontsize=15)" - ] - }, - { - "cell_type": "markdown", - "id": "093dd6f9-f935-4aa5-9898-1c52b3bef6d1", - "metadata": {}, - "source": [ - "GPzEstimator parameterizes each PDF as a single Gaussian, here we see a few examples of Gaussians of different widths. Now let's grab the mode of each PDF (stored as ancil data in the ensemble) and compare to the true redshifts from the test_data file:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e33ac16a-1c2b-4be8-a947-362bcc039431", - "metadata": {}, - "outputs": [], - "source": [ - "truez = test_data.data['photometry']['redshift']\n", - "zmode = ens.ancil['zmode'].flatten()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2f63f2e-7931-41c5-8cec-32a60230d4cc", - "metadata": {}, - "outputs": [], - "source": [ - "plt.figure(figsize=(12,12))\n", - "plt.scatter(truez, zmode, s=3)\n", - "plt.plot([0,3],[0,3], 'k--')\n", - "plt.xlabel(\"redshift\", fontsize=12)\n", - "plt.ylabel(\"z mode\", fontsize=12)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From cc6ca508e287b1450118980cded8bd792038910a Mon Sep 17 00:00:00 2001 From: Olivia Lynn Date: Tue, 17 Oct 2023 09:44:22 -0400 Subject: [PATCH 3/4] Add back CMNN --- examples/estimation_examples/CMNN_Demo.ipynb | 387 +++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 examples/estimation_examples/CMNN_Demo.ipynb diff --git a/examples/estimation_examples/CMNN_Demo.ipynb b/examples/estimation_examples/CMNN_Demo.ipynb new file mode 100644 index 0000000..628be35 --- /dev/null +++ b/examples/estimation_examples/CMNN_Demo.ipynb @@ -0,0 +1,387 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CMNN Demo\n", + "\n", + "**Author:** Sam Schmidt\n", + "\n", + "**Last Successfully Run:** September 26, 2023\n", + "\n", + "This is a notebook demonstrating some of the features of the LSSTDESC `RAIL` version of the CMNN estimator, see **[Graham et al. (2018)](https://ui.adsabs.harvard.edu/abs/2018AJ....155....1G/abstract)** for more details on the algorithm.
\n", + "\n", + "CMNN stands for color-matched nearest-neighbor, and as this name implies, the method works by finding the Mahalanobis distance between each test galaxy and the training galaxies, and selecting one of those \"nearby\" in color space as the redshift estimate. The algorithm also estimates the \"width\" of the resulting PDF based on the standard deviation of this nearby set and returns a single Gaussian with a mean and width defined as such.
\n", + "\n", + "The current version of the code consists of a training stage, `Inform_CMNNPDF`, that computes colors for a set of training data and an estimation stage `CMNNPDF` that calculates the Mahalanobis distance to each training galaxy for each test galaxy and returns a single Guassian PDF for each galaxy. The mean value of this Gaussian PDF can be estimated in one of three ways (see selection mode below), and the width is determined by the standard deviation of training galaxy redshifts within the threshold Mahalanobis distance. Future implementation improvements may change the output format to include multiple Gaussians.\n", + "\n", + "For the color calculation, there is an option for how to treat the \"non-detections\" in a band: the default choice is to ignore any colors that contain a non-detect magnitude and adjust the number of degrees of freedom in the Mahalanobis distance accordingly (this is how the CMNN algorithm was originally implemented). Or, if the configuration parameter `nondetect_replace` is set to `True` in `Inform_CMNNPDF`, the non-detected magnitudes will be replaced with the 1-sigma limiting magnitude in each band as supplied by the user via the `mag_limits` configuration parameter (or by the default 1-sigma limits if the user does not supply specific numbers). We have not done any exploration of the relative performance of these two choices, but note that there is not a significant performance difference in terms of runtime between the two methods.
\n", + "\n", + "In addition to the Gaussian PDF for each test galaxy, two ancillary quantities are stored: `zmode`: the mode of the redshift PDF and `Ncm`, the integer number of \"nearby\" galaxies considered as neighbors for each galaxy.
\n", + "\n", + "`Inform_CMNNPDF` takes in a training data set and returns a model file that simply consists of the computed colors and color errors (magnitude errors added in quadrature) for that dataset, the model to be used in the `CMNNPDF` stage. A modification of the original CMNN algorithm, \"nondetections\" are now replaced by the 1-sigma limiting magnitudes and the non-detect magnitude errors replaced with a value of 1.0. The config parameters that can be set by the user for `Inform_CMNNPDF` are:\n", + "\n", + "- bands: list of the band names that should be present in the input data.
\n", + "- err_bands: list of the magnitude error column names that should be present in the input data.
\n", + "- redshift_col: a string giving the name for the redshift column present in the input data.
\n", + "- mag_limits: a dictionary with keys that match those in bands and a float with the 1 sigma limiting magnitude for each band.
\n", + "- nondetect_val: float or np.nan, the value indicating a non-detection, which will be replaced by the values in mag_limits.
\n", + "- nondetect_replace: bool, if set to `False` (the default) this option ignores colors with non-detected values in the Mahalanobis distance calculation, with a corresponding drop in the degrees of freedom value. If set to `True`, the method will replace non-detections with the 1-sigma limiting magnitudes specified via `mag_limits` (or default 1-sigma limits if not supplied), and will use all colors in the Mahalanobis distance calculation.
\n", + "\n", + "\n", + "The parameters that can be set via the config_params in `CMNNPDF` are described in brief below:\n", + "\n", + "- bands, err_bands, redshift_col, mag_limits are all the same as described above for Inform_CMNNPDF.\n", + "- ppf_value: float, usually 0.68 or 0.95, which sets the value of the PPF used in the Mahalanobis distance calculation.\n", + "- selection_mode: int, selects how the central value of the Gaussian PDF is calculated in the algorithm, if set to **0** randomly chooses from set within the Mahalanobis distance, if set to **1** chooses the nearest neighbor point, if set to **2** adds a distance weight to the random choice.\n", + "- min_n: int, the minimum number of training galaxies to use.\n", + "- min_thresh: float, the minimum threshold cutoff. Values smaller than this threshold value will be ignored.\n", + "- min_dist: float, the minimum Mahalanobis distance. Values smaller than this will be ignored.\n", + "- bad_redshift_val: float, in the unlikely case that there are not enough training galaxies, this central redshift will be assigned to galaxies.\n", + "- bad_redshift_err: float, in the unlikely case that there are not enough training galaxies, this Gaussian width will be assigned to galaxies.\n", + "\n", + "\n", + "Let's grab some example data, train the model by running the `Inform_CMNNPDF` `inform` method, then calculate a set of photo-z's using `CMNNPDF` `estimate`. Much of the following is copied from the `RAIL_estiation_demo.ipynb` in the RAIL repo, so look at that notebook for general questions on setting up the RAIL infrastructure for estimators." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "%matplotlib inline " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import rail\n", + "import qp\n", + "from rail.core.data import TableHandle\n", + "from rail.core.stage import RailStage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DS = RailStage.data_store\n", + "DS.__class__.allow_overwrite = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Getting the list of available Estimators\n", + "\n", + "RailStage knows about all of the sub-types of stages. The are stored in the `RailStage.pipeline_stages` dict. By looping through the values in that dict we can and asking if each one is a sub-class of `rail.estimation.estimator.CatEstimator` we can identify the available estimators that operator on catalog-like inputs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The code-specific parameters\n", + "As mentioned above, CMNN has particular configuration options that can be set when setting up an instance of our `Inform_CMNNPDF` stage, we'll define those in a dictionary. Any parameters not specifically assigned will take on default values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cmnn_dict = dict(zmin=0.0, zmax=3.0, nzbins=301, hdf5_groupname='photometry')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will begin by training the algorithm, to to this we instantiate a rail object with a call to the base class.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from rail.estimation.algos.cmnn import Inform_CMNNPDF, CMNNPDF\n", + "pz_train = Inform_CMNNPDF.make_stage(name='inform_CMNN', model='demo_cmnn_model.pkl', **cmnn_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's load our training data, which is stored in hdf5 format. We'll load it into the Data Store so that the ceci stages are able to access it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from rail.core.utils import RAILDIR\n", + "trainFile = os.path.join(RAILDIR, 'rail/examples_data/testdata/test_dc2_training_9816.hdf5')\n", + "testFile = os.path.join(RAILDIR, 'rail/examples_data/testdata/test_dc2_validation_9816.hdf5')\n", + "training_data = DS.read_file(\"training_data\", TableHandle, trainFile)\n", + "test_data = DS.read_file(\"test_data\", TableHandle, testFile)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The inform stage for CMNN should not take long to run, it essentially just converts the magnitudes to colors for the training data and stores those as a model dictionary which is stored in a pickle file specfied by the `model` keyword above, in this case \"demo_cmnn_model.pkl\". This file should appear in the directory after we run the inform stage in the cell below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "pz_train.inform(training_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now set up the main photo-z stage and run our algorithm on the data to produce simple photo-z estimates. Note that we are loading the trained model that we computed from the inform stage: with the `model=pz_train.get_handle('model')` statement. We will set `nondetect_replace` to `True` to replace our non-detection magnitudes with their 1-sigma limits and use all colors.
\n", + "\n", + "Let's also set the minumum number of neighbors to 24, and the `selection_mode` to \"1\", which will choose the nearest neighbor for each galaxy as the redshift estimate:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "pz = CMNNPDF.make_stage(name='CMNN', hdf5_groupname='photometry',\n", + " model=pz_train.get_handle('model'),\n", + " min_n=20,\n", + " selection_mode=1,\n", + " nondetect_replace=True,\n", + " aliases={\"output\":\"pz_near\"})\n", + "results = pz.estimate(test_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As mentioned above, in addition to the PDF, `estimate` calculates and stores both the mode of the PDF (`zmode`), and the number of neighbors (`Ncm`) for each galaxy, which can be accessed from the ancillary data. We will plot the modes vs the true redshift to see how well CMNN did in estimating redshifts:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zmode = results().ancil['zmode']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's plot the redshift mode against the true redshifts to see how they look:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(8,8))\n", + "plt.scatter(test_data()['photometry']['redshift'],zmode,s=1,c='k',label='simple NN mode')\n", + "plt.plot([0,3],[0,3],'r--');\n", + "plt.xlabel(\"true redshift\")\n", + "plt.ylabel(\"CMNN photo-z mode\")\n", + "plt.ylim(0,3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Very nice! Not many outliers and a fairly small scatter without much biase!\n", + "\n", + "Now, let's plot the histogram of how many neighbors were used. We set a minimum number of 20, so we should see a large peak at that value: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ncm =results().ancil['Ncm']\n", + "plt.hist(ncm, bins=np.linspace(0,200,20));" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As mentioned previously, we can change the method for how we select the mean redshift, let's re-run the estimator but use `selection_mode` = \"0\", which will select a random galaxy from amongst the neighbors. This should still look decent, but perhaps not as nice as the nearest neighbor estimator:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pz_rand = CMNNPDF.make_stage(name='CMNN_rand', hdf5_groupname='photometry',\n", + " model=pz_train.get_handle('model'),\n", + " min_n=20,\n", + " selection_mode=0,\n", + " nondetect_replace=True,\n", + " aliaes={\"output\": \"pz_rand\"})\n", + "results_rand = pz_rand.estimate(test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zmode_rand = results_rand().ancil['zmode']\n", + "plt.figure(figsize=(8,8))\n", + "plt.scatter(test_data()['photometry']['redshift'],zmode_rand,s=1,c='k',label='simple NN mode')\n", + "plt.plot([0,3],[0,3],'r--');\n", + "plt.xlabel(\"true redshift\")\n", + "plt.ylabel(\"CMNN photo-z mode\")\n", + "plt.ylim(0,3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Slightly worse, but not dramatically so, a few more outliers are visible visually. Finally, we can try the weighted random selection by setting `selection_mode` to \"2\":" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pz_weight = CMNNPDF.make_stage(name='CMNN_weight', hdf5_groupname='photometry',\n", + " model=pz_train.get_handle('model'),\n", + " min_n=20,\n", + " selection_mode=2,\n", + " nondetect_replace=True,\n", + " aliaes={\"output\": \"pz_weight\"})\n", + "results_weight = pz_weight.estimate(test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zmode_weight = results_weight().ancil['zmode']\n", + "plt.figure(figsize=(8,8))\n", + "plt.scatter(test_data()['photometry']['redshift'],zmode_weight,s=1,c='k',label='simple NN mode')\n", + "plt.plot([0,3],[0,3],'r--');\n", + "plt.xlabel(\"true redshift\")\n", + "plt.ylabel(\"CMNN photo-z mode\")\n", + "plt.ylim(0,3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, not a dramatic difference, but it can make a difference if there is sparse coverage of areas of the color-space by the training data, where choosing \"nearest\" might choose the same single data point for many test points, whereas setting to random or weighted random could slightly \"smooth\" that choice by forcing choices of other nearby points for the redshift estimate." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's plot a few PDFs, again, they are a single Gaussian:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results().plot_native(key=9, xlim=(0,3))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results().plot_native(key=1554, xlim=(0,3))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results().plot_native(key=19554, xlim=(0,3))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see a wide variety of widths, as expected for a single Gaussian parameterization that must encompass a wide variety of PDF shapes." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From dd3c81c58269b422f48844143d01e0869c8c17d6 Mon Sep 17 00:00:00 2001 From: Olivia Lynn Date: Tue, 17 Oct 2023 09:51:53 -0400 Subject: [PATCH 4/4] Revert "Add back CMNN" This reverts commit cc6ca508e287b1450118980cded8bd792038910a. --- examples/estimation_examples/CMNN_Demo.ipynb | 387 ------------------- 1 file changed, 387 deletions(-) delete mode 100644 examples/estimation_examples/CMNN_Demo.ipynb diff --git a/examples/estimation_examples/CMNN_Demo.ipynb b/examples/estimation_examples/CMNN_Demo.ipynb deleted file mode 100644 index 628be35..0000000 --- a/examples/estimation_examples/CMNN_Demo.ipynb +++ /dev/null @@ -1,387 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# CMNN Demo\n", - "\n", - "**Author:** Sam Schmidt\n", - "\n", - "**Last Successfully Run:** September 26, 2023\n", - "\n", - "This is a notebook demonstrating some of the features of the LSSTDESC `RAIL` version of the CMNN estimator, see **[Graham et al. (2018)](https://ui.adsabs.harvard.edu/abs/2018AJ....155....1G/abstract)** for more details on the algorithm.
\n", - "\n", - "CMNN stands for color-matched nearest-neighbor, and as this name implies, the method works by finding the Mahalanobis distance between each test galaxy and the training galaxies, and selecting one of those \"nearby\" in color space as the redshift estimate. The algorithm also estimates the \"width\" of the resulting PDF based on the standard deviation of this nearby set and returns a single Gaussian with a mean and width defined as such.
\n", - "\n", - "The current version of the code consists of a training stage, `Inform_CMNNPDF`, that computes colors for a set of training data and an estimation stage `CMNNPDF` that calculates the Mahalanobis distance to each training galaxy for each test galaxy and returns a single Guassian PDF for each galaxy. The mean value of this Gaussian PDF can be estimated in one of three ways (see selection mode below), and the width is determined by the standard deviation of training galaxy redshifts within the threshold Mahalanobis distance. Future implementation improvements may change the output format to include multiple Gaussians.\n", - "\n", - "For the color calculation, there is an option for how to treat the \"non-detections\" in a band: the default choice is to ignore any colors that contain a non-detect magnitude and adjust the number of degrees of freedom in the Mahalanobis distance accordingly (this is how the CMNN algorithm was originally implemented). Or, if the configuration parameter `nondetect_replace` is set to `True` in `Inform_CMNNPDF`, the non-detected magnitudes will be replaced with the 1-sigma limiting magnitude in each band as supplied by the user via the `mag_limits` configuration parameter (or by the default 1-sigma limits if the user does not supply specific numbers). We have not done any exploration of the relative performance of these two choices, but note that there is not a significant performance difference in terms of runtime between the two methods.
\n", - "\n", - "In addition to the Gaussian PDF for each test galaxy, two ancillary quantities are stored: `zmode`: the mode of the redshift PDF and `Ncm`, the integer number of \"nearby\" galaxies considered as neighbors for each galaxy.
\n", - "\n", - "`Inform_CMNNPDF` takes in a training data set and returns a model file that simply consists of the computed colors and color errors (magnitude errors added in quadrature) for that dataset, the model to be used in the `CMNNPDF` stage. A modification of the original CMNN algorithm, \"nondetections\" are now replaced by the 1-sigma limiting magnitudes and the non-detect magnitude errors replaced with a value of 1.0. The config parameters that can be set by the user for `Inform_CMNNPDF` are:\n", - "\n", - "- bands: list of the band names that should be present in the input data.
\n", - "- err_bands: list of the magnitude error column names that should be present in the input data.
\n", - "- redshift_col: a string giving the name for the redshift column present in the input data.
\n", - "- mag_limits: a dictionary with keys that match those in bands and a float with the 1 sigma limiting magnitude for each band.
\n", - "- nondetect_val: float or np.nan, the value indicating a non-detection, which will be replaced by the values in mag_limits.
\n", - "- nondetect_replace: bool, if set to `False` (the default) this option ignores colors with non-detected values in the Mahalanobis distance calculation, with a corresponding drop in the degrees of freedom value. If set to `True`, the method will replace non-detections with the 1-sigma limiting magnitudes specified via `mag_limits` (or default 1-sigma limits if not supplied), and will use all colors in the Mahalanobis distance calculation.
\n", - "\n", - "\n", - "The parameters that can be set via the config_params in `CMNNPDF` are described in brief below:\n", - "\n", - "- bands, err_bands, redshift_col, mag_limits are all the same as described above for Inform_CMNNPDF.\n", - "- ppf_value: float, usually 0.68 or 0.95, which sets the value of the PPF used in the Mahalanobis distance calculation.\n", - "- selection_mode: int, selects how the central value of the Gaussian PDF is calculated in the algorithm, if set to **0** randomly chooses from set within the Mahalanobis distance, if set to **1** chooses the nearest neighbor point, if set to **2** adds a distance weight to the random choice.\n", - "- min_n: int, the minimum number of training galaxies to use.\n", - "- min_thresh: float, the minimum threshold cutoff. Values smaller than this threshold value will be ignored.\n", - "- min_dist: float, the minimum Mahalanobis distance. Values smaller than this will be ignored.\n", - "- bad_redshift_val: float, in the unlikely case that there are not enough training galaxies, this central redshift will be assigned to galaxies.\n", - "- bad_redshift_err: float, in the unlikely case that there are not enough training galaxies, this Gaussian width will be assigned to galaxies.\n", - "\n", - "\n", - "Let's grab some example data, train the model by running the `Inform_CMNNPDF` `inform` method, then calculate a set of photo-z's using `CMNNPDF` `estimate`. Much of the following is copied from the `RAIL_estiation_demo.ipynb` in the RAIL repo, so look at that notebook for general questions on setting up the RAIL infrastructure for estimators." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "%matplotlib inline " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import rail\n", - "import qp\n", - "from rail.core.data import TableHandle\n", - "from rail.core.stage import RailStage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "DS = RailStage.data_store\n", - "DS.__class__.allow_overwrite = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Getting the list of available Estimators\n", - "\n", - "RailStage knows about all of the sub-types of stages. The are stored in the `RailStage.pipeline_stages` dict. By looping through the values in that dict we can and asking if each one is a sub-class of `rail.estimation.estimator.CatEstimator` we can identify the available estimators that operator on catalog-like inputs." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## The code-specific parameters\n", - "As mentioned above, CMNN has particular configuration options that can be set when setting up an instance of our `Inform_CMNNPDF` stage, we'll define those in a dictionary. Any parameters not specifically assigned will take on default values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cmnn_dict = dict(zmin=0.0, zmax=3.0, nzbins=301, hdf5_groupname='photometry')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will begin by training the algorithm, to to this we instantiate a rail object with a call to the base class.
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from rail.estimation.algos.cmnn import Inform_CMNNPDF, CMNNPDF\n", - "pz_train = Inform_CMNNPDF.make_stage(name='inform_CMNN', model='demo_cmnn_model.pkl', **cmnn_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, let's load our training data, which is stored in hdf5 format. We'll load it into the Data Store so that the ceci stages are able to access it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from rail.core.utils import RAILDIR\n", - "trainFile = os.path.join(RAILDIR, 'rail/examples_data/testdata/test_dc2_training_9816.hdf5')\n", - "testFile = os.path.join(RAILDIR, 'rail/examples_data/testdata/test_dc2_validation_9816.hdf5')\n", - "training_data = DS.read_file(\"training_data\", TableHandle, trainFile)\n", - "test_data = DS.read_file(\"test_data\", TableHandle, testFile)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The inform stage for CMNN should not take long to run, it essentially just converts the magnitudes to colors for the training data and stores those as a model dictionary which is stored in a pickle file specfied by the `model` keyword above, in this case \"demo_cmnn_model.pkl\". This file should appear in the directory after we run the inform stage in the cell below:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pz_train.inform(training_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can now set up the main photo-z stage and run our algorithm on the data to produce simple photo-z estimates. Note that we are loading the trained model that we computed from the inform stage: with the `model=pz_train.get_handle('model')` statement. We will set `nondetect_replace` to `True` to replace our non-detection magnitudes with their 1-sigma limits and use all colors.
\n", - "\n", - "Let's also set the minumum number of neighbors to 24, and the `selection_mode` to \"1\", which will choose the nearest neighbor for each galaxy as the redshift estimate:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pz = CMNNPDF.make_stage(name='CMNN', hdf5_groupname='photometry',\n", - " model=pz_train.get_handle('model'),\n", - " min_n=20,\n", - " selection_mode=1,\n", - " nondetect_replace=True,\n", - " aliases={\"output\":\"pz_near\"})\n", - "results = pz.estimate(test_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As mentioned above, in addition to the PDF, `estimate` calculates and stores both the mode of the PDF (`zmode`), and the number of neighbors (`Ncm`) for each galaxy, which can be accessed from the ancillary data. We will plot the modes vs the true redshift to see how well CMNN did in estimating redshifts:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "zmode = results().ancil['zmode']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's plot the redshift mode against the true redshifts to see how they look:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.figure(figsize=(8,8))\n", - "plt.scatter(test_data()['photometry']['redshift'],zmode,s=1,c='k',label='simple NN mode')\n", - "plt.plot([0,3],[0,3],'r--');\n", - "plt.xlabel(\"true redshift\")\n", - "plt.ylabel(\"CMNN photo-z mode\")\n", - "plt.ylim(0,3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Very nice! Not many outliers and a fairly small scatter without much biase!\n", - "\n", - "Now, let's plot the histogram of how many neighbors were used. We set a minimum number of 20, so we should see a large peak at that value: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ncm =results().ancil['Ncm']\n", - "plt.hist(ncm, bins=np.linspace(0,200,20));" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As mentioned previously, we can change the method for how we select the mean redshift, let's re-run the estimator but use `selection_mode` = \"0\", which will select a random galaxy from amongst the neighbors. This should still look decent, but perhaps not as nice as the nearest neighbor estimator:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pz_rand = CMNNPDF.make_stage(name='CMNN_rand', hdf5_groupname='photometry',\n", - " model=pz_train.get_handle('model'),\n", - " min_n=20,\n", - " selection_mode=0,\n", - " nondetect_replace=True,\n", - " aliaes={\"output\": \"pz_rand\"})\n", - "results_rand = pz_rand.estimate(test_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "zmode_rand = results_rand().ancil['zmode']\n", - "plt.figure(figsize=(8,8))\n", - "plt.scatter(test_data()['photometry']['redshift'],zmode_rand,s=1,c='k',label='simple NN mode')\n", - "plt.plot([0,3],[0,3],'r--');\n", - "plt.xlabel(\"true redshift\")\n", - "plt.ylabel(\"CMNN photo-z mode\")\n", - "plt.ylim(0,3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Slightly worse, but not dramatically so, a few more outliers are visible visually. Finally, we can try the weighted random selection by setting `selection_mode` to \"2\":" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pz_weight = CMNNPDF.make_stage(name='CMNN_weight', hdf5_groupname='photometry',\n", - " model=pz_train.get_handle('model'),\n", - " min_n=20,\n", - " selection_mode=2,\n", - " nondetect_replace=True,\n", - " aliaes={\"output\": \"pz_weight\"})\n", - "results_weight = pz_weight.estimate(test_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "zmode_weight = results_weight().ancil['zmode']\n", - "plt.figure(figsize=(8,8))\n", - "plt.scatter(test_data()['photometry']['redshift'],zmode_weight,s=1,c='k',label='simple NN mode')\n", - "plt.plot([0,3],[0,3],'r--');\n", - "plt.xlabel(\"true redshift\")\n", - "plt.ylabel(\"CMNN photo-z mode\")\n", - "plt.ylim(0,3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Again, not a dramatic difference, but it can make a difference if there is sparse coverage of areas of the color-space by the training data, where choosing \"nearest\" might choose the same single data point for many test points, whereas setting to random or weighted random could slightly \"smooth\" that choice by forcing choices of other nearby points for the redshift estimate." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, let's plot a few PDFs, again, they are a single Gaussian:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results().plot_native(key=9, xlim=(0,3))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results().plot_native(key=1554, xlim=(0,3))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results().plot_native(key=19554, xlim=(0,3))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We see a wide variety of widths, as expected for a single Gaussian parameterization that must encompass a wide variety of PDF shapes." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}