diff --git a/examples/creation_examples/Use_FSPS_to_Generate_SEDs.ipynb b/examples/creation_examples/Use_FSPS_to_Generate_SEDs.ipynb new file mode 100644 index 0000000..66bc11c --- /dev/null +++ b/examples/creation_examples/Use_FSPS_to_Generate_SEDs.ipynb @@ -0,0 +1,518 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Use FSPS to Generate Galaxy Rest-Frame Spectral Energy Distributions and Compute Apparent Magnitudes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Author:** Luca Tortorelli, Josue de Santiago, Eric Charles\n", + "\n", + "**Last Run Successfully:** August 2, 2022\n", + " \n", + "This notebook demonstrates how to use rail_fsps to generate galaxy rest-frame spectral energy distributions (SEDs) with FSPS and how to compute apparent magnitudes from them.\n", + "\n", + "In order to run this notebook you need to have FSPS and Python-FSPS installed. The easiest way to do this is the following (first line applies only in case you already installed it via pip):\n", + "\n", + " pip uninstall fsps\n", + " git clone --recursive https://github.com/dfm/python-fsps.git\n", + " cd python-fsps\n", + " python -m pip install .\n", + " export SPS_HOME=$(pwd)/src/fsps/libfsps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from rail.core.data import TableHandle\n", + "from rail.creation.engines.fsps_sed_modeler import *\n", + "import rail.fsps\n", + "import h5py\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll start by setting up the Rail data store. RAIL uses ceci, which is designed for pipelines rather than interactive notebooks, the data store will work around that and enable us to use data interactively. See the rail/examples/goldenspike/goldenspike.ipynb example notebook for more details on the Data Store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from rail.core.stage import RailStage\n", + "DS = RailStage.data_store\n", + "DS.__class__.allow_overwrite = True\n", + "RAIL_FSPS_DIR = os.path.abspath(os.path.join(os.path.dirname(rail.fsps.__file__), '..', '..'))\n", + "default_rail_fsps_files_folder = os.path.join(RAIL_FSPS_DIR, 'rail', 'examples_data', 'creation_data', 'data',\n", + " 'fsps_default_data')\n", + "input_file = os.path.join(default_rail_fsps_files_folder, 'input_galaxy_properties_fsps.hdf5')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We generate some mock input data for the sed modeler class that we store in an hdf5 file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_galaxies = 10\n", + "\n", + "redshifts = np.linspace(0.1,1,num=n_galaxies)\n", + "zmet = np.full(n_galaxies, 1, dtype=int)\n", + "stellar_metallicity = np.full(n_galaxies, 0.0) # log10(Z/Zsun)\n", + "pmetals = np.full(n_galaxies, 2.0)\n", + "stellar_velocity_dispersion = np.full(n_galaxies, 100.)\n", + "gas_ionization = np.full(n_galaxies, -1)\n", + "gas_metallicity = np.full(n_galaxies, 0.0)\n", + "tau = np.full(n_galaxies, 1.0)\n", + "const = np.full(n_galaxies, 0.0)\n", + "sf_start = np.full(n_galaxies, 0.0)\n", + "sf_trunc = np.full(n_galaxies, 0.0)\n", + "stellar_age = np.full(n_galaxies, 2.)\n", + "fburst = np.full(n_galaxies, 0.0)\n", + "tburst = np.full(n_galaxies, 11.0)\n", + "sf_slope = np.full(n_galaxies, 0.0)\n", + "dust1_birth_cloud = np.full(n_galaxies, 0.1)\n", + "dust2_diffuse = np.full(n_galaxies, 0.1)\n", + "dust_index = np.full(n_galaxies, -0.7)\n", + "dust_calzetti_modifier = np.full(n_galaxies, -1.)\n", + "mwr = np.full(n_galaxies, 3.1)\n", + "uvb = np.full(n_galaxies, 1.0)\n", + "wgp1 = np.full(n_galaxies, 1)\n", + "dust_gamma = np.full(n_galaxies, 0.01)\n", + "dust_umin = np.full(n_galaxies, 1.0)\n", + "dust_qpah = np.full(n_galaxies, 3.5)\n", + "f_agn = np.full(n_galaxies, 0.01)\n", + "tau_agn = np.full(n_galaxies, 10.0)\n", + "\n", + "gal_t_table = np.linspace(0.05, 13.8, 100) # age of the universe in Gyr\n", + "gal_sfr_table = np.random.uniform(0, 10, gal_t_table.size) # SFR in Msun/yr\n", + "tabulated_sfh = np.full((n_galaxies, 2, len(gal_sfr_table)), [gal_t_table,gal_sfr_table])\n", + "\n", + "wave_lsf = np.linspace(3000, 10000, 2000)\n", + "sigma_lsf = np.full_like(wave_lsf, 0.5)\n", + "tabulated_lsf = np.full((n_galaxies, 2, len(wave_lsf)), [wave_lsf, sigma_lsf])\n", + "\n", + "with h5py.File(input_file, 'w') as h5table:\n", + " data = h5table.create_group('model')\n", + " data.create_dataset(name='redshifts', data=redshifts)\n", + " data.create_dataset(name='zmet', data=zmet)\n", + " data.create_dataset(name='stellar_metallicity', data=stellar_metallicity)\n", + " data.create_dataset(name='pmetals', data=pmetals)\n", + " data.create_dataset(name='stellar_velocity_dispersion', data=stellar_velocity_dispersion)\n", + " data.create_dataset(name='gas_ionization', data=gas_ionization)\n", + " data.create_dataset(name='gas_metallicity', data=gas_metallicity)\n", + " data.create_dataset(name='tau', data=tau)\n", + " data.create_dataset(name='const', data=const)\n", + " data.create_dataset(name='sf_start', data=sf_start)\n", + " data.create_dataset(name='sf_trunc', data=sf_trunc)\n", + " data.create_dataset(name='stellar_age', data=stellar_age)\n", + " data.create_dataset(name='fburst', data=fburst)\n", + " data.create_dataset(name='tburst', data=tburst)\n", + " data.create_dataset(name='sf_slope', data=sf_slope)\n", + " data.create_dataset(name='dust1_birth_cloud', data=dust1_birth_cloud)\n", + " data.create_dataset(name='dust2_diffuse', data=dust2_diffuse)\n", + " data.create_dataset(name='dust_index', data=dust_index)\n", + " data.create_dataset(name='dust_calzetti_modifier', data=dust_calzetti_modifier)\n", + " data.create_dataset(name='mwr', data=mwr)\n", + " data.create_dataset(name='uvb', data=uvb)\n", + " data.create_dataset(name='wgp1', data=wgp1)\n", + " data.create_dataset(name='dust_gamma', data=dust_gamma)\n", + " data.create_dataset(name='dust_umin', data=dust_umin)\n", + " data.create_dataset(name='dust_qpah', data=dust_qpah)\n", + " data.create_dataset(name='f_agn', data=f_agn)\n", + " data.create_dataset(name='tau_agn', data=tau_agn)\n", + " data.create_dataset(name='tabulated_lsf', data=tabulated_lsf)\n", + " data.create_dataset(name='tabulated_sfh', data=tabulated_sfh)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we load the data to the data store to use it later, and we also set the redshifts in which the output will be generated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainFile = os.path.join(input_file)\n", + "training_data = DS.read_file(\"training_data\", TableHandle, trainFile)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create an FSPSSedModeler class object. The latter has a number of configuration parameters to set. All the parameters have default values. Therefore, if the user is not sure about a particular value for a certain parameter, the latter can be left at default. A short description of each parameter can be found in src/rail/creation/engines/fsps_sed_modeler.py. \n", + "\n", + "We run the FSPSSedModeler in sequential mode. Note that each galaxy spectrum takes a few seconds to generate, so it is advisable to proceed in this way only for a limited sample of objects or for testing the code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fspssedmodeler = FSPSSedModeler.make_stage(chunk_size=10, hdf5_groupname='model', name='FSPSSedModeler',\n", + " compute_vega_mags=False,vactoair_flag=False,\n", + " zcontinuous=1, add_agb_dust_model=True,\n", + " add_dust_emission=True, add_igm_absorption=True,\n", + " add_neb_emission=True, add_neb_continuum=True,\n", + " add_stellar_remnants=True, compute_light_ages=False,\n", + " nebemlineinspec=True, smooth_velocity=True,\n", + " smooth_lsf=False, cloudy_dust=False,\n", + " agb_dust=1.0, tpagb_norm_type=2, dell=0.0,\n", + " delt=0.0, redgb=1.0, agb=1.0, fcstar=1.0, sbss=0.0,\n", + " fbhb=0.0, pagb=1.0, redshifts_key='redshifts',\n", + " zmet_key='zmet', stellar_metallicities_key='stellar_metallicity',\n", + " pmetals_key='pmetals', imf_type=1, imf_upper_limit=120.,\n", + " imf_lower_limit=0.08, imf1=1.3, imf2=2.3,imf3=2.3,vdmc=0.08,\n", + " mdave=0.5,evtype=-1,use_wr_spectra=1,logt_wmb_hot=0.0,masscut=150.0,\n", + " velocity_dispersions_key='stellar_velocity_dispersion',min_wavelength=3000,\n", + " max_wavelength=10000,gas_ionizations_key='gas_ionization',\n", + " gas_metallicities_key='gas_metallicity',igm_factor=1.0,sfh_type=3,\n", + " tau_key='tau',const_key='const',sf_start_key='sf_start',\n", + " sf_trunc_key='sf_trunc',stellar_ages_key='stellar_age',\n", + " fburst_key='fburst',tburst_key='tburst',sf_slope_key='sf_slope',\n", + " dust_type=2,dust_tesc=7.0,dust_birth_cloud_key='dust1_birth_cloud',\n", + " dust_diffuse_key='dust2_diffuse',dust_clumps=-99,frac_nodust= 0.0,\n", + " frac_obrun=0.0,dust_index_key='dust_index',\n", + " dust_powerlaw_modifier_key='dust_calzetti_modifier',mwr_key='mwr',\n", + " uvb_key='uvb',wgp1_key='wgp1',wgp2=1,wgp3=1,\n", + " dust_emission_gamma_key='dust_gamma',dust_emission_umin_key='dust_umin',\n", + " dust_emission_qpah_key='dust_qpah',fraction_agn_bol_lum_key='f_agn',\n", + " agn_torus_opt_depth_key='tau_agn',tabulated_sfh_key='tabulated_sfh',\n", + " tabulated_lsf_key='tabulated_lsf',physical_units=False,\n", + " restframe_wave_key='restframe_wavelengths',\n", + " restframe_sed_key='restframe_seds')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we show the example where we provide tabulated star-formation histories to generate the final SED. In this case, FSPS outputs the emission per total stellar mass. We call the fit_model() function to generate the model SEDs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fspssedmodel = fspssedmodeler.fit_model(training_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fspssedmodel.data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We plot the first spectrum to check that the SED generation worked correctly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "with h5py.File('model_FSPSSedModeler.hdf5','r') as h5table:\n", + " for key in h5table.keys():\n", + " print(key)\n", + " redshifts = h5table['redshifts'][()]\n", + " restframe_seds = h5table['restframe_seds'][()]\n", + " restframe_wavelengths = h5table['restframe_wavelengths'][()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.clf()\n", + "plt.plot(restframe_wavelengths[0], restframe_seds[0], lw=2, color='black')\n", + "plt.xlim(3000, 10000)\n", + "plt.xlabel(r'wavelength [$\\AA$]')\n", + "plt.ylabel(r'luminosity density [$\\mathrm{Lsun \\ Hz^{-1}}$]')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running it with ceci" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import ceci\n", + "pipe = ceci.Pipeline.interactive()\n", + "stages = [fspssedmodeler]\n", + "for stage in stages:\n", + " pipe.add_stage(stage)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "help(pipe.initialize)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe.initialize(dict(input=trainFile), dict(output_dir='./temp_output_rail_fsps', log_dir='./logs_rail_fsps',\n", + " resume=False, nprocess=2), None)\n", + "pipe.save('./temp_output_rail_fsps/pipe_saved.yml')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import ceci\n", + "pr = ceci.Pipeline.read('./temp_output_rail_fsps/pipe_saved.yml')\n", + "pr.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tables_io\n", + "rest_frame_sed_models = tables_io.read('temp_output_rail_fsps/model_FSPSSedModeler.hdf5')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.clf()\n", + "plt.plot(rest_frame_sed_models['restframe_wavelengths'][0], rest_frame_sed_models['restframe_seds'][1],\n", + " lw=2, color='black')\n", + "plt.xlim(3000, 10000)\n", + "plt.xlabel(r'wavelength [$\\AA$]')\n", + "plt.ylabel(r'luminosity density [$\\mathrm{Lsun \\ Hz^{-1}}$]')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we use these rest-frame SEDs to generate LSST photometry at user-provided redshifts.\n", + "For this we need the FSPSPhotometryCreator class. The class parameters are:\n", + "- redshifts_key: redshift keyword for the Hdf5 input table.\n", + "- restframe_sed_key: rest-frame sed keyword for the Hdf5 input table.\n", + "- restframe_wave_key: wavelength keyword for the Hdf5 input table.\n", + "- apparent_mags_key: apparent magnitudes keyword for the Hdf5 output table.\n", + "- filter_folder: path to the folder where filter bands are stored.\n", + "- instrument_name: name of the instrument for which we want to compute the magnitudes. The syntax of filenames should be of type instrument_band_transmission.h5. The wavelengths should be in units of Angstrom.\n", + "- wavebands: comma-separated list of filter bands.\n", + "- filter_wave_key: wavelength keyword in the hdf5 table of filter bands.\n", + "- filter_transm_key: transmission keyword in the hdf5 table of filter bands.\n", + "- Om0, Ode0, w0, wa, h: cosmological parameters for a w0waCDM cosmology\n", + "- use_planck_cosmology: True to overwrite the cosmological parameters with Planck15 cosmology model from Astropy\n", + "- physical_units: same meaning as above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from rail.core.stage import RailStage\n", + "import os\n", + "DS = RailStage.data_store\n", + "DS.__class__.allow_overwrite = True\n", + "from rail.core.data import TableHandle\n", + "import rail.fsps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "trainFile = os.path.join('model_FSPSSedModeler.hdf5')\n", + "training_data = DS.read_file(\"training_data\", TableHandle, trainFile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from rail.creation.engines.fsps_photometry_creator import *\n", + "\n", + "RAIL_FSPS_DIR = os.path.abspath(os.path.join(os.path.dirname(rail.fsps.__file__), '..', '..'))\n", + "default_rail_fsps_filter_folder = os.path.join(RAIL_FSPS_DIR, 'rail', 'examples_data', 'creation_data', 'data',\n", + " 'fsps_default_data', 'filters')\n", + "\n", + "fspsphotometrycreator = FSPSPhotometryCreator.make_stage(redshifts_key='redshifts',restframe_sed_key='restframe_seds',\n", + " restframe_wave_key='restframe_wavelengths',\n", + " apparent_mags_key='apparent_mags',\n", + " filter_folder=default_rail_fsps_filter_folder,\n", + " instrument_name='lsst', wavebands='u,g,r,i,z,y',\n", + " filter_wave_key='wave', filter_transm_key='transmission',\n", + " Om0=0.3, Ode0=0.7, w0=-1, wa=0.0, h=0.7,\n", + " use_planck_cosmology=True, physical_units=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We call the sample method to generate LSST photometry once the class has been initialised. The output is a table in Hdf5 format, with three columns: sequential ID, redshifts, apparent AB magnitudes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fspsphotometry = fspsphotometrycreator.sample(input_data=training_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fspsphotometry.data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The RAIL stages can be chained together conveniently using Ceci. The following is an example of how the FSPSSedGenerator and FSPSPhotometryCreator can be run as part of the pipeline Ceci stages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe = ceci.Pipeline.interactive()\n", + "stages = [fspssedmodeler, fspsphotometrycreator]\n", + "for stage in stages:\n", + " pipe.add_stage(stage)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipe.initialize(dict(input='input_galaxy_properties_fsps.h5'), dict(output_dir='./temp_output', log_dir='./logs', resume=False, nprocess=2),'./temp_output/pipe_sed_fsps_config.yml')\n", + "pipe.save('./temp_output/pipe_saved.yml')\n", + "pr = ceci.Pipeline.read('./temp_output/pipe_saved.yml')\n", + "pr.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The creation of galaxy SEDs is a computationally intensive process. To speed things up, it is convenient to parallelize the process using MPI. To do that, one needs to set the parameters into the configuration file pipe_saved_config.yml. An example command to be run in the command line with n_cores is:\n", + "\n", + "mpiexec -n n_cores --mpi python3 -m rail SedGenerator --input=input_galaxy_properties_fsps.h5 --name=sed_generator_test --config=pipe_saved_config.yml --output=output_sed_generator_test.h5\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/estimation_examples/GPz_estimation_example.ipynb b/examples/estimation_examples/GPz_estimation_example.ipynb deleted file mode 100644 index d5fe65d..0000000 --- a/examples/estimation_examples/GPz_estimation_example.ipynb +++ /dev/null @@ -1,254 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "69a40421-e7b3-4a7d-9a97-b70bf6cb8f28", - "metadata": {}, - "source": [ - "# GPz Estimation Example\n", - "\n", - "**Author:** Sam Schmidt\n", - "\n", - "**Last Run Successfully:** September 26, 2023\n", - "\n", - "A quick demo of running GPz on the typical test data. You should have installed rail_gpz_v1 (we highly recommend that you do this from within a custom conda environment so that all dependencies for package versions are met), either by cloning and installing from github, or with:\n", - "```\n", - "pip install pz-rail-gpz-v1\n", - "```\n", - "\n", - "As RAIL is a namespace package, installing rail_gpz_v1 will make `GPzInformer` and `GPzEstimator` available, and they can be imported via:
\n", - "```\n", - "from rail.estimation.algos.gpz import GPzInformer, GPzEstimator\n", - "```\n", - "\n", - "Let's start with all of our necessary imports:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "697e5adf-9e8f-496b-808d-83225c34a31e", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import rail\n", - "import qp\n", - "from rail.core.data import TableHandle\n", - "from rail.core.stage import RailStage\n", - "from rail.estimation.algos.gpz import GPzInformer, GPzEstimator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "abf96656-7cdc-4d45-ba13-355e3386e94d", - "metadata": {}, - "outputs": [], - "source": [ - "# set up the DataStore to keep track of data\n", - "DS = RailStage.data_store\n", - "DS.__class__.allow_overwrite = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "631b2b33-c93b-41b8-b3be-a526418f0f57", - "metadata": {}, - "outputs": [], - "source": [ - "# RAILDIR is a convenience variable set within RAIL that stores the path to the package as installed on your machine. We have several example data files that are copied with RAIL that we can use for our example run, let's grab those files, one for training/validation, and the other for testing:\n", - "from rail.core.utils import RAILDIR\n", - "trainFile = os.path.join(RAILDIR, 'rail/examples_data/testdata/test_dc2_training_9816.hdf5')\n", - "testFile = os.path.join(RAILDIR, 'rail/examples_data/testdata/test_dc2_validation_9816.hdf5')\n", - "training_data = DS.read_file(\"training_data\", TableHandle, trainFile)\n", - "test_data = DS.read_file(\"test_data\", TableHandle, testFile)" - ] - }, - { - "cell_type": "markdown", - "id": "89cfd420-a6cd-4f1d-a54b-536d95aac703", - "metadata": {}, - "source": [ - "Now, we need to set up the stage that will run GPz. We begin by defining a dictionary with the config options for the algorithm. There are sensible defaults set, we will override several of these as an example of how to do this. Config parameters not set in the dictionary will automatically be set to their default values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "44825971-2974-4dde-a072-c774655b22bd", - "metadata": {}, - "outputs": [], - "source": [ - "gpz_train_dict = dict(n_basis=60, trainfrac=0.8, csl_method=\"normal\", max_iter=150, hdf5_groupname=\"photometry\") " - ] - }, - { - "cell_type": "markdown", - "id": "2afd5df8-7449-4a2f-a4f8-7ec18eae92d4", - "metadata": {}, - "source": [ - "Let's set up the training stage. We need to provide a name for the stage for ceci, as well as a name for the model file that will be written by the stage. We also include the arguments in the dictionary we wrote above as additional arguments:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dee70194-46d5-4b3a-a282-b7ca123d008a", - "metadata": {}, - "outputs": [], - "source": [ - "# set up the stage to run our GPZ_training\n", - "pz_train = GPzInformer.make_stage(name=\"GPz_Train\", model=\"GPz_model.pkl\", **gpz_train_dict)" - ] - }, - { - "cell_type": "markdown", - "id": "2cceb899-0acb-448d-8a58-7a61227547a9", - "metadata": {}, - "source": [ - "We are now ready to run the stage to create the model. We will use the training data from `test_dc2_training_9816.hdf5`, which contains 10,225 galaxies drawn from healpix 9816 from the cosmoDC2_v1.1.4 dataset, to train the model. Note that we read this data in called `train_data` in the DataStore. Note that we set `trainfrac` to 0.8, so 80% of the data will be used in the \"main\" training, but 20% will be reserved by `GPzInformer` to determine a SIGMA parameter. We set `max_iter` to 150, so we will see 150 steps where the stage tries to maximize the likelihood. We run the stage as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d925f67e-d3a5-49ae-9a1e-6ac043dbdd0a", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pz_train.inform(training_data)" - ] - }, - { - "cell_type": "markdown", - "id": "c5c7a409-1919-49c6-a258-4af05fe30e00", - "metadata": {}, - "source": [ - "This should have taken about 30 seconds on a typical desktop computer, and you should now see a file called `GPz_model.pkl` in the directory. This model file is used by the `GPzEstimator` stage to determine our redshift PDFs for the test set of galaxies. Let's set up that stage, again defining a dictionary of variables for the config params:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc2a9632-adfd-42cc-96c7-c0bb34390b15", - "metadata": {}, - "outputs": [], - "source": [ - "gpz_test_dict = dict(hdf5_groupname=\"photometry\", model=\"GPz_model.pkl\")\n", - "\n", - "gpz_run = GPzEstimator.make_stage(name=\"gpz_run\", **gpz_test_dict)" - ] - }, - { - "cell_type": "markdown", - "id": "8bfe0f11-5c38-4856-b845-d5fb830e707a", - "metadata": {}, - "source": [ - "Let's run the stage and compute photo-z's for our test set:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4c739a8-1155-4265-ba2b-45709399f291", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "results = gpz_run.estimate(test_data)" - ] - }, - { - "cell_type": "markdown", - "id": "2c0d2a56-029b-45d2-b5e1-10eee75d24bb", - "metadata": {}, - "source": [ - "This should be very fast, under a second for our 20,449 galaxies in the test set. Now, let's plot a scatter plot of the point estimates, as well as a few example PDFs. We can get access to the `qp` ensemble that was written via the DataStore via `results()`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2d8dc61f-c073-44b1-9d32-6e5d715e699e", - "metadata": {}, - "outputs": [], - "source": [ - "ens = results()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "baa61f3d-426e-41d0-9d3f-65eb05a5b7ec", - "metadata": {}, - "outputs": [], - "source": [ - "expdfids = [2, 180, 13517, 18032]\n", - "fig, axs = plt.subplots(4, 1, figsize=(12,10))\n", - "for i, xx in enumerate(expdfids):\n", - " axs[i].set_xlim(0,3)\n", - " ens[xx].plot_native(axes=axs[i])\n", - "axs[3].set_xlabel(\"redshift\", fontsize=15)" - ] - }, - { - "cell_type": "markdown", - "id": "093dd6f9-f935-4aa5-9898-1c52b3bef6d1", - "metadata": {}, - "source": [ - "GPzEstimator parameterizes each PDF as a single Gaussian, here we see a few examples of Gaussians of different widths. Now let's grab the mode of each PDF (stored as ancil data in the ensemble) and compare to the true redshifts from the test_data file:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e33ac16a-1c2b-4be8-a947-362bcc039431", - "metadata": {}, - "outputs": [], - "source": [ - "truez = test_data.data['photometry']['redshift']\n", - "zmode = ens.ancil['zmode'].flatten()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2f63f2e-7931-41c5-8cec-32a60230d4cc", - "metadata": {}, - "outputs": [], - "source": [ - "plt.figure(figsize=(12,12))\n", - "plt.scatter(truez, zmode, s=3)\n", - "plt.plot([0,3],[0,3], 'k--')\n", - "plt.xlabel(\"redshift\", fontsize=12)\n", - "plt.ylabel(\"z mode\", fontsize=12)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}