From cd05564c2337ae54f7841a7fe307089416c2af23 Mon Sep 17 00:00:00 2001
From: Drew Oldag <awoldag@uw.edu>
Date: Tue, 7 Nov 2023 09:18:10 -0800
Subject: [PATCH 1/8] WIP - Scaffolding for pdf-vs-pdf evaluator class.

---
 src/rail/evaluation/prob_prob_evaluator.py |  61 +++++++++++
 src/rail/evaluation/testing.ipynb          | 114 +++++++++++++++++++++
 2 files changed, 175 insertions(+)
 create mode 100644 src/rail/evaluation/prob_prob_evaluator.py
 create mode 100644 src/rail/evaluation/testing.ipynb

diff --git a/src/rail/evaluation/prob_prob_evaluator.py b/src/rail/evaluation/prob_prob_evaluator.py
new file mode 100644
index 00000000..0bb9ccc5
--- /dev/null
+++ b/src/rail/evaluation/prob_prob_evaluator.py
@@ -0,0 +1,61 @@
+import numpy as np
+
+from ceci.config import StageParameter as Param
+from qp.metrics import calculate_goodness_of_fit
+
+from rail.core.data import Hdf5Handle, QPHandle
+from rail.core.stage import RailStage
+from rail.evaluation.evaluator import Evaluator
+
+class ProbProbEvaluator(Evaluator):
+    """Evaluate the performance of a photo-z estimator against reference PDFs"""
+
+    name = 'ProbProbEvaluator'
+    config_options = RailStage.config_options.copy()
+    config_options.update(
+        metrics=Param(list, [], msg="The metrics you want to evaluate."),
+        chunk_size=Param(int, 1000, msg="The default number of PDFs to evaluate per loop.")
+    )
+    inputs = [('input', QPHandle),
+              ('truth', QPHandle)]
+    outputs = [('output', Hdf5Handle)]
+
+    def __init__(self, args, comm=None):
+        Evaluator.__init__(self, args, comm=comm)
+
+    def run(self):
+        out_table = {}
+        print(f"Requested metrics: {self.config.metrics}")
+
+        estimate_iterator = self.input_iterator('input')
+        
+        #! Correct the following line to be 'truth' !!!
+        # comparison_iterator = self.input_iterator('input') 
+
+        #! Need to implement something to ensure that the iterators are aligned.
+
+        first = True
+        for s, e, data in estimate_iterator:
+            print(f"Processing {self.rank} running evaluator on chunk {s} - {e}.")
+            self._process_chunk(s, e, data, first)
+            first = False
+
+
+        out_table_to_write = {key: np.array(val).astype(float) for key, val in out_table.items()}
+        self.add_data('output', data=out_table_to_write)
+
+    def _process_chunk(self, start, end, data, first):
+        out_table = {}
+        for metric in self.config.metrics:
+            try:
+                #! Fix the next line since it's just data vs. data !!!
+                out_table[metric] = calculate_goodness_of_fit(data, data, fit_metric=metric)
+            except KeyError:
+                print(f"User requested unrecognized metric: {metric} - Skipping.")
+
+        if first:
+            # Need to determine how to correctly initialize this
+            pass
+        else:
+            # Remove the "else", but figure out the correct "non-first" logic.
+            pass
diff --git a/src/rail/evaluation/testing.ipynb b/src/rail/evaluation/testing.ipynb
new file mode 100644
index 00000000..f3fc6f1a
--- /dev/null
+++ b/src/rail/evaluation/testing.ipynb
@@ -0,0 +1,114 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from rail.evaluation.prob_prob_evaluator import ProbProbEvaluator\n",
+    "from rail.core.stage import RailStage\n",
+    "from rail.core.data import QPHandle, TableHandle\n",
+    "\n",
+    "DS = RailStage.data_store\n",
+    "DS.__class__.allow_overwrite = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 'cvm' takes about 3.5 minutes to run\n",
+    "# 'ad' takes about ~4 minutes to run\n",
+    "# 'ks' takes about 2.75 minutes to run \n",
+    "squish_fish = ProbProbEvaluator.make_stage(name='SillyPoopfish', metrics=['cvm', 'ks', 'omega'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from rail.core.utils import find_rail_file\n",
+    "possible_local_file = './examples_data/evaluation_data/data/output_fzboost.hdf5'\n",
+    "if os.path.exists(possible_local_file):\n",
+    "    pdfs_file = os.path.abspath(possible_local_file)\n",
+    "else:\n",
+    "    pdfs_file = 'examples_data/evaluation_data/data/output_fzboost.hdf5'\n",
+    "    try:\n",
+    "        os.makedirs(os.path.dirname(pdfs_file))\n",
+    "    except FileExistsError:\n",
+    "        pass\n",
+    "    curl_com = f\"curl -o {pdfs_file} https://portal.nersc.gov/cfs/lsst/PZ/output_fzboost.hdf5\"\n",
+    "    os.system(curl_com)\n",
+    "\n",
+    "ztrue_file = find_rail_file('examples_data/testdata/test_dc2_validation_9816.hdf5')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ensemble = DS.read_file(key='pdfs_data', handle_class=QPHandle, path=pdfs_file)\n",
+    "ztrue_data = DS.read_file('ztrue_data', TableHandle, ztrue_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "truth = DS.add_data('truth', ztrue_data()['photometry'], TableHandle)\n",
+    "# ensemble = DS.add_data('ensemble', fzdata(), QPHandle, path=pdfs_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "squish_results = squish_fish.evaluate(ensemble, truth)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tables_io\n",
+    "results_df= tables_io.convertObj(squish_results(), tables_io.types.PD_DATAFRAME)\n",
+    "results_df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "rail",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From ac36d537c0d322e60550a3e8669bc1a41a320fc6 Mon Sep 17 00:00:00 2001
From: Drew Oldag <awoldag@uw.edu>
Date: Tue, 7 Nov 2023 10:29:35 -0800
Subject: [PATCH 2/8] Writing output for evaluation of iterated input.

---
 src/rail/evaluation/prob_prob_evaluator.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/rail/evaluation/prob_prob_evaluator.py b/src/rail/evaluation/prob_prob_evaluator.py
index 0bb9ccc5..76511880 100644
--- a/src/rail/evaluation/prob_prob_evaluator.py
+++ b/src/rail/evaluation/prob_prob_evaluator.py
@@ -22,15 +22,15 @@ class ProbProbEvaluator(Evaluator):
 
     def __init__(self, args, comm=None):
         Evaluator.__init__(self, args, comm=comm)
+        self._output_handle = None
 
     def run(self):
-        out_table = {}
         print(f"Requested metrics: {self.config.metrics}")
 
         estimate_iterator = self.input_iterator('input')
-        
+
         #! Correct the following line to be 'truth' !!!
-        # comparison_iterator = self.input_iterator('input') 
+        # comparison_iterator = self.input_iterator('input')
 
         #! Need to implement something to ensure that the iterators are aligned.
 
@@ -40,9 +40,7 @@ def run(self):
             self._process_chunk(s, e, data, first)
             first = False
 
-
-        out_table_to_write = {key: np.array(val).astype(float) for key, val in out_table.items()}
-        self.add_data('output', data=out_table_to_write)
+        self._output_handle.finalize_write()
 
     def _process_chunk(self, start, end, data, first):
         out_table = {}
@@ -53,9 +51,10 @@ def _process_chunk(self, start, end, data, first):
             except KeyError:
                 print(f"User requested unrecognized metric: {metric} - Skipping.")
 
+        out_table_to_write = {key: np.array(val).astype(float) for key, val in out_table.items()}
+
         if first:
-            # Need to determine how to correctly initialize this
-            pass
-        else:
-            # Remove the "else", but figure out the correct "non-first" logic.
-            pass
+            self._output_handle = self.add_handle('output', data=out_table_to_write)
+            self._output_handle.initialize_write(self._input_length, communicator=self.comm)
+        self._output_handle.set_data(out_table_to_write, partial=True)
+        self._output_handle.write_chunk(start, end)

From aaaa0efab81bdbb933418d2473510a2a83c86a93 Mon Sep 17 00:00:00 2001
From: Drew Oldag <awoldag@uw.edu>
Date: Tue, 7 Nov 2023 15:29:29 -0800
Subject: [PATCH 3/8] Adding temporary support for kld metric.

---
 src/rail/evaluation/prob_prob_evaluator.py | 8 ++++++--
 src/rail/evaluation/testing.ipynb          | 5 +++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/rail/evaluation/prob_prob_evaluator.py b/src/rail/evaluation/prob_prob_evaluator.py
index 76511880..bd9598ac 100644
--- a/src/rail/evaluation/prob_prob_evaluator.py
+++ b/src/rail/evaluation/prob_prob_evaluator.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from ceci.config import StageParameter as Param
-from qp.metrics import calculate_goodness_of_fit
+from qp.metrics import calculate_goodness_of_fit, calculate_kld
 
 from rail.core.data import Hdf5Handle, QPHandle
 from rail.core.stage import RailStage
@@ -14,7 +14,9 @@ class ProbProbEvaluator(Evaluator):
     config_options = RailStage.config_options.copy()
     config_options.update(
         metrics=Param(list, [], msg="The metrics you want to evaluate."),
-        chunk_size=Param(int, 1000, msg="The default number of PDFs to evaluate per loop.")
+        chunk_size=Param(int, 1000, msg="The default number of PDFs to evaluate per loop."),
+        limits=Param(tuple, (0.0, 3.0), msg="The default end points for calculating metrics on a grid."),
+        dx=Param(float, 0.01, msg="The default step size when calculating metrics on a grid."),
     )
     inputs = [('input', QPHandle),
               ('truth', QPHandle)]
@@ -50,6 +52,8 @@ def _process_chunk(self, start, end, data, first):
                 out_table[metric] = calculate_goodness_of_fit(data, data, fit_metric=metric)
             except KeyError:
                 print(f"User requested unrecognized metric: {metric} - Skipping.")
+                if metric == 'kld':
+                    out_table[metric] = calculate_kld(data, data, self.config.limits, self.config.dx)
 
         out_table_to_write = {key: np.array(val).astype(float) for key, val in out_table.items()}
 
diff --git a/src/rail/evaluation/testing.ipynb b/src/rail/evaluation/testing.ipynb
index f3fc6f1a..8dc7e10c 100644
--- a/src/rail/evaluation/testing.ipynb
+++ b/src/rail/evaluation/testing.ipynb
@@ -22,8 +22,9 @@
    "source": [
     "# 'cvm' takes about 3.5 minutes to run\n",
     "# 'ad' takes about ~4 minutes to run\n",
-    "# 'ks' takes about 2.75 minutes to run \n",
-    "squish_fish = ProbProbEvaluator.make_stage(name='SillyPoopfish', metrics=['cvm', 'ks', 'omega'])\n"
+    "# 'ks' takes about 2.75 minutes to run\n",
+    "# 'kld' takes about X minutes to run\n",
+    "squish_fish = ProbProbEvaluator.make_stage(name='SillyPoopfish', metrics=['cvm', 'ks', 'omega', 'kld'])\n"
    ]
   },
   {

From a17d8496143cb9212636d0d79351e61d74104912 Mon Sep 17 00:00:00 2001
From: Drew Oldag <awoldag@uw.edu>
Date: Tue, 14 Nov 2023 16:42:34 -0800
Subject: [PATCH 4/8] Add dynamic dictionary creation for DistToDist metrics.
 Clean up prob_prob_evaluator.

---
 src/rail/evaluation/prob_prob_evaluator.py | 39 +++++++++++++++-------
 src/rail/evaluation/testing.ipynb          |  6 +++-
 2 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/src/rail/evaluation/prob_prob_evaluator.py b/src/rail/evaluation/prob_prob_evaluator.py
index bd9598ac..c15cbe5c 100644
--- a/src/rail/evaluation/prob_prob_evaluator.py
+++ b/src/rail/evaluation/prob_prob_evaluator.py
@@ -1,22 +1,35 @@
 import numpy as np
 
 from ceci.config import StageParameter as Param
-from qp.metrics import calculate_goodness_of_fit, calculate_kld
+from qp.metrics.concrete_metric_classes import DistToDistMetric
 
 from rail.core.data import Hdf5Handle, QPHandle
 from rail.core.stage import RailStage
 from rail.evaluation.evaluator import Evaluator
 
+# dynamically build a dictionary of all available metrics of the appropriate type
+METRIC_DICT = {}
+for subcls in DistToDistMetric.__subclasses__():
+    METRIC_DICT[subcls.metric_name] = subcls
+
 class ProbProbEvaluator(Evaluator):
     """Evaluate the performance of a photo-z estimator against reference PDFs"""
 
     name = 'ProbProbEvaluator'
     config_options = RailStage.config_options.copy()
     config_options.update(
-        metrics=Param(list, [], msg="The metrics you want to evaluate."),
-        chunk_size=Param(int, 1000, msg="The default number of PDFs to evaluate per loop."),
-        limits=Param(tuple, (0.0, 3.0), msg="The default end points for calculating metrics on a grid."),
-        dx=Param(float, 0.01, msg="The default step size when calculating metrics on a grid."),
+        metrics=Param(list, [], required=False,
+            msg="The metrics you want to evaluate."),
+        chunk_size=Param(int, 1000, required=False,
+            msg="The default number of PDFs to evaluate per loop."),
+        limits=Param(tuple, (0.0, 3.0), required=False,
+            msg="The default end points for calculating metrics on a grid."),
+        dx=Param(float, 0.01, required=False,
+            msg="The default step size when calculating metrics on a grid."),
+        num_samples=Param(int, 100, required=False,
+            msg="The number of random samples to select for certain metrics."),
+        _random_state=Param(float, default=None, required=False,
+            msg="Random seed value to use for reproducible results."),
     )
     inputs = [('input', QPHandle),
               ('truth', QPHandle)]
@@ -47,13 +60,15 @@ def run(self):
     def _process_chunk(self, start, end, data, first):
         out_table = {}
         for metric in self.config.metrics:
-            try:
-                #! Fix the next line since it's just data vs. data !!!
-                out_table[metric] = calculate_goodness_of_fit(data, data, fit_metric=metric)
-            except KeyError:
-                print(f"User requested unrecognized metric: {metric} - Skipping.")
-                if metric == 'kld':
-                    out_table[metric] = calculate_kld(data, data, self.config.limits, self.config.dx)
+            if metric not in METRIC_DICT:
+                #! Make the following a logged error instead of bailing out of the stage.
+                # raise ValueError(
+                # f"Unsupported metric requested: '{metric}'.
+                # Available metrics are: {METRIC_DICT.keys()}")
+                continue
+
+            this_metric = METRIC_DICT[metric](**self.config.to_dict())
+            out_table[metric] = this_metric.evaluate(data, data)
 
         out_table_to_write = {key: np.array(val).astype(float) for key, val in out_table.items()}
 
diff --git a/src/rail/evaluation/testing.ipynb b/src/rail/evaluation/testing.ipynb
index 8dc7e10c..ef52c930 100644
--- a/src/rail/evaluation/testing.ipynb
+++ b/src/rail/evaluation/testing.ipynb
@@ -24,7 +24,11 @@
     "# 'ad' takes about ~4 minutes to run\n",
     "# 'ks' takes about 2.75 minutes to run\n",
     "# 'kld' takes about X minutes to run\n",
-    "squish_fish = ProbProbEvaluator.make_stage(name='SillyPoopfish', metrics=['cvm', 'ks', 'omega', 'kld'])\n"
+    "stage_dict = dict(\n",
+    "    metrics=['cvm', 'ks', 'omega', 'kld'],\n",
+    "    _random_state=None,\n",
+    ")\n",
+    "squish_fish = ProbProbEvaluator.make_stage(name='SillyPoopfish', **stage_dict)\n"
    ]
   },
   {

From f7e962df8b962896e9719b3b7c4b53e636ace94a Mon Sep 17 00:00:00 2001
From: drewoldag <47493171+drewoldag@users.noreply.github.com>
Date: Tue, 28 Nov 2023 15:42:43 -0800
Subject: [PATCH 5/8] Adding two other flavors of evaluator classes. Working on
 getting the iterators to work correctly.

---
 ...evaluator.py => dist_to_dist_evaluator.py} | 29 ++++---
 .../evaluation/dist_to_point_evaluator.py     | 78 +++++++++++++++++++
 .../evaluation/point_to_point_evaluator.py    | 70 +++++++++++++++++
 src/rail/evaluation/testing.ipynb             |  8 +-
 4 files changed, 165 insertions(+), 20 deletions(-)
 rename src/rail/evaluation/{prob_prob_evaluator.py => dist_to_dist_evaluator.py} (75%)
 create mode 100644 src/rail/evaluation/dist_to_point_evaluator.py
 create mode 100644 src/rail/evaluation/point_to_point_evaluator.py

diff --git a/src/rail/evaluation/prob_prob_evaluator.py b/src/rail/evaluation/dist_to_dist_evaluator.py
similarity index 75%
rename from src/rail/evaluation/prob_prob_evaluator.py
rename to src/rail/evaluation/dist_to_dist_evaluator.py
index c15cbe5c..97c48185 100644
--- a/src/rail/evaluation/prob_prob_evaluator.py
+++ b/src/rail/evaluation/dist_to_dist_evaluator.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from ceci.config import StageParameter as Param
-from qp.metrics.concrete_metric_classes import DistToDistMetric
+from qp.metrics.base_metric_classes import DistToDistMetric
 
 from rail.core.data import Hdf5Handle, QPHandle
 from rail.core.stage import RailStage
@@ -12,10 +12,10 @@
 for subcls in DistToDistMetric.__subclasses__():
     METRIC_DICT[subcls.metric_name] = subcls
 
-class ProbProbEvaluator(Evaluator):
+class DistToDistEvaluator(Evaluator):
     """Evaluate the performance of a photo-z estimator against reference PDFs"""
 
-    name = 'ProbProbEvaluator'
+    name = 'DistToDistEvaluator'
     config_options = RailStage.config_options.copy()
     config_options.update(
         metrics=Param(list, [], required=False,
@@ -38,37 +38,34 @@ class ProbProbEvaluator(Evaluator):
     def __init__(self, args, comm=None):
         Evaluator.__init__(self, args, comm=comm)
         self._output_handle = None
+        self._metric_dict = METRIC_DICT
 
     def run(self):
         print(f"Requested metrics: {self.config.metrics}")
 
-        estimate_iterator = self.input_iterator('input')
-
-        #! Correct the following line to be 'truth' !!!
-        # comparison_iterator = self.input_iterator('input')
-
-        #! Need to implement something to ensure that the iterators are aligned.
+        estimate_iterator = self.get_handle('input').iterator()
+        reference_iterator = self.get_handle('truth').iterator()
 
         first = True
-        for s, e, data in estimate_iterator:
+        for s, e, estimate_data, _, _, reference_data in zip(estimate_iterator, reference_iterator):
             print(f"Processing {self.rank} running evaluator on chunk {s} - {e}.")
-            self._process_chunk(s, e, data, first)
+            self._process_chunk(s, e, estimate_data, reference_data, first)
             first = False
 
         self._output_handle.finalize_write()
 
-    def _process_chunk(self, start, end, data, first):
+    def _process_chunk(self, start, end, estimate_data, reference_data, first):
         out_table = {}
         for metric in self.config.metrics:
-            if metric not in METRIC_DICT:
+            if metric not in self._metric_dict:
                 #! Make the following a logged error instead of bailing out of the stage.
                 # raise ValueError(
                 # f"Unsupported metric requested: '{metric}'.
-                # Available metrics are: {METRIC_DICT.keys()}")
+                # Available metrics are: {self._metric_dict.keys()}")
                 continue
 
-            this_metric = METRIC_DICT[metric](**self.config.to_dict())
-            out_table[metric] = this_metric.evaluate(data, data)
+            this_metric = self._metric_dict[metric](**self.config.to_dict())
+            out_table[metric] = this_metric.evaluate(estimate_data, reference_data)
 
         out_table_to_write = {key: np.array(val).astype(float) for key, val in out_table.items()}
 
diff --git a/src/rail/evaluation/dist_to_point_evaluator.py b/src/rail/evaluation/dist_to_point_evaluator.py
new file mode 100644
index 00000000..3bd725ec
--- /dev/null
+++ b/src/rail/evaluation/dist_to_point_evaluator.py
@@ -0,0 +1,78 @@
+import numpy as np
+
+from ceci.config import StageParameter as Param
+from qp.metrics.base_metric_classes import DistToPointMetric
+
+from rail.core.data import Hdf5Handle, QPHandle, TableHandle
+from rail.core.stage import RailStage
+from rail.evaluation.evaluator import Evaluator
+
+# dynamically build a dictionary of all available metrics of the appropriate type
+METRIC_DICT = {}
+for subcls in DistToPointMetric.__subclasses__():
+    METRIC_DICT[subcls.metric_name] = subcls
+
+class DistToPointEvaluator(Evaluator):
+    """Evaluate the performance of a photo-z estimator against reference point estimate"""
+
+    name = 'DistToPointEvaluator'
+    config_options = RailStage.config_options.copy()
+    config_options.update(
+        metrics=Param(list, [], required=False,
+            msg="The metrics you want to evaluate."),
+        chunk_size=Param(int, 1000, required=False,
+            msg="The default number of PDFs to evaluate per loop."),
+        limits=Param(tuple, (0.0, 3.0), required=False,
+            msg="The default end points for calculating metrics on a grid."),
+        dx=Param(float, 0.01, required=False,
+            msg="The default step size when calculating metrics on a grid."),
+        quantile_grid=Param(list, np.linspace(0,1,100), required=False,
+            msg="The quantile value grid on which to evaluate the CDF values. (0, 1)"),
+        x_grid=Param(list, np.linspace(0,2.5, 301), required=False,
+            msg="The x-value grid at which to evaluate the pdf values."),
+        _random_state=Param(float, default=None, required=False,
+            msg="Random seed value to use for reproducible results."),
+    )
+    inputs = [('input', QPHandle),
+              ('truth', TableHandle)]
+    outputs = [('output', Hdf5Handle)]
+
+    def __init__(self, args, comm=None):
+        Evaluator.__init__(self, args, comm=comm)
+        self._output_handle = None
+        self._metric_dict = METRIC_DICT
+
+    def run(self):
+        print(f"Requested metrics: {self.config.metrics}")
+
+        estimate_iterator = self.get_handle('input').iterator()
+        reference_iterator = self.get_handle('truth').iterator()
+
+        first = True
+        for s, e, estimate_data, _, _, reference_data in zip(estimate_iterator, reference_iterator):
+            print(f"Processing {self.rank} running evaluator on chunk {s} - {e}.")
+            self._process_chunk(s, e, estimate_data, reference_data, first)
+            first = False
+
+        self._output_handle.finalize_write()
+
+    def _process_chunk(self, start, end, estimate_data, reference_data, first):
+        out_table = {}
+        for metric in self.config.metrics:
+            if metric not in self._metric_dict:
+                #! Make the following a logged error instead of bailing out of the stage.
+                # raise ValueError(
+                # f"Unsupported metric requested: '{metric}'.
+                # Available metrics are: {self._metric_dict.keys()}")
+                continue
+
+            this_metric = self._metric_dict[metric](**self.config.to_dict())
+            out_table[metric] = this_metric.evaluate(estimate_data, reference_data)
+
+        out_table_to_write = {key: np.array(val).astype(float) for key, val in out_table.items()}
+
+        if first:
+            self._output_handle = self.add_handle('output', data=out_table_to_write)
+            self._output_handle.initialize_write(self._input_length, communicator=self.comm)
+        self._output_handle.set_data(out_table_to_write, partial=True)
+        self._output_handle.write_chunk(start, end)
diff --git a/src/rail/evaluation/point_to_point_evaluator.py b/src/rail/evaluation/point_to_point_evaluator.py
new file mode 100644
index 00000000..06510f6f
--- /dev/null
+++ b/src/rail/evaluation/point_to_point_evaluator.py
@@ -0,0 +1,70 @@
+import numpy as np
+
+from ceci.config import StageParameter as Param
+from qp.metrics.base_metric_classes import PointToPointMetric
+
+from rail.core.data import Hdf5Handle, TableHandle
+from rail.core.stage import RailStage
+from rail.evaluation.evaluator import Evaluator
+
+# dynamically build a dictionary of all available metrics of the appropriate type
+METRIC_DICT = {}
+for subcls in PointToPointMetric.__subclasses__():
+    METRIC_DICT[subcls.metric_name] = subcls
+
+class PointToPointEvaluator(Evaluator):
+    """Evaluate the performance of a photo-z estimator against reference point estimate"""
+
+    name = 'PointToPointEvaluator'
+    config_options = RailStage.config_options.copy()
+    config_options.update(
+        metrics=Param(list, [], required=False,
+            msg="The metrics you want to evaluate."),
+        chunk_size=Param(int, 1000, required=False,
+            msg="The default number of PDFs to evaluate per loop."),
+        _random_state=Param(float, default=None, required=False,
+            msg="Random seed value to use for reproducible results."),
+    )
+    inputs = [('input', TableHandle),
+              ('truth', TableHandle)]
+    outputs = [('output', Hdf5Handle)]
+
+    def __init__(self, args, comm=None):
+        Evaluator.__init__(self, args, comm=comm)
+        self._output_handle = None
+        self._metric_dict = METRIC_DICT
+
+    def run(self):
+        print(f"Requested metrics: {self.config.metrics}")
+
+        estimate_iterator = self.get_handle('input').iterator()
+        reference_iterator = self.get_handle('truth').iterator()
+
+        first = True
+        for s, e, estimate_data, _, _, reference_data in zip(estimate_iterator, reference_iterator):
+            print(f"Processing {self.rank} running evaluator on chunk {s} - {e}.")
+            self._process_chunk(s, e, estimate_data, reference_data, first)
+            first = False
+
+        self._output_handle.finalize_write()
+
+    def _process_chunk(self, start, end, estimate_data, reference_data, first):
+        out_table = {}
+        for metric in self.config.metrics:
+            if metric not in self._metric_dict:
+                #! Make the following a logged error instead of bailing out of the stage.
+                # raise ValueError(
+                # f"Unsupported metric requested: '{metric}'.
+                # Available metrics are: {self._metric_dict.keys()}")
+                continue
+
+            this_metric = self._metric_dict[metric](**self.config.to_dict())
+            out_table[metric] = this_metric.evaluate(estimate_data, reference_data)
+
+        out_table_to_write = {key: np.array(val).astype(float) for key, val in out_table.items()}
+
+        if first:
+            self._output_handle = self.add_handle('output', data=out_table_to_write)
+            self._output_handle.initialize_write(self._input_length, communicator=self.comm)
+        self._output_handle.set_data(out_table_to_write, partial=True)
+        self._output_handle.write_chunk(start, end)
diff --git a/src/rail/evaluation/testing.ipynb b/src/rail/evaluation/testing.ipynb
index ef52c930..5e88877d 100644
--- a/src/rail/evaluation/testing.ipynb
+++ b/src/rail/evaluation/testing.ipynb
@@ -6,7 +6,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from rail.evaluation.prob_prob_evaluator import ProbProbEvaluator\n",
+    "from rail.evaluation.dist_to_point_evaluator import DistToPointEvaluator\n",
     "from rail.core.stage import RailStage\n",
     "from rail.core.data import QPHandle, TableHandle\n",
     "\n",
@@ -28,7 +28,7 @@
     "    metrics=['cvm', 'ks', 'omega', 'kld'],\n",
     "    _random_state=None,\n",
     ")\n",
-    "squish_fish = ProbProbEvaluator.make_stage(name='SillyPoopfish', **stage_dict)\n"
+    "squish_fish = DistToPointEvaluator.make_stage(name='SillyPoopfish', **stage_dict)\n"
    ]
   },
   {
@@ -70,7 +70,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "truth = DS.add_data('truth', ztrue_data()['photometry'], TableHandle)\n",
+    "truth = DS.add_data('truth', ztrue_data()['photometry'], TableHandle, path=ztrue_file)\n",
     "# ensemble = DS.add_data('ensemble', fzdata(), QPHandle, path=pdfs_file)"
    ]
   },
@@ -111,7 +111,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,

From 60a0ed606f09cc27b0148031f64f29d370d58d32 Mon Sep 17 00:00:00 2001
From: drewoldag <47493171+drewoldag@users.noreply.github.com>
Date: Tue, 28 Nov 2023 16:04:16 -0800
Subject: [PATCH 6/8] More experimentation to get the iterators to work
 correctly.

---
 src/rail/core/stage.py                         | 4 ++--
 src/rail/evaluation/dist_to_point_evaluator.py | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/rail/core/stage.py b/src/rail/core/stage.py
index 5e6d5653..d7fee3a7 100644
--- a/src/rail/core/stage.py
+++ b/src/rail/core/stage.py
@@ -355,9 +355,9 @@ def input_iterator(self, tag, **kwargs):
         # in a single chunk.
         else:  #pragma: no cover
             if self.config.hdf5_groupname:
-                test_data = self.get_data('input')[self.config.hdf5_groupname]
+                test_data = self.get_data(tag)[self.config.hdf5_groupname]
             else:
-                test_data = self.get_data('input')
+                test_data = self.get_data(tag)
             max_l = 0
             for k, v in test_data.items():
                 max_l = max(max_l, len(v))
diff --git a/src/rail/evaluation/dist_to_point_evaluator.py b/src/rail/evaluation/dist_to_point_evaluator.py
index 3bd725ec..39f74744 100644
--- a/src/rail/evaluation/dist_to_point_evaluator.py
+++ b/src/rail/evaluation/dist_to_point_evaluator.py
@@ -32,6 +32,8 @@ class DistToPointEvaluator(Evaluator):
             msg="The x-value grid at which to evaluate the pdf values."),
         _random_state=Param(float, default=None, required=False,
             msg="Random seed value to use for reproducible results."),
+        hdf5_groupname=Param(str, "photometry", required=False,
+            msg="HDF5 Groupname for truth table."),
     )
     inputs = [('input', QPHandle),
               ('truth', TableHandle)]
@@ -45,8 +47,8 @@ def __init__(self, args, comm=None):
     def run(self):
         print(f"Requested metrics: {self.config.metrics}")
 
-        estimate_iterator = self.get_handle('input').iterator()
-        reference_iterator = self.get_handle('truth').iterator()
+        estimate_iterator = self.input_iterator('input')
+        reference_iterator = self.input_iterator('truth')
 
         first = True
         for s, e, estimate_data, _, _, reference_data in zip(estimate_iterator, reference_iterator):

From eaaf54dcd809d46d14fd036fbaedaa4dc66e1e4d Mon Sep 17 00:00:00 2001
From: Drew Oldag <awoldag@uw.edu>
Date: Wed, 29 Nov 2023 16:57:32 -0800
Subject: [PATCH 7/8] Iterators working for DistToPoint and DistToDist. Not
 working for PointToPoint. Need to address cumulative metrics for DistToPoint
 (i.e. metrics that require _all_ the data to be evaluated at once).

---
 src/rail/evaluation/dist_to_dist_evaluator.py |  15 ++-
 .../evaluation/dist_to_point_evaluator.py     |  15 ++-
 .../evaluation/point_to_point_evaluator.py    |  13 +-
 src/rail/evaluation/testing.ipynb             | 116 +++++++++++++++---
 4 files changed, 124 insertions(+), 35 deletions(-)

diff --git a/src/rail/evaluation/dist_to_dist_evaluator.py b/src/rail/evaluation/dist_to_dist_evaluator.py
index 97c48185..2c743a79 100644
--- a/src/rail/evaluation/dist_to_dist_evaluator.py
+++ b/src/rail/evaluation/dist_to_dist_evaluator.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from ceci.config import StageParameter as Param
-from qp.metrics.base_metric_classes import DistToDistMetric
+from qp.metrics.concrete_metric_classes import DistToDistMetric
 
 from rail.core.data import Hdf5Handle, QPHandle
 from rail.core.stage import RailStage
@@ -43,13 +43,16 @@ def __init__(self, args, comm=None):
     def run(self):
         print(f"Requested metrics: {self.config.metrics}")
 
-        estimate_iterator = self.get_handle('input').iterator()
-        reference_iterator = self.get_handle('truth').iterator()
+        estimate_iterator = self.input_iterator('input')
+        reference_iterator = self.input_iterator('truth')
 
         first = True
-        for s, e, estimate_data, _, _, reference_data in zip(estimate_iterator, reference_iterator):
-            print(f"Processing {self.rank} running evaluator on chunk {s} - {e}.")
-            self._process_chunk(s, e, estimate_data, reference_data, first)
+        for estimate_data_chunk, reference_data_chunk in zip(estimate_iterator, reference_iterator):
+            chunk_start, chunk_end, estimate_data = estimate_data_chunk
+            _, _, reference_data = reference_data_chunk
+
+            print(f"Processing {self.rank} running evaluator on chunk {chunk_start} - {chunk_end}.")
+            self._process_chunk(chunk_start, chunk_end, estimate_data, reference_data, first)
             first = False
 
         self._output_handle.finalize_write()
diff --git a/src/rail/evaluation/dist_to_point_evaluator.py b/src/rail/evaluation/dist_to_point_evaluator.py
index 39f74744..331389e0 100644
--- a/src/rail/evaluation/dist_to_point_evaluator.py
+++ b/src/rail/evaluation/dist_to_point_evaluator.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from ceci.config import StageParameter as Param
-from qp.metrics.base_metric_classes import DistToPointMetric
+from qp.metrics.concrete_metric_classes import DistToPointMetric
 
 from rail.core.data import Hdf5Handle, QPHandle, TableHandle
 from rail.core.stage import RailStage
@@ -34,6 +34,8 @@ class DistToPointEvaluator(Evaluator):
             msg="Random seed value to use for reproducible results."),
         hdf5_groupname=Param(str, "photometry", required=False,
             msg="HDF5 Groupname for truth table."),
+        reference_dictionary_key=Param(str, "redshift", required=False,
+            msg="The key in the `truth` dictionary where the redshift data is stored."),
     )
     inputs = [('input', QPHandle),
               ('truth', TableHandle)]
@@ -51,9 +53,12 @@ def run(self):
         reference_iterator = self.input_iterator('truth')
 
         first = True
-        for s, e, estimate_data, _, _, reference_data in zip(estimate_iterator, reference_iterator):
-            print(f"Processing {self.rank} running evaluator on chunk {s} - {e}.")
-            self._process_chunk(s, e, estimate_data, reference_data, first)
+        for estimate_data_chunk, reference_data_chunk in zip(estimate_iterator, reference_iterator):
+            chunk_start, chunk_end, estimate_data = estimate_data_chunk
+            _, _, reference_data = reference_data_chunk
+
+            print(f"Processing {self.rank} running evaluator on chunk {chunk_start} - {chunk_end}.")
+            self._process_chunk(chunk_start, chunk_end, estimate_data, reference_data, first)
             first = False
 
         self._output_handle.finalize_write()
@@ -69,7 +74,7 @@ def _process_chunk(self, start, end, estimate_data, reference_data, first):
                 continue
 
             this_metric = self._metric_dict[metric](**self.config.to_dict())
-            out_table[metric] = this_metric.evaluate(estimate_data, reference_data)
+            out_table[metric] = this_metric.evaluate(estimate_data, reference_data[self.config.reference_dictionary_key])
 
         out_table_to_write = {key: np.array(val).astype(float) for key, val in out_table.items()}
 
diff --git a/src/rail/evaluation/point_to_point_evaluator.py b/src/rail/evaluation/point_to_point_evaluator.py
index 06510f6f..08b7fa43 100644
--- a/src/rail/evaluation/point_to_point_evaluator.py
+++ b/src/rail/evaluation/point_to_point_evaluator.py
@@ -37,13 +37,16 @@ def __init__(self, args, comm=None):
     def run(self):
         print(f"Requested metrics: {self.config.metrics}")
 
-        estimate_iterator = self.get_handle('input').iterator()
-        reference_iterator = self.get_handle('truth').iterator()
+        estimate_iterator = self.input_iterator('input')
+        reference_iterator = self.input_iterator('truth')
 
         first = True
-        for s, e, estimate_data, _, _, reference_data in zip(estimate_iterator, reference_iterator):
-            print(f"Processing {self.rank} running evaluator on chunk {s} - {e}.")
-            self._process_chunk(s, e, estimate_data, reference_data, first)
+        for estimate_data_chunk, reference_data_chunk in zip(estimate_iterator, reference_iterator):
+            chunk_start, chunk_end, estimate_data = estimate_data_chunk
+            _, _, reference_data = reference_data_chunk
+
+            print(f"Processing {self.rank} running evaluator on chunk {chunk_start} - {chunk_end}.")
+            self._process_chunk(chunk_start, chunk_end, estimate_data, reference_data, first)
             first = False
 
         self._output_handle.finalize_write()
diff --git a/src/rail/evaluation/testing.ipynb b/src/rail/evaluation/testing.ipynb
index 5e88877d..1299b16e 100644
--- a/src/rail/evaluation/testing.ipynb
+++ b/src/rail/evaluation/testing.ipynb
@@ -6,7 +6,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import tables_io\n",
+    "\n",
+    "from rail.evaluation.dist_to_dist_evaluator import DistToDistEvaluator\n",
     "from rail.evaluation.dist_to_point_evaluator import DistToPointEvaluator\n",
+    "from rail.evaluation.point_to_point_evaluator import PointToPointEvaluator\n",
     "from rail.core.stage import RailStage\n",
     "from rail.core.data import QPHandle, TableHandle\n",
     "\n",
@@ -15,20 +19,10 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# 'cvm' takes about 3.5 minutes to run\n",
-    "# 'ad' takes about ~4 minutes to run\n",
-    "# 'ks' takes about 2.75 minutes to run\n",
-    "# 'kld' takes about X minutes to run\n",
-    "stage_dict = dict(\n",
-    "    metrics=['cvm', 'ks', 'omega', 'kld'],\n",
-    "    _random_state=None,\n",
-    ")\n",
-    "squish_fish = DistToPointEvaluator.make_stage(name='SillyPoopfish', **stage_dict)\n"
+    "# Load example Data"
    ]
   },
   {
@@ -61,7 +55,15 @@
    "outputs": [],
    "source": [
     "ensemble = DS.read_file(key='pdfs_data', handle_class=QPHandle, path=pdfs_file)\n",
-    "ztrue_data = DS.read_file('ztrue_data', TableHandle, ztrue_file)"
+    "ztrue_data = DS.read_file('ztrue_data', TableHandle, ztrue_file)\n",
+    "truth = DS.add_data('truth', ztrue_data()['photometry'], TableHandle, path=ztrue_file)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dist to Dist Evaluation"
    ]
   },
   {
@@ -70,8 +72,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "truth = DS.add_data('truth', ztrue_data()['photometry'], TableHandle, path=ztrue_file)\n",
-    "# ensemble = DS.add_data('ensemble', fzdata(), QPHandle, path=pdfs_file)"
+    "# 'cvm' takes about 3.5 minutes to run\n",
+    "# 'ad' takes about ~4 minutes to run\n",
+    "# 'ks' takes about 2.75 minutes to run\n",
+    "# 'kld' takes about X minutes to run\n",
+    "\n",
+    "stage_dict = dict(\n",
+    "    metrics=['cvm', 'ks', 'omega', 'kld'],\n",
+    "    _random_state=None,\n",
+    ")\n",
+    "\n",
+    "dtd_stage = DistToDistEvaluator.make_stage(name='SillyPoopfish', **stage_dict)"
    ]
   },
   {
@@ -80,7 +91,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "squish_results = squish_fish.evaluate(ensemble, truth)"
+    "dtd_results = dtd_stage.evaluate(ensemble, ensemble)"
    ]
   },
   {
@@ -89,10 +100,77 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import tables_io\n",
-    "results_df= tables_io.convertObj(squish_results(), tables_io.types.PD_DATAFRAME)\n",
+    "results_df = tables_io.convertObj(dtd_results(), tables_io.types.PD_DATAFRAME)\n",
+    "results_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dist to Point Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stage_dict = dict(\n",
+    "    metrics=['cdeloss'],\n",
+    "    _random_state=None,\n",
+    ")\n",
+    "dtp_stage = DistToPointEvaluator.make_stage(name='SillyPoopfish', **stage_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dtp_results = dtp_stage.evaluate(ensemble, truth)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results_df = tables_io.convertObj(dtp_results(), tables_io.types.PD_DATAFRAME)\n",
     "results_df"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Point to Point Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stage_dict = dict(\n",
+    "    metrics=['point_stats_ez'],\n",
+    "    _random_state=None,\n",
+    ")\n",
+    "ptp_stage = PointToPointEvaluator.make_stage(name='SillyPoopfish', **stage_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ptp_results = ptp_stage.evaluate(truth, truth)"
+   ]
   }
  ],
  "metadata": {
@@ -111,7 +189,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

From df3bf80385bee071071f55aeb4bfc5c0b46bfd1a Mon Sep 17 00:00:00 2001
From: Drew Oldag <awoldag@uw.edu>
Date: Wed, 17 Jan 2024 14:32:11 -0800
Subject: [PATCH 8/8] WIP commit.

---
 src/rail/evaluation/dist_to_point_evaluator.py | 17 ++++++++++++++++-
 src/rail/evaluation/testing.ipynb              | 11 ++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/rail/evaluation/dist_to_point_evaluator.py b/src/rail/evaluation/dist_to_point_evaluator.py
index 331389e0..f67b35e9 100644
--- a/src/rail/evaluation/dist_to_point_evaluator.py
+++ b/src/rail/evaluation/dist_to_point_evaluator.py
@@ -1,6 +1,7 @@
 import numpy as np
 
 from ceci.config import StageParameter as Param
+from qp.metrics.base_metric_classes import MetricOutputType
 from qp.metrics.concrete_metric_classes import DistToPointMetric
 
 from rail.core.data import Hdf5Handle, QPHandle, TableHandle
@@ -63,9 +64,11 @@ def run(self):
 
         self._output_handle.finalize_write()
 
+    #pylint: disable=too-many-arguments
     def _process_chunk(self, start, end, estimate_data, reference_data, first):
         out_table = {}
         for metric in self.config.metrics:
+
             if metric not in self._metric_dict:
                 #! Make the following a logged error instead of bailing out of the stage.
                 # raise ValueError(
@@ -74,7 +77,19 @@ def _process_chunk(self, start, end, estimate_data, reference_data, first):
                 continue
 
             this_metric = self._metric_dict[metric](**self.config.to_dict())
-            out_table[metric] = this_metric.evaluate(estimate_data, reference_data[self.config.reference_dictionary_key])
+
+            if this_metric.metric_output_type == MetricOutputType.single_value:
+                print(f"We can't calculate a value for {this_metric.metric_name} right now.")
+                continue
+
+            if this_metric.metric_output_type == MetricOutputType.single_distribution:
+                print(f"We can't calculate a value for {this_metric.metric_name} right now.")
+                continue
+
+            out_table[metric] = this_metric.evaluate(
+                estimate_data,
+                reference_data[self.config.reference_dictionary_key]
+            )
 
         out_table_to_write = {key: np.array(val).astype(float) for key, val in out_table.items()}
 
diff --git a/src/rail/evaluation/testing.ipynb b/src/rail/evaluation/testing.ipynb
index 1299b16e..b7b66c50 100644
--- a/src/rail/evaluation/testing.ipynb
+++ b/src/rail/evaluation/testing.ipynb
@@ -56,7 +56,8 @@
    "source": [
     "ensemble = DS.read_file(key='pdfs_data', handle_class=QPHandle, path=pdfs_file)\n",
     "ztrue_data = DS.read_file('ztrue_data', TableHandle, ztrue_file)\n",
-    "truth = DS.add_data('truth', ztrue_data()['photometry'], TableHandle, path=ztrue_file)\n"
+    "truth = DS.add_data('truth', ztrue_data()['photometry'], TableHandle, path=ztrue_file)\n",
+    "truth_points = DS.add_data('truth_points', ztrue_data()['photometry']['redshift'], TableHandle, path=ztrue_file)"
    ]
   },
   {
@@ -159,6 +160,7 @@
     "stage_dict = dict(\n",
     "    metrics=['point_stats_ez'],\n",
     "    _random_state=None,\n",
+    "    hdf5_groupname='redshift',\n",
     ")\n",
     "ptp_stage = PointToPointEvaluator.make_stage(name='SillyPoopfish', **stage_dict)"
    ]
@@ -171,6 +173,13 @@
    "source": [
     "ptp_results = ptp_stage.evaluate(truth, truth)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {