Parallelizing PointOutlierRate and updating the notebook with an exam…

…ple.
LSSTDESC · Feb 15, 2024 · 6fe662c · 6fe662c
1 parent be1c73d
commit 6fe662c
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 2 deletions.
diff --git a/src/qp/metrics/parallel_metrics.ipynb b/src/qp/metrics/parallel_metrics.ipynb
@@ -12,6 +12,7 @@
     "    PointSigmaIQR,\n",
     "    PointBias,\n",
     "    PointSigmaMAD,\n",
+    "    PointOutlierRate,\n",
     ")"
    ]
   },
@@ -142,7 +143,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "An example running the PointBias metric in directly and in parallel"
+    "### An example running the PointBias metric in directly and in parallel"
    ]
   },
   {
@@ -167,7 +168,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "An example running PointSigmaMAD directly and in parallel"
+    "### An example running PointSigmaMAD directly and in parallel"
    ]
   },
   {
@@ -221,6 +222,66 @@
    "source": [
     "run_parallel_metric(PointSigmaMAD(), point_sigma_mad_data_chunks)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### An example running PointOutlierRate metric directly and in parallel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# An example with PointOutlierRate\n",
+    "point_outlier_estimator = PointOutlierRate()\n",
+    "point_outlier_estimator_list = [point_outlier_estimator]*n_chunk\n",
+    "point_outlier_data_chunks = [chunk for chunk in zip(point_outlier_estimator_list, chunker(estimate, chunk_size), chunker(reference, chunk_size))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PointOutlierRate().evaluate(estimate, reference)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "por = PointOutlierRate()\n",
+    "centroids = por.accumulate(estimate, reference)\n",
+    "\n",
+    "por.finalize(centroids=[centroids])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The estimation of the metric trends closer to the analytic if the value of `compression` is increased.\n",
+    "\n",
+    "The default value for compression is 1000. If set to 10_000, the estimate becomes 0.13645.\n",
+    "\n",
+    "Note that, of course, setting compression = 10_000 increases runtime and memory usage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_parallel_metric(PointOutlierRate(), point_outlier_data_chunks)"
+   ]
   }
  ],
  "metadata": {

diff --git a/src/qp/metrics/point_estimate_metric_classes.py b/src/qp/metrics/point_estimate_metric_classes.py
@@ -178,6 +178,37 @@ def evaluate(self, estimate, reference):
         outlier = np.sum(mask)
         return float(outlier) / float(num)
 
+    def accumulate(self, estimate, reference):
+        ez = (estimate - reference) / (1.0 + reference)
+        digest = TDigest.compute(ez, compression=1000)
+        centroids = digest.get_centroids()
+        return centroids
+
+    def finalize(self, centroids=None):
+        digests = (
+            TDigest.of_centroids(np.array(centroid), compression=1000)
+            for centroid in centroids
+        )
+        digest = reduce(add, digests)
+
+        # this replaces the call to PointSigmaIQR().evaluate()
+        x75, x25 = digest.inverse_cdf([0.75,0.25])
+        iqr = x75 - x25
+        sigma_iqr = iqr / 1.349
+
+        three_sig = 3.0 * sigma_iqr
+        cut_criterion = np.maximum(0.06, three_sig)
+
+        # here we use the number of points in the centroids as an approximation
+        # of ez.
+        centroids = digest.get_centroids()
+        mask = np.fabs(centroids[:,0]) > cut_criterion
+        outlier = np.sum(centroids[mask,1])
+
+        # Since we use equal weights for all the values in the digest
+        # digest.weight is the total number of values, and is stored as a float.
+        return float(outlier) / digest.weight
+
 
 class PointSigmaMAD(PointToPointMetric):
     """Function to calculate median absolute deviation and sigma