From f4ec7915925862bdf67970eecb3b192563a8bfd3 Mon Sep 17 00:00:00 2001
From: Felix <65565033+fexfl@users.noreply.github.com>
Date: Wed, 25 Dec 2024 14:37:47 +0100
Subject: [PATCH] Added deltas between pseudonymized texts using difflib

---
 notebook/batching_performance.ipynb | 42 ++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/notebook/batching_performance.ipynb b/notebook/batching_performance.ipynb
index 02c8d54..42e2733 100644
--- a/notebook/batching_performance.ipynb
+++ b/notebook/batching_performance.ipynb
@@ -23,7 +23,8 @@
     "import time\n",
     "import datetime\n",
     "import matplotlib.pyplot as plt\n",
-    "from IPython.display import display, HTML"
+    "from IPython.display import display, HTML\n",
+    "import difflib"
    ]
   },
   {
@@ -59,7 +60,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Below is the function for a single performance test. The model is loaded from scratch every iteration, and the csv file is reread every time, to create equal conditions for every batching size."
+    "Below is the function for a single performance test. The model is loaded from scratch every iteration, and the csv file is reread every time, to create equal conditions for every batching size. The text with pseudonymized entities can be displayed by setting ``disp=True``."
    ]
   },
   {
@@ -148,7 +149,6 @@
     "    for key, height in email_bar_height.items():\n",
     "        plt.bar(idx_list, height, 0.5, label=key, bottom=bt)\n",
     "        bt = [bi + hi for (bi,hi) in zip(bt, height)]\n",
-    "    #plt.yscale(\"log\")\n",
     "    plt.xlabel(\"Email\")\n",
     "    plt.ylabel(\"t [s]\")\n",
     "    plt.title(\"Computation times for emails, model loading and file reading\")\n",
@@ -215,7 +215,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "A bar plot displaying the average email processing times for the different batch sizes."
+    "A bar plot displaying the average email processing times for the different batch sizes"
    ]
   },
   {
@@ -235,6 +235,40 @@
     "plt.title(\"Average email time for different batch sizes\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Testing for quality differences by displaying differences in the pseudonymized text using ``difflib``. Deltas are only printed if there are differences between the pseudonymized texts. A cross-check is performed using ``difflib.SequenceMatcher().ratio()``"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# declare a result as standard to compare the other results to\n",
+    "standard_ps_texts = [email_dict[\"pseudo_content\"] for email_dict in next(output[\"email_outputs\"] for output in outputs if output[\"batch_size\"] == 1)]\n",
+    "\n",
+    "# iterate over other results and print diffs\n",
+    "for output in outputs:\n",
+    "    #print(f\"----- Comparing batch size {output['batch_size']} sample {output['sample']} to standard: -----\")\n",
+    "    ps_texts = [email_dict[\"pseudo_content\"] for email_dict in output[\"email_outputs\"]]\n",
+    "    # diff to standard\n",
+    "    for idx, (text, stdtext) in enumerate(zip(ps_texts, standard_ps_texts)):\n",
+    "        #print(f\"--- Comparing email text {idx} ---\")\n",
+    "        diff = difflib.ndiff(stdtext.splitlines(keepends=True), text.splitlines(keepends=True))\n",
+    "        for line in diff:\n",
+    "            if line.startswith('+ ') or line.startswith('- '):\n",
+    "                print(f\"Delta in batch size {output['batch_size']} at sample {output['sample']}:\")\n",
+    "                print(line, end='')\n",
+    "        # also test the matching ratio\n",
+    "        rt = difflib.SequenceMatcher(None, stdtext, text).ratio()\n",
+    "        if not rt == 1.0:\n",
+    "            print(f\"Delta in batch size {output['batch_size']} at sample {output['sample']}: Matching ratio is not 1!\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,