Added deltas between pseudonymized texts using difflib

ssciwr · Dec 25, 2024 · f4ec791 · f4ec791
1 parent 9c2c4f3
commit f4ec791
Showing 1 changed file with 38 additions and 4 deletions.
diff --git a/notebook/batching_performance.ipynb b/notebook/batching_performance.ipynb
@@ -23,7 +23,8 @@
     "import time\n",
     "import datetime\n",
     "import matplotlib.pyplot as plt\n",
-    "from IPython.display import display, HTML"
+    "from IPython.display import display, HTML\n",
+    "import difflib"
    ]
   },
   {
@@ -59,7 +60,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Below is the function for a single performance test. The model is loaded from scratch every iteration, and the csv file is reread every time, to create equal conditions for every batching size."
+    "Below is the function for a single performance test. The model is loaded from scratch every iteration, and the csv file is reread every time, to create equal conditions for every batching size. The text with pseudonymized entities can be displayed by setting ``disp=True``."
    ]
   },
   {
@@ -148,7 +149,6 @@
     "    for key, height in email_bar_height.items():\n",
     "        plt.bar(idx_list, height, 0.5, label=key, bottom=bt)\n",
     "        bt = [bi + hi for (bi,hi) in zip(bt, height)]\n",
-    "    #plt.yscale(\"log\")\n",
     "    plt.xlabel(\"Email\")\n",
     "    plt.ylabel(\"t [s]\")\n",
     "    plt.title(\"Computation times for emails, model loading and file reading\")\n",
@@ -215,7 +215,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "A bar plot displaying the average email processing times for the different batch sizes."
+    "A bar plot displaying the average email processing times for the different batch sizes"
    ]
   },
   {
@@ -235,6 +235,40 @@
     "plt.title(\"Average email time for different batch sizes\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Testing for quality differences by displaying differences in the pseudonymized text using ``difflib``. Deltas are only printed if there are differences between the pseudonymized texts. A cross-check is performed using ``difflib.SequenceMatcher().ratio()``"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# declare a result as standard to compare the other results to\n",
+    "standard_ps_texts = [email_dict[\"pseudo_content\"] for email_dict in next(output[\"email_outputs\"] for output in outputs if output[\"batch_size\"] == 1)]\n",
+    "\n",
+    "# iterate over other results and print diffs\n",
+    "for output in outputs:\n",
+    "    #print(f\"----- Comparing batch size {output['batch_size']} sample {output['sample']} to standard: -----\")\n",
+    "    ps_texts = [email_dict[\"pseudo_content\"] for email_dict in output[\"email_outputs\"]]\n",
+    "    # diff to standard\n",
+    "    for idx, (text, stdtext) in enumerate(zip(ps_texts, standard_ps_texts)):\n",
+    "        #print(f\"--- Comparing email text {idx} ---\")\n",
+    "        diff = difflib.ndiff(stdtext.splitlines(keepends=True), text.splitlines(keepends=True))\n",
+    "        for line in diff:\n",
+    "            if line.startswith('+ ') or line.startswith('- '):\n",
+    "                print(f\"Delta in batch size {output['batch_size']} at sample {output['sample']}:\")\n",
+    "                print(line, end='')\n",
+    "        # also test the matching ratio\n",
+    "        rt = difflib.SequenceMatcher(None, stdtext, text).ratio()\n",
+    "        if not rt == 1.0:\n",
+    "            print(f\"Delta in batch size {output['batch_size']} at sample {output['sample']}: Matching ratio is not 1!\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,