Removed display option and added comments

ssciwr · Jan 5, 2025 · adfcb2d · adfcb2d
1 parent 8717c05
commit adfcb2d
Showing 1 changed file with 29 additions and 53 deletions.
diff --git a/notebook/batching_performance.ipynb b/notebook/batching_performance.ipynb
@@ -27,40 +27,17 @@
     "import difflib"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# function for displaying the result using HTML\n",
-    "def highlight_ne(text, per_list, org_list, loc_list, misc_list):\n",
-    "    # create a list of all entities with their positions\n",
-    "    entities = []\n",
-    "    for loc in loc_list:\n",
-    "        entities.append((loc, \"green\"))\n",
-    "    for org in org_list:\n",
-    "        entities.append((org, \"blue\"))\n",
-    "    for misc in misc_list:\n",
-    "        entities.append((misc, \"yellow\"))\n",
-    "    for per in per_list:\n",
-    "        entities.append((per, \"red\"))\n",
-    "    \n",
-    "    # sort entities by their positions in the text in reverse order\n",
-    "    entities.sort(key=lambda x: text.find(x[0]), reverse=True)\n",
-    "    \n",
-    "    # replace entities with highlighted spans\n",
-    "    for entity, color in entities:\n",
-    "        text = text.replace(entity, f\"<span style=\\\"background-color:{color}\\\">{entity}</span>\")\n",
-    "\n",
-    "    return text"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Below is the function for a single performance test. The model is loaded from scratch every iteration, and the csv file is reread every time, to create equal conditions for every batching size. The text with pseudonymized entities can be displayed by setting ``disp=True``."
+    "Below is the function for a single performance test. The model is loaded from scratch every iteration, and the csv file is reread every time, to create equal conditions for every batching size.\n",
+    "\n",
+    "The test returns the total email processing time averaged over all emails, as well as a list of all email output dictionaries as created in the pseudonymization:\n",
+    "\n",
+    "Email dict structure: |\n",
+    "`content`: Original email text |\n",
+    "`pseudo_content`: Pseudonymized email text |"
    ]
   },
   {
@@ -69,16 +46,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def performance_test(csv_file, batch_size, disp=True):\n",
+    "def performance_test(csv_file, batch_size):\n",
     "    print(\"-----------------------------------\")\n",
     "    print(\"Starting performance test for batch size\", batch_size)\n",
     "    # create t0 timestamp\n",
     "    t0 = time.time()\n",
     "\n",
     "    # import files from csv file\n",
     "    email_list = pd.read_csv(csv_file)\n",
-    "    #print(email_list)\n",
-    "\n",
     "    t_csv_read = time.time()\n",
     "\n",
     "    # create pseudonymization object\n",
@@ -93,18 +68,18 @@
     "    out_list = []\n",
     "    ts_list = []\n",
     "    for idx, row in email_list.iterrows():\n",
+    "        # email start time\n",
     "        ts_email_start = time.time()\n",
     "        text = row[\"message\"]\n",
     "        email_dict = {\"content\": text}\n",
     "        if not text:\n",
     "            continue\n",
-    "        # Test functionality of Pseudonymize class\n",
     "        # Pseudonymization is usually done using ps.pseudonymize\n",
     "        # For performance analysis the process is split into its subprocesses here\n",
     "        ps.reset()\n",
     "        sentences = ps.get_sentences(text)\n",
     "        batches = ps.split_batches(sentences)\n",
-    "        ts_email_ppr_done = time.time()\n",
+    "        ts_email_ppr_done = time.time() # preprocessing complete\n",
     "        pseudonymized_batches = []\n",
     "        for batch in batches:\n",
     "            batch = ps.concatenate(batch)\n",
@@ -123,14 +98,6 @@
     "        ts_email_end = time.time()\n",
     "        ts_list.append([ts_email_start, ts_email_ppr_done, ts_email_end])\n",
     "\n",
-    "        # display the pseudonymized text\n",
-    "        # display(HTML(output_text))\n",
-    "\n",
-    "        # display original text and highlight found and replaced NEs\n",
-    "        if disp:\n",
-    "            highlighted_html = highlight_ne(text, ps.per_list, ps.org_list, ps.loc_list, ps.misc_list)\n",
-    "            display(HTML(highlighted_html))\n",
-    "\n",
     "    # display timestamps\n",
     "\n",
     "    # bar plot for each individual email\n",
@@ -190,21 +157,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "batching_sizes = [-1, 1, 2, 3, 4, 6, 8, 10]\n",
-    "# batching_sizes = [1]\n",
-    "n_samples = 5\n",
+    "# batching_sizes = [-1, 1, 2, 3, 4, 6, 8, 10]\n",
+    "batching_sizes = [1, 10]\n",
+    "n_samples = 3\n",
     "outputs = []\n",
-    "csv_file = \"../mailcom/test/data/mails_lb_sg.csv\"\n",
+    "csv_file = \"../mailcom/test/data/mails_lb_sg_copy.csv\"\n",
     "\n",
     "# first make a dummy run since there seem to be some inconsitencies when loading for the first time\n",
-    "_ = performance_test(csv_file, -1, disp=False)\n",
+    "_ = performance_test(csv_file, -1)\n",
     "\n",
     "# testing\n",
-    "av_email_times_for_batches = []\n",
     "for bs in batching_sizes:\n",
     "    average_email_time = 0\n",
     "    for sid in range(n_samples):\n",
-    "        t, out = performance_test(csv_file, bs, disp=False)\n",
+    "        t, out = performance_test(csv_file, bs)\n",
     "        average_email_time += t\n",
     "        test_result_dict = {\n",
     "            \"batch_size\": bs,\n",
@@ -228,19 +194,27 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# list for email-averaged total processing time, now to be averaged over n_samples\n",
     "average_email_times_for_batches = [0]*len(batching_sizes)\n",
+    "# corresponding sample standard deviations\n",
     "std_email_times_for_batches = [0]*len(batching_sizes)\n",
+    "# manual computation of the average over n_samples\n",
     "for output in outputs:\n",
+    "    # add times to list position corresponding to batch size\n",
     "    average_email_times_for_batches[batching_sizes.index(output[\"batch_size\"])] += output[\"average_email_time\"]\n",
+    "# divide by length\n",
     "average_email_times_for_batches = [avt/n_samples for avt in average_email_times_for_batches]\n",
     "\n",
-    "# if n_samples > 1, calculate standard deviation.\n",
+    "# if n_samples > 1, calculate sample standard deviation\n",
     "if n_samples > 1:\n",
     "    for output in outputs:\n",
+    "        # add squared time deviations to list position corresponding to batch size\n",
     "        ix = batching_sizes.index(output[\"batch_size\"])\n",
     "        std_email_times_for_batches[ix] += (output[\"average_email_time\"] - average_email_times_for_batches[ix])**2\n",
+    "    # divide by length-1\n",
     "    std_email_times_for_batches = [(stdt**(1./2.))/(n_samples-1) for stdt in std_email_times_for_batches]\n",
     "\n",
+    "# plot\n",
     "plt.errorbar(batching_sizes, average_email_times_for_batches, yerr=std_email_times_for_batches, linestyle='None', marker='.', capsize=2, elinewidth=1)\n",
     "plt.xlabel(\"n batches\")\n",
     "plt.ylabel(\"Average Email Time [s]\")\n",
@@ -256,7 +230,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Testing for quality differences by displaying differences in the pseudonymized text using ``difflib``. Deltas are only printed if there are differences between the pseudonymized texts. A cross-check is performed using ``difflib.SequenceMatcher().ratio()``"
+    "Testing for quality differences by displaying differences in the pseudonymized text using ``difflib``. Deltas are only printed if there are differences between the pseudonymized texts. Matching ratios are calculated using ``difflib.SequenceMatcher().ratio()``"
    ]
   },
   {
@@ -272,9 +246,10 @@
     "# iterate over other results and print diffs\n",
     "for output in outputs:\n",
     "    print(f\"----- Comparing batch size {output['batch_size']} sample {output['sample']} to standard {standard_batch_size}: -----\")\n",
+    "    # pseudonymized texts for this output\n",
     "    ps_texts = [email_dict[\"pseudo_content\"] for email_dict in output[\"email_outputs\"]]\n",
     "    # diff to standard\n",
-    "    average_sqm_ratio = 0. # SequenceMatcherRatio averaged over all emails for this result\n",
+    "    average_sqm_ratio = 0. # SequenceMatcherRatio averaged over all emails for this output\n",
     "    for idx, (text, stdtext) in enumerate(zip(ps_texts, standard_ps_texts)):\n",
     "        print(f\"--- Comparing email text {idx} ---\")\n",
     "        diff = difflib.ndiff(stdtext.splitlines(keepends=True), text.splitlines(keepends=True))\n",
@@ -300,6 +275,7 @@
    "source": [
     "# average the SequenceMatcher ratios over n_samples for all batching sizes\n",
     "average_sqm_ratio_for_batches = [0.]*len(batching_sizes)\n",
+    "# manually average the ratios over n_samples\n",
     "for output in outputs:\n",
     "    average_sqm_ratio_for_batches[batching_sizes.index(output[\"batch_size\"])] += output[\"average_sqm_ratio\"]\n",
     "average_sqm_ratio_for_batches = [asr/n_samples for asr in average_sqm_ratio_for_batches]\n",