Skip to content

Commit

Permalink
Added email metadata to df
Browse files Browse the repository at this point in the history
  • Loading branch information
fexfl committed Nov 5, 2024
1 parent f38b39d commit 4779ec7
Showing 1 changed file with 28 additions and 13 deletions.
41 changes: 28 additions & 13 deletions notebook/demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,18 @@
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Felix\\miniconda3\\envs\\mailcom\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import mailcom.inout\n",
"import mailcom.parse\n",
Expand All @@ -13,7 +22,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand All @@ -40,7 +49,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand All @@ -59,22 +68,22 @@
"out_list = []\n",
"for file in io.email_list:\n",
" print(\"Parsing input file {}\".format(file))\n",
" # creating dict\n",
" email_dict = {}\n",
" text = io.get_text(file)\n",
" # after this function was called, the email metadata can be accessed via io.email_content\n",
" # the dict already has the entries content, date, attachments, attachment type\n",
" email_dict = io.email_content.copy()\n",
" text = io.get_html_text(text)\n",
" if not text:\n",
" continue\n",
" # Test functionality of Pseudonymize class\n",
" output_text = ps.pseudonymize(text)\n",
" email_dict[\"content\"] = text\n",
" email_dict[\"pseudo_content\"] = output_text\n",
" out_list.append(email_dict)\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand All @@ -83,15 +92,21 @@
"text": [
" content \\\n",
"0 J'espère que tu vas bien! Je voulais partager ... \n",
"1 \\nOlá Lino,Espero que este e-mail te encontre ... \n",
"2 From : [email protected] : \"Alejandr... \n",
"3 From : [email protected] : \"Alejand... \n",
"1 <html><head></head><body><div class=\"ydp9a5bdb... \n",
"2 <div style=\"font-size: 10pt; font-family: Verd... \n",
"3 <div style=\"font-size: 10pt; font-family: Verd... \n",
"\n",
" date attachment attachement type \\\n",
"0 2024-04-17 15:13:56+00:00 2 [jpg, jpg] \n",
"1 2024-04-17 15:39:49+00:00 1 [png] \n",
"2 1970-01-01 00:00:00+00:00 0 [] \n",
"3 1970-01-01 00:00:00+00:00 0 [] \n",
"\n",
" pseudo_content \n",
"0 J'espère que tu vas bien! Je voulais partager ... \n",
"1 \\n Olá Claude,Espero que este e-mail te encont... \n",
"2 From : [email protected] : \"Claud... \n",
"3 From : [email protected] : \" Clau... \n"
"2 From : [email] : \"Claude\"< [email] : mié. , [n... \n",
"3 From : [email] : \" Claude\"< [email] Dominique\"... \n"
]
}
],
Expand Down

0 comments on commit 4779ec7

Please sign in to comment.