last changes towards readability

ssciwr · Sep 26, 2024 · 66ff067 · 66ff067
1 parent 70e5497
commit 66ff067
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 14 deletions.
diff --git a/data/out/dict.out b/data/out/dict.out
@@ -1 +1 @@
-<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">J&apos;espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage![Inline-Bild][Inline-Bild]À bientôt,Pierre</content></email>
+<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">J&apos;espère que tu vas bien!</content><content type="str">Je voulais partager avec toi quelques photos de mon dernier voyage![Inline-Bild][Inline-Bild]À bientôt,xxxxxx</content></email>
diff --git a/mailcom/inout.py b/mailcom/inout.py
@@ -53,9 +53,8 @@ def get_text(self, file: Path) -> str:
             raw_email = fhdl.read()
         ep = eml_parser.EmlParser(include_raw_body=True)
         parsed_eml = ep.decode_email_bytes(raw_email)
-        # content = parsed_eml["body"][0]["content"]
+        # cleaning control characters from content
         mapping =  dict.fromkeys(range(32))
-        # res = content.translate(mapping)
         attachmenttypes = []
         # find if there are any attachements, and if yes, how many
         attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0

diff --git a/mailcom/parse.py b/mailcom/parse.py
@@ -86,7 +86,7 @@ def init_spacy(lang):
 
 def init_transformers():
     ner_recognizer = pipeline(
-        "token-classification", model="xlm-roberta-large-finetuned-conll03-english", device_map = 'cuda'
+        "token-classification", model="xlm-roberta-large-finetuned-conll03-english", device_map = 'cuda',
     )
     return ner_recognizer
 
@@ -122,31 +122,27 @@ def make_dir(path: str):
     for file in io.email_list:
         text = io.get_text(file)
         text = io.get_html_text(text)
-        xml = io.data_to_xml(text)
-        io.write_file(xml, path_output / output_filename)
         # print(text)
         # print(io.email_content["date"])
         # print(io.email_content["attachment"])
         # print(io.email_content["attachement type"])
         # skip this text if email could not be parsed
         if not text:
             continue 
-        ### nlp = init_spacy(sprache) done l.108
-        doc_spacy = nlp_spacy(text) ### fehlt - alte version
+        doc_spacy = nlp_spacy(text)
         text = get_sentences(doc_spacy)
         # start with first line
-        # here you can limit the number of sentences to parse
         newlist = []
-        max_i = len(text) ### weg
-        ### init transformers
+        max_i = len(text)
         for i in range(0, max_i):
-        #     if tool == "transformers": ### gibt nur eins
-            nlps = nlp_transformers(text[i]) ### fehlty bzw process_doc
+            nlps = nlp_transformers(text[i])
             doc = nlps
             newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
             newlist[i] = " ".join(newlist[i])
         # join the new and old lines for comparison
         printout = "New: " + " ".join(newlist) + "\n"
         printout = printout + "Old: " + " ".join(text[0:max_i])
         print(printout)
-        # write_file(printout, path_output + "/" + file)
+        # parse pseudomized sentences to xml and write to file
+        xml = io.data_to_xml(newlist)
+        io.write_file(xml, path_output / output_filename)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">J'espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage![Inline-Bild][Inline-Bild]À bientôt,Pierre</content></email>
		<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">J'espère que tu vas bien!</content><content type="str">Je voulais partager avec toi quelques photos de mon dernier voyage![Inline-Bild][Inline-Bild]À bientôt,xxxxxx</content></email>