Skip to content

Commit

Permalink
last changes towards readability
Browse files Browse the repository at this point in the history
  • Loading branch information
Olthoff231381 committed Sep 26, 2024
1 parent 70e5497 commit 66ff067
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 14 deletions.
2 changes: 1 addition & 1 deletion data/out/dict.out
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">J&apos;espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage![Inline-Bild][Inline-Bild]À bientôt,Pierre</content></email>
<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">J&apos;espère que tu vas bien!</content><content type="str">Je voulais partager avec toi quelques photos de mon dernier voyage![Inline-Bild][Inline-Bild]À bientôt,xxxxxx</content></email>
3 changes: 1 addition & 2 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,8 @@ def get_text(self, file: Path) -> str:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
# content = parsed_eml["body"][0]["content"]
# cleaning control characters from content
mapping = dict.fromkeys(range(32))
# res = content.translate(mapping)
attachmenttypes = []
# find if there are any attachements, and if yes, how many
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
Expand Down
18 changes: 7 additions & 11 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def init_spacy(lang):

def init_transformers():
ner_recognizer = pipeline(
"token-classification", model="xlm-roberta-large-finetuned-conll03-english", device_map = 'cuda'
"token-classification", model="xlm-roberta-large-finetuned-conll03-english", device_map = 'cuda',
)
return ner_recognizer

Expand Down Expand Up @@ -122,31 +122,27 @@ def make_dir(path: str):
for file in io.email_list:
text = io.get_text(file)
text = io.get_html_text(text)
xml = io.data_to_xml(text)
io.write_file(xml, path_output / output_filename)
# print(text)
# print(io.email_content["date"])
# print(io.email_content["attachment"])
# print(io.email_content["attachement type"])
# skip this text if email could not be parsed
if not text:
continue
### nlp = init_spacy(sprache) done l.108
doc_spacy = nlp_spacy(text) ### fehlt - alte version
doc_spacy = nlp_spacy(text)
text = get_sentences(doc_spacy)
# start with first line
# here you can limit the number of sentences to parse
newlist = []
max_i = len(text) ### weg
### init transformers
max_i = len(text)
for i in range(0, max_i):
# if tool == "transformers": ### gibt nur eins
nlps = nlp_transformers(text[i]) ### fehlty bzw process_doc
nlps = nlp_transformers(text[i])
doc = nlps
newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
newlist[i] = " ".join(newlist[i])
# join the new and old lines for comparison
printout = "New: " + " ".join(newlist) + "\n"
printout = printout + "Old: " + " ".join(text[0:max_i])
print(printout)
# write_file(printout, path_output + "/" + file)
# parse pseudomized sentences to xml and write to file
xml = io.data_to_xml(newlist)
io.write_file(xml, path_output / output_filename)

0 comments on commit 66ff067

Please sign in to comment.