From c70f681f2ae008274d7f55522bd2f2ac1cd9203b Mon Sep 17 00:00:00 2001 From: ThoreOlthoff Date: Fri, 30 Aug 2024 11:20:47 +0200 Subject: [PATCH] Changed the reading in of files and changed the accepted format from just eml to eml and html --- mailcom/parse.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/mailcom/parse.py b/mailcom/parse.py index b6e2cd6..86d84fd 100644 --- a/mailcom/parse.py +++ b/mailcom/parse.py @@ -2,7 +2,7 @@ import spacy as sp from transformers import pipeline from pathlib import Path -from .inout import get_text, delete_header, list_of_files, write_file +from mailcom import inout # please modify this section depending on your setup # input language - either "es" or "fr" @@ -10,10 +10,10 @@ lang = "es" # lang = "fr" # path where the input files can be found -path_input = Path("./data/in/") +path_input = "./data/in/" # path where the output files should be written to # this is generated if not present yet -path_output = Path("./data/out/") +path_output = "./data/out/" # the ner tool - currently only "transformers" tool = "transformers" # please do not modify below this section unless you know what you are doing @@ -116,14 +116,13 @@ def make_dir(path: str): print("Generating output directory/ies.") make_dir(path_output) # process the text - eml_files = list_of_files(path_input, "eml") - html_files = list_of_files(path_input, "html") - for file in eml_files: - text = get_text(file) + files = inout.list_of_files(path_input) + for file in files: + text = inout.get_text(file) # skip this text if email could not be parsed if not text: continue - text = delete_header(text) + text = inout.delete_header(text) doc_spacy = nlp_spacy(text) text = get_sentences(doc_spacy) # start with first line @@ -139,4 +138,4 @@ def make_dir(path: str): # join the new and old lines for comparison printout = "New: " + " ".join(newlist) + "\n" printout = printout + "Old: " + " ".join(text[0:max_i]) - write_file(printout, path_output + "/" + file) + inout.write_file(printout, path_output + "/" + file)