Skip to content

Commit

Permalink
Changed the reading in of files and changed the accepted format from …
Browse files Browse the repository at this point in the history
…just eml to eml and html
  • Loading branch information
Olthoff231381 committed Aug 30, 2024
1 parent 2fd9212 commit c70f681
Showing 1 changed file with 8 additions and 9 deletions.
17 changes: 8 additions & 9 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
import spacy as sp
from transformers import pipeline
from pathlib import Path
from .inout import get_text, delete_header, list_of_files, write_file
from mailcom import inout

# please modify this section depending on your setup
# input language - either "es" or "fr"
# will also need pt
lang = "es"
# lang = "fr"
# path where the input files can be found
path_input = Path("./data/in/")
path_input = "./data/in/"
# path where the output files should be written to
# this is generated if not present yet
path_output = Path("./data/out/")
path_output = "./data/out/"
# the ner tool - currently only "transformers"
tool = "transformers"
# please do not modify below this section unless you know what you are doing
Expand Down Expand Up @@ -116,14 +116,13 @@ def make_dir(path: str):
print("Generating output directory/ies.")
make_dir(path_output)
# process the text
eml_files = list_of_files(path_input, "eml")
html_files = list_of_files(path_input, "html")
for file in eml_files:
text = get_text(file)
files = inout.list_of_files(path_input)
for file in files:
text = inout.get_text(file)
# skip this text if email could not be parsed
if not text:
continue
text = delete_header(text)
text = inout.delete_header(text)
doc_spacy = nlp_spacy(text)
text = get_sentences(doc_spacy)
# start with first line
Expand All @@ -139,4 +138,4 @@ def make_dir(path: str):
# join the new and old lines for comparison
printout = "New: " + " ".join(newlist) + "\n"
printout = printout + "Old: " + " ".join(text[0:max_i])
write_file(printout, path_output + "/" + file)
inout.write_file(printout, path_output + "/" + file)

0 comments on commit c70f681

Please sign in to comment.