Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better text filters #30

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,25 @@ jobs:
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Getting repository
uses: actions/checkout@v4

- name: Install dependencies and package
run: |
python -m pip install --upgrade pip
pip install -e .

- name: Install pytest
run: |
pip install pytest
pip install pytest pytest-cov

- name: Execute PyTest
run: python -m pytest -svv
run: python -m pytest -svv --cov=mailcom --cov-report=xml:mailcom/coverage_re/coverage.xml

- name: Upload coverage reports to Codecov
uses: codecov/[email protected]
with:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./mailcom/coverage_re/

2 changes: 2 additions & 0 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ def list_of_files(directory_name: str) -> list[Path]:
mypath = Path(directory_name)
pattern = [".eml", ".html"] # we would not change the file type through user input
email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern]
if len(email_list) == 0:
raise ValueError("The directory {} does not contain .eml or .html files".format(mypath))
return email_list


Expand Down
25 changes: 13 additions & 12 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@
import spacy as sp
from transformers import pipeline
from pathlib import Path
from .inout import get_text, delete_header, list_of_files, write_file
from mailcom import inout

# please modify this section depending on your setup
# input language - either "es" or "fr"
# will also need pt
lang = "es"
# lang = "fr"
# path where the input files can be found
path_input = Path("./data/in/")
path_input = "./data/in/"
# path where the output files should be written to
# this is generated if not present yet
path_output = Path("./data/out/")
path_output = "./data/out/"
# the ner tool - currently only "transformers"
tool = "transformers"
# please do not modify below this section unless you know what you are doing
Expand Down Expand Up @@ -91,9 +91,11 @@ def init_transformers():
return ner_recognizer


def check_dir(path: str):
# check if directory is there
return os.path.exists(path)
def check_dir(path: str) -> bool:
if not os.path.exists(path):
raise OSError("Path {} does not exist".format(path))
else:
return True


def make_dir(path: str):
Expand All @@ -114,14 +116,13 @@ def make_dir(path: str):
print("Generating output directory/ies.")
make_dir(path_output)
# process the text
eml_files = list_of_files(path_input, "eml")
html_files = list_of_files(path_input, "html")
for file in eml_files:
text = get_text(file)
files = inout.list_of_files(path_input)
for file in files:
text = inout.get_text(file)
# skip this text if email could not be parsed
if not text:
continue
text = delete_header(text)
text = inout.delete_header(text)
doc_spacy = nlp_spacy(text)
text = get_sentences(doc_spacy)
# start with first line
Expand All @@ -137,4 +138,4 @@ def make_dir(path: str):
# join the new and old lines for comparison
printout = "New: " + " ".join(newlist) + "\n"
printout = printout + "Old: " + " ".join(text[0:max_i])
write_file(printout, path_output + "/" + file)
inout.write_file(printout, path_output + "/" + file)
16 changes: 14 additions & 2 deletions mailcom/test/test_inout.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
from mailcom.inout import list_of_files
import pytest

def test_list_of_files_found(tmp_path):
p = tmp_path / "test.eml"
p.write_text("test")
assert len(list_of_files(tmp_path)) != 0, "The list is empty"
assert len(list_of_files(tmp_path)) != 0

def test_list_of_files_empty(tmp_path):
assert len(list_of_files(tmp_path)) == 0, "The list is not empty"
with pytest.raises(ValueError):
list_of_files(tmp_path)


def test_list_of_files_correct_format(tmp_path):
p = tmp_path / "test.eml"
p.write_text("test")
p = tmp_path / "test2.html"
p.write_text("test2")
p = tmp_path / "test3.xml"
p.write_text("test3")
assert tmp_path / "test3.xml" not in list_of_files(tmp_path)
8 changes: 7 additions & 1 deletion mailcom/test/test_parse.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
from mailcom.parse import make_dir, check_dir
import pytest


# these worked when we were using strings
# with the update to Path, we need to change the tests
def test_check_dir(tmpdir):
mydir = tmpdir.mkdir("sub")
assert check_dir(str(mydir))
assert not check_dir(str(tmpdir.join("sub2")))
with pytest.raises(OSError):
check_dir(str(tmpdir.join("sub2")))


def test_make_dir(tmpdir):
mydir = tmpdir.join("sub")
make_dir(str(mydir))
assert mydir.check()

def test_check_dir_fail():
with pytest.raises(OSError):
check_dir(str("mydir"))