Skip to content

Commit

Permalink
add nbstripout pre-commit hook, run all hooks
Browse files Browse the repository at this point in the history
  • Loading branch information
iulusoy committed Dec 2, 2024
1 parent e78713a commit dbd0577
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 389 deletions.
12 changes: 8 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ repos:
rev: 24.4.2
hooks:
- id: black
- repo: https://github.com/pycqa/flake8
rev: 7.1.0
hooks:
- id: flake8
- repo: https://github.com/pycqa/flake8
rev: 7.1.0
hooks:
- id: flake8
- repo: https://github.com/kynan/nbstripout
rev: 0.8.1
hooks:
- id: nbstripout
65 changes: 39 additions & 26 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,62 @@
from bs4 import BeautifulSoup
from dicttoxml import dicttoxml


class InoutHandler:
def __init__(self, directory_name: str):
"""Constructor for the InoutHandler class.
Args:
Args:
directory_name (str): The directory where the files are located.
"""
"""
self.directory_name = directory_name
# presets
self.pattern = [".eml", ".html"]

def list_of_files(self):
"""Method to create a list of Path objects (files) that are present
"""Method to create a list of Path objects (files) that are present
in a directory."""
if not os.path.exists(self.directory_name): # check if given dir exists raises error otherwise
if not os.path.exists(
self.directory_name
): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(self.directory_name))
mypath = Path(self.directory_name)
self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern]
self.email_list = [
mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern
]
if len(self.email_list) == 0:
raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
raise ValueError(
"""The directory {} does not contain .eml or .html files.
Please check that the directory is containing the
email data files""".format(
mypath
)
)

def get_html_text(self, text_check: str) -> str:
"""Clean up a string if it contains html content.
Args:
text_check (str): The string that may contain html content.
Returns:
str: The (potentially) cleaned up string."""
soup = BeautifulSoup(text_check , 'html.parser')
soup = BeautifulSoup(text_check, "html.parser")
if soup.find():
text_check = soup.get_text()
return text_check

def get_text(self, file: Path) -> str:
"""Function to extract the textual content and other metadata from an email file.
Args:
file (Path): The path to the email file.
Returns:
str: The textual content of the email. In the future, this will return the
str: The textual content of the email. In the future, this will return the
complete dictionary with the metadata."""
if not file.is_file(): # check if given file exists raises error otherwise
if not file.is_file(): # check if given file exists raises error otherwise
raise OSError("File {} does not exist".format(file))
with open(file, 'rb') as fhdl:
with open(file, "rb") as fhdl:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
Expand All @@ -57,28 +68,30 @@ def get_text(self, file: Path) -> str:
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
self.email_content = {"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
}
return(self.email_content["content"])
attachmenttypes = [
parsed_eml["attachment"][i]["extension"] for i in range(attachments)
]
self.email_content = {
"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes,
}
return self.email_content["content"]

def validate_data(self):
pass

def data_to_xml(self, text):
my_item_func = lambda x: 'content'
xml = dicttoxml(text, custom_root='email', item_func = my_item_func)
my_item_func = lambda x: "content" # noqa
xml = dicttoxml(text, custom_root="email", item_func=my_item_func)
return xml.decode()

def write_file(self, text: str, name: str)-> None:
def write_file(self, text: str, name: str) -> None:
"""Write the extracted string to a text file.
Args:
text (str): The string to be written to the file.
name (str): The name of the file to be written."""
with open("{}.out".format(name), "w") as file:
file.write(text)

34 changes: 21 additions & 13 deletions mailcom/test/test_inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
XML_PATH = Path(pkg / "test" / "data" / "test.out")

TEXT_REF = "J'espère que tu vas bien!"
XML_REF = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?><email><content type=\"str\">"
XML_REF = '<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">'


@pytest.fixture()
def get_instant(tmp_path):
return inout.InoutHandler(tmp_path)


def test_list_of_files(get_instant):
with pytest.raises(ValueError):
get_instant.list_of_files()
Expand All @@ -34,31 +36,37 @@ def test_list_of_files(get_instant):
get_instant.list_of_files()
assert get_instant.directory_name / "test3.xml" not in get_instant.email_list


def test_get_text(get_instant):
p = get_instant.directory_name / "test.eml"
p.write_text("test")
extracted_text = get_instant.get_text(p)
assert extracted_text == 'test'
assert extracted_text == "test"
text = get_instant.get_text(FILE_PATH)
assert text[0:25] == TEXT_REF
assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc)
assert get_instant.email_content["date"] == datetime.datetime(
2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc
)
assert get_instant.email_content["attachment"] == 2
assert get_instant.email_content["attachement type"] == ['jpg', 'jpg']
assert get_instant.email_content["attachement type"] == ["jpg", "jpg"]
with pytest.raises(OSError):
get_instant.get_text(get_instant.directory_name / "nonexisting.eml")


def test_get_html_text(get_instant):
html = """<html><head><title>Test</title></head></html>"""
assert get_instant.get_html_text(html) == 'Test'
assert get_instant.get_html_text(html) == "Test"
noHtml = """Test"""
assert get_instant.get_html_text(noHtml) == 'Test'

def test_data_to_xml(get_instant,tmp_path):
xml_content = {"content": "This is nothing more than a test",
"date": "2024-04-17T15:13:56+00:00",
"attachment": 2,
"attachement type": {'jpg', 'jpg'}
}
assert get_instant.get_html_text(noHtml) == "Test"


def test_data_to_xml(get_instant, tmp_path):
xml_content = {
"content": "This is nothing more than a test",
"date": "2024-04-17T15:13:56+00:00",
"attachment": 2,
"attachement type": {"jpg", "jpg"},
}
xml = get_instant.data_to_xml(xml_content)
get_instant.write_file(xml, tmp_path / "test")
assert filecmp.cmp(XML_PATH, tmp_path / "test.out")
Loading

0 comments on commit dbd0577

Please sign in to comment.