Skip to content

Commit

Permalink
Merge pull request #83 from huridocs/input-without-spaces
Browse files Browse the repository at this point in the history
Input without spaces
  • Loading branch information
gabriel-piles authored Aug 9, 2024
2 parents 5f06578 + 2770a85 commit e5b00b9
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 3 deletions.
2 changes: 1 addition & 1 deletion src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ async def get_suggestions(tenant: str, extraction_id: str):
suggestions_list: list[str] = list()

for document in pdf_metadata_extraction_db.suggestions.find(suggestions_filter):
suggestions_list.append(Suggestion(**document).scale_up().to_dict())
suggestions_list.append(Suggestion(**document).scale_up().to_output())

pdf_metadata_extraction_db.suggestions.delete_many(suggestions_filter)
config_logger.info(f"{len(suggestions_list)} suggestions created for {tenant} {extraction_id}")
Expand Down
7 changes: 7 additions & 0 deletions src/data/SegmentBox.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ class SegmentBox(BaseModel):
def to_dict(self):
return json.loads(self.model_dump_json())

def to_output(self):
segment_box_dict = json.loads(self.model_dump_json())
del segment_box_dict["page_width"]
del segment_box_dict["page_height"]
del segment_box_dict["segment_type"]
return segment_box_dict

def get_bounding_box(self) -> Rectangle:
return Rectangle.from_width_height(
left=int(self.left), top=int(self.top), width=int(self.width), height=int(self.height)
Expand Down
5 changes: 5 additions & 0 deletions src/data/Suggestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ def to_dict(self):
suggestion_dict["segments_boxes"] = [x.to_dict() for x in self.segments_boxes]
return suggestion_dict

def to_output(self):
suggestion_dict = self.model_dump()
suggestion_dict["segments_boxes"] = [x.to_output() for x in self.segments_boxes]
return suggestion_dict

@staticmethod
def get_empty(extraction_identifier: ExtractionIdentifier, entity_name: str) -> "Suggestion":
return Suggestion(
Expand Down
2 changes: 2 additions & 0 deletions src/extractors/text_to_text_extractor/TextToTextExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from extractors.text_to_text_extractor.TextToTextMethod import TextToTextMethod
from extractors.text_to_text_extractor.methods.DateParserMethod import DateParserMethod
from extractors.text_to_text_extractor.methods.DateParserWithBreaksMethod import DateParserWithBreaksMethod
from extractors.text_to_text_extractor.methods.InputWithoutSpaces import InputWithoutSpaces
from extractors.text_to_text_extractor.methods.MT5TrueCaseEnglishSpanishMethod import MT5TrueCaseEnglishSpanishMethod
from extractors.text_to_text_extractor.methods.RegexMethod import RegexMethod
from extractors.text_to_text_extractor.methods.RegexSubtractionMethod import RegexSubtractionMethod
Expand All @@ -20,6 +21,7 @@ class TextToTextExtractor(ExtractorBase):
SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
METHODS: list[type[TextToTextMethod]] = [
SameInputOutputMethod,
InputWithoutSpaces,
RegexMethod,
RegexSubtractionMethod,
DateParserWithBreaksMethod,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from data.ExtractionData import ExtractionData
from data.PredictionSample import PredictionSample
from extractors.text_to_text_extractor.TextToTextMethod import TextToTextMethod


class InputWithoutSpaces(TextToTextMethod):

def train(self, extraction_data: ExtractionData):
pass

@staticmethod
def trim_text(tag_texts: list[str]) -> str:
text = "".join(tag_texts)
return "".join(text.split())

def predict(self, predictions_samples: list[PredictionSample]) -> list[str]:
return [self.trim_text(x.tags_texts) for x in predictions_samples]
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from unittest import TestCase

from data.ExtractionData import ExtractionData
from data.ExtractionIdentifier import ExtractionIdentifier
from data.LabeledData import LabeledData
from data.PredictionSample import PredictionSample
from data.TrainingSample import TrainingSample
from extractors.text_to_text_extractor.methods.InputWithoutSpaces import InputWithoutSpaces

extraction_identifier = ExtractionIdentifier(run_name="test", extraction_name="test")


class TestInputWithoutSpaces(TestCase):
def test_performance_100(self):
sample = TrainingSample(labeled_data=LabeledData(label_text="abc", language_iso="en"), tags_texts=["a b c"])

extraction_data = ExtractionData(samples=[sample], extraction_identifier=extraction_identifier)

same_input_output_method = InputWithoutSpaces(extraction_identifier)
self.assertEqual(100, same_input_output_method.performance(extraction_data))

def test_performance_50(self):
sample_1 = TrainingSample(labeled_data=LabeledData(label_text="abc", language_iso="en"), tags_texts=["a b ", "c"])
sample_2 = TrainingSample(labeled_data=LabeledData(label_text="2", language_iso="en"), tags_texts=["a", " b c"])

extraction_data = ExtractionData(samples=[sample_1] + [sample_2], extraction_identifier=extraction_identifier)

same_input_output_method = InputWithoutSpaces(extraction_identifier)

self.assertEqual(50, same_input_output_method.performance(extraction_data))

def test_predict(self):
same_input_output_method = InputWithoutSpaces(extraction_identifier)
predictions = same_input_output_method.predict([PredictionSample.from_text(" test 1 4 foo ")])

self.assertEqual(["test14foo"], predictions)
4 changes: 2 additions & 2 deletions src/test_end_to_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,8 @@ def test_pdf_to_multi_option(self):
top=60.0,
width=116.0,
height=21.0,
page_width=612,
page_height=792,
page_width=0,
page_height=0,
page_number=1,
segment_type=TokenType.TEXT,
)
Expand Down

0 comments on commit e5b00b9

Please sign in to comment.