Merge pull request #83 from huridocs/input-without-spaces

Input without spaces
huridocs · Aug 9, 2024 · e5b00b9 · e5b00b9
2 parents 5f06578 + 2770a85
commit e5b00b9
Show file tree

Hide file tree

Showing 7 changed files with 70 additions and 3 deletions.
diff --git a/src/app.py b/src/app.py
@@ -135,7 +135,7 @@ async def get_suggestions(tenant: str, extraction_id: str):
         suggestions_list: list[str] = list()
 
         for document in pdf_metadata_extraction_db.suggestions.find(suggestions_filter):
-            suggestions_list.append(Suggestion(**document).scale_up().to_dict())
+            suggestions_list.append(Suggestion(**document).scale_up().to_output())
 
         pdf_metadata_extraction_db.suggestions.delete_many(suggestions_filter)
         config_logger.info(f"{len(suggestions_list)} suggestions created for {tenant} {extraction_id}")

diff --git a/src/data/SegmentBox.py b/src/data/SegmentBox.py
@@ -23,6 +23,13 @@ class SegmentBox(BaseModel):
     def to_dict(self):
         return json.loads(self.model_dump_json())
 
+    def to_output(self):
+        segment_box_dict = json.loads(self.model_dump_json())
+        del segment_box_dict["page_width"]
+        del segment_box_dict["page_height"]
+        del segment_box_dict["segment_type"]
+        return segment_box_dict
+
     def get_bounding_box(self) -> Rectangle:
         return Rectangle.from_width_height(
             left=int(self.left), top=int(self.top), width=int(self.width), height=int(self.height)

diff --git a/src/data/Suggestion.py b/src/data/Suggestion.py
@@ -27,6 +27,11 @@ def to_dict(self):
         suggestion_dict["segments_boxes"] = [x.to_dict() for x in self.segments_boxes]
         return suggestion_dict
 
+    def to_output(self):
+        suggestion_dict = self.model_dump()
+        suggestion_dict["segments_boxes"] = [x.to_output() for x in self.segments_boxes]
+        return suggestion_dict
+
     @staticmethod
     def get_empty(extraction_identifier: ExtractionIdentifier, entity_name: str) -> "Suggestion":
         return Suggestion(

diff --git a/src/extractors/text_to_text_extractor/TextToTextExtractor.py b/src/extractors/text_to_text_extractor/TextToTextExtractor.py
@@ -9,6 +9,7 @@
 from extractors.text_to_text_extractor.TextToTextMethod import TextToTextMethod
 from extractors.text_to_text_extractor.methods.DateParserMethod import DateParserMethod
 from extractors.text_to_text_extractor.methods.DateParserWithBreaksMethod import DateParserWithBreaksMethod
+from extractors.text_to_text_extractor.methods.InputWithoutSpaces import InputWithoutSpaces
 from extractors.text_to_text_extractor.methods.MT5TrueCaseEnglishSpanishMethod import MT5TrueCaseEnglishSpanishMethod
 from extractors.text_to_text_extractor.methods.RegexMethod import RegexMethod
 from extractors.text_to_text_extractor.methods.RegexSubtractionMethod import RegexSubtractionMethod
@@ -20,6 +21,7 @@ class TextToTextExtractor(ExtractorBase):
     SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
     METHODS: list[type[TextToTextMethod]] = [
         SameInputOutputMethod,
+        InputWithoutSpaces,
         RegexMethod,
         RegexSubtractionMethod,
         DateParserWithBreaksMethod,

diff --git a/src/extractors/text_to_text_extractor/methods/InputWithoutSpaces.py b/src/extractors/text_to_text_extractor/methods/InputWithoutSpaces.py
@@ -0,0 +1,17 @@
+from data.ExtractionData import ExtractionData
+from data.PredictionSample import PredictionSample
+from extractors.text_to_text_extractor.TextToTextMethod import TextToTextMethod
+
+
+class InputWithoutSpaces(TextToTextMethod):
+
+    def train(self, extraction_data: ExtractionData):
+        pass
+
+    @staticmethod
+    def trim_text(tag_texts: list[str]) -> str:
+        text = "".join(tag_texts)
+        return "".join(text.split())
+
+    def predict(self, predictions_samples: list[PredictionSample]) -> list[str]:
+        return [self.trim_text(x.tags_texts) for x in predictions_samples]
diff --git a/src/extractors/text_to_text_extractor/methods/test/test_input_without_spaces.py b/src/extractors/text_to_text_extractor/methods/test/test_input_without_spaces.py
@@ -0,0 +1,36 @@
+from unittest import TestCase
+
+from data.ExtractionData import ExtractionData
+from data.ExtractionIdentifier import ExtractionIdentifier
+from data.LabeledData import LabeledData
+from data.PredictionSample import PredictionSample
+from data.TrainingSample import TrainingSample
+from extractors.text_to_text_extractor.methods.InputWithoutSpaces import InputWithoutSpaces
+
+extraction_identifier = ExtractionIdentifier(run_name="test", extraction_name="test")
+
+
+class TestInputWithoutSpaces(TestCase):
+    def test_performance_100(self):
+        sample = TrainingSample(labeled_data=LabeledData(label_text="abc", language_iso="en"), tags_texts=["a b c"])
+
+        extraction_data = ExtractionData(samples=[sample], extraction_identifier=extraction_identifier)
+
+        same_input_output_method = InputWithoutSpaces(extraction_identifier)
+        self.assertEqual(100, same_input_output_method.performance(extraction_data))
+
+    def test_performance_50(self):
+        sample_1 = TrainingSample(labeled_data=LabeledData(label_text="abc", language_iso="en"), tags_texts=["a b ", "c"])
+        sample_2 = TrainingSample(labeled_data=LabeledData(label_text="2", language_iso="en"), tags_texts=["a", " b c"])
+
+        extraction_data = ExtractionData(samples=[sample_1] + [sample_2], extraction_identifier=extraction_identifier)
+
+        same_input_output_method = InputWithoutSpaces(extraction_identifier)
+
+        self.assertEqual(50, same_input_output_method.performance(extraction_data))
+
+    def test_predict(self):
+        same_input_output_method = InputWithoutSpaces(extraction_identifier)
+        predictions = same_input_output_method.predict([PredictionSample.from_text(" test 1 4 foo ")])
+
+        self.assertEqual(["test14foo"], predictions)
diff --git a/src/test_end_to_end.py b/src/test_end_to_end.py
@@ -250,8 +250,8 @@ def test_pdf_to_multi_option(self):
                     top=60.0,
                     width=116.0,
                     height=21.0,
-                    page_width=612,
-                    page_height=792,
+                    page_width=0,
+                    page_height=0,
                     page_number=1,
                     segment_type=TokenType.TEXT,
                 )