Skip to content

Commit

Permalink
Accept portuguese for gliner
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-piles committed Dec 19, 2024
1 parent f4f99c3 commit 6883a54
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 4 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "trainable-entity-extractor"
version = "2024.12.19.4"
version = "2024.12.19.5"
description = "This tool is a trainable text/PDF to entity extractor"
license = { file = "LICENSE" }
authors = [{ name = "HURIDOCS" }]
Expand Down
2 changes: 1 addition & 1 deletion src/trainable_entity_extractor/data/TrainingSample.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def from_pdf(pdf_path: str | Path, label_text: str, language_iso: str = "en"):
segmentation_data = SegmentationData(
page_width=pdf_features.pages[0].page_width if pdf_features.pages else 0,
page_height=pdf_features.pages[0].page_height if pdf_features.pages else 0,
xml_segments_boxes=[],
xml_segments_boxes=[SegmentBox(left=0, top=0, width=343434342, height=300, page_number=1)],
label_segments_boxes=[],
)
pdf_data.set_segments_from_segmentation_data(segmentation_data=segmentation_data)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ class GlinerDateParserMethod(ToTextExtractorMethod):

@staticmethod
def get_alphanumeric_text_with_spaces(text):
alphanumeric_pattern = re.compile(r"[A-Za-z0-9 ]+")
return "".join(alphanumeric_pattern.findall(text))
return "".join([letter for letter in text if letter.isalnum() or letter.isspace()])

@staticmethod
def get_date(tags_texts: list[str]):
Expand All @@ -36,3 +35,7 @@ def predict(self, predictions_samples: list[PredictionSample]) -> list[str]:
]
predictions = [date.strftime("%Y-%m-%d") if date else "" for date in predictions_dates]
return predictions


if __name__ == "__main__":
print(GlinerDateParserMethod.get_alphanumeric_text_with_spaces("21 DE MARÇO DE 2023"))
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ def test_predict_special_character(self):
predictions = gliner_method.predict([PredictionSample.from_text("SENTENÇA DE 1° DE JULHO DE 2009")])
self.assertEqual(["2009-07-01"], predictions)

def test_predict_portuguese(self):
gliner_method = GlinerDateParserMethod(extraction_identifier)

predictions = gliner_method.predict([PredictionSample.from_text("SENTENÇA DE 1° DE MARÇO DE 2010")])
self.assertEqual(["2010-03-01"], predictions)

def test_predict_multiple_dates_spanish(self):
text = "Informe no. 52/16 caso 12.521 fondo Maria Laura órdenes guerra y otros Chile 30 de noviembre de 2016"
sample = TrainingSample(
Expand Down

0 comments on commit 6883a54

Please sign in to comment.