Skip to content

Commit

Permalink
Remove super scripts that appear in the wrong order
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-piles committed Dec 19, 2024
1 parent d7dc123 commit f4f99c3
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 12 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "trainable-entity-extractor"
version = "2024.12.19.2"
version = "2024.12.19.4"
description = "This tool is a trainable text/PDF to entity extractor"
license = { file = "LICENSE" }
authors = [{ name = "HURIDOCS" }]
Expand Down
7 changes: 6 additions & 1 deletion src/trainable_entity_extractor/data/PdfData.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,15 +100,20 @@ def contains_text(self):

@staticmethod
def remove_super_scripts(segment_tokens: list[PdfToken]) -> list[PdfToken]:
if not segment_tokens:
return []

font_sizes = [token.font.font_size for token in segment_tokens]

if PdfData.similar_font_sizes(font_sizes):
return segment_tokens

tokens_no_super_scripts = []

min_left = min([token.bounding_box.left for token in segment_tokens])

for i, token in enumerate(segment_tokens):
if token == segment_tokens[0]:
if token.bounding_box.left == min_left:
tokens_no_super_scripts.append(token)
continue

Expand Down
21 changes: 11 additions & 10 deletions src/trainable_entity_extractor/test/test_pdf_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@

class TestPDFData(TestCase):
@staticmethod
def create_token(content: str, font_size: int):
def create_token(content: str, font_size: int, left: int = 0):
font_12 = PdfFont(font_id="1", font_size=font_size, bold=False, italics=False, color="black")
bounding_box = Rectangle.from_width_height(0, 0, 0, 0)
bounding_box = Rectangle.from_width_height(left, 0, 0, 0)
token = PdfToken(
page_number=1,
tag_id="tag",
Expand All @@ -25,26 +25,27 @@ def create_token(content: str, font_size: int):
return token

def test_no_remove_super_scripts(self):
token_1 = self.create_token("bu", 12)
token_2 = self.create_token("1", 12)
token_3 = self.create_token("2", 12)
token_1 = self.create_token("bu", 12, left=1)
token_2 = self.create_token("1", 12, left=2)
token_3 = self.create_token("2", 12, left=3)
tokens = PdfData.remove_super_scripts([token_1, token_2, token_3])

self.assertEqual(3, len(tokens))

def test_remove_super_scripts(self):
token_1 = self.create_token("first", 12)
token_2 = self.create_token("1", 10)
token_2 = self.create_token("1", 10, left=1)

tokens = PdfData.remove_super_scripts([token_1, token_2])

self.assertEqual(1, len(tokens))
self.assertEqual("first", tokens[0].content)

def test_no_remove_super_scripts_when_bigger(self):
token_1 = self.create_token("1", 12)
token_2 = self.create_token("first", 10)
token_1 = self.create_token("foo", 12)
token_2 = self.create_token("1", 12, left=1)
token_3 = self.create_token("first", 10, left=2)

tokens = PdfData.remove_super_scripts([token_1, token_2])
tokens = PdfData.remove_super_scripts([token_1, token_2, token_3])

self.assertEqual(2, len(tokens))
self.assertEqual(3, len(tokens))

0 comments on commit f4f99c3

Please sign in to comment.