Skip to content

Commit

Permalink
Take the spaces from the bounding boxes
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-piles committed Dec 17, 2024
1 parent ff97c65 commit 9f26eae
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "pdf-document-layout-analysis"
version = "2024.12.17.3"
version = "2024.12.17.4"
description = "This tool is for PDF document layout analysis"
license = { file = "LICENSE" }
authors = [{ name = "HURIDOCS" }]
Expand Down
2 changes: 1 addition & 1 deletion src/pdf_features/PdfToken.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ def from_poppler_etree(page_number: int, xml_tag: ElementBase, pdf_font: PdfFont
else:
tag_id = "tag"

content = "".join(xml_tag.itertext()).strip()
reading_order_no = int(xml_tag.attrib["reading_order_no"]) if "reading_order_no" in xml_tag.attrib else -1
bounding_box = Rectangle.from_poppler_tag_etree(xml_tag)
token_type = TokenType.TEXT

content = "".join(xml_tag.itertext()).strip()
return PdfToken(page_number, tag_id, content, pdf_font, reading_order_no, bounding_box, token_type)

def get_label_intersection_percentage(self, label: Label):
Expand Down
9 changes: 9 additions & 0 deletions src/pdf_features/Rectangle.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,20 @@ def __init__(self, left: int, top: int, right: int, bottom: int):

@staticmethod
def from_poppler_tag_etree(tag: ElementBase) -> "Rectangle":
content = "".join(tag.itertext())

x_min = int(tag.attrib["left"])
y_min = int(tag.attrib["top"])
x_max = x_min + int(tag.attrib["width"])
y_max = y_min + int(tag.attrib["height"])

one_character_length = max(int((x_max - x_min) / len(content)), 2)
if content[0] == " ":
x_min += one_character_length

if content[-1] == " ":
x_max -= one_character_length

return Rectangle(x_min, y_min, x_max, y_max)

def fix_wrong_areas(self):
Expand Down

0 comments on commit 9f26eae

Please sign in to comment.