Skip to content

Commit

Permalink
Update version
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-piles committed Nov 21, 2024
1 parent 29a3193 commit 46cdb40
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "trainable-entity-extractor"
version = "2024.11.21.1"
version = "2024.11.21.2"
description = "This tool is a trainable text/PDF to entity extractor"
license = { file = "LICENSE" }
authors = [{ name = "HURIDOCS" }]
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
git+https://github.com/huridocs/pdf-document-layout-analysis@a490e280aa168e12c0211d4f03a5264168512ba4
python-Levenshtein==0.25.1
tdda==2.0.9
datasets==2.19.0
Expand Down Expand Up @@ -28,4 +27,5 @@ typer==0.12.3
sentence-transformers==3.0.1
py-markdown-table==1.1.0
flair==0.14.0
graypy==2.1.0
graypy==2.1.0
git+https://github.com/huridocs/pdf-document-layout-analysis@a490e280aa168e12c0211d4f03a5264168512ba4
15 changes: 15 additions & 0 deletions src/trainable_entity_extractor/data/PdfData.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Optional

from pdf_features.PdfToken import PdfToken
from pdf_token_type_labels.TokenType import TokenType

from trainable_entity_extractor.data.SegmentationData import SegmentationData
from pdf_features.PdfFeatures import PdfFeatures
Expand Down Expand Up @@ -103,6 +104,20 @@ def remove_super_scripts(tokens: list[PdfToken]) -> list[PdfToken]:
tokens_no_super_scripts = []

for token in tokens:
if token == tokens[0]:
tokens_no_super_scripts.append(token)
continue

if token.token_type in [
TokenType.FORMULA,
TokenType.FOOTNOTE,
TokenType.TABLE,
TokenType.PICTURE,
TokenType.PAGE_FOOTER,
]:
tokens_no_super_scripts.append(token)
continue

if token.font.font_size == min_font_size and token.content.isnumeric() and float(token.content) < 999:
continue

Expand Down

0 comments on commit 46cdb40

Please sign in to comment.