-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #44 from huridocs/text_extraction
Add text extraction
- Loading branch information
Showing
4 changed files
with
62 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from configuration import service_logger | ||
from pdf_token_type_labels.TokenType import TokenType | ||
|
||
|
||
def extract_text(segment_boxes: list[dict], types: list[TokenType]): | ||
service_logger.info(f"Extracted types: {[t.name for t in types]}") | ||
text = "\n".join( | ||
[ | ||
segment_box["text"] | ||
for segment_box in segment_boxes | ||
if TokenType.from_text(segment_box["type"].replace(" ", "_")) in types | ||
] | ||
) | ||
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from fastapi import UploadFile | ||
from pdf_token_type_labels.TokenType import TokenType | ||
from pdf_layout_analysis.run_pdf_layout_analysis import analyze_pdf | ||
from pdf_layout_analysis.run_pdf_layout_analysis_fast import analyze_pdf_fast | ||
from text_extraction.extract_text import extract_text | ||
|
||
|
||
def get_text_extraction(file: UploadFile, fast: bool, types: str): | ||
file_content = file.file.read() | ||
if types == "all": | ||
token_types: list[TokenType] = [t for t in TokenType] | ||
else: | ||
token_types = list(set([TokenType.from_text(t.strip().replace(" ", "_")) for t in types.split(",")])) | ||
if fast: | ||
return extract_text(analyze_pdf_fast(file_content), token_types) | ||
return extract_text(analyze_pdf(file_content, ""), token_types) |