-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
95c06c8
commit e6e2d7d
Showing
11 changed files
with
410 additions
and
291 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,47 +1,47 @@ | ||
import io | ||
|
||
import fitz as pymupdf | ||
import json | ||
import tempfile | ||
from bs4 import BeautifulSoup | ||
|
||
from benchmarks.overall.scoring import score_blocks | ||
from benchmarks.overall.schema import BlockScores | ||
from marker.converters.pdf import PdfConverter | ||
|
||
def open_pymupdf(pdf_bytes): | ||
stream = io.BytesIO(pdf_bytes) | ||
return pymupdf.open(stream=stream) | ||
def get_marker_html(marker_models: dict, pdf_bytes: bytes): | ||
block_converter = PdfConverter( | ||
artifact_dict=marker_models, | ||
config={"page_range": [0], "disable_tqdm": True}, | ||
renderer="marker.renderers.html.HTMLRenderer" | ||
) | ||
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: | ||
f.write(pdf_bytes) | ||
rendered = block_converter(f.name) | ||
html = rendered.html | ||
soup = BeautifulSoup(html, "html.parser") | ||
inner_html = str(soup.find("body").decode_contents()) | ||
return inner_html | ||
|
||
|
||
def marker_html_func(model_dict, sample, **kwargs) -> BlockScores: | ||
gt_blocks = json.loads(sample["gt_blocks"]) | ||
pdf_bytes = sample["pdf"] # This is a single page PDF | ||
marker_html = get_marker_html(model_dict, pdf_bytes) | ||
gt_html = [block["html"] for block in gt_blocks] | ||
scores = score_blocks(gt_html, marker_html) | ||
return scores | ||
|
||
def clip_pdf_to_bbox(doc, bbox, padding=1): | ||
page = doc[0] | ||
height, width = page.bound().height, page.bound().width | ||
remove_left = [0, 0, bbox[0] - padding, height] | ||
remove_top = [0, 0, width, bbox[1] - padding] | ||
remove_right = [bbox[2] + padding, 0, width, height] | ||
remove_bottom = [0, bbox[3] + padding, width, height] | ||
for remove in [remove_left, remove_top, remove_right, remove_bottom]: | ||
clip_rect = pymupdf.Rect(*remove) | ||
page.add_redact_annot(clip_rect) | ||
page.apply_redactions() | ||
|
||
clip_rect = pymupdf.Rect(*bbox) | ||
page.set_cropbox(clip_rect) | ||
return doc | ||
def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores: | ||
uuid = sample["uuid"] | ||
data = None | ||
for row in mathpix_ds: | ||
if str(row["uuid"]) == str(uuid): | ||
data = row | ||
break | ||
if not data: | ||
raise ValueError(f"Could not find data for uuid {uuid}") | ||
|
||
def get_marker_block_html(marker_models: dict, gt_blocks: list, pdf_bytes: bytes): | ||
block_html = [] | ||
for block in gt_blocks: | ||
bbox = block["bbox"] | ||
doc2 = open_pymupdf(pdf_bytes) | ||
clip_pdf_to_bbox(doc2, bbox) | ||
block_converter = PdfConverter( | ||
artifact_dict=marker_models, | ||
config={"page_range": [0], "force_layout_block": block["block_type"], "disable_tqdm": True}, | ||
renderer="marker.renderers.html.HTMLRenderer" | ||
) | ||
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: | ||
doc2.save(f) | ||
rendered = block_converter(f.name) | ||
html = rendered.html | ||
soup = BeautifulSoup(html, "html.parser") | ||
inner_html = str(soup.find("body").decode_contents()) | ||
block_html.append(inner_html) | ||
return block_html | ||
mathpix_md = data["md"] | ||
gt_blocks = json.loads(sample["gt_blocks"]) | ||
gt_html = [block["html"] for block in gt_blocks] | ||
scores = score_blocks(gt_html, mathpix_md, convert=False) | ||
return scores |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from typing import TypedDict, List, Dict | ||
|
||
|
||
class BlockScores(TypedDict): | ||
scores: List[float] | ||
order_score: float | ||
gt: List[str] | ||
method: str | ||
overall_score: float | ||
|
||
|
||
class FullResult(TypedDict): | ||
raw_scores: Dict[int, BlockScores] | ||
averages_by_type: Dict[str, List[float]] | ||
averages_by_block_type: Dict[str, List[float]] |
Oops, something went wrong.