Skip to content

Commit

Permalink
Clean up benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 30, 2025
1 parent 95c06c8 commit e6e2d7d
Show file tree
Hide file tree
Showing 11 changed files with 410 additions and 291 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,11 @@ rendered = converter("FILEPATH")
text, _, images = text_from_rendered(rendered)
```

This takes all the same configuration as the PdfConverter. You can specify the configuration `--force_layout_block=Table` to avoid layout detection and instead assume every page is a table.
This takes all the same configuration as the PdfConverter. You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table. Set `output_format=json` to also get cell bounding boxes.

You can also run this via the CLI with
```shell
python convert_single.py FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter
marker_single FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter --output_format json
```

# Output Formats
Expand Down
78 changes: 39 additions & 39 deletions benchmarks/overall/inference.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,47 @@
import io

import fitz as pymupdf
import json
import tempfile
from bs4 import BeautifulSoup

from benchmarks.overall.scoring import score_blocks
from benchmarks.overall.schema import BlockScores
from marker.converters.pdf import PdfConverter

def open_pymupdf(pdf_bytes):
stream = io.BytesIO(pdf_bytes)
return pymupdf.open(stream=stream)
def get_marker_html(marker_models: dict, pdf_bytes: bytes):
block_converter = PdfConverter(
artifact_dict=marker_models,
config={"page_range": [0], "disable_tqdm": True},
renderer="marker.renderers.html.HTMLRenderer"
)
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
f.write(pdf_bytes)
rendered = block_converter(f.name)
html = rendered.html
soup = BeautifulSoup(html, "html.parser")
inner_html = str(soup.find("body").decode_contents())
return inner_html


def marker_html_func(model_dict, sample, **kwargs) -> BlockScores:
gt_blocks = json.loads(sample["gt_blocks"])
pdf_bytes = sample["pdf"] # This is a single page PDF
marker_html = get_marker_html(model_dict, pdf_bytes)
gt_html = [block["html"] for block in gt_blocks]
scores = score_blocks(gt_html, marker_html)
return scores

def clip_pdf_to_bbox(doc, bbox, padding=1):
page = doc[0]
height, width = page.bound().height, page.bound().width
remove_left = [0, 0, bbox[0] - padding, height]
remove_top = [0, 0, width, bbox[1] - padding]
remove_right = [bbox[2] + padding, 0, width, height]
remove_bottom = [0, bbox[3] + padding, width, height]
for remove in [remove_left, remove_top, remove_right, remove_bottom]:
clip_rect = pymupdf.Rect(*remove)
page.add_redact_annot(clip_rect)
page.apply_redactions()

clip_rect = pymupdf.Rect(*bbox)
page.set_cropbox(clip_rect)
return doc
def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores:
uuid = sample["uuid"]
data = None
for row in mathpix_ds:
if str(row["uuid"]) == str(uuid):
data = row
break
if not data:
raise ValueError(f"Could not find data for uuid {uuid}")

def get_marker_block_html(marker_models: dict, gt_blocks: list, pdf_bytes: bytes):
block_html = []
for block in gt_blocks:
bbox = block["bbox"]
doc2 = open_pymupdf(pdf_bytes)
clip_pdf_to_bbox(doc2, bbox)
block_converter = PdfConverter(
artifact_dict=marker_models,
config={"page_range": [0], "force_layout_block": block["block_type"], "disable_tqdm": True},
renderer="marker.renderers.html.HTMLRenderer"
)
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
doc2.save(f)
rendered = block_converter(f.name)
html = rendered.html
soup = BeautifulSoup(html, "html.parser")
inner_html = str(soup.find("body").decode_contents())
block_html.append(inner_html)
return block_html
mathpix_md = data["md"]
gt_blocks = json.loads(sample["gt_blocks"])
gt_html = [block["html"] for block in gt_blocks]
scores = score_blocks(gt_html, mathpix_md, convert=False)
return scores
107 changes: 66 additions & 41 deletions benchmarks/overall/overall.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import traceback
from collections import defaultdict
from pathlib import Path

Expand All @@ -8,64 +9,53 @@
import tabulate
from tqdm import tqdm

from benchmarks.overall.inference import marker_html_func, mathpix_html_func
from benchmarks.overall.schema import FullResult
from marker.logger import configure_logging
from marker.models import create_model_dict
from inference import get_marker_block_html
from marker.settings import settings
from scoring import score_blocks

configure_logging()

@click.command(help="Benchmark PDF to MD conversion.")
@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
@click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values:", default="")
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
def main(
dataset: str,
other_methods: str,
result_path: str,
max_rows: int
):
allowed_methods = [""]
methods = other_methods.split(",")
for method in methods:
if method not in allowed_methods:
raise ValueError(f"Method {method} not allowed. Allowed methods are {allowed_methods}")

model_dict = create_model_dict()
ds = datasets.load_dataset(dataset, split="train")

def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func, **kwargs) -> FullResult:
bench_scores = {}
averages_by_type = defaultdict(list)
averages_by_block_type = defaultdict(list)
for idx, sample in tqdm(enumerate(ds), desc="Running benchmark"):
if max_rows is not None and idx >= max_rows:
break

gt_blocks = json.loads(sample["gt_blocks"])
doc_type = sample["classification"]
pdf_bytes = sample["pdf"] # This is a single page PDF
marker_html = get_marker_block_html(model_dict, gt_blocks, pdf_bytes)
gt_html = [block["html"] for block in gt_blocks]
scores = score_blocks(gt_html, marker_html)
gt_weights = [len(ht) for ht in gt_html]
overall_score = sum([s * w for s, w in zip(scores, gt_weights)]) / sum(gt_weights)
bench_scores[idx] = {
"scores": scores,
"weights": gt_weights,
"overall_score": overall_score # Weighted score, weighted by length of GT block
}

averages_by_type[doc_type].append(overall_score)

for score, gt_block in zip(scores, gt_blocks):
try:
scores = html_func(model_dict, sample, **kwargs)
except ValueError as e:
print(f"Error with sample {idx}: {e}")
continue
averages_by_type[doc_type].append(scores["overall_score"])

for score, gt_block in zip(scores["scores"], gt_blocks):
averages_by_block_type[gt_block["block_type"]].append(score)

if max_rows is not None and idx >= max_rows:
break
bench_scores[idx] = scores

return {
"raw_scores": bench_scores,
"averages_by_type": averages_by_type,
"averages_by_block_type": averages_by_block_type
}

def print_scores(scores: FullResult, method: str):
averages_by_type = scores["averages_by_type"]
averages_by_block_type = scores["averages_by_block_type"]
bench_scores = scores["raw_scores"]

for k in averages_by_type:
averages_by_type[k] = sum(averages_by_type[k]) / len(averages_by_type[k])
averages_by_type = sorted(averages_by_type.items())

print(f"Scores for method {method}:")
print(tabulate.tabulate(averages_by_type, headers=["Document Type", "Average Score"], tablefmt="github"))

for k in averages_by_block_type:
Expand All @@ -76,10 +66,45 @@ def main(

overall_average = sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
print(tabulate.tabulate([["Overall Average", overall_average]], tablefmt="github"))
print()

@click.command(help="Benchmark PDF to MD conversion.")
@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
@click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values: mathpix", default="")
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
def main(
dataset: str,
other_methods: str,
result_path: str,
max_rows: int
):
allowed_methods = ["mathpix", ""]
methods = other_methods.split(",")
for method in methods:
if method not in allowed_methods:
raise ValueError(f"Method {method} not allowed. Allowed methods are {allowed_methods}")

model_dict = create_model_dict()
ds = datasets.load_dataset(dataset, split="train")

marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows)
all_scores = {
"marker": marker_scores
}

if "mathpix" in methods:
mathpix_ds = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train")
mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, html_func=mathpix_html_func, mathpix_ds=mathpix_ds)
all_scores["mathpix"] = mathpix_scores

for k,v in all_scores.items():
print_scores(v, k)

out_path = Path(result_path) / "overall.json"
with open(out_path, "w") as f:
json.dump(bench_scores, f, indent=2)
out_path = Path(result_path)
out_path.mkdir(parents=True, exist_ok=True)
with open(out_path / "overall.json", "w") as f:
json.dump(all_scores, f, indent=2)

print(f"Results saved to {out_path}.")

Expand Down
15 changes: 15 additions & 0 deletions benchmarks/overall/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import TypedDict, List, Dict


class BlockScores(TypedDict):
scores: List[float]
order_score: float
gt: List[str]
method: str
overall_score: float


class FullResult(TypedDict):
raw_scores: Dict[int, BlockScores]
averages_by_type: Dict[str, List[float]]
averages_by_block_type: Dict[str, List[float]]
Loading

0 comments on commit e6e2d7d

Please sign in to comment.