Clean up benchmarks

VikParuchuri · Jan 30, 2025 · e6e2d7d · e6e2d7d
1 parent 95c06c8
commit e6e2d7d
Show file tree

Hide file tree

Showing 11 changed files with 410 additions and 291 deletions.
diff --git a/README.md b/README.md
@@ -219,11 +219,11 @@ rendered = converter("FILEPATH")
 text, _, images = text_from_rendered(rendered)
 ```
 
-This takes all the same configuration as the PdfConverter.  You can specify the configuration `--force_layout_block=Table` to avoid layout detection and instead assume every page is a table.
+This takes all the same configuration as the PdfConverter.  You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table.  Set `output_format=json` to also get cell bounding boxes.
 
 You can also run this via the CLI with 
 ```shell
-python convert_single.py FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter
+marker_single FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter --output_format json
 ```
 
 # Output Formats

diff --git a/benchmarks/overall/inference.py b/benchmarks/overall/inference.py
@@ -1,47 +1,47 @@
-import io
-
-import fitz as pymupdf
+import json
 import tempfile
 from bs4 import BeautifulSoup
 
+from benchmarks.overall.scoring import score_blocks
+from benchmarks.overall.schema import BlockScores
 from marker.converters.pdf import PdfConverter
 
-def open_pymupdf(pdf_bytes):
-    stream = io.BytesIO(pdf_bytes)
-    return pymupdf.open(stream=stream)
+def get_marker_html(marker_models: dict, pdf_bytes: bytes):
+    block_converter = PdfConverter(
+        artifact_dict=marker_models,
+        config={"page_range": [0], "disable_tqdm": True},
+        renderer="marker.renderers.html.HTMLRenderer"
+    )
+    with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
+        f.write(pdf_bytes)
+        rendered = block_converter(f.name)
+    html = rendered.html
+    soup = BeautifulSoup(html, "html.parser")
+    inner_html = str(soup.find("body").decode_contents())
+    return inner_html
+
+
+def marker_html_func(model_dict, sample, **kwargs) -> BlockScores:
+    gt_blocks = json.loads(sample["gt_blocks"])
+    pdf_bytes = sample["pdf"]  # This is a single page PDF
+    marker_html = get_marker_html(model_dict, pdf_bytes)
+    gt_html = [block["html"] for block in gt_blocks]
+    scores = score_blocks(gt_html, marker_html)
+    return scores
 
-def clip_pdf_to_bbox(doc, bbox, padding=1):
-    page = doc[0]
-    height, width = page.bound().height, page.bound().width
-    remove_left = [0, 0, bbox[0] - padding, height]
-    remove_top = [0, 0, width, bbox[1] - padding]
-    remove_right = [bbox[2] + padding, 0, width, height]
-    remove_bottom = [0, bbox[3] + padding, width, height]
-    for remove in [remove_left, remove_top, remove_right, remove_bottom]:
-        clip_rect = pymupdf.Rect(*remove)
-        page.add_redact_annot(clip_rect)
-    page.apply_redactions()
 
-    clip_rect = pymupdf.Rect(*bbox)
-    page.set_cropbox(clip_rect)
-    return doc
+def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores:
+    uuid = sample["uuid"]
+    data = None
+    for row in mathpix_ds:
+        if str(row["uuid"]) == str(uuid):
+            data = row
+            break
+    if not data:
+        raise ValueError(f"Could not find data for uuid {uuid}")
 
-def get_marker_block_html(marker_models: dict, gt_blocks: list, pdf_bytes: bytes):
-    block_html = []
-    for block in gt_blocks:
-        bbox = block["bbox"]
-        doc2 = open_pymupdf(pdf_bytes)
-        clip_pdf_to_bbox(doc2, bbox)
-        block_converter = PdfConverter(
-            artifact_dict=marker_models,
-            config={"page_range": [0], "force_layout_block": block["block_type"], "disable_tqdm": True},
-            renderer="marker.renderers.html.HTMLRenderer"
-        )
-        with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
-            doc2.save(f)
-            rendered = block_converter(f.name)
-        html = rendered.html
-        soup = BeautifulSoup(html, "html.parser")
-        inner_html = str(soup.find("body").decode_contents())
-        block_html.append(inner_html)
-    return block_html
+    mathpix_md = data["md"]
+    gt_blocks = json.loads(sample["gt_blocks"])
+    gt_html = [block["html"] for block in gt_blocks]
+    scores = score_blocks(gt_html, mathpix_md, convert=False)
+    return scores
diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py
@@ -1,5 +1,6 @@
 import json
 import os
+import traceback
 from collections import defaultdict
 from pathlib import Path
 
@@ -8,64 +9,53 @@
 import tabulate
 from tqdm import tqdm
 
+from benchmarks.overall.inference import marker_html_func, mathpix_html_func
+from benchmarks.overall.schema import FullResult
 from marker.logger import configure_logging
 from marker.models import create_model_dict
-from inference import get_marker_block_html
 from marker.settings import settings
-from scoring import score_blocks
 
 configure_logging()
 
-@click.command(help="Benchmark PDF to MD conversion.")
-@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
-@click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against.  Possible values:", default="")
-@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
-@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
-def main(
-        dataset: str,
-        other_methods: str,
-        result_path: str,
-        max_rows: int
-):
-    allowed_methods = [""]
-    methods = other_methods.split(",")
-    for method in methods:
-        if method not in allowed_methods:
-            raise ValueError(f"Method {method} not allowed.  Allowed methods are {allowed_methods}")
-
-    model_dict = create_model_dict()
-    ds = datasets.load_dataset(dataset, split="train")
 
+def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func, **kwargs) -> FullResult:
     bench_scores = {}
     averages_by_type = defaultdict(list)
     averages_by_block_type = defaultdict(list)
     for idx, sample in tqdm(enumerate(ds), desc="Running benchmark"):
+        if max_rows is not None and idx >= max_rows:
+            break
+
         gt_blocks = json.loads(sample["gt_blocks"])
         doc_type = sample["classification"]
-        pdf_bytes = sample["pdf"] # This is a single page PDF
-        marker_html = get_marker_block_html(model_dict, gt_blocks, pdf_bytes)
-        gt_html = [block["html"] for block in gt_blocks]
-        scores = score_blocks(gt_html, marker_html)
-        gt_weights = [len(ht) for ht in gt_html]
-        overall_score = sum([s * w for s, w in zip(scores, gt_weights)]) / sum(gt_weights)
-        bench_scores[idx] = {
-            "scores": scores,
-            "weights": gt_weights,
-            "overall_score": overall_score # Weighted score, weighted by length of GT block
-        }
-
-        averages_by_type[doc_type].append(overall_score)
-
-        for score, gt_block in zip(scores, gt_blocks):
+        try:
+            scores = html_func(model_dict, sample, **kwargs)
+        except ValueError as e:
+            print(f"Error with sample {idx}: {e}")
+            continue
+        averages_by_type[doc_type].append(scores["overall_score"])
+
+        for score, gt_block in zip(scores["scores"], gt_blocks):
             averages_by_block_type[gt_block["block_type"]].append(score)
 
-        if max_rows is not None and idx >= max_rows:
-            break
+        bench_scores[idx] = scores
+
+    return {
+        "raw_scores": bench_scores,
+        "averages_by_type": averages_by_type,
+        "averages_by_block_type": averages_by_block_type
+    }
+
+def print_scores(scores: FullResult, method: str):
+    averages_by_type = scores["averages_by_type"]
+    averages_by_block_type = scores["averages_by_block_type"]
+    bench_scores = scores["raw_scores"]
 
     for k in averages_by_type:
         averages_by_type[k] = sum(averages_by_type[k]) / len(averages_by_type[k])
     averages_by_type = sorted(averages_by_type.items())
 
+    print(f"Scores for method {method}:")
     print(tabulate.tabulate(averages_by_type, headers=["Document Type", "Average Score"], tablefmt="github"))
 
     for k in averages_by_block_type:
@@ -76,10 +66,45 @@ def main(
 
     overall_average = sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
     print(tabulate.tabulate([["Overall Average", overall_average]], tablefmt="github"))
+    print()
+
+@click.command(help="Benchmark PDF to MD conversion.")
+@click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
+@click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against.  Possible values: mathpix", default="")
+@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
+@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
+def main(
+        dataset: str,
+        other_methods: str,
+        result_path: str,
+        max_rows: int
+):
+    allowed_methods = ["mathpix", ""]
+    methods = other_methods.split(",")
+    for method in methods:
+        if method not in allowed_methods:
+            raise ValueError(f"Method {method} not allowed.  Allowed methods are {allowed_methods}")
+
+    model_dict = create_model_dict()
+    ds = datasets.load_dataset(dataset, split="train")
+
+    marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows)
+    all_scores = {
+        "marker": marker_scores
+    }
+
+    if "mathpix" in methods:
+        mathpix_ds = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train")
+        mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, html_func=mathpix_html_func, mathpix_ds=mathpix_ds)
+        all_scores["mathpix"] = mathpix_scores
+
+    for k,v in all_scores.items():
+        print_scores(v, k)
 
-    out_path = Path(result_path) / "overall.json"
-    with open(out_path, "w") as f:
-        json.dump(bench_scores, f, indent=2)
+    out_path = Path(result_path)
+    out_path.mkdir(parents=True, exist_ok=True)
+    with open(out_path / "overall.json", "w") as f:
+        json.dump(all_scores, f, indent=2)
 
     print(f"Results saved to {out_path}.")
 

diff --git a/benchmarks/overall/schema.py b/benchmarks/overall/schema.py
@@ -0,0 +1,15 @@
+from typing import TypedDict, List, Dict
+
+
+class BlockScores(TypedDict):
+    scores: List[float]
+    order_score: float
+    gt: List[str]
+    method: str
+    overall_score: float
+
+
+class FullResult(TypedDict):
+    raw_scores: Dict[int, BlockScores]
+    averages_by_type: Dict[str, List[float]]
+    averages_by_block_type: Dict[str, List[float]]