From 1a8a35d7407226e2021bba2f95a3f446d8dbbaea Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Thu, 30 Jan 2025 12:23:58 -0500
Subject: [PATCH] Additional benchmark cleanup

---
 .github/workflows/benchmark.yml | 25 +++++++++++++------------
 benchmarks/overall/inference.py | 21 +++++++++++----------
 benchmarks/overall/overall.py   | 24 +++++++++++++++---------
 benchmarks/overall/schema.py    |  5 ++++-
 benchmarks/overall/scoring.py   | 21 +++++++++++++++++++--
 benchmarks/table/table.py       |  4 ++--
 benchmarks/verify_scores.py     |  8 +++-----
 7 files changed, 67 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 5d49aa1c..5e7785c0 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -2,12 +2,12 @@ name: Integration test with benchmark
 
 on: [push]
 
-env:
-  TORCH_DEVICE: "cpu"
-
 jobs:
   benchmark:
-    runs-on: [ubuntu-latest, windows-latest]
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, windows-latest ]
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python 3.11
@@ -18,16 +18,17 @@ jobs:
         run: |
           pip install poetry
           poetry install
-          poetry remove torch
-          poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
-      - name: Download benchmark data
-        run: |
-          wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
-          unzip -o benchmark_data.zip
       - name: Run benchmark test
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          poetry run python benchmarks/overall/overall.py --max_rows 5
+          poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/overall.json --type marker
+            - name: Run benchmark test
+      - name: Run table benchmark
         run: |
-          poetry run python benchmarks/overall.py benchmark_data/pdfs benchmark_data/references report.json
-          poetry run python benchmarks/verify_scores.py report.json --type marker
+          poetry run python benchmarks/table/table.py --max_rows 5
+          poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table
         
           
 
diff --git a/benchmarks/overall/inference.py b/benchmarks/overall/inference.py
index f312429b..1b504cff 100644
--- a/benchmarks/overall/inference.py
+++ b/benchmarks/overall/inference.py
@@ -1,15 +1,16 @@
-import json
 import tempfile
+import time
+
 from bs4 import BeautifulSoup
 
 from benchmarks.overall.scoring import score_blocks
 from benchmarks.overall.schema import BlockScores
 from marker.converters.pdf import PdfConverter
 
-def get_marker_html(marker_models: dict, pdf_bytes: bytes):
+def get_marker_html(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
     block_converter = PdfConverter(
         artifact_dict=marker_models,
-        config={"page_range": [0], "disable_tqdm": True},
+        config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm},
         renderer="marker.renderers.html.HTMLRenderer"
     )
     with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
@@ -21,16 +22,17 @@ def get_marker_html(marker_models: dict, pdf_bytes: bytes):
     return inner_html
 
 
-def marker_html_func(model_dict, sample, **kwargs) -> BlockScores:
-    gt_blocks = json.loads(sample["gt_blocks"])
+def marker_scoring_func(model_dict, sample, gt_html, use_llm=False, **kwargs) -> BlockScores:
     pdf_bytes = sample["pdf"]  # This is a single page PDF
-    marker_html = get_marker_html(model_dict, pdf_bytes)
-    gt_html = [block["html"] for block in gt_blocks]
+    start = time.time()
+    marker_html = get_marker_html(model_dict, pdf_bytes, use_llm)
+    total = time.time() - start
     scores = score_blocks(gt_html, marker_html)
+    scores["time"] = total
     return scores
 
 
-def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores:
+def mathpix_scoring_func(model_dict, sample, gt_html, mathpix_ds=None, **kwargs) -> BlockScores:
     uuid = sample["uuid"]
     data = None
     for row in mathpix_ds:
@@ -41,7 +43,6 @@ def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores:
         raise ValueError(f"Could not find data for uuid {uuid}")
 
     mathpix_md = data["md"]
-    gt_blocks = json.loads(sample["gt_blocks"])
-    gt_html = [block["html"] for block in gt_blocks]
     scores = score_blocks(gt_html, mathpix_md, convert=False)
+    scores["time"] = data["time"]
     return scores
diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py
index bdb1fc7c..9cf6fb01 100644
--- a/benchmarks/overall/overall.py
+++ b/benchmarks/overall/overall.py
@@ -9,7 +9,7 @@
 import tabulate
 from tqdm import tqdm
 
-from benchmarks.overall.inference import marker_html_func, mathpix_html_func
+from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
 from benchmarks.overall.schema import FullResult
 from marker.logger import configure_logging
 from marker.models import create_model_dict
@@ -18,7 +18,7 @@
 configure_logging()
 
 
-def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func, **kwargs) -> FullResult:
+def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_func, **kwargs) -> FullResult:
     bench_scores = {}
     averages_by_type = defaultdict(list)
     averages_by_block_type = defaultdict(list)
@@ -29,7 +29,8 @@ def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func,
         gt_blocks = json.loads(sample["gt_blocks"])
         doc_type = sample["classification"]
         try:
-            scores = html_func(model_dict, sample, **kwargs)
+            gt_html = [block["html"] for block in gt_blocks]
+            scores = score_func(model_dict, sample, gt_html, **kwargs)
         except ValueError as e:
             print(f"Error with sample {idx}: {e}")
             continue
@@ -40,10 +41,13 @@ def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func,
 
         bench_scores[idx] = scores
 
+    avg_time = sum([bench_scores[k]["time"] for k in bench_scores]) / len(bench_scores)
     return {
         "raw_scores": bench_scores,
         "averages_by_type": averages_by_type,
-        "averages_by_block_type": averages_by_block_type
+        "averages_by_block_type": averages_by_block_type,
+        "average_time": avg_time,
+        "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
     }
 
 def print_scores(scores: FullResult, method: str):
@@ -73,11 +77,13 @@ def print_scores(scores: FullResult, method: str):
 @click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against.  Possible values: mathpix", default="")
 @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
 @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
+@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
 def main(
         dataset: str,
         other_methods: str,
         result_path: str,
-        max_rows: int
+        max_rows: int,
+        use_llm: bool
 ):
     allowed_methods = ["mathpix", ""]
     methods = other_methods.split(",")
@@ -88,14 +94,14 @@ def main(
     model_dict = create_model_dict()
     ds = datasets.load_dataset(dataset, split="train")
 
-    marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows)
+    marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows, use_llm=use_llm)
     all_scores = {
         "marker": marker_scores
     }
 
     if "mathpix" in methods:
         mathpix_ds = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train")
-        mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, html_func=mathpix_html_func, mathpix_ds=mathpix_ds)
+        mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds)
         all_scores["mathpix"] = mathpix_scores
 
     for k,v in all_scores.items():
@@ -103,8 +109,8 @@ def main(
 
     out_path = Path(result_path)
     out_path.mkdir(parents=True, exist_ok=True)
-    with open(out_path / "overall.json", "w") as f:
-        json.dump(all_scores, f, indent=2)
+    with open(out_path / "overall.json", "w", encoding="utf-8") as f:
+        json.dump(all_scores, f, indent=2, ensure_ascii=False)
 
     print(f"Results saved to {out_path}.")
 
diff --git a/benchmarks/overall/schema.py b/benchmarks/overall/schema.py
index 98ffc1b8..8af5bf28 100644
--- a/benchmarks/overall/schema.py
+++ b/benchmarks/overall/schema.py
@@ -1,4 +1,4 @@
-from typing import TypedDict, List, Dict
+from typing import TypedDict, List, Dict, Optional
 
 
 class BlockScores(TypedDict):
@@ -7,9 +7,12 @@ class BlockScores(TypedDict):
     gt: List[str]
     method: str
     overall_score: float
+    time: Optional[float]
 
 
 class FullResult(TypedDict):
     raw_scores: Dict[int, BlockScores]
     averages_by_type: Dict[str, List[float]]
     averages_by_block_type: Dict[str, List[float]]
+    average_time: float
+    average_score: float
diff --git a/benchmarks/overall/scoring.py b/benchmarks/overall/scoring.py
index 1ba78bc9..713e5fef 100644
--- a/benchmarks/overall/scoring.py
+++ b/benchmarks/overall/scoring.py
@@ -12,6 +12,9 @@ def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
     concordant = 0
     discordant = 0
 
+    if n <= 1:
+        return 100
+
     for i in range(n):
         for j in range(i + 1, n):
             correct_sign = correct_order[i] - correct_order[j]
@@ -61,18 +64,27 @@ def convert_to_md(html):
     return markdown
 
 def standardize_markdown(markdown):
+    # Replace math expressions
     pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
     markdown = re.sub(pattern, standardize_math, markdown)
 
+    # Replace image urls
+    pattern = r'!\[(.*?)\]\((.*?)(?:\?.*?width=(\d+).*?height=(\d+).*?)\)'
+    markdown =  re.sub(pattern, r'![/api/placeholder]', markdown)
+
+    # Clean up html tags
     markdown = markdown.replace("<br>", "\n")
     markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
     markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
+    markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown) # Remove span tags and keep content
 
+    # Clean up markdown
     markdown = re.sub(r"\s+", " ", markdown)
     markdown = re.sub(r"\n+", "\n", markdown)
     markdown = re.sub("\\.+", ".", markdown) # Replace repeated periods with a single period, like in table of contents
     markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header
     markdown = re.sub(r"\$", "", markdown) # Remove equation delimiters
+    markdown = markdown.encode().decode('unicode-escape') # Decode unicode characters properly
     return markdown.strip().lower()
 
 
@@ -116,10 +128,14 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores:
     gt = [standardize_markdown(convert_to_md(gt)) for gt in gt_html]
     alignments = find_fuzzy_alignments(method_html, gt)
     scores = [alignment["score"] for alignment in alignments]
+
+    # Find order score
     orders = [alignment["start"] for alignment in alignments]
-    correct_order = range(len(gt))
+    correct_order = list(range(len(gt)))
     actual_order = sorted(range(len(gt)), key=lambda x: orders[x])
     order_score = kendall_tau(correct_order, actual_order)
+
+    # Weight score by sequence length
     gt_weights = [len(g) for g in gt]
     weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
 
@@ -131,5 +147,6 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores:
         "order_score": order_score,
         "gt": gt,
         "method": method_html,
-        "overall_score": overall_score
+        "overall_score": overall_score,
+        "time": None
     }
\ No newline at end of file
diff --git a/benchmarks/table/table.py b/benchmarks/table/table.py
index 448e32fe..130f862d 100644
--- a/benchmarks/table/table.py
+++ b/benchmarks/table/table.py
@@ -222,9 +222,9 @@ def main(
         "gemini": gemini_results
     }
 
-    out_path = Path(result_path) / "table.json"
+    out_path = Path(result_path)
     out_path.mkdir(parents=True, exist_ok=True)
-    with open(out_path, "w+") as f:
+    with open(out_path / "table.json", "w+") as f:
         json.dump(results, f, indent=2)
 
     print(f"Results saved to {out_path}.")
diff --git a/benchmarks/verify_scores.py b/benchmarks/verify_scores.py
index 913081e9..1ce2fa07 100644
--- a/benchmarks/verify_scores.py
+++ b/benchmarks/verify_scores.py
@@ -6,11 +6,9 @@ def verify_scores(file_path):
     with open(file_path, 'r') as file:
         data = json.load(file)
 
-    multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
-    switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"]
-
-    if multicolcnn_score <= 0.34 or switch_trans_score <= 0.40:
-        raise ValueError("One or more scores are below the required threshold of 0.4")
+    marker_score = data["marker"]["overall_score"]
+    if marker_score < 90:
+        raise ValueError("Marker score below 90")
 
 
 def verify_table_scores(file_path):