Skip to content

Commit

Permalink
Additional benchmark cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 30, 2025
1 parent e6e2d7d commit 1a8a35d
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 41 deletions.
25 changes: 13 additions & 12 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ name: Integration test with benchmark

on: [push]

env:
TORCH_DEVICE: "cpu"

jobs:
benchmark:
runs-on: [ubuntu-latest, windows-latest]
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ ubuntu-latest, windows-latest ]
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
Expand All @@ -18,16 +18,17 @@ jobs:
run: |
pip install poetry
poetry install
poetry remove torch
poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
- name: Download benchmark data
run: |
wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
unzip -o benchmark_data.zip
- name: Run benchmark test
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
poetry run python benchmarks/overall/overall.py --max_rows 5
poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/overall.json --type marker
- name: Run benchmark test
- name: Run table benchmark
run: |
poetry run python benchmarks/overall.py benchmark_data/pdfs benchmark_data/references report.json
poetry run python benchmarks/verify_scores.py report.json --type marker
poetry run python benchmarks/table/table.py --max_rows 5
poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table
21 changes: 11 additions & 10 deletions benchmarks/overall/inference.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import json
import tempfile
import time

from bs4 import BeautifulSoup

from benchmarks.overall.scoring import score_blocks
from benchmarks.overall.schema import BlockScores
from marker.converters.pdf import PdfConverter

def get_marker_html(marker_models: dict, pdf_bytes: bytes):
def get_marker_html(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
block_converter = PdfConverter(
artifact_dict=marker_models,
config={"page_range": [0], "disable_tqdm": True},
config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm},
renderer="marker.renderers.html.HTMLRenderer"
)
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
Expand All @@ -21,16 +22,17 @@ def get_marker_html(marker_models: dict, pdf_bytes: bytes):
return inner_html


def marker_html_func(model_dict, sample, **kwargs) -> BlockScores:
gt_blocks = json.loads(sample["gt_blocks"])
def marker_scoring_func(model_dict, sample, gt_html, use_llm=False, **kwargs) -> BlockScores:
pdf_bytes = sample["pdf"] # This is a single page PDF
marker_html = get_marker_html(model_dict, pdf_bytes)
gt_html = [block["html"] for block in gt_blocks]
start = time.time()
marker_html = get_marker_html(model_dict, pdf_bytes, use_llm)
total = time.time() - start
scores = score_blocks(gt_html, marker_html)
scores["time"] = total
return scores


def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores:
def mathpix_scoring_func(model_dict, sample, gt_html, mathpix_ds=None, **kwargs) -> BlockScores:
uuid = sample["uuid"]
data = None
for row in mathpix_ds:
Expand All @@ -41,7 +43,6 @@ def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores:
raise ValueError(f"Could not find data for uuid {uuid}")

mathpix_md = data["md"]
gt_blocks = json.loads(sample["gt_blocks"])
gt_html = [block["html"] for block in gt_blocks]
scores = score_blocks(gt_html, mathpix_md, convert=False)
scores["time"] = data["time"]
return scores
24 changes: 15 additions & 9 deletions benchmarks/overall/overall.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import tabulate
from tqdm import tqdm

from benchmarks.overall.inference import marker_html_func, mathpix_html_func
from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
from benchmarks.overall.schema import FullResult
from marker.logger import configure_logging
from marker.models import create_model_dict
Expand All @@ -18,7 +18,7 @@
configure_logging()


def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func, **kwargs) -> FullResult:
def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_func, **kwargs) -> FullResult:
bench_scores = {}
averages_by_type = defaultdict(list)
averages_by_block_type = defaultdict(list)
Expand All @@ -29,7 +29,8 @@ def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func,
gt_blocks = json.loads(sample["gt_blocks"])
doc_type = sample["classification"]
try:
scores = html_func(model_dict, sample, **kwargs)
gt_html = [block["html"] for block in gt_blocks]
scores = score_func(model_dict, sample, gt_html, **kwargs)
except ValueError as e:
print(f"Error with sample {idx}: {e}")
continue
Expand All @@ -40,10 +41,13 @@ def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func,

bench_scores[idx] = scores

avg_time = sum([bench_scores[k]["time"] for k in bench_scores]) / len(bench_scores)
return {
"raw_scores": bench_scores,
"averages_by_type": averages_by_type,
"averages_by_block_type": averages_by_block_type
"averages_by_block_type": averages_by_block_type,
"average_time": avg_time,
"average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
}

def print_scores(scores: FullResult, method: str):
Expand Down Expand Up @@ -73,11 +77,13 @@ def print_scores(scores: FullResult, method: str):
@click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values: mathpix", default="")
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
@click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
def main(
dataset: str,
other_methods: str,
result_path: str,
max_rows: int
max_rows: int,
use_llm: bool
):
allowed_methods = ["mathpix", ""]
methods = other_methods.split(",")
Expand All @@ -88,23 +94,23 @@ def main(
model_dict = create_model_dict()
ds = datasets.load_dataset(dataset, split="train")

marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows)
marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows, use_llm=use_llm)
all_scores = {
"marker": marker_scores
}

if "mathpix" in methods:
mathpix_ds = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train")
mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, html_func=mathpix_html_func, mathpix_ds=mathpix_ds)
mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds)
all_scores["mathpix"] = mathpix_scores

for k,v in all_scores.items():
print_scores(v, k)

out_path = Path(result_path)
out_path.mkdir(parents=True, exist_ok=True)
with open(out_path / "overall.json", "w") as f:
json.dump(all_scores, f, indent=2)
with open(out_path / "overall.json", "w", encoding="utf-8") as f:
json.dump(all_scores, f, indent=2, ensure_ascii=False)

print(f"Results saved to {out_path}.")

Expand Down
5 changes: 4 additions & 1 deletion benchmarks/overall/schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import TypedDict, List, Dict
from typing import TypedDict, List, Dict, Optional


class BlockScores(TypedDict):
Expand All @@ -7,9 +7,12 @@ class BlockScores(TypedDict):
gt: List[str]
method: str
overall_score: float
time: Optional[float]


class FullResult(TypedDict):
raw_scores: Dict[int, BlockScores]
averages_by_type: Dict[str, List[float]]
averages_by_block_type: Dict[str, List[float]]
average_time: float
average_score: float
21 changes: 19 additions & 2 deletions benchmarks/overall/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
concordant = 0
discordant = 0

if n <= 1:
return 100

for i in range(n):
for j in range(i + 1, n):
correct_sign = correct_order[i] - correct_order[j]
Expand Down Expand Up @@ -61,18 +64,27 @@ def convert_to_md(html):
return markdown

def standardize_markdown(markdown):
# Replace math expressions
pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
markdown = re.sub(pattern, standardize_math, markdown)

# Replace image urls
pattern = r'!\[(.*?)\]\((.*?)(?:\?.*?width=(\d+).*?height=(\d+).*?)\)'
markdown = re.sub(pattern, r'![/api/placeholder]', markdown)

# Clean up html tags
markdown = markdown.replace("<br>", "\n")
markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown) # Remove span tags and keep content

# Clean up markdown
markdown = re.sub(r"\s+", " ", markdown)
markdown = re.sub(r"\n+", "\n", markdown)
markdown = re.sub("\\.+", ".", markdown) # Replace repeated periods with a single period, like in table of contents
markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header
markdown = re.sub(r"\$", "", markdown) # Remove equation delimiters
markdown = markdown.encode().decode('unicode-escape') # Decode unicode characters properly
return markdown.strip().lower()


Expand Down Expand Up @@ -116,10 +128,14 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores:
gt = [standardize_markdown(convert_to_md(gt)) for gt in gt_html]
alignments = find_fuzzy_alignments(method_html, gt)
scores = [alignment["score"] for alignment in alignments]

# Find order score
orders = [alignment["start"] for alignment in alignments]
correct_order = range(len(gt))
correct_order = list(range(len(gt)))
actual_order = sorted(range(len(gt)), key=lambda x: orders[x])
order_score = kendall_tau(correct_order, actual_order)

# Weight score by sequence length
gt_weights = [len(g) for g in gt]
weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]

Expand All @@ -131,5 +147,6 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores:
"order_score": order_score,
"gt": gt,
"method": method_html,
"overall_score": overall_score
"overall_score": overall_score,
"time": None
}
4 changes: 2 additions & 2 deletions benchmarks/table/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,9 @@ def main(
"gemini": gemini_results
}

out_path = Path(result_path) / "table.json"
out_path = Path(result_path)
out_path.mkdir(parents=True, exist_ok=True)
with open(out_path, "w+") as f:
with open(out_path / "table.json", "w+") as f:
json.dump(results, f, indent=2)

print(f"Results saved to {out_path}.")
Expand Down
8 changes: 3 additions & 5 deletions benchmarks/verify_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,9 @@ def verify_scores(file_path):
with open(file_path, 'r') as file:
data = json.load(file)

multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"]

if multicolcnn_score <= 0.34 or switch_trans_score <= 0.40:
raise ValueError("One or more scores are below the required threshold of 0.4")
marker_score = data["marker"]["overall_score"]
if marker_score < 90:
raise ValueError("Marker score below 90")


def verify_table_scores(file_path):
Expand Down

0 comments on commit 1a8a35d

Please sign in to comment.