From 1a8a35d7407226e2021bba2f95a3f446d8dbbaea Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 30 Jan 2025 12:23:58 -0500 Subject: [PATCH] Additional benchmark cleanup --- .github/workflows/benchmark.yml | 25 +++++++++++++------------ benchmarks/overall/inference.py | 21 +++++++++++---------- benchmarks/overall/overall.py | 24 +++++++++++++++--------- benchmarks/overall/schema.py | 5 ++++- benchmarks/overall/scoring.py | 21 +++++++++++++++++++-- benchmarks/table/table.py | 4 ++-- benchmarks/verify_scores.py | 8 +++----- 7 files changed, 67 insertions(+), 41 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 5d49aa1c..5e7785c0 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -2,12 +2,12 @@ name: Integration test with benchmark on: [push] -env: - TORCH_DEVICE: "cpu" - jobs: benchmark: - runs-on: [ubuntu-latest, windows-latest] + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ ubuntu-latest, windows-latest ] steps: - uses: actions/checkout@v3 - name: Set up Python 3.11 @@ -18,16 +18,17 @@ jobs: run: | pip install poetry poetry install - poetry remove torch - poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu - - name: Download benchmark data - run: | - wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi" - unzip -o benchmark_data.zip - name: Run benchmark test + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + poetry run python benchmarks/overall/overall.py --max_rows 5 + poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/overall.json --type marker + - name: Run benchmark test + - name: Run table benchmark run: | - poetry run python benchmarks/overall.py benchmark_data/pdfs benchmark_data/references report.json - poetry run python benchmarks/verify_scores.py report.json --type marker + poetry run python benchmarks/table/table.py --max_rows 5 + poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table diff --git a/benchmarks/overall/inference.py b/benchmarks/overall/inference.py index f312429b..1b504cff 100644 --- a/benchmarks/overall/inference.py +++ b/benchmarks/overall/inference.py @@ -1,15 +1,16 @@ -import json import tempfile +import time + from bs4 import BeautifulSoup from benchmarks.overall.scoring import score_blocks from benchmarks.overall.schema import BlockScores from marker.converters.pdf import PdfConverter -def get_marker_html(marker_models: dict, pdf_bytes: bytes): +def get_marker_html(marker_models: dict, pdf_bytes: bytes, use_llm: bool): block_converter = PdfConverter( artifact_dict=marker_models, - config={"page_range": [0], "disable_tqdm": True}, + config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm}, renderer="marker.renderers.html.HTMLRenderer" ) with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: @@ -21,16 +22,17 @@ def get_marker_html(marker_models: dict, pdf_bytes: bytes): return inner_html -def marker_html_func(model_dict, sample, **kwargs) -> BlockScores: - gt_blocks = json.loads(sample["gt_blocks"]) +def marker_scoring_func(model_dict, sample, gt_html, use_llm=False, **kwargs) -> BlockScores: pdf_bytes = sample["pdf"] # This is a single page PDF - marker_html = get_marker_html(model_dict, pdf_bytes) - gt_html = [block["html"] for block in gt_blocks] + start = time.time() + marker_html = get_marker_html(model_dict, pdf_bytes, use_llm) + total = time.time() - start scores = score_blocks(gt_html, marker_html) + scores["time"] = total return scores -def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores: +def mathpix_scoring_func(model_dict, sample, gt_html, mathpix_ds=None, **kwargs) -> BlockScores: uuid = sample["uuid"] data = None for row in mathpix_ds: @@ -41,7 +43,6 @@ def mathpix_html_func(model_dict, sample, mathpix_ds, **kwargs) -> BlockScores: raise ValueError(f"Could not find data for uuid {uuid}") mathpix_md = data["md"] - gt_blocks = json.loads(sample["gt_blocks"]) - gt_html = [block["html"] for block in gt_blocks] scores = score_blocks(gt_html, mathpix_md, convert=False) + scores["time"] = data["time"] return scores diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py index bdb1fc7c..9cf6fb01 100644 --- a/benchmarks/overall/overall.py +++ b/benchmarks/overall/overall.py @@ -9,7 +9,7 @@ import tabulate from tqdm import tqdm -from benchmarks.overall.inference import marker_html_func, mathpix_html_func +from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func from benchmarks.overall.schema import FullResult from marker.logger import configure_logging from marker.models import create_model_dict @@ -18,7 +18,7 @@ configure_logging() -def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func, **kwargs) -> FullResult: +def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_func, **kwargs) -> FullResult: bench_scores = {} averages_by_type = defaultdict(list) averages_by_block_type = defaultdict(list) @@ -29,7 +29,8 @@ def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func, gt_blocks = json.loads(sample["gt_blocks"]) doc_type = sample["classification"] try: - scores = html_func(model_dict, sample, **kwargs) + gt_html = [block["html"] for block in gt_blocks] + scores = score_func(model_dict, sample, gt_html, **kwargs) except ValueError as e: print(f"Error with sample {idx}: {e}") continue @@ -40,10 +41,13 @@ def get_method_scores(ds, model_dict, max_rows=None, html_func=marker_html_func, bench_scores[idx] = scores + avg_time = sum([bench_scores[k]["time"] for k in bench_scores]) / len(bench_scores) return { "raw_scores": bench_scores, "averages_by_type": averages_by_type, - "averages_by_block_type": averages_by_block_type + "averages_by_block_type": averages_by_block_type, + "average_time": avg_time, + "average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores) } def print_scores(scores: FullResult, method: str): @@ -73,11 +77,13 @@ def print_scores(scores: FullResult, method: str): @click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against. Possible values: mathpix", default="") @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.") @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.") +@click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.") def main( dataset: str, other_methods: str, result_path: str, - max_rows: int + max_rows: int, + use_llm: bool ): allowed_methods = ["mathpix", ""] methods = other_methods.split(",") @@ -88,14 +94,14 @@ def main( model_dict = create_model_dict() ds = datasets.load_dataset(dataset, split="train") - marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows) + marker_scores = get_method_scores(ds, model_dict, max_rows=max_rows, use_llm=use_llm) all_scores = { "marker": marker_scores } if "mathpix" in methods: mathpix_ds = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train") - mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, html_func=mathpix_html_func, mathpix_ds=mathpix_ds) + mathpix_scores = get_method_scores(ds, model_dict, max_rows=max_rows, score_func=mathpix_scoring_func, mathpix_ds=mathpix_ds) all_scores["mathpix"] = mathpix_scores for k,v in all_scores.items(): @@ -103,8 +109,8 @@ def main( out_path = Path(result_path) out_path.mkdir(parents=True, exist_ok=True) - with open(out_path / "overall.json", "w") as f: - json.dump(all_scores, f, indent=2) + with open(out_path / "overall.json", "w", encoding="utf-8") as f: + json.dump(all_scores, f, indent=2, ensure_ascii=False) print(f"Results saved to {out_path}.") diff --git a/benchmarks/overall/schema.py b/benchmarks/overall/schema.py index 98ffc1b8..8af5bf28 100644 --- a/benchmarks/overall/schema.py +++ b/benchmarks/overall/schema.py @@ -1,4 +1,4 @@ -from typing import TypedDict, List, Dict +from typing import TypedDict, List, Dict, Optional class BlockScores(TypedDict): @@ -7,9 +7,12 @@ class BlockScores(TypedDict): gt: List[str] method: str overall_score: float + time: Optional[float] class FullResult(TypedDict): raw_scores: Dict[int, BlockScores] averages_by_type: Dict[str, List[float]] averages_by_block_type: Dict[str, List[float]] + average_time: float + average_score: float diff --git a/benchmarks/overall/scoring.py b/benchmarks/overall/scoring.py index 1ba78bc9..713e5fef 100644 --- a/benchmarks/overall/scoring.py +++ b/benchmarks/overall/scoring.py @@ -12,6 +12,9 @@ def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float: concordant = 0 discordant = 0 + if n <= 1: + return 100 + for i in range(n): for j in range(i + 1, n): correct_sign = correct_order[i] - correct_order[j] @@ -61,18 +64,27 @@ def convert_to_md(html): return markdown def standardize_markdown(markdown): + # Replace math expressions pattern = r'(?", "\n") markdown = re.sub(r"(.*?)", r"\1", markdown) markdown = re.sub(r"(.*?)", r"\1", markdown) + markdown = re.sub(r"(.*?)", r"\1", markdown) # Remove span tags and keep content + # Clean up markdown markdown = re.sub(r"\s+", " ", markdown) markdown = re.sub(r"\n+", "\n", markdown) markdown = re.sub("\\.+", ".", markdown) # Replace repeated periods with a single period, like in table of contents markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header markdown = re.sub(r"\$", "", markdown) # Remove equation delimiters + markdown = markdown.encode().decode('unicode-escape') # Decode unicode characters properly return markdown.strip().lower() @@ -116,10 +128,14 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores: gt = [standardize_markdown(convert_to_md(gt)) for gt in gt_html] alignments = find_fuzzy_alignments(method_html, gt) scores = [alignment["score"] for alignment in alignments] + + # Find order score orders = [alignment["start"] for alignment in alignments] - correct_order = range(len(gt)) + correct_order = list(range(len(gt))) actual_order = sorted(range(len(gt)), key=lambda x: orders[x]) order_score = kendall_tau(correct_order, actual_order) + + # Weight score by sequence length gt_weights = [len(g) for g in gt] weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)] @@ -131,5 +147,6 @@ def score_blocks(gt_html, method_html, convert=True) -> BlockScores: "order_score": order_score, "gt": gt, "method": method_html, - "overall_score": overall_score + "overall_score": overall_score, + "time": None } \ No newline at end of file diff --git a/benchmarks/table/table.py b/benchmarks/table/table.py index 448e32fe..130f862d 100644 --- a/benchmarks/table/table.py +++ b/benchmarks/table/table.py @@ -222,9 +222,9 @@ def main( "gemini": gemini_results } - out_path = Path(result_path) / "table.json" + out_path = Path(result_path) out_path.mkdir(parents=True, exist_ok=True) - with open(out_path, "w+") as f: + with open(out_path / "table.json", "w+") as f: json.dump(results, f, indent=2) print(f"Results saved to {out_path}.") diff --git a/benchmarks/verify_scores.py b/benchmarks/verify_scores.py index 913081e9..1ce2fa07 100644 --- a/benchmarks/verify_scores.py +++ b/benchmarks/verify_scores.py @@ -6,11 +6,9 @@ def verify_scores(file_path): with open(file_path, 'r') as file: data = json.load(file) - multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"] - switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"] - - if multicolcnn_score <= 0.34 or switch_trans_score <= 0.40: - raise ValueError("One or more scores are below the required threshold of 0.4") + marker_score = data["marker"]["overall_score"] + if marker_score < 90: + raise ValueError("Marker score below 90") def verify_table_scores(file_path):