From 1a9734f4f78d80080a9af31ad1b131a9aa0fdd0f Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 29 Jan 2025 09:55:55 -0500 Subject: [PATCH] Add test for partial row splitting --- marker/processors/table.py | 3 ++- tests/processors/test_table_processor.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/marker/processors/table.py b/marker/processors/table.py index 1f455848..783a73ce 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -226,7 +226,8 @@ def split_combined_rows(self, tables: List[TableResult]): new_cell_count += 1 # For each new row we add, shift up subsequent rows - shift_up += line_lens[0] - 1 + # The max is to account for partial rows + shift_up += max(line_lens) - 1 else: for cell in row_cells: cell.row_id += shift_up diff --git a/tests/processors/test_table_processor.py b/tests/processors/test_table_processor.py index 79224a58..72e2a04b 100644 --- a/tests/processors/test_table_processor.py +++ b/tests/processors/test_table_processor.py @@ -1,3 +1,5 @@ +from typing import List + import pytest from marker.renderers.json import JSONRenderer @@ -63,3 +65,15 @@ def test_ocr_table(pdf_document, detection_model, recognition_model, table_rec_m table_output = renderer(pdf_document) assert "1.2E-38" in table_output.markdown + +@pytest.mark.config({"page_range": [11]}) +def test_split_rows(pdf_document, detection_model, recognition_model, table_rec_model): + processor = TableProcessor(detection_model, recognition_model, table_rec_model) + processor(pdf_document) + + table = pdf_document.contained_blocks((BlockTypes.Table,))[-1] + cells: List[TableCell] = table.contained_blocks(pdf_document, (BlockTypes.TableCell,)) + unique_rows = len(set([cell.row_id for cell in cells])) + assert unique_rows == 6 + +