Skip to content

Commit

Permalink
Add test for partial row splitting
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 29, 2025
1 parent 597db72 commit 1a9734f
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 1 deletion.
3 changes: 2 additions & 1 deletion marker/processors/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,8 @@ def split_combined_rows(self, tables: List[TableResult]):
new_cell_count += 1

# For each new row we add, shift up subsequent rows
shift_up += line_lens[0] - 1
# The max is to account for partial rows
shift_up += max(line_lens) - 1
else:
for cell in row_cells:
cell.row_id += shift_up
Expand Down
14 changes: 14 additions & 0 deletions tests/processors/test_table_processor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

import pytest
from marker.renderers.json import JSONRenderer

Expand Down Expand Up @@ -63,3 +65,15 @@ def test_ocr_table(pdf_document, detection_model, recognition_model, table_rec_m
table_output = renderer(pdf_document)
assert "1.2E-38" in table_output.markdown


@pytest.mark.config({"page_range": [11]})
def test_split_rows(pdf_document, detection_model, recognition_model, table_rec_model):
processor = TableProcessor(detection_model, recognition_model, table_rec_model)
processor(pdf_document)

table = pdf_document.contained_blocks((BlockTypes.Table,))[-1]
cells: List[TableCell] = table.contained_blocks(pdf_document, (BlockTypes.TableCell,))
unique_rows = len(set([cell.row_id for cell in cells]))
assert unique_rows == 6


0 comments on commit 1a9734f

Please sign in to comment.