8000 Format Lines: Fix line merging by tarun-menta · Pull Request #712 · datalab-to/marker · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Format Lines: Fix line merging #712

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ Options:
- `--debug`: Enable debug mode for additional logging and diagnostic information.
- `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
- `--config_json PATH`: Path to a JSON configuration file containing additional settings.
- `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "en,fr,de"` for English, French, and German.
- `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
- `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`. The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables.
- `--llm_service`: Which llm service to use if `--use_llm` is passed. This defaults to `marker.services.gemini.GoogleGeminiService`.
Expand Down
31 changes: 31 additions & 0 deletions marker/builders/layout.py
8000
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ class LayoutBuilder(BaseBuilder):
bool,
"Disable tqdm progress bars.",
] = False
expand_block_types: Annotated[
List[BlockTypes],
"Block types whose bounds should be expanded to accomodate missing regions"
] = [BlockTypes.Picture, BlockTypes.Figure, BlockTypes.ComplexRegion] # Does not include groups since they are only injected later
max_expand_frac: Annotated[
float,
"The maximum fraction to expand the layout box bounds by"
] = 0.05

def __init__(self, layout_model: LayoutPredictor, config=None):
self.layout_model = layout_model
Expand All @@ -44,6 +52,7 @@ def __call__(self, document: Document, provider: PdfProvider):
else:
layout_results = self.surya_layout(document.pages)
self.add_blocks_to_pages(document.pages, layout_results)
self.expand_layout_blocks(document)

def get_batch_size(self):
if self.layout_batch_size is not None:
Expand Down Expand Up @@ -79,6 +88,28 @@ def surya_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
)
return layout_results

def expand_layout_blocks(self, document: Document):
for page in document.pages:
# Collect all blocks on this page as PolygonBox for easy access
page_blocks = [document.get_block(bid) for bid in page.structure]

for block_id in page.structure:
block = document.get_block(block_id)
if block.block_type in self.expand_block_types:
other_blocks = [b for b in page_blocks if b != block]
if not other_blocks:
block.polygon = block.polygon.expand(self.max_expand_frac, self.max_expand_frac)
continue

min_gap = min(block.polygon.minimum_gap(other.polygon) for other in other_blocks)
if min_gap <= 0:
continue

x_expand_frac = min_gap / block.polygon.width if block.polygon.width > 0 else 0
y_expand_frac = min_gap / block.polygon.height if block.polygon.height > 0 else 0

block.polygon = block.polygon.expand(x_expand_frac, y_expand_frac)

def add_blocks_to_pages(
self, pages: List[PageGroup], layout_results: List[LayoutResult]
):
Expand Down
72 changes: 61 additions & 11 deletions marker/builders/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,16 +210,17 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
document_page.text_extraction_method = "pdftext"

# Add in the provider lines - merge ones that get broken by pdftext
merged_provider_lines = self.merge_provider_lines_detected_lines(
provider_lines, detection_boxes, image_size, page_size
# Also track the lines that were detected but have no provider overlaps
merged_provider_lines, detected_only_lines = self.merge_provider_lines_detected_lines(
provider_lines, detection_boxes, image_size, page_size, document_page.page_id
)

# If fixing lines, mark every line to be passed to the OCR model
for provider_line in merged_provider_lines:
provider_line.line.text_extraction_method = (
"surya" if self.format_lines else "pdftext"
"hybrid" if self.format_lines else "pdftext"
)
page_lines[document_page.page_id] = merged_provider_lines
page_lines[document_page.page_id] = merged_provider_lines + detected_only_lines
else:
document_page.text_extraction_method = "surya"
boxes_to_ocr[document_page.page_id].extend(detection_boxes)
Expand Down Expand Up @@ -381,7 +382,7 @@ def merge_blocks(
# Text extraction method is overridden later for OCRed documents
document_page.merge_blocks(
merged_lines,
text_extraction_method="pdftext",
text_extraction_method="pdftext" if provider_lines else "surya",
keep_chars=self.keep_chars,
)

Expand All @@ -391,10 +392,31 @@ def merge_provider_lines_detected_lines(
text_lines: List[PolygonBox],
image_size,
page_size,
page_id,
):
# When provider lines is empty or no lines detected, return provider lines
if not provider_lines or not text_lines:
return provider_lines
# If no lines detected, skip the merging
if not text_lines:
return provider_lines, []

# If no provider lines, return all detected text lines
if not provider_lines:
detected_only_lines = []
LineClass: Line = get_block_class(BlockTypes.Line)
for text_line in text_lines:
text_line_polygon = PolygonBox(polygon=text_line.polygon).rescale(image_size, page_size)
detected_only_lines.append(
ProviderOutput(
line=LineClass(
polygon=text_line_polygon,
page_id=page_id,
text_extraction_method="surya"
),
spans=[],
chars=[]
)
)

return out_provider_lines, detected_only_lines

out_provider_lines = []
horizontal_provider_lines = []
Expand All @@ -414,7 +436,7 @@ def merge_provider_lines_detected_lines(
]

overlaps = matrix_intersection_area(provider_line_boxes, detected_line_boxes)

# Find potential merges
merge_lines = defaultdict(list)
for i in range(len(provider_line_boxes)):
Expand Down Expand Up @@ -478,7 +500,16 @@ def bbox_for_merge_section(
):
# Don't just take the whole detected line if we have multiple sections inside
if len(all_merge_sections) == 1:
return text_line.rescale(image_size, page_size)
# This is to cover for the special case where multiple detected lines fall under the same provider line
# This happens sometimes since providers lines are long, and will merge across whitespace
# We merge in all detected lines into a single polygon before assigning to the provider line
idx = merge_section[0]
overlap_idxs = np.nonzero(overlaps[idx])[0]
# Account for lines that overlap, but have been assigned to a different provider line already
lines = [text_line] + [text_lines[i] for i in overlap_idxs if i not in merge_lines]

merged = lines[0] if len(lines) == 1 else lines[0].merge(lines[1:])
return merged.rescale(image_size, page_size)
else:
poly = None
for section_idx in merge_section:
Expand Down Expand Up @@ -532,4 +563,23 @@ def bbox_for_merge_section(
# Sort to preserve original order
out_provider_lines = sorted(out_provider_lines, key=lambda x: x[0])
out_provider_lines = [p for _, p in out_provider_lines]
return out_provider_lines

# Detected lines that do not overlap with any provider lines shoudl be outputted as-is
detected_only_lines = []
LineClass: Line = get_block_class(BlockTypes.Line)
for j in range(len(detected_line_boxes)):
if np.max(overlaps[:, j]) == 0:
detected_line_polygon = PolygonBox.from_bbox(detected_line_boxes[j])
detected_only_lines.append(
ProviderOutput(
line=LineClass(
polygon=detected_line_polygon,
page_id=page_id,
text_extraction_method="surya",
),
spans=[],
chars=[],
)
)

return out_provider_lines, detected_only_lines
7 changes: 4 additions & 3 deletions marker/builders/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,12 @@ class OcrBuilder(BaseBuilder):
bool,
"Disable tqdm progress bars.",
] = False
# We can skip tables here, since the TableProcessor will re-OCR
skip_ocr_blocks: Annotated[
List[BlockTypes],
"Blocktypes for which contained lines are not processed by the OCR model"
"By default, this avoids recognizing lines inside equations",
] = BlockTypes.Equation
"By default, this avoids recognizing lines inside equations, figures, and pictures",
] = [BlockTypes.Equation, BlockTypes.Figure, BlockTypes.FigureGroup, BlockTypes.Picture, BlockTypes.PictureGroup, BlockTypes.Table]
ocr_task_name: Annotated[
str,
"The OCR mode to use, see surya for details. Set to 'ocr_without_boxes' for potentially better performance, at the expense of formatting.",
Expand Down Expand Up @@ -99,7 +100,7 @@ def get_ocr_images_polygons_ids(
block_lines_to_ocr = [
block_line
for block_line in block_lines
if block_line.text_extraction_method == "surya"
if block_line.text_extraction_method in ["surya", "hybrid"]
]

# Set extraction method of OCR-only pages
Expand Down
16 changes: 13 additions & 3 deletions marker/schema/groups/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from marker.schema.blocks.base import BlockMetadata
from marker.schema.groups.base import Group
from marker.schema.polygon import PolygonBox
from marker.util import matrix_intersection_area
from marker.util import matrix_intersection_area, sort_text_lines

LINE_MAPPING_TYPE = List[Tuple[int, ProviderOutput]]

Expand Down Expand Up @@ -244,9 +244,19 @@ def add_initial_blocks(
):
# Add lines to the proper blocks, sorted in order
for block_id, lines in block_lines.items():
lines = sorted(lines, key=lambda x: x[0])
line_extraction_methods = set([l[1].line.text_extraction_method for l in lines])
if len(line_extraction_methods) == 1:
lines = sorted(lines, key=lambda x: x[0])
lines = [l for _, l in lines]
else:
lines = [l for _, l in lines]
line_polygons = [l.line.polygon for l in lines]
sorted_line_polygons = sort_text_lines(line_polygons)
argsort = [line_polygons.index(p) for p in sorted_line_polygons]
lines = [lines[i] for i in argsort]

block = self.get_block(block_id)
for line_idx, provider_output in lines:
for provider_output in lines:
line = provider_output.line
spans = provider_output.spans
self.add_full_block(line)
Expand Down
24 changes: 24 additions & 0 deletions tests/builders/test_line_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pytest

from marker.schema import BlockTypes

# Page contains provider lines that are longer than detected lines
# Any bad merging will cause broken final OCR results with format lines
@pytest.mark.filename("mixed_eng_hindi.pdf")
@pytest.mark.config({"page_range": [2], "format_lines": True})
def test_provider_detected_line_merge(pdf_document):
page = pdf_document.pages[0]
text_lines = page.contained_blocks(pdf_document, (BlockTypes.Line,))

# This count includes detected lines merged in with provider lines
assert len(text_lines) == 83

# Page provider lines only contain english, while the hindi is missing
# format_lines should fill in the missing lines
@pytest.mark.filename("mixed_eng_hindi.pdf")
@pytest.mark.config({"page_range": [0], "format_lines": True})
def test_fill_missing_provider_lines(pdf_document):
page = pdf_document.pages[0]
raw_text = page.raw_text(pdf_document)
assert "प्राधिकार से प्रकाशित" in raw_text
assert "खान मंत्रालय" in raw_text
0