datalab-to · VikParuchuri · Jun 2, 2025 · May 27, 2025 · May 27, 2025 · May 27, 2025
diff --git a/README.md b/README.md
@@ -113,7 +113,6 @@ Options:
 - `--debug`: Enable debug mode for additional logging and diagnostic information.
 - `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
 - `--config_json PATH`: Path to a JSON configuration file containing additional settings.
-- `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "en,fr,de"` for English, French, and German.
 - `config --help`: List all available builders, processors, and converters, and their associated configuration.  These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
 - `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`.  The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables.
 - `--llm_service`: Which llm service to use if `--use_llm` is passed.  This defaults to `marker.services.gemini.GoogleGeminiService`.

diff --git a/marker/builders/layout.py b/marker/builders/layout.py
@@ -31,6 +31,14 @@ class LayoutBuilder(BaseBuilder):
         bool,
         "Disable tqdm progress bars.",
     ] = False
+    expand_block_types: Annotated[
+        List[BlockTypes],
+        "Block types whose bounds should be expanded to accomodate missing regions"
+    ] = [BlockTypes.Picture, BlockTypes.Figure, BlockTypes.ComplexRegion] # Does not include groups since they are only injected later
+    max_expand_frac: Annotated[
+        float,
+        "The maximum fraction to expand the layout box bounds by"
+    ] = 0.05
 
     def __init__(self, layout_model: LayoutPredictor, config=None):
         self.layout_model = layout_model
@@ -44,6 +52,7 @@ def __call__(self, document: Document, provider: PdfProvider):
         else:
             layout_results = self.surya_layout(document.pages)
         self.add_blocks_to_pages(document.pages, layout_results)
+        self.expand_layout_blocks(document)
 
     def get_batch_size(self):
         if self.layout_batch_size is not None:
@@ -79,6 +88,28 @@ def surya_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
         )
         return layout_results
 
+    def expand_layout_blocks(self, document: Document):
+        for page in document.pages:
+            # Collect all blocks on this page as PolygonBox for easy access
+            page_blocks = [document.get_block(bid) for bid in page.structure]
+
+            for block_id in page.structure:
+                block = document.get_block(block_id)
+                if block.block_type in self.expand_block_types:
+                    other_blocks = [b for b in page_blocks if b != block]
+                    if not other_blocks:
+                        block.polygon = block.polygon.expand(self.max_expand_frac, self.max_expand_frac)
+                        continue
+
+                    min_gap = min(block.polygon.minimum_gap(other.polygon) for other in other_blocks)
+                    if min_gap <= 0:
+                        continue
+
+                    x_expand_frac = min_gap / block.polygon.width if block.polygon.width > 0 else 0
+                    y_expand_frac = min_gap / block.polygon.height if block.polygon.height > 0 else 0
+
+                    block.polygon = block.polygon.expand(x_expand_frac, y_expand_frac)
+
     def add_blocks_to_pages(
         self, pages: List[PageGroup], layout_results: List[LayoutResult]
     ):

diff --git a/marker/builders/line.py b/marker/builders/line.py
@@ -210,16 +210,17 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
                 document_page.text_extraction_method = "pdftext"
 
                 # Add in the provider lines - merge ones that get broken by pdftext
-                merged_provider_lines = self.merge_provider_lines_detected_lines(
-                    provider_lines, detection_boxes, image_size, page_size
+                # Also track the lines that were detected but have no provider overlaps
+                merged_provider_lines, detected_only_lines = self.merge_provider_lines_detected_lines(
+                    provider_lines, detection_boxes, image_size, page_size, document_page.page_id
                 )
 
                 # If fixing lines, mark every line to be passed to the OCR model
                 for provider_line in merged_provider_lines:
                     provider_line.line.text_extraction_method = (
-                        "surya" if self.format_lines else "pdftext"
+                        "hybrid" if self.format_lines else "pdftext"
                     )
-                page_lines[document_page.page_id] = merged_provider_lines
+                page_lines[document_page.page_id] = merged_provider_lines + detected_only_lines
             else:
                 document_page.text_extraction_method = "surya"
                 boxes_to_ocr[document_page.page_id].extend(detection_boxes)
@@ -381,7 +382,7 @@ def merge_blocks(
             # Text extraction method is overridden later for OCRed documents
             document_page.merge_blocks(
                 merged_lines,
-                text_extraction_method="pdftext",
+                text_extraction_method="pdftext" if provider_lines else "surya",
                 keep_chars=self.keep_chars,
             )
 
@@ -391,10 +392,31 @@ def merge_provider_lines_detected_lines(
         text_lines: List[PolygonBox],
         image_size,
         page_size,
+        page_id,
     ):
-        # When provider lines is empty or no lines detected, return provider lines
-        if not provider_lines or not text_lines:
-            return provider_lines
+        # If no lines detected, skip the merging
+        if not text_lines:
+            return provider_lines, []
+
+        # If no provider lines, return all detected text lines
+        if not provider_lines:
+            detected_only_lines = []
+            LineClass: Line = get_block_class(BlockTypes.Line)
+            for text_line in text_lines:
+                text_line_polygon = PolygonBox(polygon=text_line.polygon).rescale(image_size, page_size)
+                detected_only_lines.append(
+                    ProviderOutput(
+                        line=LineClass(
+                            polygon=text_line_polygon,
+                            page_id=page_id,
+                            text_extraction_method="surya"
+                        ),
+                        spans=[],
+                        chars=[]
+                    )
+                )
+
+            return out_provider_lines, detected_only_lines
 
         out_provider_lines = []
         horizontal_provider_lines = []
@@ -414,7 +436,7 @@ def merge_provider_lines_detected_lines(
         ]
 
         overlaps = matrix_intersection_area(provider_line_boxes, detected_line_boxes)
-
+        
         # Find potential merges
         merge_lines = defaultdict(list)
         for i in range(len(provider_line_boxes)):
@@ -478,7 +500,16 @@ def bbox_for_merge_section(
         ):
             # Don't just take the whole detected line if we have multiple sections inside
             if len(all_merge_sections) == 1:
-                return text_line.rescale(image_size, page_size)
+                # This is to cover for the special case where multiple detected lines fall under the same provider line
+                # This happens sometimes since providers lines are long, and will merge across whitespace
+                # We merge in all detected lines into a single polygon before assigning to the provider line
+                idx = merge_section[0]
+                overlap_idxs = np.nonzero(overlaps[idx])[0]
+                # Account for lines that overlap, but have been assigned to a different provider line already
+                lines = [text_line] + [text_lines[i] for i in overlap_idxs if i not in merge_lines]
+
+                merged = lines[0] if len(lines) == 1 else lines[0].merge(lines[1:])
+                return merged.rescale(image_size, page_size)
             else:
                 poly = None
                 for section_idx in merge_section:
@@ -532,4 +563,23 @@ def bbox_for_merge_section(
         # Sort to preserve original order
         out_provider_lines = sorted(out_provider_lines, key=lambda x: x[0])
         out_provider_lines = [p for _, p in out_provider_lines]
-        return out_provider_lines
+
+        # Detected lines that do not overlap with any provider lines shoudl be outputted as-is
+        detected_only_lines = []
+        LineClass: Line = get_block_class(BlockTypes.Line)
+        for j in range(len(detected_line_boxes)):
+            if np.max(overlaps[:, j]) == 0:
+                detected_line_polygon = PolygonBox.from_bbox(detected_line_boxes[j])
+                detected_only_lines.append(
+                    ProviderOutput(
+                        line=LineClass(
+                            polygon=detected_line_polygon,
+                            page_id=page_id,
+                            text_extraction_method="surya",
+                        ),
+                        spans=[],
+                        chars=[],
+                    )
+                )
+
+        return out_provider_lines, detected_only_lines
diff --git a/marker/builders/ocr.py b/marker/builders/ocr.py
@@ -35,11 +35,12 @@ class OcrBuilder(BaseBuilder):
         bool,
         "Disable tqdm progress bars.",
     ] = False
+    # We can skip tables here, since the TableProcessor will re-OCR
     skip_ocr_blocks: Annotated[
         List[BlockTypes],
         "Blocktypes for which contained lines are not processed by the OCR model"
-        "By default, this avoids recognizing lines inside equations",
-    ] = BlockTypes.Equation
+        "By default, this avoids recognizing lines inside equations, figures, and pictures",
+    ] = [BlockTypes.Equation, BlockTypes.Figure, BlockTypes.FigureGroup, BlockTypes.Picture, BlockTypes.PictureGroup, BlockTypes.Table]
     ocr_task_name: Annotated[
         str,
         "The OCR mode to use, see surya for details.  Set to 'ocr_without_boxes' for potentially better performance, at the expense of formatting.",
@@ -99,7 +100,7 @@ def get_ocr_images_polygons_ids(
                 block_lines_to_ocr = [
                     block_line
                     for block_line in block_lines
-                    if block_line.text_extraction_method == "surya"
+                    if block_line.text_extraction_method in ["surya", "hybrid"]
                 ]
 
                 # Set extraction method of OCR-only pages

diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
@@ -12,7 +12,7 @@
 from marker.schema.blocks.base import BlockMetadata
 from marker.schema.groups.base import Group
 from marker.schema.polygon import PolygonBox
-from marker.util import matrix_intersection_area
+from marker.util import matrix_intersection_area, sort_text_lines
 
 LINE_MAPPING_TYPE = List[Tuple[int, ProviderOutput]]
 
@@ -244,9 +244,19 @@ def add_initial_blocks(
     ):
         # Add lines to the proper blocks, sorted in order
         for block_id, lines in block_lines.items():
-            lines = sorted(lines, key=lambda x: x[0])
+            line_extraction_methods = set([l[1].line.text_extraction_method for l in lines])
+            if len(line_extraction_methods) == 1:
+                lines = sorted(lines, key=lambda x: x[0])
+                lines = [l for _, l in lines]
+            else:
+                lines = [l for _, l in lines]
+                line_polygons = [l.line.polygon for l in lines]
+                sorted_line_polygons = sort_text_lines(line_polygons)
+                argsort = [line_polygons.index(p) for p in sorted_line_polygons]
+                lines = [lines[i] for i in argsort]
+
             block = self.get_block(block_id)
-            for line_idx, provider_output in lines:
+            for provider_output in lines:
                 line = provider_output.line
                 spans = provider_output.spans
                 self.add_full_block(line)

diff --git a/tests/builders/test_line_builder.py b/tests/builders/test_line_builder.py
@@ -0,0 +1,24 @@
+import pytest
+
+from marker.schema import BlockTypes
+
+# Page contains provider lines that are longer than detected lines
+# Any bad merging will cause broken final OCR results with format lines
+@pytest.mark.filename("mixed_eng_hindi.pdf")
+@pytest.mark.config({"page_range": [2], "format_lines": True})
+def test_provider_detected_line_merge(pdf_document):
+    page = pdf_document.pages[0]
+    text_lines = page.contained_blocks(pdf_document, (BlockTypes.Line,))
+
+    # This count includes detected lines merged in with provider lines
+    assert len(text_lines) == 83
+
+# Page provider lines only contain english, while the hindi is missing
+# format_lines should fill in the missing lines
+@pytest.mark.filename("mixed_eng_hindi.pdf")
+@pytest.mark.config({"page_range": [0], "format_lines": True})
+def test_fill_missing_provider_lines(pdf_document):
+    page = pdf_document.pages[0]
+    raw_text = page.raw_text(pdf_document)
+    assert "प्राधिकार से प्रकाशित" in raw_text
+    assert "खान मंत्रालय" in raw_text