From 9474a4169fe7a484364311aeaea63573b635d451 Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Tue, 27 May 2025 13:06:58 -0400 Subject: [PATCH 01/13] Merge missed text detection boxes in format lines When merging provider and detection lines, some boxes may be missing, but the layout check fails. This catches and merges in these boxes too. --- marker/builders/line.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index 53b0f45b..3583111f 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -211,7 +211,7 @@ def get_all_lines(self, document: Document, provider: PdfProvider): # Add in the provider lines - merge ones that get broken by pdftext merged_provider_lines = self.merge_provider_lines_detected_lines( - provider_lines, detection_boxes, image_size, page_size + provider_lines, detection_boxes, image_size, page_size, document_page.page_id ) # If fixing lines, mark every line to be passed to the OCR model @@ -391,6 +391,7 @@ def merge_provider_lines_detected_lines( text_lines: List[PolygonBox], image_size, page_size, + page_id, ): # When provider lines is empty or no lines detected, return provider lines if not provider_lines or not text_lines: @@ -414,7 +415,7 @@ def merge_provider_lines_detected_lines( ] overlaps = matrix_intersection_area(provider_line_boxes, detected_line_boxes) - + # Find potential merges merge_lines = defaultdict(list) for i in range(len(provider_line_boxes)): @@ -532,4 +533,22 @@ def bbox_for_merge_section( # Sort to preserve original order out_provider_lines = sorted(out_provider_lines, key=lambda x: x[0]) out_provider_lines = [p for _, p in out_provider_lines] + + # Detected lines that do not overlap with any provider lines shoudl be outputted as-is + LineClass: Line = get_block_class(BlockTypes.Line) + for j in range(len(detected_line_boxes)): + if np.max(overlaps[:, j]) == 0: + detected_line_polygon = PolygonBox.from_bbox(detected_line_boxes[j]) + out_provider_lines.append( + ProviderOutput( + line=LineClass( + polygon=detected_line_polygon, + page_id=page_id, + text_extraction_method="surya", + ), + spans=[], + chars=[], + ) + ) + return out_provider_lines From af6a381abf572768fc1d8c7e971e456b7696abc3 Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Tue, 27 May 2025 13:24:11 -0400 Subject: [PATCH 02/13] Avoid OCR in certain block types --- marker/builders/ocr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marker/builders/ocr.py b/marker/builders/ocr.py index df680ef3..1278beb1 100644 --- a/marker/builders/ocr.py +++ b/marker/builders/ocr.py @@ -39,7 +39,7 @@ class OcrBuilder(BaseBuilder): List[BlockTypes], "Blocktypes for which contained lines are not processed by the OCR model" "By default, this avoids recognizing lines inside equations", - ] = BlockTypes.Equation + ] = [BlockTypes.Equation, BlockTypes.Figure, BlockTypes.FigureGroup, BlockTypes.Picture, BlockTypes.PictureGroup, BlockTypes.Table] ocr_task_name: Annotated[ str, "The OCR mode to use, see surya for details. Set to 'ocr_without_boxes' for potentially better performance, at the expense of formatting.", From a2ecb6bfa72ede66da920fdf37ca835af8f98eef Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Tue, 27 May 2025 13:25:42 -0400 Subject: [PATCH 03/13] Cleanup --- marker/builders/ocr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/marker/builders/ocr.py b/marker/builders/ocr.py index 1278beb1..ec86322f 100644 --- a/marker/builders/ocr.py +++ b/marker/builders/ocr.py @@ -35,6 +35,7 @@ class OcrBuilder(BaseBuilder): bool, "Disable tqdm progress bars.", ] = False + # We can skip tables here, since the TableProcessor will re-OCR skip_ocr_blocks: Annotated[ List[BlockTypes], "Blocktypes for which contained lines are not processed by the OCR model" From 29bf761d8998a54a14603d6dddd0eaafde33565a Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Tue, 27 May 2025 13:38:55 -0400 Subject: [PATCH 04/13] Cleanup --- marker/builders/ocr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marker/builders/ocr.py b/marker/builders/ocr.py index ec86322f..c10b3369 100644 --- a/marker/builders/ocr.py +++ b/marker/builders/ocr.py @@ -39,7 +39,7 @@ class OcrBuilder(BaseBuilder): skip_ocr_blocks: Annotated[ List[BlockTypes], "Blocktypes for which contained lines are not processed by the OCR model" - "By default, this avoids recognizing lines inside equations", + "By default, this avoids recognizing lines inside equations, figures, and pictures", ] = [BlockTypes.Equation, BlockTypes.Figure, BlockTypes.FigureGroup, BlockTypes.Picture, BlockTypes.PictureGroup, BlockTypes.Table] ocr_task_name: Annotated[ str, From c60087f0b21639f662fd07d2b8c111d9753acab5 Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Tue, 27 May 2025 15:17:24 -0400 Subject: [PATCH 05/13] Merge multiple detected lines into common provider line --- marker/builders/line.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index 3583111f..264e4dbc 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -479,7 +479,11 @@ def bbox_for_merge_section( ): # Don't just take the whole detected line if we have multiple sections inside if len(all_merge_sections) == 1: - return text_line.rescale(image_size, page_size) + text_line_overlaps = np.nonzero(overlaps[merge_section[0]])[0].tolist() + merged_text_line: PolygonBox = text_lines[text_line_overlaps[0]] + if len(text_line_overlaps) > 1: + merged_text_line = merged_text_line.merge([text_lines[k] for k in text_line_overlaps[1:]]) + return merged_text_line.rescale(image_size, page_size) else: poly = None for section_idx in merge_section: From 58fe85057ff04a5561bfb98854fb942dc72ec261 Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Fri, 30 May 2025 11:52:04 -0400 Subject: [PATCH 06/13] Add tests for line merging logic --- tests/builders/test_line_builder.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 tests/builders/test_line_builder.py diff --git a/tests/builders/test_line_builder.py b/tests/builders/test_line_builder.py new file mode 100644 index 00000000..53584c8a --- /dev/null +++ b/tests/builders/test_line_builder.py @@ -0,0 +1,24 @@ +import pytest + +from marker.schema import BlockTypes + +# Page contains provider lines that are longer than detected lines +# Any bad merging will cause broken final OCR results with format lines +@pytest.mark.filename("mixed_eng_hindi.pdf") +@pytest.mark.config({"page_range": [2], "format_lines": True}) +def test_provider_detected_line_merge(pdf_document): + page = pdf_document.pages[0] + text_lines = page.contained_blocks(pdf_document, (BlockTypes.Line,)) + + # This count includes detected lines merged in with provider lines + assert len(text_lines) == 83 + +# Page provider lines only contain english, while the hindi is missing +# format_lines should fill in the missing lines +@pytest.mark.filename("mixed_eng_hindi.pdf") +@pytest.mark.config({"page_range": [0], "format_lines": True}) +def test_fill_missing_provider_lines(pdf_document): + page = pdf_document.pages[0] + raw_text = page.raw_text(pdf_document) + assert "प्राधिकार से प्रकाशित" in raw_text + assert "खान मंत्रालय" in raw_text \ No newline at end of file From 4a37789630422d5c57786b4a8924d1ac5cb706aa Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Fri, 30 May 2025 12:48:35 -0400 Subject: [PATCH 07/13] Expand layout boxes for figures and pictures Avoid slight cutting off of the layout boxes --- marker/builders/layout.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/marker/builders/layout.py b/marker/builders/layout.py index 6976af0b..91b725f1 100644 --- a/marker/builders/layout.py +++ b/marker/builders/layout.py @@ -31,6 +31,14 @@ class LayoutBuilder(BaseBuilder): bool, "Disable tqdm progress bars.", ] = False + expand_block_types: Annotated[ + List[BlockTypes], + "Block types whose bounds should be expanded to accomodate missing regions" + ] = [BlockTypes.Picture, BlockTypes.Figure, BlockTypes.ComplexRegion] # Does not include groups since they are only injected later + expand_frac: Annotated[ + float, + "The fraction to expand the layout box bounds by" + ] = 0.02 def __init__(self, layout_model: LayoutPredictor, config=None): self.layout_model = layout_model @@ -96,6 +104,8 @@ def add_blocks_to_pages( layout_block.polygon = layout_block.polygon.rescale( layout_page_size, provider_page_size ) + if layout_block.block_type in self.expand_block_types: + layout_block.polygon = layout_block.polygon.expand(self.expand_frac, self.expand_frac) layout_block.top_k = { BlockTypes[label]: prob for (label, prob) in bbox.top_k.items() } From 713fff5fcbcd84552c85cf0483ef81a6aa633412 Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Fri, 30 May 2025 14:11:02 -0400 Subject: [PATCH 08/13] Better sorting of lines from mixed sources (pdftext + surya) In format lines mode, we include lines from surya which were not present in the provider lines. However, we do not have ordering of these relative to the provider lines. This commit identifies blocks which contain lines from both sources, and sort with a different method within those blocks (Unchanged for all other blocks) --- marker/builders/line.py | 14 ++++++++------ marker/builders/ocr.py | 2 +- marker/schema/groups/page.py | 16 +++++++++++++--- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index 264e4dbc..cb4df216 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -210,16 +210,17 @@ def get_all_lines(self, document: Document, provider: PdfProvider): document_page.text_extraction_method = "pdftext" # Add in the provider lines - merge ones that get broken by pdftext - merged_provider_lines = self.merge_provider_lines_detected_lines( + # Also track the lines that were detected but have no provider overlaps + merged_provider_lines, detected_only_lines = self.merge_provider_lines_detected_lines( provider_lines, detection_boxes, image_size, page_size, document_page.page_id ) # If fixing lines, mark every line to be passed to the OCR model for provider_line in merged_provider_lines: provider_line.line.text_extraction_method = ( - "surya" if self.format_lines else "pdftext" + "hybrid" if self.format_lines else "pdftext" ) - page_lines[document_page.page_id] = merged_provider_lines + page_lines[document_page.page_id] = merged_provider_lines + detected_only_lines else: document_page.text_extraction_method = "surya" boxes_to_ocr[document_page.page_id].extend(detection_boxes) @@ -381,7 +382,7 @@ def merge_blocks( # Text extraction method is overridden later for OCRed documents document_page.merge_blocks( merged_lines, - text_extraction_method="pdftext", + text_extraction_method="pdftext" if provider_lines else "surya", keep_chars=self.keep_chars, ) @@ -539,11 +540,12 @@ def bbox_for_merge_section( out_provider_lines = [p for _, p in out_provider_lines] # Detected lines that do not overlap with any provider lines shoudl be outputted as-is + detected_only_lines = [] LineClass: Line = get_block_class(BlockTypes.Line) for j in range(len(detected_line_boxes)): if np.max(overlaps[:, j]) == 0: detected_line_polygon = PolygonBox.from_bbox(detected_line_boxes[j]) - out_provider_lines.append( + detected_only_lines.append( ProviderOutput( line=LineClass( polygon=detected_line_polygon, @@ -555,4 +557,4 @@ def bbox_for_merge_section( ) ) - return out_provider_lines + return out_provider_lines, detected_only_lines \ No newline at end of file diff --git a/marker/builders/ocr.py b/marker/builders/ocr.py index c10b3369..debeb36b 100644 --- a/marker/builders/ocr.py +++ b/marker/builders/ocr.py @@ -100,7 +100,7 @@ def get_ocr_images_polygons_ids( block_lines_to_ocr = [ block_line for block_line in block_lines - if block_line.text_extraction_method == "surya" + if block_line.text_extraction_method in ["surya", "hybrid"] ] # Set extraction method of OCR-only pages diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index 7d623bd8..7ea76773 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -12,7 +12,7 @@ from marker.schema.blocks.base import BlockMetadata from marker.schema.groups.base import Group from marker.schema.polygon import PolygonBox -from marker.util import matrix_intersection_area +from marker.util import matrix_intersection_area, sort_text_lines LINE_MAPPING_TYPE = List[Tuple[int, ProviderOutput]] @@ -244,9 +244,19 @@ def add_initial_blocks( ): # Add lines to the proper blocks, sorted in order for block_id, lines in block_lines.items(): - lines = sorted(lines, key=lambda x: x[0]) + line_extraction_methods = set([l[1].line.text_extraction_method for l in lines]) + if len(line_extraction_methods) == 1: + lines = sorted(lines, key=lambda x: x[0]) + lines = [l for _, l in lines] + else: + lines = [l for _, l in lines] + line_polygons = [l.line.polygon for l in lines] + sorted_line_polygons = sort_text_lines(line_polygons) + argsort = [line_polygons.index(p) for p in sorted_line_polygons] + lines = [lines[i] for i in argsort] + block = self.get_block(block_id) - for line_idx, provider_output in lines: + for provider_output in lines: line = provider_output.line spans = provider_output.spans self.add_full_block(line) From 634242eead1debfc790f56173c75968dd2da390d Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Fri, 30 May 2025 15:06:10 -0400 Subject: [PATCH 09/13] Bugfix - Fix return type when merging is skipped --- marker/builders/line.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index cb4df216..d6df9328 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -394,9 +394,29 @@ def merge_provider_lines_detected_lines( page_size, page_id, ): - # When provider lines is empty or no lines detected, return provider lines - if not provider_lines or not text_lines: - return provider_lines + # If no lines detected, skip the merging + if not text_lines: + return provider_lines, [] + + # If no provider lines, return all detected text lines + if not provider_lines: + detected_only_lines = [] + LineClass: Line = get_block_class(BlockTypes.Line) + for text_line in text_lines: + text_line_polygon = PolygonBox(polygon=text_line.polygon).rescale(image_size, page_size) + detected_only_lines.append( + ProviderOutput( + line=LineClass( + polygon=text_line_polygon, + page_id=page_id, + text_extraction_method="surya" + ), + spans=[], + chars=[] + ) + ) + + return out_provider_lines, detected_only_lines out_provider_lines = [] horizontal_provider_lines = [] From 262b48dc039230797480f60ad84453d1563d53d0 Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Fri, 30 May 2025 15:46:14 -0400 Subject: [PATCH 10/13] Fix merging logic When merging multiple detected lines into a single provider line, skip detected lines which have already been asigned to a different provider line Was causing repeated text otherwise --- marker/builders/line.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index d6df9328..38ef5d84 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -500,11 +500,16 @@ def bbox_for_merge_section( ): # Don't just take the whole detected line if we have multiple sections inside if len(all_merge_sections) == 1: - text_line_overlaps = np.nonzero(overlaps[merge_section[0]])[0].tolist() - merged_text_line: PolygonBox = text_lines[text_line_overlaps[0]] - if len(text_line_overlaps) > 1: - merged_text_line = merged_text_line.merge([text_lines[k] for k in text_line_overlaps[1:]]) - return merged_text_line.rescale(image_size, page_size) + # This is to cover for the special case where multiple detected lines fall under the same provider line + # This happens sometimes since providers lines are long, and will merge across whitespace + # We merge in all detected lines into a single polygon before assigning to the provider line + idx = merge_section[0] + overlap_idxs = np.nonzero(overlaps[idx])[0] + # Account for lines that overlap, but have been assigned to a different provider line already + lines = [text_line] + [text_lines[i] for i in overlap_idxs if i not in merge_lines] + + merged = lines[0] if len(lines) == 1 else lines[0].merge(lines[1:]) + return merged.rescale(image_size, page_size) else: poly = None for section_idx in merge_section: From 933a3e3bc2322db59197a63c1f176a9ee7ef8c5d Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Fri, 30 May 2025 15:54:30 -0400 Subject: [PATCH 11/13] Update README - Remove deprecated flag [no ci] --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 928600df..79c1663f 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,6 @@ Options: - `--debug`: Enable debug mode for additional logging and diagnostic information. - `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"` - `--config_json PATH`: Path to a JSON configuration file containing additional settings. -- `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "en,fr,de"` for English, French, and German. - `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults. - `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`. The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables. - `--llm_service`: Which llm service to use if `--use_llm` is passed. This defaults to `marker.services.gemini.GoogleGeminiService`. From b3ef1ca3801380f638d69c4bf370cd3a7b622ca9 Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Fri, 30 May 2025 17:56:04 -0400 Subject: [PATCH 12/13] Improve layout box expansion Ensure expansion doesn't cut into other layout blocks, still upper bounded by the max fraction --- marker/builders/layout.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/marker/builders/layout.py b/marker/builders/layout.py index 91b725f1..cdb5d137 100644 --- a/marker/builders/layout.py +++ b/marker/builders/layout.py @@ -35,10 +35,10 @@ class LayoutBuilder(BaseBuilder): List[BlockTypes], "Block types whose bounds should be expanded to accomodate missing regions" ] = [BlockTypes.Picture, BlockTypes.Figure, BlockTypes.ComplexRegion] # Does not include groups since they are only injected later - expand_frac: Annotated[ + max_expand_frac: Annotated[ float, - "The fraction to expand the layout box bounds by" - ] = 0.02 + "The maximum fraction to expand the layout box bounds by" + ] = 0.05 def __init__(self, layout_model: LayoutPredictor, config=None): self.layout_model = layout_model @@ -52,6 +52,7 @@ def __call__(self, document: Document, provider: PdfProvider): else: layout_results = self.surya_layout(document.pages) self.add_blocks_to_pages(document.pages, layout_results) + self.expand_layout_blocks(document) def get_batch_size(self): if self.layout_batch_size is not None: @@ -87,6 +88,31 @@ def surya_layout(self, pages: List[PageGroup]) -> List[LayoutResult]: ) return layout_results + def expand_layout_blocks(self, document: Document): + for page in document.pages: + # Collect all blocks on this page as PolygonBox for easy access + page_blocks = [document.get_block(bid) for bid in page.structure] + + for block_id in page.structure: + block = document.get_block(block_id) + if block.block_type in self.expand_block_types: + other_blocks = [b for b in page_blocks if b != block] + if not other_blocks: + print(f'Expanding {block_id} by {(self.max_expand_frac, self.max_expand_frac)}') + block.polygon = block.polygon.expand(self.max_expand_frac, self.max_expand_frac) + continue + + min_gap = min(block.polygon.minimum_gap(other.polygon) for other in other_blocks) + if min_gap <= 0: + print(f'Cannot expand {block_id}') + continue + + x_expand_frac = min_gap / block.polygon.width if block.polygon.width > 0 else 0 + y_expand_frac = min_gap / block.polygon.height if block.polygon.height > 0 else 0 + + print(f'Expanding {block_id} by {(min(x_expand_frac, self.max_expand_frac), min(y_expand_frac, self.max_expand_frac))}') + block.polygon = block.polygon.expand(x_expand_frac, y_expand_frac) + def add_blocks_to_pages( self, pages: List[PageGroup], layout_results: List[LayoutResult] ): @@ -104,8 +130,6 @@ def add_blocks_to_pages( layout_block.polygon = layout_block.polygon.rescale( layout_page_size, provider_page_size ) - if layout_block.block_type in self.expand_block_types: - layout_block.polygon = layout_block.polygon.expand(self.expand_frac, self.expand_frac) layout_block.top_k = { BlockTypes[label]: prob for (label, prob) in bbox.top_k.items() } From 7c31c9f841bb042463d4e062c9b86d9e4d876e59 Mon Sep 17 00:00:00 2001 From: Tarun Menta Date: Fri, 30 May 2025 18:38:24 -0400 Subject: [PATCH 13/13] Cleanup --- marker/builders/layout.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/marker/builders/layout.py b/marker/builders/layout.py index cdb5d137..2b9847c6 100644 --- a/marker/builders/layout.py +++ b/marker/builders/layout.py @@ -98,19 +98,16 @@ def expand_layout_blocks(self, document: Document): if block.block_type in self.expand_block_types: other_blocks = [b for b in page_blocks if b != block] if not other_blocks: - print(f'Expanding {block_id} by {(self.max_expand_frac, self.max_expand_frac)}') block.polygon = block.polygon.expand(self.max_expand_frac, self.max_expand_frac) continue min_gap = min(block.polygon.minimum_gap(other.polygon) for other in other_blocks) if min_gap <= 0: - print(f'Cannot expand {block_id}') continue x_expand_frac = min_gap / block.polygon.width if block.polygon.width > 0 else 0 y_expand_frac = min_gap / block.polygon.height if block.polygon.height > 0 else 0 - print(f'Expanding {block_id} by {(min(x_expand_frac, self.max_expand_frac), min(y_expand_frac, self.max_expand_frac))}') block.polygon = block.polygon.expand(x_expand_frac, y_expand_frac) def add_blocks_to_pages(