datalab-to · VikParuchuri · Jun 2, 2025 · May 30, 2025 · May 30, 2025 · May 30, 2025
diff --git a/marker/builders/line.py b/marker/builders/line.py
@@ -110,7 +110,7 @@ def get_detection_batch_size(self):
         if self.detection_batch_size is not None:
             return self.detection_batch_size
         elif settings.TORCH_DEVICE_MODEL == "cuda":
-            return 12
+            return 10
         return 4
 
     def get_ocr_error_batch_size(self):
@@ -156,13 +156,14 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
         for document_page, ocr_error_detection_label in zip(
             document.pages, ocr_error_detection_results.labels
         ):
+            document_page.ocr_errors_detected = ocr_error_detection_label == "bad"
             provider_lines: List[ProviderOutput] = provider.page_lines.get(
                 document_page.page_id, []
             )
             provider_lines_good = all(
                 [
                     bool(provider_lines),
-                    ocr_error_detection_label != "bad",
+                    not document_page.ocr_errors_detected,
                     self.check_layout_coverage(document_page, provider_lines),
                 ]
             )

diff --git a/marker/processors/equation.py b/marker/processors/equation.py
@@ -47,7 +47,7 @@ def get_batch_size(self):
         if self.equation_batch_size is not None:
             return self.equation_batch_size
         elif settings.TORCH_DEVICE_MODEL == "cuda":
-            return 32
+            return 16
         elif settings.TORCH_DEVICE_MODEL == "mps":
             return 6
         return 6

diff --git a/marker/processors/table.py b/marker/processors/table.py
@@ -101,8 +101,13 @@ def __call__(self, document: Document):
                         "table_image": image,
                         "table_bbox": image_poly.bbox,
                         "img_size": page.get_image(highres=True).size,
-                        "ocr_block": page.text_extraction_method == "surya"
-                        or self.format_lines,
+                        "ocr_block": any(
+                            [
+                                page.text_extraction_method == "surya",
+                                page.ocr_errors_detected,
+                                self.format_lines,
+                            ]
+                        ),
                     }
                 )
 
@@ -488,7 +493,7 @@ def get_detection_batch_size(self):
         if self.detection_batch_size is not None:
             return self.detection_batch_size
         elif settings.TORCH_DEVICE_MODEL == "cuda":
-            return 12
+            return 10
         return 4
 
     def get_table_rec_batch_size(self):

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
@@ -76,6 +76,10 @@ class PdfProvider(BaseProvider):
         bool,
         "Whether to disable links.",
     ] = False
+    keep_chars: Annotated[
+        bool,
+        "Whether to keep character-level information in the output.",
+    ] = False
 
     def __init__(self, filepath: str, config=None):
         super().__init__(filepath, config)
@@ -200,7 +204,7 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
         page_char_blocks = dictionary_output(
             self.filepath,
             page_range=self.page_range,
-            keep_chars=True,
+            keep_chars=self.keep_chars,
             workers=self.pdftext_workers,
             flatten_pdf=self.flatten_pdf,
             quote_loosebox=False,
@@ -237,16 +241,6 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
                         polygon = PolygonBox.from_bbox(
                             span["bbox"], ensure_nonzero_area=True
                         )
-                        span_chars = [
-                            CharClass(
-                                text=c["char"],
-                                polygon=PolygonBox.from_bbox(
-                                    c["bbox"], ensure_nonzero_area=True
-                                ),
-                                idx=c["char_idx"],
-                            )
-                            for c in span["chars"]
-                        ]
                         superscript = span.get("superscript", False)
                         subscript = span.get("subscript", False)
                         text = self.normalize_spaces(fix_text(span["text"]))
@@ -270,11 +264,29 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
                                 has_subscript=subscript,
                             )
                         )
-                        chars.append(span_chars)
+
+                        if self.keep_chars:
+                            span_chars = [
+                                CharClass(
+                                    text=c["char"],
+                                    polygon=PolygonBox.from_bbox(
+                                        c["bbox"], ensure_nonzero_area=True
+                                    ),
+                                    idx=c["char_idx"],
+                                )
+                                for c in span["chars"]
+                            ]
+                            chars.append(span_chars)
+                        else:
+                            chars.append([])
+
                     polygon = PolygonBox.from_bbox(
                         line["bbox"], ensure_nonzero_area=True
                     )
-                    assert len(spans) == len(chars)
+
+                    assert len(spans) == len(chars), (
+                        f"Spans and chars length mismatch on page {page_id}: {len(spans)} spans, {len(chars)} chars"
+                    )
                     lines.append(
                         ProviderOutput(
                             line=LineClass(polygon=polygon, page_id=page_id),

diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
@@ -33,6 +33,7 @@ class PageGroup(Group):
     maximum_assignment_distance: float = 20  # pixels
     block_description: str = "A single page in the document."
     refs: List[Reference] | None = None
+    ocr_errors_detected: bool = False
 
     def incr_block_id(self):
         if self.block_id is None:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "1.7.4"
+version = "1.7.5"
 description = "Convert documents to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"