From 1a7424b68337a30ad281db896b9679f646db3f93 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Fri, 30 May 2025 15:34:15 -0400
Subject: [PATCH 1/5] Det batch sizes

---
 marker/builders/line.py    | 2 +-
 marker/processors/table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/marker/builders/line.py b/marker/builders/line.py
index 53b0f45b..e2d7cd19 100644
--- a/marker/builders/line.py
+++ b/marker/builders/line.py
@@ -110,7 +110,7 @@ def get_detection_batch_size(self):
         if self.detection_batch_size is not None:
             return self.detection_batch_size
         elif settings.TORCH_DEVICE_MODEL == "cuda":
-            return 12
+            return 10
         return 4
 
     def get_ocr_error_batch_size(self):
diff --git a/marker/processors/table.py b/marker/processors/table.py
index 3fde4d0c..55e537ab 100644
--- a/marker/processors/table.py
+++ b/marker/processors/table.py
@@ -488,7 +488,7 @@ def get_detection_batch_size(self):
         if self.detection_batch_size is not None:
             return self.detection_batch_size
         elif settings.TORCH_DEVICE_MODEL == "cuda":
-            return 12
+            return 10
         return 4
 
     def get_table_rec_batch_size(self):

From 95fc0b381459584e8515985cb3489101a6940bac Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Fri, 30 May 2025 15:44:00 -0400
Subject: [PATCH 2/5] Ensure bad ocr in tables is redone

---
 marker/builders/line.py      |  3 ++-
 marker/processors/table.py   | 14 ++++++++++++--
 marker/schema/groups/page.py |  1 +
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/marker/builders/line.py b/marker/builders/line.py
index e2d7cd19..a92cc904 100644
--- a/marker/builders/line.py
+++ b/marker/builders/line.py
@@ -156,13 +156,14 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
         for document_page, ocr_error_detection_label in zip(
             document.pages, ocr_error_detection_results.labels
         ):
+            document_page.ocr_errors_detected = ocr_error_detection_label == "bad"
             provider_lines: List[ProviderOutput] = provider.page_lines.get(
                 document_page.page_id, []
             )
             provider_lines_good = all(
                 [
                     bool(provider_lines),
-                    ocr_error_detection_label != "bad",
+                    not document_page.ocr_errors_detected,
                     self.check_layout_coverage(document_page, provider_lines),
                 ]
             )
diff --git a/marker/processors/table.py b/marker/processors/table.py
index 55e537ab..18a06e0b 100644
--- a/marker/processors/table.py
+++ b/marker/processors/table.py
@@ -68,6 +68,11 @@ class TableProcessor(BaseProcessor):
         bool,
         "Whether to format the lines.",
     ] = False
+    ocr_error_batch_size: Annotated[
+        int,
+        "The batch size to use for the ocr error detection model.",
+        "Default is None, which will use the default batch size for the model.",
+    ] = None
 
     def __init__(
         self,
@@ -101,8 +106,13 @@ def __call__(self, document: Document):
                         "table_image": image,
                         "table_bbox": image_poly.bbox,
                         "img_size": page.get_image(highres=True).size,
-                        "ocr_block": page.text_extraction_method == "surya"
-                        or self.format_lines,
+                        "ocr_block": any(
+                            [
+                                page.text_extraction_method == "surya",
+                                page.ocr_errors_detected,
+                                self.format_lines,
+                            ]
+                        ),
                     }
                 )
 
diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py
index 7d623bd8..6672424c 100644
--- a/marker/schema/groups/page.py
+++ b/marker/schema/groups/page.py
@@ -33,6 +33,7 @@ class PageGroup(Group):
     maximum_assignment_distance: float = 20  # pixels
     block_description: str = "A single page in the document."
     refs: List[Reference] | None = None
+    ocr_errors_detected: bool = False
 
     def incr_block_id(self):
         if self.block_id is None:

From 9feb37a47aecb87ffc158384c508dd0eb9a6dac4 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Fri, 30 May 2025 15:50:44 -0400
Subject: [PATCH 3/5] Remove chars

---
 marker/builders/line.py | 42 ++++++++++++++++-------------------------
 marker/providers/pdf.py | 33 +++++++++++++++++++-------------
 2 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/marker/builders/line.py b/marker/builders/line.py
index a92cc904..d314ec82 100644
--- a/marker/builders/line.py
+++ b/marker/builders/line.py
@@ -313,36 +313,26 @@ def check_layout_coverage(
             text_okay = True
         return text_okay
 
-    def is_blank_slice(self, slice_image: Image.Image):
-        image = np.asarray(slice_image)
-        if (
-            image is None
-            or image.size == 0
-            or image.shape[0] == 0
-            or image.shape[1] == 0
-        ):
-            # Handle empty image case
+    def is_blank_slice(
+        self, slice_image: Image.Image, std_thresh: float = 2, noise_area: int = 30
+    ):
+        gray = np.asarray(slice_image.convert("L"))
+        if gray.size == 0:
             return True
 
-        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
-        gray = cv2.GaussianBlur(gray, (3, 3), 0)
+        if gray.std() < std_thresh:
+            return True
 
-        # Adaptive threshold (inverse for text as white)
-        binarized = cv2.adaptiveThreshold(
-            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15
-        )
+        _, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
 
-        num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
-            binarized, connectivity=8
-        )
-        cleaned = np.zeros_like(binarized)
-        for i in range(1, num_labels):  # skip background
-            cleaned[labels == i] = 255
-
-        kernel = np.ones((1, 5), np.uint8)
-        dilated = cv2.dilate(cleaned, kernel, iterations=3)
-        b = dilated / 255
-        return b.sum() == 0
+        if cv2.countNonZero(bw) < noise_area:
+            return True
+
+        kernel_size = max(3, int(np.sqrt(noise_area)))  # Scale kernel to noise_area
+        kernel = np.ones((kernel_size, kernel_size), np.uint8)
+        bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
+
+        return cv2.countNonZero(bw) == 0
 
     def filter_blank_lines(self, page: PageGroup, lines: List[ProviderOutput]):
         page_size = (page.polygon.width, page.polygon.height)
diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index fed2318c..6e034c47 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -76,6 +76,10 @@ class PdfProvider(BaseProvider):
         bool,
         "Whether to disable links.",
     ] = False
+    keep_chars: Annotated[
+        bool,
+        "Whether to keep character-level information in the output.",
+    ] = False
 
     def __init__(self, filepath: str, config=None):
         super().__init__(filepath, config)
@@ -200,7 +204,7 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
         page_char_blocks = dictionary_output(
             self.filepath,
             page_range=self.page_range,
-            keep_chars=True,
+            keep_chars=self.keep_chars,
             workers=self.pdftext_workers,
             flatten_pdf=self.flatten_pdf,
             quote_loosebox=False,
@@ -237,16 +241,6 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
                         polygon = PolygonBox.from_bbox(
                             span["bbox"], ensure_nonzero_area=True
                         )
-                        span_chars = [
-                            CharClass(
-                                text=c["char"],
-                                polygon=PolygonBox.from_bbox(
-                                    c["bbox"], ensure_nonzero_area=True
-                                ),
-                                idx=c["char_idx"],
-                            )
-                            for c in span["chars"]
-                        ]
                         superscript = span.get("superscript", False)
                         subscript = span.get("subscript", False)
                         text = self.normalize_spaces(fix_text(span["text"]))
@@ -270,11 +264,24 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
                                 has_subscript=subscript,
                             )
                         )
-                        chars.append(span_chars)
+
+                        if self.keep_chars:
+                            span_chars = [
+                                CharClass(
+                                    text=c["char"],
+                                    polygon=PolygonBox.from_bbox(
+                                        c["bbox"], ensure_nonzero_area=True
+                                    ),
+                                    idx=c["char_idx"],
+                                )
+                                for c in span["chars"]
+                            ]
+                            chars.append(span_chars)
+
                     polygon = PolygonBox.from_bbox(
                         line["bbox"], ensure_nonzero_area=True
                     )
-                    assert len(spans) == len(chars)
+
                     lines.append(
                         ProviderOutput(
                             line=LineClass(polygon=polygon, page_id=page_id),

From 787f617a84274059f0f7aa6ca4a6ad8af9639c17 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Fri, 30 May 2025 16:16:30 -0400
Subject: [PATCH 4/5] Fix blank slice issue

---
 marker/builders/line.py | 42 +++++++++++++++++++++++++----------------
 marker/providers/pdf.py |  5 +++++
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/marker/builders/line.py b/marker/builders/line.py
index d314ec82..a92cc904 100644
--- a/marker/builders/line.py
+++ b/marker/builders/line.py
@@ -313,26 +313,36 @@ def check_layout_coverage(
             text_okay = True
         return text_okay
 
-    def is_blank_slice(
-        self, slice_image: Image.Image, std_thresh: float = 2, noise_area: int = 30
-    ):
-        gray = np.asarray(slice_image.convert("L"))
-        if gray.size == 0:
-            return True
-
-        if gray.std() < std_thresh:
+    def is_blank_slice(self, slice_image: Image.Image):
+        image = np.asarray(slice_image)
+        if (
+            image is None
+            or image.size == 0
+            or image.shape[0] == 0
+            or image.shape[1] == 0
+        ):
+            # Handle empty image case
             return True
 
-        _, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
+        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+        gray = cv2.GaussianBlur(gray, (3, 3), 0)
 
-        if cv2.countNonZero(bw) < noise_area:
-            return True
-
-        kernel_size = max(3, int(np.sqrt(noise_area)))  # Scale kernel to noise_area
-        kernel = np.ones((kernel_size, kernel_size), np.uint8)
-        bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
+        # Adaptive threshold (inverse for text as white)
+        binarized = cv2.adaptiveThreshold(
+            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15
+        )
 
-        return cv2.countNonZero(bw) == 0
+        num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
+            binarized, connectivity=8
+        )
+        cleaned = np.zeros_like(binarized)
+        for i in range(1, num_labels):  # skip background
+            cleaned[labels == i] = 255
+
+        kernel = np.ones((1, 5), np.uint8)
+        dilated = cv2.dilate(cleaned, kernel, iterations=3)
+        b = dilated / 255
+        return b.sum() == 0
 
     def filter_blank_lines(self, page: PageGroup, lines: List[ProviderOutput]):
         page_size = (page.polygon.width, page.polygon.height)
diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
index 6e034c47..dc02ea23 100644
--- a/marker/providers/pdf.py
+++ b/marker/providers/pdf.py
@@ -277,11 +277,16 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
                                 for c in span["chars"]
                             ]
                             chars.append(span_chars)
+                        else:
+                            chars.append([])
 
                     polygon = PolygonBox.from_bbox(
                         line["bbox"], ensure_nonzero_area=True
                     )
 
+                    assert len(spans) == len(chars), (
+                        f"Spans and chars length mismatch on page {page_id}: {len(spans)} spans, {len(chars)} chars"
+                    )
                     lines.append(
                         ProviderOutput(
                             line=LineClass(polygon=polygon, page_id=page_id),

From 2b949a0b47efb54f0a858b4bcf3dd193dac4bbb3 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Fri, 30 May 2025 16:34:39 -0400
Subject: [PATCH 5/5] Cleanups

---
 marker/processors/equation.py | 2 +-
 marker/processors/table.py    | 5 -----
 pyproject.toml                | 2 +-
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/marker/processors/equation.py b/marker/processors/equation.py
index e48ecca3..f95a7023 100644
--- a/marker/processors/equation.py
+++ b/marker/processors/equation.py
@@ -47,7 +47,7 @@ def get_batch_size(self):
         if self.equation_batch_size is not None:
             return self.equation_batch_size
         elif settings.TORCH_DEVICE_MODEL == "cuda":
-            return 32
+            return 16
         elif settings.TORCH_DEVICE_MODEL == "mps":
             return 6
         return 6
diff --git a/marker/processors/table.py b/marker/processors/table.py
index 18a06e0b..7b149cb3 100644
--- a/marker/processors/table.py
+++ b/marker/processors/table.py
@@ -68,11 +68,6 @@ class TableProcessor(BaseProcessor):
         bool,
         "Whether to format the lines.",
     ] = False
-    ocr_error_batch_size: Annotated[
-        int,
-        "The batch size to use for the ocr error detection model.",
-        "Default is None, which will use the default batch size for the model.",
-    ] = None
 
     def __init__(
         self,
diff --git a/pyproject.toml b/pyproject.toml
index 0f048955..3b2da6cc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "1.7.4"
+version = "1.7.5"
 description = "Convert documents to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"