From 1a7424b68337a30ad281db896b9679f646db3f93 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 30 May 2025 15:34:15 -0400 Subject: [PATCH 1/5] Det batch sizes --- marker/builders/line.py | 2 +- marker/processors/table.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index 53b0f45b..e2d7cd19 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -110,7 +110,7 @@ def get_detection_batch_size(self): if self.detection_batch_size is not None: return self.detection_batch_size elif settings.TORCH_DEVICE_MODEL == "cuda": - return 12 + return 10 return 4 def get_ocr_error_batch_size(self): diff --git a/marker/processors/table.py b/marker/processors/table.py index 3fde4d0c..55e537ab 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -488,7 +488,7 @@ def get_detection_batch_size(self): if self.detection_batch_size is not None: return self.detection_batch_size elif settings.TORCH_DEVICE_MODEL == "cuda": - return 12 + return 10 return 4 def get_table_rec_batch_size(self): From 95fc0b381459584e8515985cb3489101a6940bac Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 30 May 2025 15:44:00 -0400 Subject: [PATCH 2/5] Ensure bad ocr in tables is redone --- marker/builders/line.py | 3 ++- marker/processors/table.py | 14 ++++++++++++-- marker/schema/groups/page.py | 1 + 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index e2d7cd19..a92cc904 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -156,13 +156,14 @@ def get_all_lines(self, document: Document, provider: PdfProvider): for document_page, ocr_error_detection_label in zip( document.pages, ocr_error_detection_results.labels ): + document_page.ocr_errors_detected = ocr_error_detection_label == "bad" provider_lines: List[ProviderOutput] = provider.page_lines.get( document_page.page_id, [] ) provider_lines_good = all( [ bool(provider_lines), - ocr_error_detection_label != "bad", + not document_page.ocr_errors_detected, self.check_layout_coverage(document_page, provider_lines), ] ) diff --git a/marker/processors/table.py b/marker/processors/table.py index 55e537ab..18a06e0b 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -68,6 +68,11 @@ class TableProcessor(BaseProcessor): bool, "Whether to format the lines.", ] = False + ocr_error_batch_size: Annotated[ + int, + "The batch size to use for the ocr error detection model.", + "Default is None, which will use the default batch size for the model.", + ] = None def __init__( self, @@ -101,8 +106,13 @@ def __call__(self, document: Document): "table_image": image, "table_bbox": image_poly.bbox, "img_size": page.get_image(highres=True).size, - "ocr_block": page.text_extraction_method == "surya" - or self.format_lines, + "ocr_block": any( + [ + page.text_extraction_method == "surya", + page.ocr_errors_detected, + self.format_lines, + ] + ), } ) diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index 7d623bd8..6672424c 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -33,6 +33,7 @@ class PageGroup(Group): maximum_assignment_distance: float = 20 # pixels block_description: str = "A single page in the document." refs: List[Reference] | None = None + ocr_errors_detected: bool = False def incr_block_id(self): if self.block_id is None: From 9feb37a47aecb87ffc158384c508dd0eb9a6dac4 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 30 May 2025 15:50:44 -0400 Subject: [PATCH 3/5] Remove chars --- marker/builders/line.py | 42 ++++++++++++++++------------------------- marker/providers/pdf.py | 33 +++++++++++++++++++------------- 2 files changed, 36 insertions(+), 39 deletions(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index a92cc904..d314ec82 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -313,36 +313,26 @@ def check_layout_coverage( text_okay = True return text_okay - def is_blank_slice(self, slice_image: Image.Image): - image = np.asarray(slice_image) - if ( - image is None - or image.size == 0 - or image.shape[0] == 0 - or image.shape[1] == 0 - ): - # Handle empty image case + def is_blank_slice( + self, slice_image: Image.Image, std_thresh: float = 2, noise_area: int = 30 + ): + gray = np.asarray(slice_image.convert("L")) + if gray.size == 0: return True - gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) - gray = cv2.GaussianBlur(gray, (3, 3), 0) + if gray.std() < std_thresh: + return True - # Adaptive threshold (inverse for text as white) - binarized = cv2.adaptiveThreshold( - gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15 - ) + _, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) - num_labels, labels, stats, _ = cv2.connectedComponentsWithStats( - binarized, connectivity=8 - ) - cleaned = np.zeros_like(binarized) - for i in range(1, num_labels): # skip background - cleaned[labels == i] = 255 - - kernel = np.ones((1, 5), np.uint8) - dilated = cv2.dilate(cleaned, kernel, iterations=3) - b = dilated / 255 - return b.sum() == 0 + if cv2.countNonZero(bw) < noise_area: + return True + + kernel_size = max(3, int(np.sqrt(noise_area))) # Scale kernel to noise_area + kernel = np.ones((kernel_size, kernel_size), np.uint8) + bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel) + + return cv2.countNonZero(bw) == 0 def filter_blank_lines(self, page: PageGroup, lines: List[ProviderOutput]): page_size = (page.polygon.width, page.polygon.height) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index fed2318c..6e034c47 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -76,6 +76,10 @@ class PdfProvider(BaseProvider): bool, "Whether to disable links.", ] = False + keep_chars: Annotated[ + bool, + "Whether to keep character-level information in the output.", + ] = False def __init__(self, filepath: str, config=None): super().__init__(filepath, config) @@ -200,7 +204,7 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines: page_char_blocks = dictionary_output( self.filepath, page_range=self.page_range, - keep_chars=True, + keep_chars=self.keep_chars, workers=self.pdftext_workers, flatten_pdf=self.flatten_pdf, quote_loosebox=False, @@ -237,16 +241,6 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines: polygon = PolygonBox.from_bbox( span["bbox"], ensure_nonzero_area=True ) - span_chars = [ - CharClass( - text=c["char"], - polygon=PolygonBox.from_bbox( - c["bbox"], ensure_nonzero_area=True - ), - idx=c["char_idx"], - ) - for c in span["chars"] - ] superscript = span.get("superscript", False) subscript = span.get("subscript", False) text = self.normalize_spaces(fix_text(span["text"])) @@ -270,11 +264,24 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines: has_subscript=subscript, ) ) - chars.append(span_chars) + + if self.keep_chars: + span_chars = [ + CharClass( + text=c["char"], + polygon=PolygonBox.from_bbox( + c["bbox"], ensure_nonzero_area=True + ), + idx=c["char_idx"], + ) + for c in span["chars"] + ] + chars.append(span_chars) + polygon = PolygonBox.from_bbox( line["bbox"], ensure_nonzero_area=True ) - assert len(spans) == len(chars) + lines.append( ProviderOutput( line=LineClass(polygon=polygon, page_id=page_id), From 787f617a84274059f0f7aa6ca4a6ad8af9639c17 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 30 May 2025 16:16:30 -0400 Subject: [PATCH 4/5] Fix blank slice issue --- marker/builders/line.py | 42 +++++++++++++++++++++++++---------------- marker/providers/pdf.py | 5 +++++ 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/marker/builders/line.py b/marker/builders/line.py index d314ec82..a92cc904 100644 --- a/marker/builders/line.py +++ b/marker/builders/line.py @@ -313,26 +313,36 @@ def check_layout_coverage( text_okay = True return text_okay - def is_blank_slice( - self, slice_image: Image.Image, std_thresh: float = 2, noise_area: int = 30 - ): - gray = np.asarray(slice_image.convert("L")) - if gray.size == 0: - return True - - if gray.std() < std_thresh: + def is_blank_slice(self, slice_image: Image.Image): + image = np.asarray(slice_image) + if ( + image is None + or image.size == 0 + or image.shape[0] == 0 + or image.shape[1] == 0 + ): + # Handle empty image case return True - _, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) + gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) + gray = cv2.GaussianBlur(gray, (3, 3), 0) - if cv2.countNonZero(bw) < noise_area: - return True - - kernel_size = max(3, int(np.sqrt(noise_area))) # Scale kernel to noise_area - kernel = np.ones((kernel_size, kernel_size), np.uint8) - bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel) + # Adaptive threshold (inverse for text as white) + binarized = cv2.adaptiveThreshold( + gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15 + ) - return cv2.countNonZero(bw) == 0 + num_labels, labels, stats, _ = cv2.connectedComponentsWithStats( + binarized, connectivity=8 + ) + cleaned = np.zeros_like(binarized) + for i in range(1, num_labels): # skip background + cleaned[labels == i] = 255 + + kernel = np.ones((1, 5), np.uint8) + dilated = cv2.dilate(cleaned, kernel, iterations=3) + b = dilated / 255 + return b.sum() == 0 def filter_blank_lines(self, page: PageGroup, lines: List[ProviderOutput]): page_size = (page.polygon.width, page.polygon.height) diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index 6e034c47..dc02ea23 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -277,11 +277,16 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines: for c in span["chars"] ] chars.append(span_chars) + else: + chars.append([]) polygon = PolygonBox.from_bbox( line["bbox"], ensure_nonzero_area=True ) + assert len(spans) == len(chars), ( + f"Spans and chars length mismatch on page {page_id}: {len(spans)} spans, {len(chars)} chars" + ) lines.append( ProviderOutput( line=LineClass(polygon=polygon, page_id=page_id), From 2b949a0b47efb54f0a858b4bcf3dd193dac4bbb3 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 30 May 2025 16:34:39 -0400 Subject: [PATCH 5/5] Cleanups --- marker/processors/equation.py | 2 +- marker/processors/table.py | 5 ----- pyproject.toml | 2 +- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/marker/processors/equation.py b/marker/processors/equation.py index e48ecca3..f95a7023 100644 --- a/marker/processors/equation.py +++ b/marker/processors/equation.py @@ -47,7 +47,7 @@ def get_batch_size(self): if self.equation_batch_size is not None: return self.equation_batch_size elif settings.TORCH_DEVICE_MODEL == "cuda": - return 32 + return 16 elif settings.TORCH_DEVICE_MODEL == "mps": return 6 return 6 diff --git a/marker/processors/table.py b/marker/processors/table.py index 18a06e0b..7b149cb3 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -68,11 +68,6 @@ class TableProcessor(BaseProcessor): bool, "Whether to format the lines.", ] = False - ocr_error_batch_size: Annotated[ - int, - "The batch size to use for the ocr error detection model.", - "Default is None, which will use the default batch size for the model.", - ] = None def __init__( self, diff --git a/pyproject.toml b/pyproject.toml index 0f048955..3b2da6cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "1.7.4" +version = "1.7.5" description = "Convert documents to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md"