8000 Vik remove chars by VikParuchuri · Pull Request #720 · datalab-to/marker · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Vik remove chars #720

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions marker/builders/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def get_detection_batch_size(self):
if self.detection_batch_size is not None:
return self.detection_batch_size
elif settings.TORCH_DEVICE_MODEL == "cuda":
return 12
return 10
return 4

def get_ocr_error_batch_size(self):
Expand Down Expand Up @@ -156,13 +156,14 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
for document_page, ocr_error_detection_label in zip(
document.pages, ocr_error_detection_results.labels
):
document_page.ocr_errors_detected = ocr_error_detection_label == "bad"
provider_lines: List[ProviderOutput] = provider.page_lines.get(
document_page.page_id, []
)
provider_lines_good = all(
[
bool(provider_lines),
ocr_error_detection_label != "bad",
not document_page.ocr_errors_detected,
self.check_layout_coverage(document_page, provider_lines),
]
)
Expand Down
2 changes: 1 addition & 1 deletion marker/processors/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def get_batch_size(self):
if self.equation_batch_size is not None:
return self.equation_batch_size
elif settings.TORCH_DEVICE_MODEL == "cuda":
return 32
return 16
elif settings.TORCH_DEVICE_MODEL == "mps":
return 6
return 6
Expand Down
11 changes: 8 additions & 3 deletions marker/processors/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,13 @@ def __call__(self, document: Document):
"table_image": image,
"table_bbox": image_poly.bbox,
"img_size": page.get_image(highres=True).size,
"ocr_block": page.text_extraction_method == "surya"
or self.format_lines,
"ocr_block": any(
[
page.text_extraction_method == "surya",
page.ocr_errors_detected,
self.format_lines,
]
),
}
)

Expand Down Expand Up @@ -488,7 +493,7 @@ def get_detection_batch_size(self):
if self.detection_batch_size is not None:
return self.detection_batch_size
elif settings.TORCH_DEVICE_MODEL == "cuda":
return 12
return 10
return 4

def get_table_rec_batch_size(self):
Expand Down
38 changes: 25 additions & 13 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ class PdfProvider(BaseProvider):
bool,
"Whether to disable links.",
] = False
keep_chars: Annotated[
bool,
"Whether to keep character-level information in the output.",
] = False

def __init__(self, filepath: str, config=None):
super().__init__(filepath, config)
Expand Down Expand Up @@ -200,7 +204,7 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
page_char_blocks = dictionary_output(
self.filepath,
page_range=self.page_range,
keep_chars=True,
keep_chars=self.keep_chars,
workers=self.pdftext_workers,
flatten_pdf=self.flatten_pdf,
quote_loosebox=False,
Expand Down Expand Up @@ -237,16 +241,6 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
polygon = PolygonBox.from_bbox(
span["bbox"], ensure_nonzero_area=True
)
span_chars = [
CharClass(
text=c["char"],
polygon=PolygonBox.from_bbox(
c["bbox"], ensure_nonzero_area=True
),
idx=c["char_idx"],
)
for c in span["chars"]
]
superscript = span.get("superscript", False)
subscript = span.get("subscript", False)
text = self.normalize_spaces(fix_text(span["text"]))
Expand All @@ -270,11 +264,29 @@ def pdftext_extraction(self, doc: PdfDocument) -> ProviderPageLines:
has_subscript=subscript,
)
)
chars.append(span_chars)

if self.keep_chars:
span_chars = [
CharClass(
text=c["char"],
polygon=PolygonBox.from_bbox(
c["bbox"], ensure_nonzero_area=True
),
idx=c["char_idx"],
)
for c in span["chars"]
]
chars.append(span_chars)
else:
chars.append([])

polygon = PolygonBox.from_bbox(
line["bbox"], ensure_nonzero_area=True
)
assert len(spans) == len(chars)

assert len(spans) == len(chars), (
f"Spans and chars length mismatch on page {page_id}: {len(spans)} spans, {len(chars)} chars"
)
lines.append(
ProviderOutput(
line=LineClass(polygon=polygon, page_id=page_id),
Expand Down
1 change: 1 addition & 0 deletions marker/schema/groups/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class PageGroup(Group):
maximum_assignment_distance: float = 20 # pixels
block_description: str = "A single page in the document."
refs: List[Reference] | None = None
ocr_errors_detected: bool = False

def incr_block_id(self):
if self.block_id is None:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "1.7.4"
version = "1.7.5"
description = "Convert documents to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <github@vikas.sh>"]
readme = "README.md"
Expand Down
0