8000 stripping punctuation from OCR results before reference matching by EmilyFitzgerald · Pull Request #4 · rbturnbull/hespi · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

stripping punctuation from OCR results before reference matching #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hespi/data/authority.txt
Original file line number Diff line number Diff line change
Expand Up @@ -269996,4 +269996,4 @@ woodii
Žíla
Žíla & H.E.Weber
Žíla & Trávn.
Șik & Erol
Șik & Erol
17 changes: 13 additions & 4 deletions hespi/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Dict
import pandas as pd
import numpy as np
import string
from rich.console import Console
from difflib import get_close_matches, SequenceMatcher
from rich.table import Column, Table
Expand Down Expand Up @@ -35,6 +36,13 @@ def adjust_case(field, value):

return value

def strip_punctuation(field, value):
punctuation_to_strip = string.punctuation.replace('[', '').replace(']', '').replace('(', '').replace(')', '')
if field in ["genus", "family", "species"]:
return value.strip(punctuation_to_strip).strip()

return value


def read_reference(field):
path = DATA_DIR / f"{field}.txt"
Expand All @@ -49,8 +57,9 @@ def mk_reference() -> Dict:


def label_sort_key(s) -> int:
base_name = s.split('_')[0]
try:
return label_fields.index(s)
return label_fields.index(base_name)
except ValueError:
return len(label_fields)

Expand Down Expand Up @@ -136,8 +145,7 @@ def ocr_data_df(data: dict, output_path: Path=None) -> pd.DataFrame:
image_files_cols = ['image_links-->'] + sorted([col for col in df.columns if '_image' in col or 'predictions' in col], key=label_sort_key)
result_cols = ['ocr_results_split-->'] + sorted([col for col in df.columns if 'Tesseract' in col or 'TrOCR' in col], key=label_sort_key)

cols = col_options + score_cols + ['label_classification'] + ocr_cols + image_files_cols + result_cols

cols = col_options + score_cols + ['label_classification'] + ocr_cols + result_cols + image_files_cols
extra_cols = [col for col in df.columns if col not in cols]

cols = cols + extra_cols
Expand Down Expand Up @@ -223,7 +231,8 @@ def ocr_data_print_tables(df: pd.DataFrame) -> None:


def adjust_text(field:str, recognised_text:str, fuzzy:bool, fuzzy_cutoff:float, reference:Dict):
text_adjusted = adjust_case(field, recognised_text)
text_stripped = strip_punctuation(field, recognised_text)
text_adjusted = adjust_case(field, text_stripped)
match_score = ""

# Match with database
Expand Down
3 changes: 2 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def test_ocr_data_df():
7714 'id', 'family', 'genus', 'species',
'infrasp_taxon', 'authority', 'collector_number', 'collector',
'locality', 'geolocation', 'year', 'month', 'day',
'<--results|ocr_details-->', 'image_links-->', 'ocr_results_split-->'
'<--results|ocr_details-->', 'ocr_results_split-->', 'image_links-->'
]
df = util.ocr_data_df(
{
Expand All @@ -76,6 +76,7 @@ def test_ocr_data_df():
}
}
)

assert (df.columns == required_columns).all()
assert len(df) == 1

Expand Down
Loading
0