diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 794538752..2a08d9710 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -67,6 +67,12 @@ def get_help(self, ctx): \b {config.describe('OCRD_EXISTING_OUTPUT', wrap_text=False)} \b +{config.describe('OCRD_MAX_MISSING_OUTPUTS')} +\b +{config.describe('OCRD_MAX_PARALLEL_PAGES')} +\b +{config.describe('OCRD_PROCESSING_PAGE_TIMEOUT')} +\b {config.describe('OCRD_METS_CACHING')} \b {config.describe('OCRD_MAX_PROCESSOR_CACHE')} diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 2d9cbf0fd..24d7e16cd 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -29,8 +29,7 @@ # this is where the fixes came from: from loky import Future, ProcessPoolExecutor import multiprocessing as mp -from threading import Timer -from _thread import interrupt_main +from multiprocessing.pool import ThreadPool from click import wrap_text from deprecated import deprecated @@ -783,11 +782,16 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: page_id = input_files[input_pos].pageId self._base_logger.info("processing page %s", page_id) for i, input_file in enumerate(input_files): + grp = self.input_file_grp.split(',')[i] if input_file is None: - grp = self.input_file_grp.split(',')[i] self._base_logger.debug(f"ignoring missing file for input fileGrp {grp} for page {page_id}") continue assert isinstance(input_file, get_args(OcrdFileType)) + if not input_file.local_filename: + self._base_logger.error(f'No local file exists for page {page_id} in file group {grp}') + if config.OCRD_MISSING_INPUT == 'ABORT': + raise MissingInputFile(grp, page_id, input_file.mimetype) + continue self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}") try: page_ = page_from_file(input_file) @@ -796,6 +800,9 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: except ValueError as err: # not PAGE and not an image to generate PAGE for self._base_logger.error(f"non-PAGE input for page {page_id}: {err}") + if not any(input_pcgts): + self._base_logger.warning(f'skipping page {page_id}') + return output_file_id = make_file_id(input_files[input_pos], self.output_file_grp) if input_files[input_pos].fileGrp == self.output_file_grp: # input=output fileGrp: re-use ID exactly @@ -1107,7 +1114,11 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n" f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") ifts = [] - for page, ifiles in pages.items(): + # use physical page order + for page in self.workspace.mets.physical_pages: + if page not in pages: + continue + ifiles = pages[page] for i, ifg in enumerate(ifgs): if not ifiles[i]: # could be from non-unique with on_error=skip or from true gap @@ -1150,18 +1161,15 @@ def _page_worker(timeout, *input_files): """ page_id = next((file.pageId for file in input_files if hasattr(file, 'pageId')), "") - if timeout > 0: - timer = Timer(timeout, interrupt_main) - timer.start() + pool = ThreadPool(processes=1) try: - _page_worker_processor.process_page_file(*input_files) + #_page_worker_processor.process_page_file(*input_files) + async_result = pool.apply_async(_page_worker_processor.process_page_file, input_files) + async_result.get(timeout or None) _page_worker_processor.logger.debug("page worker completed for page %s", page_id) - except KeyboardInterrupt: + except mp.TimeoutError: _page_worker_processor.logger.debug("page worker timed out for page %s", page_id) - raise TimeoutError() - finally: - if timeout > 0: - timer.cancel() + raise def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): """Generate a string describing the full CLI of this processor including params. diff --git a/src/ocrd_utils/os.py b/src/ocrd_utils/os.py index 70721acbe..024fe0af9 100644 --- a/src/ocrd_utils/os.py +++ b/src/ocrd_utils/os.py @@ -254,7 +254,7 @@ def guess_media_type(input_file : str, fallback : str = None, application_xml : if mimetype is None: mimetype = EXT_TO_MIME.get(''.join(Path(input_file).suffixes), fallback) if mimetype is None: - raise ValueError("Could not determine MIME type of input_file must") + raise ValueError(f"Could not determine MIME type of {input_file}") if mimetype == 'application/xml': mimetype = application_xml return mimetype diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index e66302725..a214cb9fe 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -6,6 +6,7 @@ from io import StringIO from contextlib import contextmanager import sys +from packaging.version import Version from click.testing import CliRunner import pytest @@ -13,7 +14,7 @@ # pylint: disable=import-error, no-name-in-module from tests.base import CapturingTestCase as TestCase, assets, copy_of_directory, main -from ocrd_utils import initLogging, pushd_popd, setOverrideLogLevel, disableLogging +from ocrd_utils import initLogging, pushd_popd, setOverrideLogLevel, disableLogging, dist_version from ocrd.cli.workspace import workspace_cli from ocrd import Resolver @@ -31,7 +32,10 @@ def setUp(self): disableLogging() self.maxDiff = None self.resolver = Resolver() - self.runner = CliRunner(mix_stderr=False) + if Version(dist_version('click')) >= Version('8.2'): + self.runner = CliRunner() + else: + self.runner = CliRunner(mix_stderr=False) def test_add(self): """ diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 03edbb2a8..8d496623b 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -3,6 +3,7 @@ from PIL import Image from io import BytesIO from contextlib import ExitStack +import multiprocessing as mp from tempfile import TemporaryDirectory from pathlib import Path @@ -232,8 +233,16 @@ def test_run_input(self): def test_run_output0(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') + file1 = ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001', + url=assets.path_to('SBB0000F29300010000/data/OCR-D-GT-PAGE/FILE_0001_FULLTEXT.xml')) + file2 = ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002', + url=assets.path_to('SBB0000F29300010000/data/OCR-D-GT-PAGE/FILE_0002_FULLTEXT.xml')) + run_processor(DummyProcessorWithOutput, workspace=ws, + input_file_grp="GRP1", + output_file_grp="OCR-D-OUT") + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 0, "no output because no download" + ws.download_file(file1) + ws.download_file(file2) run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") @@ -303,7 +312,7 @@ def test_run_output_timeout(self): assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' config.OCRD_PROCESSING_PAGE_TIMEOUT = 1 - with pytest.raises(TimeoutError) as exc: + with pytest.raises(mp.TimeoutError) as exc: run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT", @@ -312,8 +321,12 @@ def test_run_output_timeout(self): def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') + file1 = ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001', + url=assets.path_to('SBB0000F29300010000/data/OCR-D-GT-PAGE/FILE_0001_FULLTEXT.xml')) + file2 = ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002', + url=assets.path_to('SBB0000F29300010000/data/OCR-D-GT-PAGE/FILE_0002_FULLTEXT.xml')) + ws.download_file(file1) + ws.download_file(file2) config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001') config.OCRD_EXISTING_OUTPUT = 'ABORT'