deepdoctection · JaMe76 · Jun 30, 2022 · Jun 22, 2022 · Jun 22, 2022 · Jun 23, 2022
diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,4 @@ snippets.py
 =3.0.0
 /notebooks/own.ipynb
 /notebooks/layout_parsing_structure.ipynb
+/snipptes.py
diff --git a/README.md b/README.md
@@ -115,7 +115,16 @@ Alternatively, consult the
 
 ### Installation from source
 
-Download the repository or clone via
+In a virtual environment, run for PyTorch:
+
+```
+pip install git+https://github.com/deepdoctection/deepdoctection#egg=deepdoctection[source-pt]
+```
+
+This will install the package without cloning the whole repo. Do a similar command with `source-tf` to
+install the package for Tensorflow. 
+
+Otherwise, download the repository or clone via
 
 ```
 git clone https://github.com/deepdoctection/deepdoctection.git

diff --git a/deepdoctection/dataflow/__init__.py b/deepdoctection/dataflow/__init__.py
@@ -20,8 +20,10 @@
     MapData,
     MapDataComponent,
     MultiProcessMapData,
+    MultiThreadMapData,
     ProxyDataFlow,
     RepeatedData,
+    MultiThreadMapData,
 )
 
 from .common import *

diff --git a/deepdoctection/datapoint/annotation.py b/deepdoctection/datapoint/annotation.py
@@ -237,7 +237,8 @@ def remove_sub_category(self, key: str) -> None:
         :param key: A key to a sub category.
         """
 
-        self.sub_categories.pop(key)
+        if key in self.sub_categories:
+            self.sub_categories.pop(key)
 
     def dump_relationship(self, key: str, annotation_id: str) -> None:
         """

diff --git a/deepdoctection/datasets/instances/fintabnet.py b/deepdoctection/datasets/instances/fintabnet.py
@@ -189,6 +189,8 @@ def _map_filename(dp: JsonDict, workdir: Path) -> JsonDict:
             load_image,
             fake_score=fake_score,
             rows_and_cols=rows_and_cols,
+            dd_pipe_like=False,
+            is_fintabnet=True,
         )
         if use_multi_proc:
             df = MultiProcessMapData(

diff --git a/deepdoctection/datasets/instances/iiitar13k.py b/deepdoctection/datasets/instances/iiitar13k.py
@@ -54,7 +54,7 @@
 from ..registry import dataset_registry
 
 if lxml_available():
-    from lxml import etree  # type: ignore
+    from lxml import etree
 
 _NAME = "iiitar13k"
 

diff --git a/deepdoctection/datasets/instances/pubtables1m.py b/deepdoctection/datasets/instances/pubtables1m.py
@@ -53,7 +53,7 @@
 from ..registry import dataset_registry
 
 if lxml_available():
-    from lxml import etree  # type: ignore
+    from lxml import etree
 
 _NAME = "pubtables1m"
 

diff --git a/deepdoctection/datasets/instances/pubtabnet.py b/deepdoctection/datasets/instances/pubtabnet.py
@@ -36,7 +36,7 @@
 from ...mapper.cats import cat_to_sub_cat, filter_cat
 from ...mapper.pubstruct import pub_to_image
 from ...utils.detection_types import JsonDict
-from ...utils.logger import log_once
+from ...utils.logger import log_once, logger
 from ...utils.settings import names
 from ..base import _BuiltInDataset
 from ..dataflow_builder import DataFlowBaseBuilder
@@ -73,7 +73,7 @@
 _LOCATION = "pubtabnet"
 _ANNOTATION_FILES: Mapping[str, str] = {"all": "PubTabNet_2.0.0.jsonl"}
 
-_INIT_CATEGORIES = [names.C.CELL, names.C.ITEM]
+_INIT_CATEGORIES = [names.C.CELL, names.C.ITEM, names.C.TAB, names.C.WORD]
 _SUB_CATEGORIES: Dict[str, Dict[str, List[str]]]
 _SUB_CATEGORIES = {
     names.C.ITEM: {"row_col": [names.C.ROW, names.C.COL]},
@@ -86,6 +86,8 @@
     },
     names.C.HEAD: {names.C.RN: [], names.C.CN: [], names.C.RS: [], names.C.CS: []},
     names.C.BODY: {names.C.RN: [], names.C.CN: [], names.C.RS: [], names.C.CS: []},
+    names.C.TAB: {names.C.HTAB: [names.C.HTAB]},
+    names.C.WORD: {names.C.CHARS: [names.C.CHARS]},
 }
 
 
@@ -139,6 +141,10 @@ def build(self, **kwargs: Union[str, int]) -> DataFlow:
         load_image = kwargs.get("load_image", False)
         rows_and_cols = kwargs.get("rows_and_cols", False)
         fake_score = kwargs.get("fake_score", False)
+        dd_pipe_like = kwargs.get("dd_pipe_like"
F438
;, False)
+        if dd_pipe_like:
+            logger.info("When 'dd_pipe_like' is set to True will reset 'load_image' to True")
+            load_image = True
 
         # Load
         dataset_split = self.annotation_files["all"]
@@ -158,6 +164,8 @@ def replace_filename(dp: JsonDict) -> JsonDict:
             load_image,
             fake_score=fake_score,
             rows_and_cols=rows_and_cols,
+            dd_pipe_like=dd_pipe_like,
+            is_fintabnet=False,
         )
 
         df = MapData(df, pub_mapper)

diff --git a/deepdoctection/eval/__init__.py b/deepdoctection/eval/__init__.py
@@ -20,8 +20,12 @@
 for training.
 """
 
+from ..utils.file_utils import apted_available
 from .accmetric import *
 from .base import *
 from .cocometric import *
 from .eval import *
 from .registry import *
+
+if apted_available():
+    from .tedsmetric import *
diff --git a/deepdoctection/eval/accmetric.py b/deepdoctection/eval/accmetric.py
@@ -53,9 +53,9 @@ def accuracy(label_gt: List[int], label_predictions: List[int], masks: Optional[
     :return: Accuracy score with only unmasked values to be considered
     """
 
-    np_label_gt, np_label_predictions = np.asarray(label_gt), np.asarray(label_predictions)
+    np_label_gt, np_label_pr = np.asarray(label_gt), np.asarray(label_predictions)
     assert len(np_label_gt) == len(
-        np_label_predictions
+        np_label_pr
     ), f"length of label_gt ({len(np_label_gt)}) and label_predictions ({len(np_label_gt)}) must be equal"
 
     if masks is not None:
@@ -65,8 +65,8 @@ def accuracy(label_gt: List[int], label_predictions: List[int], masks: Optional[
         np_masks = np.asarray(masks)
         np_masks.astype(bool)
         np_label_gt = np_label_gt[np_masks]
-        np_label_predictions = np_label_predictions[np_masks]
-    return np.array(accuracy_score(np_label_gt, np_label_predictions), dtype=float32)
+        np_label_pr = np_label_pr[np_masks]
+    return np.array(accuracy_score(np_label_gt, np_label_pr), dtype=float32)
 
 
 @metric_registry.register("accuracy")
@@ -111,11 +111,11 @@ def get_distance(
         cls, dataflow_gt: DataFlow, dataflow_predictions: DataFlow, categories: DatasetCategories
     ) -> List[JsonDict]:
 
-        labels_gt, labels_predictions = cls.dump(dataflow_gt, dataflow_predictions, categories)
+        labels_gt, labels_pr = cls.dump(dataflow_gt, dataflow_predictions, categories)
 
         results = []
         for key in labels_gt:  # pylint: disable=C0206
-            res = cls.metric(labels_gt[key], labels_predictions[key])
+            res = cls.metric(labels_gt[key], labels_pr[key])
             results.append({"key": key, "val": res, "num_samples": len(labels_gt[key])})
         return results
 
@@ -168,6 +168,11 @@ def _category_sanity_checks(cls, categories: DatasetCategories) -> None:
     def get_requirements(cls) -> List[Requirement]:
         return [get_sklearn_requirement()]
 
+    @property
+    def sub_cats(self) -> Optional[Union[Dict[str, str], Dict[str, List[str]]]]:
+        """ sub cats"""
+        return self._sub_cats
+
 
 def confusion(label_gt: List[int], label_predictions: List[int], masks: Optional[List[int]] = None) -> NDArray[float32]:
     """
@@ -181,15 +186,15 @@ def confusion(label_gt: List[int], label_predictions: List[int], masks: Optional
     :return: numpy array
     """
 
-    np_label_gt, np_label_predictions = np.asarray(label_gt), np.asarray(label_predictions)
+    np_label_gt, np_label_pr = np.asarray(label_gt), np.asarray(label_predictions)
 
     if masks is not None:
         np_masks = np.asarray(masks)
         np_masks.astype(bool)
         np_label_gt = np_label_gt[np_masks]
-        np_label_predictions = np_label_predictions[np_masks]
+        np_label_pr = np_label_pr[np_masks]
 
-    return confusion_matrix(np_label_gt, np_label_predictions)
+    return confusion_matrix(np_label_gt, np_label_pr)
 
 
 @metric_registry.register("confusion")

diff --git a/deepdoctection/eval/cocometric.py b/deepdoctection/eval/cocometric.py
@@ -26,6 +26,7 @@
 
 from ..dataflow import DataFlow
 from ..datasets.info import DatasetCategories
+from ..mapper.cats import re_assign_cat_ids
 from ..mapper.cocostruct import image_to_coco
 from ..utils.detection_types import JsonDict
 from ..utils.file_utils import Requirement, cocotools_available, get_cocotools_requirement
@@ -126,14 +127,17 @@ class CocoMetric(MetricBase):
     def dump(
         cls, dataflow_gt: DataFlow, dataflow_predictions: DataFlow, categories: DatasetCategories
     ) -> Tuple["COCO", "COCO"]:
-        cats = [{"id": int(k), "name": v} for k, v in categories.get_categories(as_dict=True).items()]
+        cats = [{"id": int(k), "name": v} for k, v in categories.get_categories(as_dict=True, filtered=True).items()]
         imgs_gt, imgs_pr = [], []
         anns_gt, anns_pr = [], []
 
         dataflow_gt.reset_state(), dataflow_predictions.reset_state()  # pylint: disable=W0106
 
         for dp_gt, dp_pred in zip(dataflow_gt, dataflow_predictions):
             img_gt, ann_gt = cls.mapper(dp_gt)  # type: ignore
+            dp_pred = re_assign_cat_ids(categories.get_categories(as_dict=True, filtered=True, name_as_key=True))(  # pylint: disable=E1120
+                dp_pred
+            )
             img_pr, ann_pr = cls.mapper(dp_pred)  # type: ignore
             imgs_gt.append(img_gt)
             imgs_pr.append(img_pr)