From e8e768809eb8dd053f20a396c5717ec54055119e Mon Sep 17 00:00:00 2001 From: fh Date: Wed, 21 Aug 2024 11:29:30 +0200 Subject: [PATCH 1/2] fix normalize and clean transforms config management --- eole/transforms/clean.py | 34 +++++++++++++++++++++++----------- eole/transforms/normalize.py | 20 +++++++++++--------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/eole/transforms/clean.py b/eole/transforms/clean.py index 3e96d7b6c..c8bf02ab0 100644 --- a/eole/transforms/clean.py +++ b/eole/transforms/clean.py @@ -18,6 +18,10 @@ class CleanConfig(TransformConfig): ) scripts_ok: List[str] | None = Field( default=["Latin", "Common"], + description="list of unicodata scripts accepted", + ) + scripts_nok: List[str] | None = Field( + default=[], description="list of unicodata scripts not accepted", ) src_tgt_ratio: float | None = Field( @@ -63,8 +67,8 @@ def _parse_config(self): @staticmethod def _get_param(corpus, param, def_val): """Get param string of a `corpus`.""" - if "clean" in corpus["transforms"]: - value = corpus.get(param, def_val) + if "clean" in getattr(corpus, "transforms", []): + value = getattr(corpus, param, def_val) clean = value else: clean = None @@ -88,17 +92,25 @@ def warm_up(self, vocabs=None): super().warm_up(None) import fasttext - self.src_eq_tgt_dict = self.get_config_dict(self.config, "src_eq_tgt", True) - self.same_char_dict = self.get_config_dict(self.config, "same_char", True) - self.same_word_dict = self.get_config_dict(self.config, "same_word", True) + self.src_eq_tgt_dict = self.get_config_dict( + self.full_config, "src_eq_tgt", True + ) + self.same_char_dict = self.get_config_dict(self.full_config, "same_char", True) + self.same_word_dict = self.get_config_dict(self.full_config, "same_word", True) self.scripts_ok_dict = self.get_config_dict( - self.config, "scripts_ok", ["Latin", "Common"] + self.full_config, "scripts_ok", ["Latin", "Common"] + ) + self.scripts_nok_dict = self.get_config_dict( + self.full_config, "scripts_nok", [] + ) + self.src_tgt_ratio_dict = self.get_config_dict( + self.full_config, "src_tgt_ratio", 2 + ) + self.avg_tok_min_dict = self.get_config_dict(self.full_config, "avg_tok_min", 3) + self.avg_tok_max_dict = self.get_config_dict( + self.full_config, "avg_tok_max", 20 ) - self.scripts_nok_dict = self.get_config_dict(self.config, "scripts_nok", []) - self.src_tgt_ratio_dict = self.get_config_dict(self.config, "src_tgt_ratio", 2) - self.avg_tok_min_dict = self.get_config_dict(self.config, "avg_tok_min", 3) - self.avg_tok_max_dict = self.get_config_dict(self.config, "avg_tok_max", 20) - self.langid_dict = self.get_config_dict(self.config, "langid", []) + self.langid_dict = self.get_config_dict(self.full_config, "langid", []) fasttext_loc = f"{os.path.dirname(os.path.abspath(__file__))}/lid.176.ftz" if not os.path.exists(fasttext_loc): diff --git a/eole/transforms/normalize.py b/eole/transforms/normalize.py index 2179c076f..0b43fe345 100644 --- a/eole/transforms/normalize.py +++ b/eole/transforms/normalize.py @@ -247,8 +247,8 @@ def _parse_config(self): @staticmethod def _get_param(corpus, param, def_val): """Get opt string of a `corpus`.""" - if "normalize" in corpus["transforms"]: - value = corpus.get(param, def_val) + if "normalize" in getattr(corpus, "transforms", []): + value = getattr(corpus, param, def_val) normalize = value else: normalize = None @@ -271,18 +271,20 @@ def get_config_dict(cls, config, param, def_val): def warm_up(self, vocabs=None): """Set options for each dataset.""" super().warm_up(None) - self.src_lang_dict = self.get_config_dict(self.config, "src_lang", "") - self.tgt_lang_dict = self.get_config_dict(self.config, "tgt_lang", "") - self.penn_dict = self.get_config_dict(self.config, "penn", True) + self.src_lang_dict = self.get_config_dict(self.full_config, "src_lang", "") + self.tgt_lang_dict = self.get_config_dict(self.full_config, "tgt_lang", "") + self.penn_dict = self.get_config_dict(self.full_config, "penn", True) self.norm_quote_commas_dict = self.get_config_dict( - self.config, "norm_quote_commas", True + self.full_config, "norm_quote_commas", True + ) + self.norm_numbers_dict = self.get_config_dict( + self.full_config, "norm_numbers", True ) - self.norm_numbers_dict = self.get_config_dict(self.config, "norm_numbers", True) self.pre_dict = self.get_config_dict( - self.config, "pre_replace_unicode_punct", False + self.full_config, "pre_replace_unicode_punct", False ) self.post_dict = self.get_config_dict( - self.config, "post_remove_control_chars", False + self.full_config, "post_remove_control_chars", False ) self.src_lang_dict["infer"] = self.src_lang self.tgt_lang_dict["infer"] = self.tgt_lang From 385c5c0eefa1d380f6461fd06dff2fa17a6f2585 Mon Sep 17 00:00:00 2001 From: fh Date: Fri, 23 Aug 2024 10:35:20 +0200 Subject: [PATCH 2/2] forgot config.data patch --- eole/config/data.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/eole/config/data.py b/eole/config/data.py index a299465f9..5237fca4d 100644 --- a/eole/config/data.py +++ b/eole/config/data.py @@ -87,10 +87,29 @@ class Dataset(Config): ) path_align: str | None = None # optional stuff for some transforms + # TODO: define a better mechanism to support such settings src_prefix: str | None = None tgt_prefix: str | None = None src_suffix: str | None = None tgt_suffix: str | None = None + # normalize + src_lang: str | None = None + tgt_lang: str | None = None + penn: bool | None = True + norm_quote_commas: bool | None = True + norm_numbers: bool | None = True + pre_replace_unicode_punct: bool | None = False + post_remove_control_chars: bool | None = False + # clean + src_eq_tgt: bool | None = True + same_char: bool | None = True + same_word: bool | None = True + scripts_ok: List[str] | None = ["Latin", "Common"] + scripts_nok: List[str] | None = [] + src_tgt_ratio: float | None = 2 + avg_tok_min: float | None = 3 + avg_tok_max: float | None = 20 + lang_id: List[str] | None = ["en", "fr"] # add all opts from all transforms (like in eole.opts._add_transform_opt)