From 3ad9386b57b394621d5a2c6fd9830e24677b276e Mon Sep 17 00:00:00 2001 From: jennyzhang-petuum <75236447+jennyzhang-petuum@users.noreply.github.com> Date: Fri, 2 Apr 2021 13:33:02 -0700 Subject: [PATCH 1/7] add record checking for allennlp and spacy --- forte_wrapper/allennlp/allennlp_processors.py | 13 ++++++++++++- forte_wrapper/spacy/spacy_processors.py | 12 +++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/forte_wrapper/allennlp/allennlp_processors.py b/forte_wrapper/allennlp/allennlp_processors.py index e5eeec1..0a1c5cb 100644 --- a/forte_wrapper/allennlp/allennlp_processors.py +++ b/forte_wrapper/allennlp/allennlp_processors.py @@ -13,7 +13,7 @@ # limitations under the License. import logging -from typing import List, Dict +from typing import List, Dict, Set from allennlp.predictors import Predictor from forte.common import ProcessorConfigError @@ -186,3 +186,14 @@ def _create_srl(input_pack: DataPack, tokens: List[Token], tokens[arg_span.end].end) link = PredicateLink(input_pack, pred, arg) link.arg_type = label + + @classmethod + def expected_types_and_attributes(cls) -> Dict[str, Set[str]]: + r"""Method to add expected type for current processor input which + would be checked before running the processor if + :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for + the pipeline. + """ + expectation_dict: Dict[str, Set[str]] = dict() + expectation_dict["ft.onto.base_ontology.Sentence"] = set() + return expectation_dict diff --git a/forte_wrapper/spacy/spacy_processors.py b/forte_wrapper/spacy/spacy_processors.py index e580c4a..a7c92e6 100644 --- a/forte_wrapper/spacy/spacy_processors.py +++ b/forte_wrapper/spacy/spacy_processors.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Optional, Dict, Set import spacy from spacy.language import Language @@ -125,3 +125,13 @@ def _process(self, input_pack: DataPack): # Process sentence parses. self._process_parser(result.sents, input_pack) + + def record(self, record_meta: Dict[str, Set[str]]): + r"""Method to add output type record of current processor + to :attr:`forte.data.data_pack.Meta.record`. + + Args: + record_meta: the field in the datapack for type record that need to + fill in for consistency checking. + """ + record_meta["ft.onto.base_ontology.Sentence"] = set() From ec5fab96c2bcd470a7c99da3f185016b222751be Mon Sep 17 00:00:00 2001 From: jennyzhang-petuum <75236447+jennyzhang-petuum@users.noreply.github.com> Date: Mon, 5 Apr 2021 14:15:49 -0700 Subject: [PATCH 2/7] add checking for configs.processors and record writing for allennlp --- forte_wrapper/allennlp/allennlp_processors.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/forte_wrapper/allennlp/allennlp_processors.py b/forte_wrapper/allennlp/allennlp_processors.py index 0a1c5cb..f58c4f0 100644 --- a/forte_wrapper/allennlp/allennlp_processors.py +++ b/forte_wrapper/allennlp/allennlp_processors.py @@ -50,7 +50,10 @@ class AllenNLPProcessor(PackProcessor): # pylint: disable=attribute-defined-outside-init,unused-argument def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) - + if "tokenize" not in self.configs.processors: + raise ProcessorConfigError('tokenize is necessary in ' + 'configs.processors for ' + 'tokenize, pos, depparse or srl') if configs.tag_formalism not in MODEL2URL: raise ProcessorConfigError('Incorrect value for tag_formalism') if configs.tag_formalism == 'stanford': @@ -197,3 +200,20 @@ def expected_types_and_attributes(cls) -> Dict[str, Set[str]]: expectation_dict: Dict[str, Set[str]] = dict() expectation_dict["ft.onto.base_ontology.Sentence"] = set() return expectation_dict + + def record(self, record_meta: Dict[str, Set[str]]): + r"""Method to add output type record of current processor + to :attr:`forte.data.data_pack.Meta.record`. + + Args: + record_meta: the field in the datapack for type record that need to + fill in for consistency checking. + """ + if "tokenize" in self.configs.processors: + record_meta["ft.onto.base_ontology.Token"] = set() + if "pos" in self.configs.processors: + record_meta["ft.onto.base_ontology.Token"] = {"pos"} + if "depparse" in self.configs.processors: + record_meta["ft.onto.base_ontology.Dependency"] = set() + + record_meta["ft.onto.base_ontology.Sentence"] = set() From 0f12fb6227e8a9d1b745abc1cf92c6ff64a2b81a Mon Sep 17 00:00:00 2001 From: jennyzhang-petuum <75236447+jennyzhang-petuum@users.noreply.github.com> Date: Mon, 5 Apr 2021 14:24:08 -0700 Subject: [PATCH 3/7] remove and unnecessary type --- forte_wrapper/allennlp/allennlp_processors.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/forte_wrapper/allennlp/allennlp_processors.py b/forte_wrapper/allennlp/allennlp_processors.py index 7cc329e..4c60e44 100644 --- a/forte_wrapper/allennlp/allennlp_processors.py +++ b/forte_wrapper/allennlp/allennlp_processors.py @@ -240,5 +240,3 @@ def record(self, record_meta: Dict[str, Set[str]]): record_meta["ft.onto.base_ontology.Token"] = {"pos"} if "depparse" in self.configs.processors: record_meta["ft.onto.base_ontology.Dependency"] = set() - - record_meta["ft.onto.base_ontology.Sentence"] = set() From 03f08d9c7935985712eda15d53ac55ea543851f5 Mon Sep 17 00:00:00 2001 From: jennyzhang-petuum <75236447+jennyzhang-petuum@users.noreply.github.com> Date: Mon, 5 Apr 2021 14:49:26 -0700 Subject: [PATCH 4/7] minor fix --- forte_wrapper/allennlp/allennlp_processors.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/forte_wrapper/allennlp/allennlp_processors.py b/forte_wrapper/allennlp/allennlp_processors.py index 4c60e44..8d25215 100644 --- a/forte_wrapper/allennlp/allennlp_processors.py +++ b/forte_wrapper/allennlp/allennlp_processors.py @@ -222,8 +222,9 @@ def expected_types_and_attributes(cls) -> Dict[str, Set[str]]: :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for the pipeline. """ - expectation_dict: Dict[str, Set[str]] = dict() - expectation_dict["ft.onto.base_ontology.Sentence"] = set() + expectation_dict: Dict[str, Set[str]] = { + "ft.onto.base_ontology.Sentence": set() + } return expectation_dict def record(self, record_meta: Dict[str, Set[str]]): From e6c420af09846be055f4a4b1e9057105a9099578 Mon Sep 17 00:00:00 2001 From: jennyzhang-petuum <75236447+jennyzhang-petuum@users.noreply.github.com> Date: Mon, 5 Apr 2021 16:08:51 -0700 Subject: [PATCH 5/7] add all the record types for spacy and a check of spacy config --- forte_wrapper/allennlp/allennlp_processors.py | 2 +- forte_wrapper/spacy/spacy_processors.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/forte_wrapper/allennlp/allennlp_processors.py b/forte_wrapper/allennlp/allennlp_processors.py index 8d25215..1c8229a 100644 --- a/forte_wrapper/allennlp/allennlp_processors.py +++ b/forte_wrapper/allennlp/allennlp_processors.py @@ -238,6 +238,6 @@ def record(self, record_meta: Dict[str, Set[str]]): if "tokenize" in self.configs.processors: record_meta["ft.onto.base_ontology.Token"] = set() if "pos" in self.configs.processors: - record_meta["ft.onto.base_ontology.Token"] = {"pos"} + record_meta["ft.onto.base_ontology.Token"].add("pos") if "depparse" in self.configs.processors: record_meta["ft.onto.base_ontology.Dependency"] = set() diff --git a/forte_wrapper/spacy/spacy_processors.py b/forte_wrapper/spacy/spacy_processors.py index a7c92e6..1662ef8 100644 --- a/forte_wrapper/spacy/spacy_processors.py +++ b/forte_wrapper/spacy/spacy_processors.py @@ -16,7 +16,7 @@ import spacy from spacy.language import Language from spacy.cli.download import download -from forte.common import ProcessExecutionException +from forte.common import ProcessExecutionException, ProcessorConfigError from forte.common.configuration import Config from forte.common.resources import Resources from forte.data.data_pack import DataPack @@ -48,6 +48,11 @@ def set_up(self): # pylint: disable=unused-argument def initialize(self, resources: Resources, configs: Config): + if "pos" in configs.processors or "lemma" in configs.processors: + if "tokenize" not in configs.processors: + raise ProcessorConfigError('tokenize is necessary in ' + 'configs.processors for ' + 'pos or lemma') self.processors = configs.processors self.lang_model = configs.lang self.set_up() @@ -135,3 +140,10 @@ def record(self, record_meta: Dict[str, Set[str]]): fill in for consistency checking. """ record_meta["ft.onto.base_ontology.Sentence"] = set() + record_meta["ft.onto.base_ontology.EntityMention"] = set() + if "tokenize" in self.processors: + record_meta["ft.onto.base_ontology.Token"] = set() + if "pos" in self.processors: + record_meta["ft.onto.base_ontology.Token"].add("pos") + if "lemma" in self.processors: + record_meta["ft.onto.base_ontology.Token"].add("lemma") From 04aa5f1ed3515a280146f8d4e9ea107f7ecfbe87 Mon Sep 17 00:00:00 2001 From: jennyzhang-petuum <75236447+jennyzhang-petuum@users.noreply.github.com> Date: Tue, 6 Apr 2021 15:19:28 -0700 Subject: [PATCH 6/7] record writing add srl and make config checking right --- forte_wrapper/allennlp/allennlp_processors.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/forte_wrapper/allennlp/allennlp_processors.py b/forte_wrapper/allennlp/allennlp_processors.py index 1c8229a..b972351 100644 --- a/forte_wrapper/allennlp/allennlp_processors.py +++ b/forte_wrapper/allennlp/allennlp_processors.py @@ -49,10 +49,12 @@ class AllenNLPProcessor(PackProcessor): # pylint: disable=attribute-defined-outside-init,unused-argument def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) - if "tokenize" not in self.configs.processors: - raise ProcessorConfigError('tokenize is necessary in ' - 'configs.processors for ' - 'tokenize, pos, depparse or srl') + if ("pos" in configs.processors or "depparse" in configs.processors + or "depparse" in configs.processors): + if "tokenize" not in self.configs.processors: + raise ProcessorConfigError('tokenize is necessary in ' + 'configs.processors for ' + 'pos, depparse or srl') cuda_devices = itertools.cycle(configs['cuda_devices']) if configs.tag_formalism not in MODEL2URL: raise ProcessorConfigError('Incorrect value for tag_formalism') @@ -240,4 +242,7 @@ def record(self, record_meta: Dict[str, Set[str]]): if "pos" in self.configs.processors: record_meta["ft.onto.base_ontology.Token"].add("pos") if "depparse" in self.configs.processors: - record_meta["ft.onto.base_ontology.Dependency"] = set() + record_meta["ft.onto.base_ontology.Dependency"] = {"rel_type"} + if "srl" in self.configs.processors: + record_meta["ft.onto.base_ontology.PredicateArgument"] = set() + record_meta["ft.onto.base_ontology.PredicateMention"] = set() From c0b05737fced9fed31446f0213773f5acf341a51 Mon Sep 17 00:00:00 2001 From: jennyzhang-petuum <75236447+jennyzhang-petuum@users.noreply.github.com> Date: Tue, 6 Apr 2021 16:57:59 -0700 Subject: [PATCH 7/7] add PredicateLink --- forte_wrapper/allennlp/allennlp_processors.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/forte_wrapper/allennlp/allennlp_processors.py b/forte_wrapper/allennlp/allennlp_processors.py index ce7f3ba..7d58345 100644 --- a/forte_wrapper/allennlp/allennlp_processors.py +++ b/forte_wrapper/allennlp/allennlp_processors.py @@ -261,3 +261,5 @@ def record(self, record_meta: Dict[str, Set[str]]): if "srl" in self.configs.processors: record_meta["ft.onto.base_ontology.PredicateArgument"] = set() record_meta["ft.onto.base_ontology.PredicateMention"] = set() + record_meta["ft.onto.base_ontology.PredicateLink"] = \ + {"arg_type"}