diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 30f493b085..48f320b977 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -68,6 +68,7 @@ "hybrid_ner_model": "deeppavlov.models.ner.NER_model:HybridNerModel", "imdb_reader": "deeppavlov.dataset_readers.imdb_reader:ImdbReader", "input_splitter": "deeppavlov.models.multitask_bert.multitask_bert:InputSplitter", + "intent_catcher_iterator": "deeppavlov.dataset_iterators.intent_catcher_iterator:IntentCatcherIterator", "insurance_reader": "deeppavlov.dataset_readers.insurance_reader:InsuranceReader", "jieba_tokenizer": "deeppavlov.models.tokenizers.jieba_tokenizer:JiebaTokenizer", "joint_tagger_parser": "deeppavlov.models.syntax_parser.joint:JointTaggerParser", @@ -207,5 +208,10 @@ "wiki_sqlite_vocab": "deeppavlov.vocabs.wiki_sqlite:WikiSQLiteVocab", "wikitionary_100K_vocab": "deeppavlov.vocabs.typos:Wiki100KDictionary", "intent_catcher_reader": "deeppavlov.dataset_readers.intent_catcher_reader:IntentCatcherReader", - "intent_catcher": "deeppavlov.models.intent_catcher.intent_catcher:IntentCatcher" -} + "intent_catcher": "deeppavlov.models.intent_catcher.intent_catcher:IntentCatcher", + "mem_classification_model": "deeppavlov.models.classifiers.memorizing_classifier:MemClassificationModel", + "md_yaml_dialogs_iterator": "deeppavlov.dataset_iterators.md_yaml_dialogs_iterator:MD_YAML_DialogsDatasetIterator", + "md_yaml_dialogs_ner_iterator": "deeppavlov.dataset_iterators.md_yaml_dialogs_ner_iterator:MD_YAML_DialogsDatasetNERIterator", + "md_yaml_dialogs_intents_iterator": "deeppavlov.dataset_iterators.md_yaml_dialogs_ner_iterator:MD_YAML_DialogsDatasetIntentsIterator", + "slotfill_raw_memorizing": "deeppavlov.models.slotfill.slotfill_raw:RASA_MemorizingSlotFillingComponent" +} \ No newline at end of file diff --git a/deeppavlov/dataset_iterators/intent_catcher_iterator.py b/deeppavlov/dataset_iterators/intent_catcher_iterator.py new file mode 100644 index 0000000000..9cbc5a2160 --- /dev/null +++ b/deeppavlov/dataset_iterators/intent_catcher_iterator.py @@ -0,0 +1,124 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import itertools +import re +from logging import getLogger +from typing import Tuple, List, Dict, Any, Iterator + +from xeger import Xeger + +from deeppavlov.core.common.registry import register +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.dataset_readers.dto.rasa.nlu import Intents, IntentDesc + +log = getLogger(__name__) + + +@register('intent_catcher_iterator') +class IntentCatcherIterator(DataLearningIterator): + """ + Iterates over data for Intent Catcher training. + A subclass of :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator`. + + Args: + seed: random seed for data shuffling + shuffle: whether to shuffle data during batching + limit: Maximum number of phrases, that are generated from input regexps. + + """ + + def __init__(self, + data: Dict[str, List[Tuple[Any, Any]]], + seed: int = None, + shuffle: bool = True, + limit: int = 10) -> None: + self.limit = limit + super().__init__(data, seed, shuffle) + + def gen_batches(self, + batch_size: int, + data_type: str = 'train', + shuffle: bool = None) -> Iterator[Tuple]: + """Generate batches of inputs and expected output to train + Intents Catcher + + Args: + batch_size: number of samples in batch + data_type: can be either 'train', 'test', or 'valid' + shuffle: whether to shuffle dataset before batching + + Returns: + regexps used in the passed data_type, list of sentences generated + from the original regexps, list of generated senteces' labels + """ + + if shuffle is None: + shuffle = self.shuffle + + ic_file_content: Intents = self.data[data_type]["nlu_lines"] + sentences, labels = [], [] + for intent in ic_file_content.intents: + for intent_line in intent.lines: + sentences.append(intent_line.text) + labels.append(intent.title) + + assert len(sentences) == len(labels), \ + "Number of labels is not equal to the number of sentences" + + try: + regexps = [re.compile(s) for s in sentences] + except Exception as e: + log.error(f"Some sentences are not a consitent regular expressions") + raise e + + proto_entries_indices = list(range(len(sentences))) + if shuffle: + self.random.shuffle(proto_entries_indices) + + if batch_size < 0: + batch_size = len(proto_entries_indices) + + xeger = Xeger(self.limit) + + regexps, generated_sentences, generated_labels = [], [], [] + generated_cnt = 0 + for proto_entry_ix in proto_entries_indices: + sent, lab = sentences[proto_entry_ix], labels[proto_entry_ix] + regex_ = re.compile(sent) + + gx = {xeger.xeger(sent) for _ in range(self.limit)} + generated_sentences.extend(gx) + generated_labels.extend([lab for _ in range(len(gx))]) + regexps.extend([regex_ for _ in range(len(gx))]) + + if len(generated_sentences) == batch_size: + # tuple(zip) below does [r1, r2, ..], [s1, s2, ..] -> ((r1, s1), (r2, s2), ..) + yield tuple(zip(regexps, generated_sentences)), generated_labels + generated_cnt += len(generated_sentences) + regexps, generated_sentences, generated_labels = [], [], [] + + if generated_sentences: + yield tuple(zip(regexps, generated_sentences)), generated_labels + generated_cnt += len(generated_sentences) + regexps, generated_sentences, generated_labels = [], [], [] + + log.info(f"Original number of samples: {len(sentences)}" + f", generated samples: {generated_cnt}") + + def get_instances(self, data_type: str = 'train') -> Tuple[tuple, tuple]: + res = tuple(map(lambda it: tuple(itertools.chain(*it)), + zip(*self.gen_batches(batch_size=-1, + data_type=data_type, + shuffle=False)))) + return res \ No newline at end of file diff --git a/deeppavlov/dataset_iterators/md_yaml_dialogs_iterator.py b/deeppavlov/dataset_iterators/md_yaml_dialogs_iterator.py new file mode 100644 index 0000000000..ffaddbcc4d --- /dev/null +++ b/deeppavlov/dataset_iterators/md_yaml_dialogs_iterator.py @@ -0,0 +1,480 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, softwaredata +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import itertools +import json +import os +import re +import tempfile +from logging import getLogger +from typing import Dict, List, Tuple, Any, Iterator + +from deeppavlov.core.common.registry import register +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.dataset_readers.dstc2_reader import DSTC2DatasetReader +from deeppavlov.dataset_readers.dto.rasa.domain_knowledge import DomainKnowledge +from deeppavlov.dataset_readers.dto.rasa.stories import Story, Turn, Stories +from deeppavlov.dataset_readers.dto.rasa.nlu import Intents +import random + +log = getLogger(__name__) + + +class RASADict(dict): + def __add__(self, oth): + return RASADict() + + +@register('md_yaml_dialogs_iterator') +class MD_YAML_DialogsDatasetIterator(DataLearningIterator): + """ + + """ + + def __init__(self, + data: Dict[str, List[Tuple[Any, Any]]], + seed: int = None, + shuffle: bool = True, + limit: int = 10) -> None: + self.limit = limit + super().__init__(data, seed, shuffle) + + def gen_batches(self, + batch_size: int, + data_type: str = 'train', + shuffle: bool = None) -> Iterator[Tuple]: + if shuffle is None: + shuffle = self.shuffle + + data = self.data[data_type] + domain_knowledge = self.data[data_type]["domain"] + intents = self.data[data_type]["nlu_lines"] + stories = self.data[data_type]["story_lines"] + + dialogs = False + ignore_slots = False + # print(stories) + story_iterator = StoriesGenerator(stories, + intents, + domain_knowledge, + ignore_slots, + batch_size) + + for batch in story_iterator.generate(): + stories_parsed = batch + + # tmp_f = tempfile.NamedTemporaryFile(delete=False, mode='w', + # encoding="utf-8") + # for story_id, story in stories_parsed.items(): + # for replics in story: + # print(json.dumps(replics), file=tmp_f) + # print(file=tmp_f) + # tmp_f.close() + # noinspection PyProtectedMember + # print(batch) + gobot_formatted_stories = DSTC2DatasetReader._read_from_batch( + list(itertools.chain(*[v + [{}] for v in batch.values()])), + dialogs=dialogs) + # os.remove(tmp_f.name) + ds = [] + prev_resp_act = None + for x, y in gobot_formatted_stories: + if x.get('episode_done'): + del x['episode_done'] + prev_resp_act = None + ds.append(([], [])) + x['prev_resp_act'] = prev_resp_act + prev_resp_act = y['act'] + ds[-1][0].append(x) + ds[-1][1].append(y) + yield zip(*ds) + + # def read_story(self, stories: Stories, dialogs, + # domain_knowledge: DomainKnowledge, nlu_knowledge: Intents, + # ignore_slots): + # log.debug(f"BEFORE MLU_MD_DialogsDatasetReader._read_story(): " + # f"story_fpath={story_fpath}, " + # f"dialogs={dialogs}, " + # f"domain_knowledge={domain_knowledge}, " + # f"intent2slots2text={intent2slots2text}, " + # f"slot_name2text2value={slot_name2text2value}") + # + # + # + # + # log.debug(f"AFTER MLU_MD_DialogsDatasetReader._read_story(): " + # f"story_fpath={story_fpath}, " + # f"dialogs={dialogs}, " + # f"domain_knowledge={domain_knowledge}, " + # f"intent2slots2text={intent2slots2text}, " + # f"slot_name2text2value={slot_name2text2value}") + # + # return gobot_formatted_stories + + # if len(generated_sentences) == batch_size: + # # tuple(zip) below does [r1, r2, ..], [s1, s2, ..] -> ((r1, s1), (r2, s2), ..) + # yield tuple(zip(regexps, generated_sentences)), generated_labels + # generated_cnt += len(generated_sentences) + # regexps, generated_sentences, generated_labels = [], [], [] + # + # if generated_sentences: + # yield tuple(zip(regexps, generated_sentences)), generated_labels + # generated_cnt += len(generated_sentences) + # regexps, generated_sentences, generated_labels = [], [], [] + # + # log.info(f"Original number of samples: {len(sentences)}" + # f", generated samples: {generated_cnt}") + + def get_instances(self, data_type: str = 'train') -> Tuple[ + tuple, tuple]: + concat = lambda it: tuple(itertools.chain(*it)) + tmp = self.gen_batches(batch_size=-1, + data_type=data_type, + shuffle=False) + # print("a") + res = tuple(e for el in tmp + for e in el) + # print("b") + # print(a) + # print("c") + # res = tuple(map(concat,zip(*tmp))) + + # print(res) + return res + + +class TurnIterator: + _USER_SPEAKER_ID = 1 + _SYSTEM_SPEAKER_ID = 2 + + def __init__(self, turn: Turn, nlu: Intents, + domain_knowledge: DomainKnowledge, ignore_slots: bool = False): + self.turn = turn + self.intents: Intents = nlu + self.domain_knowledge = domain_knowledge + self.ignore_slots = ignore_slots + + def _clarify_slots_values(self, slots_dstc2formatted): + slots_key = [] + for slot_name, slot_value in slots_dstc2formatted: + slot_actual_value = self.intents.slot_name2text2value.get(slot_name, + {}).get( + slot_value, slot_value) + slots_key.append((slot_name, slot_actual_value)) + slots_key = tuple(sorted(slots_key)) + return slots_key + + def parse_user_intent(self): + """ + Given the intent line in RASA stories.md format, return the name of the intent and slots described with this line + Args: + line: the line to parse + Returns: + the pair of the intent name and slots ([[slot name, slot value],.. ]) info + """ + intent = self.turn.turn_description.strip('*').strip() + if '{' not in intent: + intent = intent + "{}" # the prototypical intent is "intent_name{slot1: value1, slotN: valueN}" + user_action, slots_info = intent.split('{', 1) + slots_info = json.loads('{' + slots_info) + slots_dstc2formatted = [[slot_name, slot_value] for + slot_name, slot_value in slots_info.items()] + if self.ignore_slots: + slots_dstc2formatted = dict() + return user_action, slots_dstc2formatted + + def choose_slots_for_whom_exists_text(self, slots_actual_values, + user_action): + """ + Args: + slots_actual_values: the slot values information to look utterance for + user_action: the intent to look utterance for + Returns: + the slots ommitted to find an NLU candidate, the slots represented in the candidate, the intent name used + """ + possible_keys = [k for k in self.intents.intent2slots2text.keys() if + user_action in k] + possible_keys = possible_keys + [user_action] + possible_keys = sorted(possible_keys, + key=lambda action_s: action_s.count('+')) + for possible_action_key in possible_keys: + if self.intents.intent2slots2text[possible_action_key].get( + slots_actual_values): + slots_used_values = slots_actual_values + slots_to_exclude = [] + return slots_to_exclude, slots_used_values, possible_action_key + else: + slots_lazy_key = set(e[0] for e in slots_actual_values) + slots_lazy_key -= {"intent"} + fake_keys = [] + for known_key in self.intents.intent2slots2text[ + possible_action_key].keys(): + if slots_lazy_key.issubset(set(e[0] for e in known_key)): + fake_keys.append(known_key) + break + + if fake_keys: + slots_used_values = sorted(fake_keys, key=lambda elem: ( + len(set(slots_actual_values) ^ set(elem)), + len([e for e in elem + if e[0] not in slots_lazy_key])) + )[0] + + slots_to_exclude = [e[0] for e in slots_used_values if + e[0] not in slots_lazy_key] + return slots_to_exclude, slots_used_values, possible_action_key + + raise KeyError("no possible NLU candidates found") + + def user_action2text(self, user_action: str, slots_li=None): + """ + given the user intent, return the text representing this intent with passed slots + Args: + user_action: the name of intent to generate text for + slots_li: the slot values to provide + Returns: + the text of utterance relevant to the passed intent and slots + """ + if slots_li is None: + slots_li = tuple() + res = self.intents.intent2slots2text[user_action][slots_li] + # print(res) + # print(self.intents.intent2slots2text) + return res + + def process_user_turn(self): + user_action, slots_dstc2formatted = self.parse_user_intent() + slots_actual_values = self._clarify_slots_values(slots_dstc2formatted) + slots_to_exclude, slots_used_values, action_for_text = self.choose_slots_for_whom_exists_text( + slots_actual_values, user_action) + possible_user_response_infos = self.user_action2text(action_for_text, + slots_used_values) + random.shuffle(possible_user_response_infos) + # possible_user_utters = [] + for user_response_info in possible_user_response_infos[:2]: + # print(user_response_info) + user_utter = {"speaker": self._USER_SPEAKER_ID, + "text": user_response_info["text"], + "dialog_acts": [{"act": user_action, + "slots": user_response_info[ + "slots"]}], + "slots to exclude": slots_to_exclude} + yield user_utter + + def system_action2text(self, system_action): + """ + given the system action name return the relevant template text + Args: + domain_knowledge: the domain knowledge relevant to the currently processed config + system_action: the name of the action to get intent for + Returns: + template relevant to the passed action + """ + possible_system_responses = self.domain_knowledge.response_templates.get( + system_action, + [{"text": system_action}]) + + response_text = possible_system_responses[0]["text"] + response_text = re.sub(r"(\w+)\=\{(.*?)\}", r"#\2", + response_text) # TODO: straightforward regex string + + return response_text + + def parse_system_turn(self): + """ + Given the RASA stories.md line, returns the dstc2-formatted json (dict) for this line + Args: + domain_knowledge: the domain knowledge relevant to the processed stories config (from which line is taken) + line: the story system step representing line from stories.md + Returns: + the dstc2-formatted passed turn + """ + # system actions are started in dataset with - + system_action_name = self.turn.turn_description.strip('-').strip() + curr_action_text = self.system_action2text(system_action_name) + system_action = {"speaker": self._SYSTEM_SPEAKER_ID, + "text": curr_action_text, + "dialog_acts": [ + {"act": system_action_name, "slots": []}]} + if system_action_name.startswith("action"): + system_action["db_result"] = {} + return system_action + + def process_system_utter(self): + """ + Yields: all the possible dstc2 versions of the passed story line + TODO: SUPPORT FORMS + """ + # nonlocal intent2slots2text, domain_knowledge, curr_story_utters_batch, nonlocal_curr_story_bad + system_action = self.parse_system_turn() + # system_action_name = system_action.get("dialog_acts")[0].get("act") + # + # for curr_story_utters in curr_story_utters_batch: + # if cls.last_turn_is_systems_turn(curr_story_utters): + # # deal with consecutive system actions by inserting the last user replics in between + # curr_story_utters.append( + # cls.get_last_users_turn(curr_story_utters)) + # + # def parse_form_name(story_line: str) -> str: + # """ + # if the line (in stories.md utterance format) contains a form name, return it + # Args: + # story_line: line to extract form name from + # Returns: + # the extracted form name or None if no form name found + # """ + # form_name = None + # if story_line.startswith("form"): + # form_di = json.loads(story_line[len("form"):]) + # form_name = form_di["name"] + # return form_name + # + # if system_action_name.startswith("form"): + # form_name = parse_form_name(system_action_name) + # augmented_utters = cls.augment_form(form_name, domain_knowledge, + # intent2slots2text) + # + # utters_to_append_batch = [[]] + # for user_utter in augmented_utters: + # new_curr_story_utters_batch = [] + # for curr_story_utters in utters_to_append_batch: + # possible_extensions = process_story_line(user_utter) + # for possible_extension in possible_extensions: + # new_curr_story_utters = curr_story_utters.copy() + # new_curr_story_utters.extend(possible_extension) + # new_curr_story_utters_batch.append( + # new_curr_story_utters) + # utters_to_append_batch = new_curr_story_utters_batch + # else: + # utters_to_append_batch = [[system_action]] + + yield system_action + + def __call__(self): + if self.turn.is_user_turn(): + for possible_turn in self.process_user_turn(): + yield possible_turn + elif self.turn.is_system_turn(): + for possible_turn in self.process_system_utter(): + yield possible_turn + + +def iterProduct(ic): + # https://stackoverflow.com/a/12094245 + if not ic: + yield [] + return + + for i in ic[0](): + for js in iterProduct(ic[1:]): + yield [i] + js + + +class StoryGenerator: + def __init__(self, story: Story, nlu: Intents, + domain_knowledge: DomainKnowledge, ignore_slots=False): + self.story: Story = story + self.turn_iterators = [] + for turn in story.turns: + turn_iterator = TurnIterator(turn, nlu, domain_knowledge, + ignore_slots) + self.turn_iterators.append(turn_iterator) + self.turn_ix = -1 + self.version_ix = -1 + + def gen_story_sample(self): + for i in iterProduct(self.turn_iterators): + yield i + + +class StoriesGenerator: + def __init__(self, stories: Stories, intents: Intents, + domain_knowledge: DomainKnowledge, ignore_slots: False, + batch_size=1): + self.stories = stories + self.intents = intents + self.domain_knowledge = domain_knowledge + self.ignore_slots = ignore_slots + self.batch_size = batch_size + + def generate(self): + batch = dict() + for story in self.stories.stories: + story_generator = StoryGenerator(story, self.intents, + self.domain_knowledge, + self.ignore_slots) + for story_data in story_generator.gen_story_sample(): + batch[story.title] = story_data + if len(batch) == self.batch_size: + yield batch + batch = dict() + yield batch + +# _USER_SPEAKER_ID = 1 +# _SYSTEM_SPEAKER_ID = 2 +# +# VALID_DATATYPES = ('trn', 'val', 'tst') +# +# NLU_FNAME = "nlu.md" +# DOMAIN_FNAME = "domain.yml" +# +# @classmethod +# def _data_fname(cls, datatype: str) -> str: +# assert datatype in cls.VALID_DATATYPES, f"wrong datatype name: {datatype}" +# return f"stories-{datatype}.md" +# +# @classmethod +# @overrides +# def read(cls, data_path: str, fmt = "md") -> Dict[str, Dict]: +# """ +# Parameters: +# data_path: path to read dataset from +# +# Returns: +# dictionary tha(t contains +# ``'train'`` field with dialogs from ``'stories-trn.md'``, +# ``'valid'`` field with dialogs from ``'stories-val.md'`` and +# ``'test'`` field with dialogs from ``'stories-tst.md'``. +# Each field is a list of tuples ``(x_i, y_i)``. +# """ +# domain_fname = cls.DOMAIN_FNAME +# nlu_fname = cls.NLU_FNAME if fmt in ("md", "markdown") else cls.NLU_FNAME.replace('.md', f'.{fmt}') +# stories_fnames = tuple(cls._data_fname(dt) for dt in cls.VALID_DATATYPES) +# required_fnames = stories_fnames + (nlu_fname, domain_fname) +# for required_fname in required_fnames: +# required_path = Path(data_path, required_fname) +# if not required_path.exists(): +# log.error(f"INSIDE MLU_MD_DialogsDatasetReader.read(): " +# f"{required_fname} not found with path {required_path}") +# +# domain_path = Path(data_path, domain_fname) +# domain_knowledge = DomainKnowledge.from_yaml(domain_path) +# nlu_fpath = Path(data_path, nlu_fname) +# intents = Intents.from_file(nlu_fpath) +# +# short2long_subsample_name = {"trn": "train", +# "val": "valid", +# "tst": "test"} +# +# data = RASADict() +# for subsample_name_short in cls.VALID_DATATYPES: +# story_fpath = Path(data_path, cls._data_fname(subsample_name_short)) +# with open(story_fpath) as f: +# story_lines = f.read().splitlines() +# stories = Stories.from_stories_lines_md(story_lines) +# dat = RASADict({"story_lines": stories, +# "domain": domain_knowledge, +# "nlu_lines": intents}) +# data[short2long_subsample_name[subsample_name_short]] = dat +# data = RASADict(data) +# return data diff --git a/deeppavlov/dataset_iterators/md_yaml_dialogs_ner_iterator.py b/deeppavlov/dataset_iterators/md_yaml_dialogs_ner_iterator.py new file mode 100644 index 0000000000..07aa08c1f6 --- /dev/null +++ b/deeppavlov/dataset_iterators/md_yaml_dialogs_ner_iterator.py @@ -0,0 +1,162 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, softwaredata +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import itertools +import json +import os +import re +import tempfile +from logging import getLogger +from typing import Dict, List, Tuple, Any, Iterator + +from deeppavlov.core.common.registry import register +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.dataset_iterators.md_yaml_dialogs_iterator import \ + MD_YAML_DialogsDatasetIterator +from deeppavlov.dataset_readers.dstc2_reader import DSTC2DatasetReader +from deeppavlov.dataset_readers.dto.rasa.domain_knowledge import DomainKnowledge +from deeppavlov.dataset_readers.dto.rasa.stories import Story, Turn, Stories +from deeppavlov.dataset_readers.dto.rasa.nlu import Intents + +log = getLogger(__name__) + + +class RASADict(dict): + def __add__(self, oth): + return RASADict() + + + + +from typing import Dict, List, Tuple, Any, Iterator + + +@register('md_yaml_dialogs_ner_iterator') +class MD_YAML_DialogsDatasetNERIterator(MD_YAML_DialogsDatasetIterator): + + def __init__(self, + data: Dict[str, List[Tuple[Any, Any]]], + seed: int = None, + shuffle: bool = True, + limit: int = 10) -> None: + super().__init__(data, seed, shuffle, limit) + + def gen_batches(self, + batch_size: int, + data_type: str = 'train', + shuffle: bool = None) -> Iterator[Tuple]: + + for batch in super().gen_batches(batch_size, + data_type, + shuffle): + processed_data = list() + processed_texts = dict() + + for xs, ys in zip(*batch): + + for x, y in zip(xs, ys): + text = x['text'] + if not text.strip(): + continue + intents = [] + if 'intents' in x: + intents = x['intents'] + elif 'slots' in x: + intents = [x] + # aggregate slots from different intents + slots = list() + for intent in intents: + current_slots = intent.get('slots', []) + for slot_type, slot_val in current_slots: + # if not self._slot_vals or ( + # slot_type in self._slot_vals): + slots.append((slot_type, slot_val,)) + # remove duplicate pairs (text, slots) + if (text in processed_texts) and ( + slots in processed_texts[text]): + continue + processed_texts[text] = processed_texts.get(text, []) + [ + slots] + processed_data.append(self._add_bio_markup(text, slots)) + yield processed_data + + def _add_bio_markup(self, + utterance: str, + slots: List[Tuple[str, str]]) -> Tuple[List, List]: + tokens = utterance.split() + n_toks = len(tokens) + tags = ['O' for _ in range(n_toks)] + for n in range(n_toks): + for slot_type, slot_val in slots: + for entity in [slot_val]: + slot_tokens = entity.split() + slot_len = len(slot_tokens) + if n + slot_len <= n_toks and \ + self._is_equal_sequences(tokens[n: n + slot_len], + slot_tokens): + tags[n] = 'B-' + slot_type + for k in range(1, slot_len): + tags[n + k] = 'I-' + slot_type + break + return tokens, tags + + @staticmethod + def _is_equal_sequences(seq1, seq2): + equality_list = [tok1 == tok2 for tok1, tok2 in zip(seq1, seq2)] + return all(equality_list) + + +@register("md_yaml_dialogs_intents_iterator") +class MD_YAML_DialogsDatasetIntentsIterator(MD_YAML_DialogsDatasetIterator): + + def __init__(self, + data: Dict[str, List[Tuple[Any, Any]]], + seed: int = None, + shuffle: bool = True, + limit: int = 10) -> None: + super().__init__(data, seed, shuffle, limit) + + def gen_batches(self, + batch_size: int, + data_type: str = 'train', + shuffle: bool = None) -> Iterator[Tuple]: + + for batch in super().gen_batches(batch_size, + data_type, + shuffle): + texts, intents = list(), list() + for users, syss in zip(*batch): + for user, sys in zip(users, syss): + reply = user + curr_intents = [] + # print(turn) + if reply['intents']: + for intent in reply['intents']: + for slot in intent['slots']: + if slot[0] == 'slot': + curr_intents.append( + intent['act'] + '_' + slot[1]) + else: + curr_intents.append( + intent['act'] + '_' + slot[0]) + if len(intent['slots']) == 0: + curr_intents.append(intent['act']) + else: + if reply['text']: + curr_intents.append('unknown') + else: + continue + texts.append(reply["text"]) + intents.append(curr_intents) + # processed_data.append((reply['text'], curr_intents)) + yield texts, intents \ No newline at end of file diff --git a/deeppavlov/dataset_readers/dstc2_reader.py b/deeppavlov/dataset_readers/dstc2_reader.py index 55127f297a..199b5cb20d 100644 --- a/deeppavlov/dataset_readers/dstc2_reader.py +++ b/deeppavlov/dataset_readers/dstc2_reader.py @@ -120,6 +120,20 @@ def _read_from_file(cls, file_path, dialogs=False): return [data[idx['start']:idx['end']] for idx in dialog_indices] return data + @classmethod + def _read_from_batch(cls, batch, dialogs=False): + """Returns data from single batch""" + log.debug(f"[loading dialogs from batch of len {len(batch)}]") + + utterances, responses, dialog_indices = \ + cls._get_turns(batch, with_indices=True) + + data = list(map(cls._format_turn, zip(utterances, responses))) + + if dialogs: + return [data[idx['start']:idx['end']] for idx in dialog_indices] + return data + @staticmethod def _format_turn(turn): turn_x, turn_y = turn diff --git a/deeppavlov/dataset_readers/dto/__init__.py b/deeppavlov/dataset_readers/dto/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deeppavlov/dataset_readers/dto/rasa/__init__.py b/deeppavlov/dataset_readers/dto/rasa/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deeppavlov/dataset_readers/dto/rasa/domain_knowledge.py b/deeppavlov/dataset_readers/dto/rasa/domain_knowledge.py new file mode 100644 index 0000000000..38ca9fd76f --- /dev/null +++ b/deeppavlov/dataset_readers/dto/rasa/domain_knowledge.py @@ -0,0 +1,28 @@ +from pathlib import Path +from typing import Dict, List, Union + +from deeppavlov.core.common.file import read_yaml + + +class DomainKnowledge: + """the DTO-like class to store the domain knowledge from the domain yaml config.""" + + def __init__(self, domain_knowledge_di: Dict): + self.known_entities: List = domain_knowledge_di.get("entities", []) + self.known_intents: List = domain_knowledge_di.get("intents", []) + self.known_actions: List = domain_knowledge_di.get("actions", []) + self.known_slots: Dict = domain_knowledge_di.get("slots", {}) + self.response_templates: Dict = domain_knowledge_di.get("responses", {}) + self.session_config: Dict = domain_knowledge_di.get("session_config", {}) + self.forms: Dict = domain_knowledge_di.get("forms", {}) + + @classmethod + def from_yaml(cls, domain_yml_fpath: Union[str, Path] = "domain.yml"): + """ + Parses domain.yml domain config file into the DomainKnowledge object + Args: + domain_yml_fpath: path to the domain config file, defaults to domain.yml + Returns: + the loaded DomainKnowledge obect + """ + return cls(read_yaml(domain_yml_fpath)) \ No newline at end of file diff --git a/deeppavlov/dataset_readers/dto/rasa/nlu.py b/deeppavlov/dataset_readers/dto/rasa/nlu.py new file mode 100644 index 0000000000..ad43c4ca8c --- /dev/null +++ b/deeppavlov/dataset_readers/dto/rasa/nlu.py @@ -0,0 +1,180 @@ +import re +from collections import defaultdict +from typing import List, Tuple, Dict + +from deeppavlov.core.common.file import read_yaml + +SLOTS_MARKUP_PATTERN = r"\[" + \ + r"(?P.*?)" + \ + r"\]" + \ + r"\(" + \ + r"(?P.*?)" + \ + r"\)" + + +class IntentLine: + def __init__(self, text, cleaned_text_slots: List[Tuple] = None): + if cleaned_text_slots is None: + cleaned_text_slots = list() + self.text = text + self.slots_key = tuple(sorted((slot[0], slot[1]) + for slot in cleaned_text_slots)) + self.slots_di = cleaned_text_slots + self.slot_name2text2value = None + + @classmethod + def from_line(cls, line, ignore_slots=False): + line = line.strip() + if line.startswith('-'): + intent_text_w_markup = line.lstrip('-').strip() + else: + intent_text_w_markup = line + line_slots_found = re.finditer(SLOTS_MARKUP_PATTERN, + intent_text_w_markup) + if ignore_slots: + line_slots_found = [] + + curr_char_ix = 0 + intent_text_without_markup = '' + cleaned_text_slots = [] # intent text can contain slots highlighted + + slot_name2text2value = defaultdict(lambda: defaultdict(list)) + + for line_slot in line_slots_found: + line_slot_l_span, line_slot_r_span = line_slot.span() + # intent w.o. markup for "some [entity](entity_example) text" is "some entity text" + # so we should remove brackets and the parentheses content + intent_text_without_markup += intent_text_w_markup[ + curr_char_ix:line_slot_l_span] + + slot_value_text = str(line_slot["slot_value"]) + slot_name = line_slot["slot_name"] + slot_value = slot_value_text + if ':' in slot_name: + # e.g. [moderately](price:moderate) + slot_name, slot_value = slot_name.split(':', 1) + + slot_value_new_l_span = len( + intent_text_without_markup) # l span in cleaned text + slot_value_new_r_span = slot_value_new_l_span + len( + slot_value_text) # r span in cleaned text + # intent w.o. markup for "some [entity](entity_example) text" is "some entity text" + # so we should remove brackets and the parentheses content + intent_text_without_markup += slot_value_text + + cleaned_text_slots.append((slot_name, slot_value)) + + slot_name2text2value[slot_name][slot_value_text].append(slot_value) + + curr_char_ix = line_slot_r_span + intent_text_without_markup += intent_text_w_markup[ + curr_char_ix: len(intent_text_w_markup)] + + intent_l = cls(intent_text_without_markup, cleaned_text_slots) + intent_l.slot_name2text2value = slot_name2text2value + + return intent_l + + +class IntentDesc: + def __init__(self, title): + self.title = title + self.lines: List[IntentLine] = list() + + def add_line(self, intent_line:IntentLine): + self.lines.append(intent_line) + + +class Intents: + def __init__(self): + self.intents: List[IntentDesc] = list() + self.lines = None + self._slot_name2text2value = None + self._intent2slot2text = None + + @property + def slot_name2text2value(self) -> Dict: + if self._slot_name2text2value is not None: + return self._slot_name2text2value + sn2t2v = dict() + for intent in self.intents: + for intent_l in intent.lines: + for slot_name, slot_text2value in intent_l.slot_name2text2value.items(): + if slot_name not in sn2t2v.keys(): + sn2t2v[slot_name] = dict() + for slot_text, slot_values_li in slot_text2value.items(): + if slot_text not in sn2t2v[slot_name].keys(): + sn2t2v[slot_name][slot_text] = list() + sn2t2v[slot_name][slot_text].extend(slot_values_li) + self._slot_name2text2value = sn2t2v + return sn2t2v + + @property + def intent2slots2text(self) -> Dict: + if self._intent2slot2text is not None: + return self._intent2slot2text + + intent2slots2text = dict() + for intent in self.intents: + slots2text = defaultdict(list) + intent_title = intent.title + for intent_l in intent.lines: + slots2text[intent_l.slots_key].append({"text": intent_l.text, + "slots_di": intent_l.slots_di, + "slots": intent_l.slots_key}) + intent2slots2text[intent_title] = slots2text + self._intent2slot2text = intent2slots2text + return intent2slots2text + + + @classmethod + def from_nlu_md(cls, lines): + intents = cls() + ignore_slots = False + for line in lines: + if line.startswith("##"): + # lines starting with ## are starting section describing new intent type + curr_intent_name = line.strip("##").strip().split("intent:", 1)[-1] + curr_intent = IntentDesc(curr_intent_name) + intents.intents.append(curr_intent) + if line.strip().startswith('-'): + # lines starting with - are listing the examples of intent texts of the current intent type + intent_l = IntentLine.from_line(line, ignore_slots) + # noinspection PyUnboundLocalVariable + curr_intent.add_line(intent_l) + return intents + + @classmethod + def from_file(cls, fpath): + format = str(fpath).split('.')[-1] + if format in ("yml", "yaml"): + ic_file_content = read_yaml(fpath) + dp_version_present = r'# dp_version: "2.0"' in open(fpath).read() + intents = cls() + for part in ic_file_content['nlu']: + if "intent" in part: + intent_title = part['intent'] + curr_intent = IntentDesc(intent_title) + for example in part.get('examples', '').split("\n"): + example = example.strip().lstrip("*-_").strip() + intent_line = IntentLine.from_line(example) + curr_intent.add_line(intent_line) + elif 'regex' in part: + intent_title = part['regex'] + curr_intent = IntentDesc(intent_title) + for example in part.get('examples', '').split("\n"): + intent_line = IntentLine(example[2:]) + curr_intent.add_line(intent_line) + else: + continue + + if dp_version_present: + for example in part.get('regex_examples', '').split("\n"): + intent_line = IntentLine(example[2:]) + curr_intent.add_line(intent_line) + intents.intents.append(curr_intent) + elif format in ("md", "markdown"): + with open(fpath, encoding="utf-8") as f: + nlu_lines = f.readlines() + intents = cls.from_nlu_md(nlu_lines) + return intents diff --git a/deeppavlov/dataset_readers/dto/rasa/stories.py b/deeppavlov/dataset_readers/dto/rasa/stories.py new file mode 100644 index 0000000000..3fb572581e --- /dev/null +++ b/deeppavlov/dataset_readers/dto/rasa/stories.py @@ -0,0 +1,89 @@ +from typing import List +from ruamel.yaml import YAML + +USER = "usr" +SYSTEM = "sys" + +class Turn: + def __init__(self, turn_description: str, whose_turn: str): + self.turn_description = turn_description + self.whose_turn = whose_turn + + def is_user_turn(self): + return self.whose_turn == USER + + def is_system_turn(self): + return self.whose_turn == SYSTEM + +class Story: + def __init__(self, title, turns: List[Turn] = None): + + self.title = title + if turns is None: + turns = list() + self.turns = turns.copy() + + +class Stories: + def __init__(self): + self.stories: List[Story] = list() + self.lines = None + + @classmethod + def from_stories_lines_md(cls, lines: List[str], fmt="md"): + if fmt != "md": + raise Exception(f"Support of fmt {fmt} is not implemented") + + stories = cls() + lines = [line.strip() for line in lines if line.strip()] + stories.lines = lines.copy() + for line in lines: + if line.startswith('#'): + # #... marks the beginning of new story + curr_story_title = line.strip('#') + curr_story = Story(curr_story_title) + stories.stories.append(curr_story) + if line.startswith('*'): + line_content = line.lstrip('*').strip() + # noinspection PyUnboundLocalVariable + curr_story.turns.append(Turn(line_content, USER)) + elif line.startswith('-'): + line_content = line.strip('-').strip() + # noinspection PyUnboundLocalVariable + curr_story.turns.append(Turn(line_content, SYSTEM)) + return stories + + @classmethod + def from_stories_lines_yml(cls, lines: List[str], fmt="yml"): + lines_text = '\n'.join(lines) + stories_yml = YAML().load(lines_text) + stories_lines = [] + for story in stories_yml.get("stories", []): + story_title = story.get("story", 'todo') + stories_lines.append(f"# {story_title}") + for step in story.get("steps", []): + is_usr_step = "intent" in step.keys() + is_sys_step = "action" in step.keys() + if is_usr_step: + curr_story_line = step["intent"] + stories_lines.append(f"* {curr_story_line}") + if is_sys_step: + curr_story_line = step["action"] + stories_lines.append(f"- {curr_story_line}") + return cls.from_stories_lines_md(stories_lines) + + @classmethod + def from_stories_lines(cls, lines: List[str]): + try: + lines_text = '\n'.join(lines) + YAML().load(lines_text) + is_yaml = True + is_md = False + except: + is_yaml = False + is_md = True + + if is_yaml: + return cls.from_stories_lines_yml(lines) + if is_md: + return cls.from_stories_lines_md(lines) \ No newline at end of file diff --git a/deeppavlov/dataset_readers/intent_catcher_reader.py b/deeppavlov/dataset_readers/intent_catcher_reader.py index d916273db9..d52d1e16df 100644 --- a/deeppavlov/dataset_readers/intent_catcher_reader.py +++ b/deeppavlov/dataset_readers/intent_catcher_reader.py @@ -8,26 +8,43 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re -from json import load +from deeppavlov.core.common.file import read_json +from deeppavlov.core.common.file import read_yaml +from collections import defaultdict from logging import getLogger from pathlib import Path from typing import Dict, List, Tuple from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader +from deeppavlov.dataset_readers.dto.rasa.nlu import IntentLine log = getLogger(__file__) @register('intent_catcher_reader') class IntentCatcherReader(DatasetReader): - """Reader for Intent Catcher dataset in json format""" + """Reader for Intent Catcher dataset in json or YAML (RASA v2) format""" - def read(self, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]: + def parse_rasa_example(self, example: str, regex: bool = False): + example = example[2:] + if not regex: + example = IntentLine.from_line(example).text + return example + + def read(self, data_path: str, format: str = 'json', *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]: data_types = ["train", "valid", "test"] - train_file = kwargs.get('train', 'train.json') + if format == 'yaml': + fmt = 'yml' + elif format == 'json': + fmt = 'json' + else: + raise Exception("Wrong file format. ") + + train_file = kwargs.get('train', f'train.{fmt}') if not Path(data_path, train_file).exists(): raise Exception( @@ -39,16 +56,23 @@ def read(self, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str "test": []} for data_type in data_types: - file_name = kwargs.get(data_type, '{}.{}'.format(data_type, "json")) + file_name = kwargs.get(data_type, '{}.{}'.format(data_type, fmt)) if file_name is None: continue file = Path(data_path).joinpath(file_name) if file.exists(): - with open(file) as fp: - file = load(fp) - for label in file: - data[data_type].extend([(phrase, label) for phrase in file[label]]) + ic_file_content = None + if format == 'json': + ic_file_content = read_json(file) + raise Exception("json is not supported anymore." + " Use RASA reader and YAML instead") + + elif format == 'yaml': + raise Exception("Use RASA reader instead") + + # noinspection PyUnboundLocalVariable + data[data_type] = ic_file_content else: log.warning("Cannot find {} file".format(file)) diff --git a/deeppavlov/dataset_readers/md_yaml_dialogs_reader.py b/deeppavlov/dataset_readers/md_yaml_dialogs_reader.py index 29a1b3f699..1ee3cd304a 100644 --- a/deeppavlov/dataset_readers/md_yaml_dialogs_reader.py +++ b/deeppavlov/dataset_readers/md_yaml_dialogs_reader.py @@ -11,53 +11,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - -import json -import os -import re -import tempfile -from collections import defaultdict from logging import getLogger from overrides import overrides from pathlib import Path -from typing import Dict, List, Tuple, Union, Any, Optional +from typing import Dict -from deeppavlov.core.common.file import read_yaml from deeppavlov.core.common.registry import register from deeppavlov.core.data.dataset_reader import DatasetReader -from deeppavlov.dataset_readers.dstc2_reader import DSTC2DatasetReader - - -SLOT2VALUE_PAIRS_TUPLE = Tuple[Tuple[str, Any], ...] +from deeppavlov.dataset_readers.dto.rasa.domain_knowledge import DomainKnowledge +from deeppavlov.dataset_readers.dto.rasa.nlu import Intents +from deeppavlov.dataset_readers.dto.rasa.stories import Stories log = getLogger(__name__) -class DomainKnowledge: - """the DTO-like class to store the domain knowledge from the domain yaml config.""" - - def __init__(self, domain_knowledge_di: Dict): - self.known_entities: List = domain_knowledge_di.get("entities", []) - self.known_intents: List = domain_knowledge_di.get("intents", []) - self.known_actions: List = domain_knowledge_di.get("actions", []) - self.known_slots: Dict = domain_knowledge_di.get("slots", {}) - self.response_templates: Dict = domain_knowledge_di.get("responses", {}) - self.session_config: Dict = domain_knowledge_di.get("session_config", {}) - self.forms: Dict = domain_knowledge_di.get("forms", {}) - - @classmethod - def from_yaml(cls, domain_yml_fpath: Union[str, Path] = "domain.yml"): - """ - Parses domain.yml domain config file into the DomainKnowledge object - Args: - domain_yml_fpath: path to the domain config file, defaults to domain.yml - Returns: - the loaded DomainKnowledge obect - """ - return cls(read_yaml(domain_yml_fpath)) - - +class RASADict(dict): + def __add__(self, oth): + return RASADict() @register('md_yaml_dialogs_reader') class MD_YAML_DialogsDatasetReader(DatasetReader): @@ -84,19 +54,16 @@ class MD_YAML_DialogsDatasetReader(DatasetReader): DOMAIN_FNAME = "domain.yml" @classmethod - def _data_fname(cls, datatype: str) -> str: + def _data_fname(cls, datatype: str, fmt: str="md") -> str: assert datatype in cls.VALID_DATATYPES, f"wrong datatype name: {datatype}" - return f"stories-{datatype}.md" + return f"stories-{datatype}.{fmt}" @classmethod @overrides - def read(cls, data_path: str, dialogs: bool = False, ignore_slots: bool = False) -> Dict[str, List]: + def read(cls, data_path: str, fmt = "md") -> Dict[str, Dict]: """ Parameters: data_path: path to read dataset from - dialogs: flag which indicates whether to output list of turns or - list of dialogs - ignore_slots: whether to ignore slots information provided in stories.md or not Returns: dictionary that contains @@ -106,8 +73,8 @@ def read(cls, data_path: str, dialogs: bool = False, ignore_slots: bool = False) Each field is a list of tuples ``(x_i, y_i)``. """ domain_fname = cls.DOMAIN_FNAME - nlu_fname = cls.NLU_FNAME - stories_fnames = tuple(cls._data_fname(dt) for dt in cls.VALID_DATATYPES) + nlu_fname = cls.NLU_FNAME if fmt in ("md", "markdown") else cls.NLU_FNAME.replace('.md', f'.{fmt}') + stories_fnames = tuple(cls._data_fname(dt, fmt) for dt in cls.VALID_DATATYPES) required_fnames = stories_fnames + (nlu_fname, domain_fname) for required_fname in required_fnames: required_path = Path(data_path, required_fname) @@ -117,547 +84,23 @@ def read(cls, data_path: str, dialogs: bool = False, ignore_slots: bool = False) domain_path = Path(data_path, domain_fname) domain_knowledge = DomainKnowledge.from_yaml(domain_path) - intent2slots2text, slot_name2text2value = cls._read_intent2text_mapping(Path(data_path, nlu_fname), - domain_knowledge, ignore_slots) + nlu_fpath = Path(data_path, nlu_fname) + intents = Intents.from_file(nlu_fpath) short2long_subsample_name = {"trn": "train", "val": "valid", "tst": "test"} - data = {short2long_subsample_name[subsample_name_short]: - cls._read_story(Path(data_path, cls._data_fname(subsample_name_short)), - dialogs, domain_knowledge, intent2slots2text, slot_name2text2value, - ignore_slots=ignore_slots) - for subsample_name_short in cls.VALID_DATATYPES} - - return data - - @classmethod - def _read_intent2text_mapping(cls, nlu_fpath: Path, domain_knowledge: DomainKnowledge, ignore_slots: bool = False) \ - -> Tuple[Dict[str, Dict[SLOT2VALUE_PAIRS_TUPLE, List]], - Dict[str, Dict[str, str]]]: - - slots_markup_pattern = r"\[" + \ - r"(?P.*?)" + \ - r"\]" + \ - r"\(" + \ - r"(?P.*?)" + \ - r"\)" - - intent2slots2text = defaultdict(lambda: defaultdict(list)) - slot_name2text2value = defaultdict(lambda: defaultdict(list)) - - curr_intent_name = None - - with open(nlu_fpath) as nlu_f: - for line in nlu_f: - if line.startswith("##"): - # lines starting with ## are starting section describing new intent type - curr_intent_name = line.strip("##").strip().split("intent:", 1)[-1] - - if line.strip().startswith('-'): - # lines starting with - are listing the examples of intent texts of the current intent type - intent_text_w_markup = line.strip().strip('-').strip() - line_slots_found = re.finditer(slots_markup_pattern, intent_text_w_markup) - if ignore_slots: - line_slots_found = [] - - curr_char_ix = 0 - intent_text_without_markup = '' - cleaned_text_slots = [] # intent text can contain slots highlighted - for line_slot in line_slots_found: - line_slot_l_span, line_slot_r_span = line_slot.span() - # intent w.o. markup for "some [entity](entity_example) text" is "some entity text" - # so we should remove brackets and the parentheses content - intent_text_without_markup += intent_text_w_markup[curr_char_ix:line_slot_l_span] - - slot_value_text = str(line_slot["slot_value"]) - slot_name = line_slot["slot_name"] - slot_value = slot_value_text - if ':' in slot_name: - slot_name, slot_value = slot_name.split(':', 1) # e.g. [moderately](price:moderate) - - assert slot_name in domain_knowledge.known_slots, f"{slot_name} from {nlu_fpath}" + \ - " was not listed as slot " + \ - "in domain knowledge config" - - slot_value_new_l_span = len(intent_text_without_markup) # l span in cleaned text - slot_value_new_r_span = slot_value_new_l_span + len(slot_value_text) # r span in cleaned text - # intent w.o. markup for "some [entity](entity_example) text" is "some entity text" - # so we should remove brackets and the parentheses content - intent_text_without_markup += slot_value_text - - cleaned_text_slots.append((slot_name, slot_value)) - - slot_name2text2value[slot_name][slot_value_text].append(slot_value) - - curr_char_ix = line_slot_r_span - intent_text_without_markup += intent_text_w_markup[curr_char_ix: len(intent_text_w_markup)] - - slots_key = tuple(sorted((slot[0], slot[1]) for slot in cleaned_text_slots)) - intent2slots2text[curr_intent_name][slots_key].append({"text": intent_text_without_markup, - "slots_di": cleaned_text_slots, - "slots": slots_key}) - - # defaultdict behavior is no more needed - intent2slots2text = {k: dict(v) for k, v in intent2slots2text.items()} - slot_name2text2value = dict(slot_name2text2value) - - return intent2slots2text, slot_name2text2value - - @classmethod - def _read_story(cls, - story_fpath: Path, - dialogs: bool, - domain_knowledge: DomainKnowledge, - intent2slots2text: Dict[str, Dict[SLOT2VALUE_PAIRS_TUPLE, List]], - slot_name2text2value: Dict[str, Dict[str, str]], - ignore_slots: bool = False) \ - -> Union[List[List[Tuple[Dict[str, bool], Dict[str, Any]]]], List[Tuple[Dict[str, bool], Dict[str, Any]]]]: - """ - Reads stories from the specified path converting them to go-bot format on the fly. - - Args: - story_fpath: path to the file containing the stories dataset - dialogs: flag which indicates whether to output list of turns or - list of dialogs - domain_knowledge: the domain knowledge, usually inferred from domain.yml - intent2slots2text: the mapping allowing given the intent class and - slotfilling values of utterance, restore utterance text. - slot_name2text2value: the mapping of possible slot values spellings to the values themselves. - Returns: - stories read as if it was done with DSTC2DatasetReader._read_from_file() - """ - log.debug(f"BEFORE MLU_MD_DialogsDatasetReader._read_story(): " - f"story_fpath={story_fpath}, " - f"dialogs={dialogs}, " - f"domain_knowledge={domain_knowledge}, " - f"intent2slots2text={intent2slots2text}, " - f"slot_name2text2value={slot_name2text2value}") - - default_system_start = { - "speaker": cls._SYSTEM_SPEAKER_ID, - "text": "start", - "dialog_acts": [{"act": "start", "slots": []}]} - default_system_goodbye = { - "text": "goodbye :(", - "dialog_acts": [{"act": "utter_goodbye", "slots": []}], - "speaker": cls._SYSTEM_SPEAKER_ID} # TODO infer from dataset - - stories_parsed = {} - - curr_story_title = None - curr_story_utters_batch = [] - nonlocal_curr_story_bad = False # can be modified as a nonlocal variable - - def process_user_utter(line: str) -> List[List[Dict[str, Any]]]: - """ - given the stories.md user line, returns the batch of all the dstc2 ways to represent it - Args: - line: the system line to generate dstc2 versions for - - Returns: - all the possible dstc2 versions of the passed story line - """ - nonlocal intent2slots2text, slot_name2text2value, curr_story_utters_batch, nonlocal_curr_story_bad - try: - possible_user_utters = cls.augment_user_turn(intent2slots2text, line, slot_name2text2value) - # dialogs MUST start with system replics - for curr_story_utters in curr_story_utters_batch: - if not curr_story_utters: - curr_story_utters.append(default_system_start) - - utters_to_append_batch = [] - for user_utter in possible_user_utters: - utters_to_append_batch.append([user_utter]) - - except KeyError: - log.debug(f"INSIDE MLU_MD_DialogsDatasetReader._read_story(): " - f"Skipping story w. line {line} because of no NLU candidates found") - nonlocal_curr_story_bad = True - utters_to_append_batch = [] - return utters_to_append_batch - - def process_system_utter(line: str) -> List[List[Dict[str, Any]]]: - """ - given the stories.md system line, returns the batch of all the dstc2 ways to represent it - Args: - line: the system line to generate dstc2 versions for - - Returns: - all the possible dstc2 versions of the passed story line - """ - nonlocal intent2slots2text, domain_knowledge, curr_story_utters_batch, nonlocal_curr_story_bad - system_action = cls.parse_system_turn(domain_knowledge, line) - system_action_name = system_action.get("dialog_acts")[0].get("act") - - for curr_story_utters in curr_story_utters_batch: - if cls.last_turn_is_systems_turn(curr_story_utters): - # deal with consecutive system actions by inserting the last user replics in between - curr_story_utters.append(cls.get_last_users_turn(curr_story_utters)) - - def parse_form_name(story_line: str) -> str: - """ - if the line (in stories.md utterance format) contains a form name, return it - Args: - story_line: line to extract form name from - - Returns: - the extracted form name or None if no form name found - """ - form_name = None - if story_line.startswith("form"): - form_di = json.loads(story_line[len("form"):]) - form_name = form_di["name"] - return form_name - - if system_action_name.startswith("form"): - form_name = parse_form_name(system_action_name) - augmented_utters = cls.augment_form(form_name, domain_knowledge, intent2slots2text) - - utters_to_append_batch = [[]] - for user_utter in augmented_utters: - new_curr_story_utters_batch = [] - for curr_story_utters in utters_to_append_batch: - possible_extensions = process_story_line(user_utter) - for possible_extension in possible_extensions: - new_curr_story_utters = curr_story_utters.copy() - new_curr_story_utters.extend(possible_extension) - new_curr_story_utters_batch.append(new_curr_story_utters) - utters_to_append_batch = new_curr_story_utters_batch - else: - utters_to_append_batch = [[system_action]] - return utters_to_append_batch - - def process_story_line(line: str) -> List[List[Dict[str, Any]]]: - """ - given the stories.md line, returns the batch of all the dstc2 ways to represent it - Args: - line: the line to generate dstc2 versions - - Returns: - all the possible dstc2 versions of the passed story line - """ - if line.startswith('*'): - utters_to_extend_with_batch = process_user_utter(line) - elif line.startswith('-'): - utters_to_extend_with_batch = process_system_utter(line) - else: - # todo raise an exception - utters_to_extend_with_batch = [] - return utters_to_extend_with_batch - - story_file = open(story_fpath) - for line in story_file: - line = line.strip() - if not line: - continue - if line.startswith('#'): - # #... marks the beginning of new story - if curr_story_utters_batch and curr_story_utters_batch[0] and curr_story_utters_batch[0][-1]["speaker"] == cls._USER_SPEAKER_ID: - for curr_story_utters in curr_story_utters_batch: - curr_story_utters.append(default_system_goodbye) # dialogs MUST end with system replics - - if not nonlocal_curr_story_bad: - for curr_story_utters_ix, curr_story_utters in enumerate(curr_story_utters_batch): - stories_parsed[curr_story_title+f"_{curr_story_utters_ix}"] = curr_story_utters - - curr_story_title = line.strip('#') - curr_story_utters_batch = [[]] - nonlocal_curr_story_bad = False - else: - new_curr_story_utters_batch = [] - possible_extensions = process_story_line(line) - for curr_story_utters in curr_story_utters_batch: - for user_utter in possible_extensions: - new_curr_story_utters = curr_story_utters.copy() - new_curr_story_utters.extend(user_utter) - new_curr_story_utters_batch.append(new_curr_story_utters) - curr_story_utters_batch = new_curr_story_utters_batch - # curr_story_utters.extend(process_story_line(line)) - story_file.close() - - if not nonlocal_curr_story_bad: - for curr_story_utters_ix, curr_story_utters in enumerate(curr_story_utters_batch): - stories_parsed[curr_story_title + f"_{curr_story_utters_ix}"] = curr_story_utters - - tmp_f = tempfile.NamedTemporaryFile(delete=False, mode='w', encoding="utf-8") - for story_id, story in stories_parsed.items(): - for replics in story: - print(json.dumps(replics), file=tmp_f) - print(file=tmp_f) - tmp_f.close() - # noinspection PyProtectedMember - gobot_formatted_stories = DSTC2DatasetReader._read_from_file(tmp_f.name, dialogs=dialogs) - os.remove(tmp_f.name) - - log.debug(f"AFTER MLU_MD_DialogsDatasetReader._read_story(): " - f"story_fpath={story_fpath}, " - f"dialogs={dialogs}, " - f"domain_knowledge={domain_knowledge}, " - f"intent2slots2text={intent2slots2text}, " - f"slot_name2text2value={slot_name2text2value}") - - return gobot_formatted_stories - - @classmethod - def augment_form(cls, form_name: str, domain_knowledge: DomainKnowledge, intent2slots2text: Dict) -> List[str]: - """ - Replaced the form mention in stories.md with the actual turns relevant to the form - Args: - form_name: the name of form to generate turns for - domain_knowledge: the domain knowledge (see domain.yml in RASA) relevant to the processed config - intent2slots2text: the mapping of intents and particular slots onto text - - Returns: - the story turns relevant to the passed form - """ - form = domain_knowledge.forms[form_name] # todo handle keyerr - augmended_story = [] - for slot_name, slot_info_li in form.items(): - if slot_info_li and slot_info_li[0].get("type", '') == "from_entity": - # we only handle from_entity slots - known_responses = list(domain_knowledge.response_templates) - known_intents = list(intent2slots2text.keys()) - augmended_story.extend(cls.augment_slot(known_responses, known_intents, slot_name, form_name)) - return augmended_story - - @classmethod - def augment_slot(cls, known_responses: List[str], known_intents: List[str], slot_name: str, form_name: str) \ - -> List[str]: - """ - Given the slot name, generates a sequence of system turn asking for a slot and user' turn providing this slot - - Args: - known_responses: responses known to the system from domain.yml - known_intents: intents known to the system from domain.yml - slot_name: the name of the slot to augment for - form_name: the name of the form for which the turn is augmented - - Returns: - the list of stories.md alike turns - """ - ask_slot_act_name = cls.get_augmented_ask_slot_utter(form_name, known_responses, slot_name) - inform_slot_user_utter = cls.get_augmented_ask_intent_utter(known_intents, slot_name) - - return [f"- {ask_slot_act_name}", f"* {inform_slot_user_utter}"] - - @classmethod - def get_augmented_ask_intent_utter(cls, known_intents: List[str], slot_name: str) -> Optional[str]: - """ - if the system knows the inform_{slot} intent, return this intent name, otherwise return None - Args: - known_intents: intents known to the system - slot_name: the slot to look inform intent for - - Returns: - the slot informing intent or None - """ - inform_slot_user_utter_hypothesis = f"inform_{slot_name}" - if inform_slot_user_utter_hypothesis in known_intents: - inform_slot_user_utter = inform_slot_user_utter_hypothesis - else: - # todo raise an exception - inform_slot_user_utter = None - pass - return inform_slot_user_utter - - @classmethod - def get_augmented_ask_slot_utter(cls, form_name: str, known_responses: List[str], slot_name: str): - """ - if the system knows the ask_{slot} action, return this action name, otherwise return None - Args: - form_name: the name of the currently processed form - known_responses: actions known to the system - slot_name: the slot to look asking action for - - Returns: - the slot asking action or None - """ - ask_slot_act_name_hypothesis1 = f"utter_ask_{form_name}_{slot_name}" - ask_slot_act_name_hypothesis2 = f"utter_ask_{slot_name}" - if ask_slot_act_name_hypothesis1 in known_responses: - ask_slot_act_name = ask_slot_act_name_hypothesis1 - elif ask_slot_act_name_hypothesis2 in known_responses: - ask_slot_act_name = ask_slot_act_name_hypothesis2 - else: - # todo raise an exception - ask_slot_act_name = None - pass - return ask_slot_act_name - - @classmethod - def get_last_users_turn(cls, curr_story_utters: List[Dict]) -> Dict: - """ - Given the dstc2 story, return the last user utterance from it - Args: - curr_story_utters: the dstc2-formatted stoyr - - Returns: - the last user utterance from the passed story - """ - *_, last_user_utter = filter(lambda x: x["speaker"] == cls._USER_SPEAKER_ID, curr_story_utters) - return last_user_utter - - @classmethod - def last_turn_is_systems_turn(cls, curr_story_utters): - return curr_story_utters and curr_story_utters[-1]["speaker"] == cls._SYSTEM_SPEAKER_ID - - @classmethod - def parse_system_turn(cls, domain_knowledge: DomainKnowledge, line: str) -> Dict: - """ - Given the RASA stories.md line, returns the dstc2-formatted json (dict) for this line - Args: - domain_knowledge: the domain knowledge relevant to the processed stories config (from which line is taken) - line: the story system step representing line from stories.md - - Returns: - the dstc2-formatted passed turn - """ - # system actions are started in dataset with - - system_action_name = line.strip('-').strip() - curr_action_text = cls._system_action2text(domain_knowledge, system_action_name) - system_action = {"speaker": cls._SYSTEM_SPEAKER_ID, - "text": curr_action_text, - "dialog_acts": [{"act": system_action_name, "slots": []}]} - if system_action_name.startswith("action"): - system_action["db_result"] = {} - return system_action - - @classmethod - def augment_user_turn(cls, intent2slots2text, line: str, slot_name2text2value) -> List[Dict[str, Any]]: - """ - given the turn information generate all the possible stories representing it - Args: - intent2slots2text: the intents and slots to natural language utterances mapping known to the system - line: the line representing used utterance in stories.md format - slot_name2text2value: the slot names to values mapping known o the system - - Returns: - the batch of all the possible dstc2 representations of the passed intent - """ - # user actions are started in dataset with * - user_action, slots_dstc2formatted = cls._parse_user_intent(line) - slots_actual_values = cls._clarify_slots_values(slot_name2text2value, slots_dstc2formatted) - slots_to_exclude, slots_used_values, action_for_text = cls._choose_slots_for_whom_exists_text( - intent2slots2text, slots_actual_values, - user_action) - possible_user_response_infos = cls._user_action2text(intent2slots2text, action_for_text, slots_used_values) - possible_user_utters = [] - for user_response_info in possible_user_response_infos: - user_utter = {"speaker": cls._USER_SPEAKER_ID, - "text": user_response_info["text"], - "dialog_acts": [{"act": user_action, "slots": user_response_info["slots"]}], - "slots to exclude": slots_to_exclude} - possible_user_utters.append(user_utter) - return possible_user_utters - - @staticmethod - def _choose_slots_for_whom_exists_text(intent2slots2text: Dict[str, Dict[SLOT2VALUE_PAIRS_TUPLE, List]], - slots_actual_values: SLOT2VALUE_PAIRS_TUPLE, - user_action: str) -> Tuple[List, SLOT2VALUE_PAIRS_TUPLE, str]: - """ - - Args: - intent2slots2text: the mapping of intents and slots to natural language utterances representing them - slots_actual_values: the slot values information to look utterance for - user_action: the intent to look utterance for - - Returns: - the slots ommitted to find an NLU candidate, the slots represented in the candidate, the intent name used - """ - possible_keys = [k for k in intent2slots2text.keys() if user_action in k] - possible_keys = possible_keys + [user_action] - possible_keys = sorted(possible_keys, key=lambda action_s: action_s.count('+')) - for possible_action_key in possible_keys: - if intent2slots2text[possible_action_key].get(slots_actual_values): - slots_used_values = slots_actual_values - slots_to_exclude = [] - return slots_to_exclude, slots_used_values, possible_action_key - else: - slots_lazy_key = set(e[0] for e in slots_actual_values) - slots_lazy_key -= {"intent"} - fake_keys = [] - for known_key in intent2slots2text[possible_action_key].keys(): - if slots_lazy_key.issubset(set(e[0] for e in known_key)): - fake_keys.append(known_key) - break - - if fake_keys: - slots_used_values = sorted(fake_keys, key=lambda elem: (len(set(slots_actual_values) ^ set(elem)), - len([e for e in elem - if e[0] not in slots_lazy_key])) - )[0] - - slots_to_exclude = [e[0] for e in slots_used_values if e[0] not in slots_lazy_key] - return slots_to_exclude, slots_used_values, possible_action_key - - raise KeyError("no possible NLU candidates found") - - @staticmethod - def _clarify_slots_values(slot_name2text2value: Dict[str, Dict[str, Any]], - slots_dstc2formatted: List[List]) -> SLOT2VALUE_PAIRS_TUPLE: - slots_key = [] - for slot_name, slot_value in slots_dstc2formatted: - slot_actual_value = slot_name2text2value.get(slot_name, {}).get(slot_value, slot_value) - slots_key.append((slot_name, slot_actual_value)) - slots_key = tuple(sorted(slots_key)) - return slots_key - - @staticmethod - def _parse_user_intent(line: str, ignore_slots=False) -> Tuple[str, List[List]]: - """ - Given the intent line in RASA stories.md format, return the name of the intent and slots described with this line - Args: - line: the line to parse - ignore_slots: whether to ignore slots information - - Returns: - the pair of the intent name and slots ([[slot name, slot value],.. ]) info - """ - intent = line.strip('*').strip() - if '{' not in intent: - intent = intent + "{}" # the prototypical intent is "intent_name{slot1: value1, slotN: valueN}" - user_action, slots_info = intent.split('{', 1) - slots_info = json.loads('{' + slots_info) - slots_dstc2formatted = [[slot_name, slot_value] for slot_name, slot_value in slots_info.items()] - if ignore_slots: - slots_dstc2formatted = dict() - return user_action, slots_dstc2formatted - - @staticmethod - def _user_action2text(intent2slots2text: Dict[str, Dict[SLOT2VALUE_PAIRS_TUPLE, List]], - user_action: str, - slots_li: Optional[SLOT2VALUE_PAIRS_TUPLE] = None) -> List[str]: - """ - given the user intent, return the text representing this intent with passed slots - Args: - intent2slots2text: the mapping of intents and slots to natural language utterances - user_action: the name of intent to generate text for - slots_li: the slot values to provide - - Returns: - the text of utterance relevant to the passed intent and slots - """ - if slots_li is None: - slots_li = tuple() - return intent2slots2text[user_action][slots_li] - - @staticmethod - def _system_action2text(domain_knowledge: DomainKnowledge, system_action: str) -> str: - """ - given the system action name return the relevant template text - Args: - domain_knowledge: the domain knowledge relevant to the currently processed config - system_action: the name of the action to get intent for - - Returns: - template relevant to the passed action - """ - possible_system_responses = domain_knowledge.response_templates.get(system_action, - [{"text": system_action}]) - - response_text = possible_system_responses[0]["text"] - response_text = re.sub(r"(\w+)\=\{(.*?)\}", r"#\2", response_text) # TODO: straightforward regex string - - return response_text + data = dict() + for subsample_name_short in cls.VALID_DATATYPES: + story_fpath = Path(data_path, cls._data_fname(subsample_name_short, fmt)) + with open(story_fpath) as f: + story_lines = f.read().splitlines() + stories = Stories.from_stories_lines(story_lines) + + data[short2long_subsample_name[subsample_name_short]] = RASADict({ + "story_lines": stories, + "domain": domain_knowledge, + "nlu_lines": intents}) + data = RASADict(data) + return data \ No newline at end of file diff --git a/deeppavlov/models/classifiers/memorizing_classifier.py b/deeppavlov/models/classifiers/memorizing_classifier.py new file mode 100644 index 0000000000..ce0b977a63 --- /dev/null +++ b/deeppavlov/models/classifiers/memorizing_classifier.py @@ -0,0 +1,121 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from pathlib import Path +from typing import List, Union, Optional + +import numpy as np +from overrides import overrides + +from deeppavlov.core.common.errors import ConfigError +from deeppavlov.core.common.file import save_json, read_json +from deeppavlov.core.common.registry import register +from ...core.models.nn_model import NNModel + +log = logging.getLogger(__name__) + + +@register('mem_classification_model') +class MemClassificationModel(NNModel): + + def __init__(self, n_classes: int, save_path: Optional[Union[str, Path]], + return_probas: bool = True, *args, **kwargs): + super().__init__(save_path, *args, **kwargs) + if n_classes == 0: + raise ConfigError("Please, provide the number of classes setting") + self.n_classes = n_classes + self.opt = { + "return_probas": return_probas, + } + self.save_path = save_path + self.text2label = dict() + self.classes = list() + self.is_trained = False + self.load() + + def __call__(self: "MemClassificationModel", texts: List[str], *args) -> Union[ + List[List[float]], List[int]]: + """Infer on the given data. + + Args: + texts: list of text samples + + Returns: + for each sentence: + vector of probabilities to belong with each class + or list of classes sentence belongs with + """ + outputs = np.zeros((len(texts), self.n_classes)) + for text_ix, text in enumerate(texts): + label = self.text2label.get(text) + if label is not None: + outputs[text_ix][self.label2ix(label)] = 1. + if self.opt["return_probas"]: + return outputs.tolist() + else: + return np.argmax(outputs, axis=-1).tolist() + + def label2ix(self, label: str): + if label not in self.classes: + return -1 + return self.classes.index(label) + + def train_on_batch(self, texts: List[str], + labels: list) -> Union[float, List[float]]: + """Train the model on the given batch. + + Args: + texts: texts + labels: list of classes + + Returns: + metrics values on the given batch + """ + if isinstance(labels, np.ndarray): + labels = labels.tolist() + if labels and isinstance(labels[0], np.ndarray): + labels_ = [] + for lab in labels: + label_ixes = np.where(lab)[0].tolist() + if len(label_ixes) != 1: + log.warning("smth wrong with ohe") + label_ix = label_ixes[0] + labels_.append(label_ix) + labels = labels_ + self.text2label.update(dict(zip(texts, labels))) + self.classes = list(sorted(set(self.classes + labels))) + # print(self.text2label) + # print(self.classes) + pseudo_loss = 0 if self.is_trained else 1 + self.is_trained = True + self.save() + return pseudo_loss + + @overrides + def save(self, *args, **kwargs): + # print("saving") + save_json({"classes": self.classes, + "text2label": self.text2label}, + self.save_path) + + @overrides + def load(self, *args, **kwargs): + # print("loading") + try: + loaded = read_json(self.save_path) + self.classes = loaded["classes"] + self.text2label = loaded["text2label"] + except: + log.info("nothing to load") \ No newline at end of file diff --git a/deeppavlov/models/go_bot/dto/dataset_features.py b/deeppavlov/models/go_bot/dto/dataset_features.py index bb5a673684..6af845770f 100644 --- a/deeppavlov/models/go_bot/dto/dataset_features.py +++ b/deeppavlov/models/go_bot/dto/dataset_features.py @@ -35,6 +35,7 @@ def __init__(self, tokens_vectorized = nlu_response.tokens_vectorized # todo proper oop self.tokens_embeddings_padded = tokens_vectorized.tokens_embeddings_padded self.features = features.concat_feats + self._nlu_response = nlu_response class UtteranceTarget: @@ -84,12 +85,14 @@ def __init__(self): self.attn_keys = [] self.tokens_embeddings_paddeds = [] self.featuress = [] + self._nlu_response = [] def append(self, utterance_features: UtteranceFeatures): self.action_masks.append(utterance_features.action_mask) self.attn_keys.append(utterance_features.attn_key) self.tokens_embeddings_paddeds.append(utterance_features.tokens_embeddings_padded) self.featuress.append(utterance_features.features) + self._nlu_response.append(utterance_features._nlu_response) def __len__(self): return len(self.featuress) @@ -156,6 +159,7 @@ def __init__(self, dialogue_features: DialogueFeatures, sequence_length): dialogue_features.tokens_embeddings_paddeds[0])] * padding_length self.featuress = dialogue_features.featuress + [np.zeros_like(dialogue_features.featuress[0])] * padding_length + self._nlu_response = dialogue_features._nlu_response class PaddedDialogueTargets(DialogueTargets): @@ -203,6 +207,7 @@ def __init__(self, max_dialogue_length): self.b_tokens_embeddings_paddeds = [] self.b_featuress = [] self.b_padded_dialogue_length_mask = [] + self._nlu_responses = [] self.max_dialogue_length = max_dialogue_length def append(self, padded_dialogue_features: PaddedDialogueFeatures): @@ -211,6 +216,7 @@ def append(self, padded_dialogue_features: PaddedDialogueFeatures): self.b_tokens_embeddings_paddeds.append(padded_dialogue_features.tokens_embeddings_paddeds) self.b_featuress.append(padded_dialogue_features.featuress) self.b_padded_dialogue_length_mask.append(padded_dialogue_features.padded_dialogue_length_mask) + self._nlu_responses.append(padded_dialogue_features._nlu_response) def __len__(self): return len(self.b_featuress) diff --git a/deeppavlov/models/go_bot/dto/shared_gobot_params.py b/deeppavlov/models/go_bot/dto/shared_gobot_params.py index 0472c37333..9cac390437 100644 --- a/deeppavlov/models/go_bot/dto/shared_gobot_params.py +++ b/deeppavlov/models/go_bot/dto/shared_gobot_params.py @@ -1,4 +1,4 @@ -from deeppavlov.models.go_bot.nlu.nlu_manager import NLUManagerInterface +from deeppavlov.models.go_bot.nlu.nlu_manager import NLUManagerInterface, NLUManager from deeppavlov.models.go_bot.nlg.nlg_manager import NLGManagerInterface from deeppavlov.models.go_bot.tracker.featurized_tracker import FeaturizedTracker @@ -22,3 +22,17 @@ def from_configured(nlg_manager: NLGManagerInterface, nlu_manager: NLUManagerInt return SharedGoBotParams(nlg_manager.num_of_known_actions(), nlu_manager.num_of_known_intents(), tracker.num_features) + +class MemorizingGoBotParams(SharedGoBotParams): + intent_ids2intents: dict + intents2intent_ids: dict + + @staticmethod + def from_configured(nlg_manager: NLGManagerInterface, nlu_manager: NLUManager, tracker: FeaturizedTracker): + """builds the params object given some GO-bot units that are already configured""" + res = SharedGoBotParams(nlg_manager.num_of_known_actions(), + nlu_manager.num_of_known_intents(), + tracker.num_features) + res.intent_ids2intents = dict(enumerate(nlu_manager.intents)) + res.intents2intent_ids = {v:k for k, v in res.intent_ids2intents.items()} + return res \ No newline at end of file diff --git a/deeppavlov/models/go_bot/go_bot.py b/deeppavlov/models/go_bot/go_bot.py index ce47cf2577..5dba8caaac 100644 --- a/deeppavlov/models/go_bot/go_bot.py +++ b/deeppavlov/models/go_bot/go_bot.py @@ -13,7 +13,7 @@ # limitations under the License. from logging import getLogger -from typing import Dict, Any, List, Optional, Union, Tuple +from typing import Dict, Any, List, Optional, Union, Tuple, Type import numpy as np @@ -25,17 +25,19 @@ from deeppavlov.models.go_bot.nlu.tokens_vectorizer import TokensVectorizer from deeppavlov.models.go_bot.dto.dataset_features import UtteranceDataEntry, DialogueDataEntry, \ BatchDialoguesDataset, UtteranceFeatures, UtteranceTarget, BatchDialoguesFeatures -from deeppavlov.models.go_bot.dto.shared_gobot_params import SharedGoBotParams +from deeppavlov.models.go_bot.dto.shared_gobot_params import SharedGoBotParams, MemorizingGoBotParams from deeppavlov.models.go_bot.nlg.nlg_manager import NLGManagerInterface from deeppavlov.models.go_bot.nlu.nlu_manager import NLUManager -from deeppavlov.models.go_bot.policy.policy_network import PolicyNetwork, PolicyNetworkParams +from deeppavlov.models.go_bot.policy.policy_network import PolicyNetwork, PolicyNetworkParams, MemorizingPolicy from deeppavlov.models.go_bot.policy.dto.policy_prediction import PolicyPrediction from deeppavlov.models.go_bot.tracker.featurized_tracker import FeaturizedTracker -from deeppavlov.models.go_bot.tracker.dialogue_state_tracker import DialogueStateTracker, MultipleUserStateTrackersPool +from deeppavlov.models.go_bot.tracker.dialogue_state_tracker import DialogueStateTracker, MultipleUserStateTrackersPool, \ + MemorizingDialogueStateTracker from pathlib import Path log = getLogger(__name__) +UtteranceT = Union[dict, str] # todo logging @register("go_bot") @@ -138,19 +140,29 @@ def __init__(self, self.data_handler = TokensVectorizer(debug, word_vocab, bow_embedder, embedder) # todo make mor abstract - self.dialogue_state_tracker = DialogueStateTracker.from_gobot_params(tracker, self.nlg_manager, + tracker_class: Type = type(tracker) + if tracker.mode == "MEM": + tracker_class = MemorizingDialogueStateTracker + features_params_class: Type = MemorizingGoBotParams + policy_class: Type = MemorizingPolicy + elif tracker.mode == "NN": + tracker_class = DialogueStateTracker + features_params_class: Type = SharedGoBotParams + policy_class: Type = PolicyNetwork + + self.dialogue_state_tracker = tracker_class.from_gobot_params(tracker, self.nlg_manager, policy_network_params, database) # todo make mor abstract self.multiple_user_state_tracker = MultipleUserStateTrackersPool(base_tracker=self.dialogue_state_tracker) tokens_dims = self.data_handler.get_dims() - features_params = SharedGoBotParams.from_configured(self.nlg_manager, self.nlu_manager, + features_params = features_params_class.from_configured(self.nlg_manager, self.nlu_manager, self.dialogue_state_tracker) policy_save_path = Path(save_path, self.POLICY_DIR_NAME) policy_load_path = Path(load_path, self.POLICY_DIR_NAME) - self.policy = PolicyNetwork(policy_network_params, tokens_dims, features_params, - policy_load_path, policy_save_path, **kwargs) + self.policy = policy_class(policy_network_params, tokens_dims, features_params, + policy_load_path, policy_save_path, **kwargs) self.dialogues_cached_features = dict() @@ -262,7 +274,7 @@ def prepare_utterance_training_data(self, utterance_data_entry = UtteranceDataEntry.from_features_and_target(utterance_features, utterance_target) return utterance_data_entry - def extract_features_from_utterance_text(self, text, tracker, keep_tracker_state=False) -> UtteranceFeatures: + def extract_features_from_utterance_text(self, text: UtteranceT, tracker, keep_tracker_state=False) -> UtteranceFeatures: """ Extract ML features for the input text and the respective tracker. Features are aggregated from the @@ -314,7 +326,7 @@ def extract_features_from_utterance_text(self, text, tracker, keep_tracker_state return UtteranceFeatures(nlu_response, tracker_knowledge, digitized_policy_features) - def _infer(self, user_utterance_text: str, user_tracker: DialogueStateTracker, + def _infer(self, user_utterance_text: UtteranceT, user_tracker: DialogueStateTracker, keep_tracker_state=False) -> Tuple[BatchDialoguesFeatures, PolicyPrediction]: """ Predict the action to perform in response to given text. @@ -352,7 +364,7 @@ def _infer(self, user_utterance_text: str, user_tracker: DialogueStateTracker, return utterance_batch_features, policy_prediction - def __call__(self, batch: Union[List[List[dict]], List[str]], + def __call__(self, batch: Union[List[List[UtteranceT]], List[UtteranceT]], user_ids: Optional[List] = None) -> Union[List[NLGResponseInterface], List[List[NLGResponseInterface]]]: if isinstance(batch[0], list): @@ -361,7 +373,7 @@ def __call__(self, batch: Union[List[List[dict]], List[str]], # todo unify tracking: no need to distinguish tracking strategies on dialogues and realtime res = [] for dialogue in batch: - dialogue: List[dict] + dialogue: List[UtteranceT] res.append(self._calc_inferences_for_dialogue(dialogue)) else: # batch is a list of utterances possibly came from different users: real-time inference @@ -369,7 +381,7 @@ def __call__(self, batch: Union[List[List[dict]], List[str]], if not user_ids: user_ids = [self.DEFAULT_USER_ID] * len(batch) for user_id, user_text in zip(user_ids, batch): - user_text: str + user_text: UtteranceT res.append(self._realtime_infer(user_id, user_text)) return res diff --git a/deeppavlov/models/go_bot/nlg/mock_json_nlg_manager.py b/deeppavlov/models/go_bot/nlg/mock_json_nlg_manager.py index 655712ab21..071e44d7a8 100644 --- a/deeppavlov/models/go_bot/nlg/mock_json_nlg_manager.py +++ b/deeppavlov/models/go_bot/nlg/mock_json_nlg_manager.py @@ -1,4 +1,6 @@ import json +import re +from collections import defaultdict from itertools import combinations from pathlib import Path from typing import Union, Dict, List, Tuple @@ -7,12 +9,13 @@ from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register, get_model from deeppavlov.dataset_readers.dstc2_reader import DSTC2DatasetReader +from deeppavlov.dataset_readers.dto.rasa.domain_knowledge import DomainKnowledge from deeppavlov.models.go_bot.dto.dataset_features import BatchDialoguesFeatures from deeppavlov.models.go_bot.nlg.dto.json_nlg_response import JSONNLGResponse, VerboseJSONNLGResponse from deeppavlov.models.go_bot.nlg.nlg_manager import log from deeppavlov.models.go_bot.nlg.nlg_manager_interface import NLGManagerInterface from deeppavlov.models.go_bot.policy.dto.policy_prediction import PolicyPrediction - +import random @register("gobot_json_nlg_manager") class MockJSONNLGManager(NLGManagerInterface): @@ -36,10 +39,11 @@ def __init__(self, self._dataset_reader = get_model(dataset_reader_class) individual_actions2slots = self._load_actions2slots_mapping(actions2slots_path) + split2domain_i = self._get_domain_info(data_path) possible_actions_combinations_tuples = sorted( set(actions_combination_tuple for actions_combination_tuple - in self._extract_actions_combinations(data_path)), + in self._extract_actions_combinations(split2domain_i)), key=lambda x: '+'.join(x)) self.action_tuples2ids = {action_tuple: action_tuple_idx @@ -60,6 +64,9 @@ def __init__(self, api_call_action_as_tuple = (api_call_action,) self._api_call_id = self.action_tuples2ids[api_call_action_as_tuple] + self.action2slots2text, self.action2slots2values2text =\ + self._extract_templates(split2domain_i) + if self.debug: log.debug(f"AFTER {self.__class__.__name__} init(): " f"actions2slots_path={actions2slots_path}, " @@ -72,17 +79,73 @@ def get_api_call_action_id(self) -> int: """ return self._api_call_id - def _extract_actions_combinations(self, dataset_path: Union[str, Path]): + def _get_domain_info(self, dataset_path: Union[str, Path]): dataset_path = expand_path(dataset_path) - dataset = self._dataset_reader.read(data_path=dataset_path, dialogs=True, ignore_slots=True) + try: + dataset = self._dataset_reader.read(data_path=dataset_path) + except: + dataset = self._dataset_reader.read(data_path=dataset_path, + fmt="yml") + split2domain = dict() + for dataset_split, dataset_split_info in dataset.items(): + domain_i: DomainKnowledge = dataset_split_info["domain"] + split2domain[dataset_split] = domain_i + return split2domain + + def _extract_actions_combinations(self, split2domain: Dict[str, DomainKnowledge]): actions_combinations = set() - for dataset_split in dataset.values(): - for dialogue in dataset_split: - for user_input, system_response in dialogue: - actions_tuple = tuple(system_response["act"].split('+')) - actions_combinations.add(actions_tuple) + for dataset_split, domain_i in split2domain.items(): + actions_combinations.update({(ac,) for ac in domain_i.known_actions}) return actions_combinations + def _extract_templates(self, split2domain: Dict[str, DomainKnowledge]): + slots_pattern = r'\[(?P\w+)\]\((?P\w+)\)' + action2slots2text = defaultdict(lambda: defaultdict(list)) + action2slots2values2text = defaultdict(lambda: defaultdict(list)) + for dataset_split, domain_i in split2domain.items(): + actions2texts = domain_i.response_templates + for action, texts in actions2texts.items(): + action_tuple = (action,) + texts = [text for text in texts if text] + for text in texts: + used_slots, slotvalue_tuples = set(), set() + if isinstance(text, dict): + text = text["text"] + used_slots_di = dict() + for found in re.finditer(slots_pattern, text): + used_slots_di = found.groupdict() + if not ("name" in used_slots_di.keys() and "value" in used_slots_di.keys()): + continue + used_slots.update(used_slots_di["name"]) + slotvalue_tuples.update({used_slots_di["name"]: + used_slots_di["value"]}) + + used_slots = tuple(sorted(used_slots)) + slotvalue_tuples = tuple(sorted(slotvalue_tuples)) + templated_text = re.sub(slots_pattern, '##\g', text) + action2slots2text[action_tuple][used_slots].append(templated_text) + action2slots2values2text[action_tuple][slotvalue_tuples].append(templated_text) + + return action2slots2text, action2slots2values2text + + def generate_template(self, response_info: VerboseJSONNLGResponse, mode="slots"): + if mode == "slots": + response_text = None + action_tuple = response_info.actions_tuple + slots = tuple(sorted(response_info.slot_values.keys())) + response_text = self.action2slots2text.get(action_tuple, {}).get(slots, None) + else: + action_tuple = response_info.actions_tuple + slotvalue_tuples = tuple(sorted(response_info.slot_values.items())) + response_text = self.action2slots2text.get(action_tuple, {}).get(slotvalue_tuples, None) + if isinstance(response_text, list): + response_text = random.choice(response_text) + for slot_name in response_info.slot_values: + response_text = response_text.replace(f"##{slot_name}", + response_info.slot_values[ + slot_name]) + return response_text + @staticmethod def _load_actions2slots_mapping(actions2slots_json_path) -> Dict[str, str]: actions2slots_json_path = expand_path(actions2slots_json_path) @@ -134,6 +197,9 @@ def decode_response(self, response = JSONNLGResponse(slots_values, actions_tuple) verbose_response = VerboseJSONNLGResponse.from_json_nlg_response(response) verbose_response.policy_prediction = policy_prediction + verbose_response._nlu_responses = utterance_batch_features._nlu_responses + response_text = self.generate_template(verbose_response) + verbose_response.text = response_text return verbose_response def num_of_known_actions(self) -> int: diff --git a/deeppavlov/models/go_bot/nlu/dto/nlu_response.py b/deeppavlov/models/go_bot/nlu/dto/nlu_response.py index 7570aef386..cf9ec1924e 100644 --- a/deeppavlov/models/go_bot/nlu/dto/nlu_response.py +++ b/deeppavlov/models/go_bot/nlu/dto/nlu_response.py @@ -10,6 +10,7 @@ class NLUResponse(NLUResponseInterface): """ def __init__(self, slots, intents, tokens): self.slots: Union[List[Tuple[str, Any]], Dict[str, Any]] = slots + self._intent_names = None self.intents = intents self.tokens = tokens self.tokens_vectorized: Optional[TextVectorizationResponse] = None diff --git a/deeppavlov/models/go_bot/nlu/nlu_manager.py b/deeppavlov/models/go_bot/nlu/nlu_manager.py index e18d74b48f..3e9b1e945c 100644 --- a/deeppavlov/models/go_bot/nlu/nlu_manager.py +++ b/deeppavlov/models/go_bot/nlu/nlu_manager.py @@ -1,7 +1,12 @@ from logging import getLogger -from typing import List +from typing import List, Union from deeppavlov import Chainer +from deeppavlov.core.data.simple_vocab import SimpleVocabulary +from deeppavlov.models.bert.bert_classifier import BertClassifierModel +from deeppavlov.models.classifiers.memorizing_classifier import \ + MemClassificationModel +from deeppavlov.models.intent_catcher.intent_catcher import IntentCatcher from deeppavlov.models.go_bot.nlu.dto.nlu_response import NLUResponse from deeppavlov.models.go_bot.nlu.nlu_manager_interface import NLUManagerInterface @@ -31,14 +36,22 @@ def __init__(self, tokenizer, slot_filler, intent_classifier, debug=False): self.intent_classifier = intent_classifier self.intents = [] if isinstance(self.intent_classifier, Chainer): - self.intents = self.intent_classifier.get_main_component().classes + component = self.intent_classifier.get_main_component() + if isinstance(component, BertClassifierModel) \ + or isinstance(component, IntentCatcher)\ + or isinstance(component, MemClassificationModel): + intent2labeltools = [el[-1] for el in self.intent_classifier.pipe if isinstance(el[-1], SimpleVocabulary)] + if intent2labeltools: + self.intents = intent2labeltools[-1]._i2t + else: + self.intents = component.classes if self.debug: log.debug(f"AFTER {self.__class__.__name__} init(): " f"tokenizer={tokenizer}, slot_filler={slot_filler}, " f"intent_classifier={intent_classifier}, debug={debug}") - def nlu(self, text: str) -> NLUResponse: + def nlu(self, text: Union[str, dict]) -> NLUResponse: """ Extracts slot values and intents from text. @@ -49,27 +62,69 @@ def nlu(self, text: str) -> NLUResponse: an object storing the extracted slos and intents info """ # todo meaningful type hints - tokens = self._tokenize_single_text_entry(text) + text_is_dict = isinstance(text, dict) + if text_is_dict: + _text = text.get("text") + _intents = text.get("intents", []) + _slots = text.get("slots", {}) + else: + _text = text + + tokens = self._tokenize_single_text_entry(_text) slots = None if callable(self.slot_filler): - slots = self._extract_slots_from_tokenized_text_entry(tokens) + if text_is_dict: + slots = _slots + else: + slots = self._extract_slots_from_text_entry(text) intents = [] if callable(self.intent_classifier): - intents = self._extract_intents_from_tokenized_text_entry(tokens) - - return NLUResponse(slots, intents, tokens) + if text_is_dict: + if isinstance(_intents, list): + intents = self._intents_to_ohe(_intents) + else: + intents = self._intent_name_to_ohe(_intents) + else: + intents = self._extract_intents_from_text_entry(text) + resp = NLUResponse(slots, intents, tokens) + resp._intents_names = self.intents + return resp def _extract_intents_from_tokenized_text_entry(self, tokens: List[str]): # todo meaningful type hints, relies on unannotated intent classifier - intent_features = self.intent_classifier([' '.join(tokens)])[1][0] + classifier_output = self.intent_classifier([' '.join(tokens)]) + intent_features = classifier_output[1][0] + return intent_features + + def _extract_intents_from_text_entry(self, text: str): + # todo meaningful type hints, relies on unannotated intent classifier + intent_features = self.intent_classifier([text])[1][0] return intent_features + def _intent_name_to_ohe(self, intent_name): + intents_ohe = [0.] * len(self.intents) + if intent_name in self.intents: + intent_ix = self.intents.index(intent_name) + intents_ohe[intent_ix] = 1. + return intents_ohe + + def _intents_to_ohe(self, intent_names): + ohes = map(self._intent_name_to_ohe, intent_names) + intents_ohe = [0.] * len(self.intents) + for ohe_ix, ohe_ in enumerate(zip(*ohes)): + intents_ohe[ohe_ix] = float(any(ohe_)) + return intents_ohe + def _extract_slots_from_tokenized_text_entry(self, tokens: List[str]): # todo meaningful type hints, relies on unannotated slot filler return self.slot_filler([tokens])[0] + def _extract_slots_from_text_entry(self, text: str): + # todo meaningful type hints, relies on unannotated slot filler + return self.slot_filler([text])[0] + def _tokenize_single_text_entry(self, text: str): # todo meaningful type hints, relies on unannotated tokenizer return self.tokenizer([text.lower().strip()])[0] diff --git a/deeppavlov/models/go_bot/policy/policy_network.py b/deeppavlov/models/go_bot/policy/policy_network.py index 1e3483203d..c92d1bb7fa 100644 --- a/deeppavlov/models/go_bot/policy/policy_network.py +++ b/deeppavlov/models/go_bot/policy/policy_network.py @@ -18,7 +18,7 @@ from deeppavlov.models.go_bot.dto.dataset_features import BatchDialoguesFeatures, BatchDialoguesTargets # todo -from deeppavlov.models.go_bot.dto.shared_gobot_params import SharedGoBotParams +from deeppavlov.models.go_bot.dto.shared_gobot_params import SharedGoBotParams, MemorizingGoBotParams from deeppavlov.models.go_bot.policy.dto.attn_params import GobotAttnParams from deeppavlov.models.go_bot.policy.dto.digitized_policy_features import DigitizedPolicyFeatures from deeppavlov.models.go_bot.policy.dto.policy_network_params import PolicyNetworkParams @@ -384,7 +384,7 @@ def train_on_batch(self, batch_dialogues_targets: BatchDialoguesTargets) -> dict: feed_dict = { - self._dropout_keep_prob: 1., + self._dropout_keep_prob: 1. - self.dropout_rate, self._utterance_mask: batch_dialogues_features.b_padded_dialogue_length_mask, self._features: batch_dialogues_features.b_featuress, self._action: batch_dialogues_targets.b_action_ids, @@ -453,3 +453,64 @@ def _save_nn_params(self) -> None: if self.debug: log.debug(f"AFTER {self.__class__.__name__} _save_nn_params()") + +class MemorizingPolicy(PolicyNetwork): + def __init__(self, network_params_passed: PolicyNetworkParams, + tokens_dims: TokensVectorRepresentationParams, + features_params: MemorizingGoBotParams, + load_path, + save_path, + debug=False, + **kwargs): + super().__init__(network_params_passed, tokens_dims, features_params, load_path, save_path, debug, **kwargs) + self.intent_ids2intents = features_params.intent_ids2intents + self.intents2intent_ids = features_params.intents2intent_ids + + def digitize_features(self, + nlu_response: NLUResponse, + tracker_knowledge: DSTKnowledge) -> DigitizedPolicyFeatures: + intent_name = self.intent_ids2intents.get(np.argmax(nlu_response.intents)) + # compute the actuap prediction + concat_feats = intent_name # todo warning!!! do not merge until rewritten !!! + possible_actions = [] + for story_ix, (story_ptr, story) in enumerate(zip(tracker_knowledge.stories_ptrs, tracker_knowledge.stories)): + next_action_ptr = story_ptr + 1 + if next_action_ptr < len(story) and story[next_action_ptr]["utter_needed"] == intent_name: + possible_actions.append((story[next_action_ptr]["action_name"], story[next_action_ptr]["action_ix"])) + elif any(ptr > -1 for ptr in tracker_knowledge.stories_ptrs): + tracker_knowledge.stories_ptrs[story_ix] = len(story) # mark this story as no longer accessible + if len(possible_actions) > 2: + log.debug("STORIES: multiple proceedings available, picked the first one") + (action_name, action_ix) = possible_actions[0] if possible_actions else (None, None) + + concat_feats = action_ix + return DigitizedPolicyFeatures(None, concat_feats, None) + + def __call__(self, batch_dialogues_features: BatchDialoguesFeatures, + states_c: np.ndarray, states_h: np.ndarray, prob: bool = False, + *args, **kwargs) -> PolicyPrediction: + + states_c = [[states_c]] # list of list aka batch of dialogues + states_h = [[states_h]] # list of list aka batch of dialogues + + probs = [np.zeros((self.action_size, 1))] * len(batch_dialogues_features) + prediction = [] + for feature_ix, feature in enumerate(batch_dialogues_features.b_featuress): + # take intent_name + # given the tracker knowledge + prediction.extend(feature) + if feature is not None and feature: + feature_ = feature[0] + probs[feature_ix][feature_] = 1. + + policy_prediction = PolicyPrediction(probs, prediction, states_c, states_h) + + return policy_prediction + + def train_on_batch(self, + batch_dialogues_features: BatchDialoguesFeatures, + batch_dialogues_targets: BatchDialoguesTargets) -> dict: + log.debug("not trainable policy chosen") + return {'loss': 0., + 'learning_rate': self.get_learning_rate(), + 'momentum': self.get_momentum()} diff --git a/deeppavlov/models/go_bot/tracker/dialogue_state_tracker.py b/deeppavlov/models/go_bot/tracker/dialogue_state_tracker.py index 9a0cb32c49..ccc51ce0e5 100644 --- a/deeppavlov/models/go_bot/tracker/dialogue_state_tracker.py +++ b/deeppavlov/models/go_bot/tracker/dialogue_state_tracker.py @@ -221,6 +221,94 @@ def fill_current_state_with_db_results(self) -> dict: return slots +class MemorizingDialogueStateTracker(DialogueStateTracker): + def get_current_knowledge(self) -> DSTKnowledge: + res = super().get_current_knowledge() + res.stories = self.stories + res.stories_ptrs = self.stories_ptrs + return res + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.act2act_id: dict = {} + self.act_id2act: dict = {} + self.stories = self._load_stories(self.stories_path) + self._setup_action_ixes(self.stories) + self.stories_ptrs = [-1]*len(self.stories) + + + @staticmethod + def from_gobot_params(parent_tracker: FeaturizedTracker, + nlg_manager: NLGManagerInterface, + policy_network_params: PolicyNetworkParams, + database: Component): + slot_names = parent_tracker.slot_names + + # region set formfilling info + act2act_id = {a_text: nlg_manager.get_action_id(a_text) for a_text in nlg_manager.known_actions()} + action_id2aqd_slots_ids, action_id2req_slots_ids = DialogueStateTracker.extract_reqiured_acquired_slots_ids_mapping( + act2act_id, slot_names, nlg_manager, parent_tracker) + + # todo why so ugly and duplicated in multiple users tracker + dialogue_state_tracker = MemorizingDialogueStateTracker(slot_names, nlg_manager.num_of_known_actions(), + nlg_manager.get_api_call_action_id(), + policy_network_params.hidden_size, + database, + parent_tracker.domain_yml_path, + parent_tracker.stories_path) + + dialogue_state_tracker.ffill_act_ids2req_slots_ids = action_id2req_slots_ids + dialogue_state_tracker.ffill_act_ids2aqd_slots_ids = action_id2aqd_slots_ids + dialogue_state_tracker.act2act_id = act2act_id + dialogue_state_tracker.act_id2act = {v:k for k, v in act2act_id.items()} + dialogue_state_tracker._setup_action_ixes(dialogue_state_tracker.stories) + # endregion set formfilling info + return dialogue_state_tracker + + def _setup_action_ixes(self, stories_lidi): + for story in stories_lidi: + for el in story: + act_name_k = (el["action_name"],) + if act_name_k in self.act2act_id: + el["action_ix"] = self.act2act_id[act_name_k] + + def _load_stories(self, stories_path: Union[Path, str]): + story_lines = [] + with open(stories_path) as stories_f: + + for line in stories_f: + line = line.strip() + if not line: + continue + if line.startswith("##"): + story_lines.append([]) + else: + story_lines[-1].append(line) + stories = [] + for story in story_lines: + story_adj = [] + for turn_ix, turn in enumerate(story): + if turn_ix % 2 == 0: + continue # we iterate over system turns + else: + story_adj.append({ + "utter_needed": story[turn_ix-1].strip(" *"), # todo smwhr exists a special method for this + "action_name": story[turn_ix].strip(" -"), # todo smwhr exists a special method for this + }) + stories.append(story_adj) + return stories + + def update_previous_action(self, prev_act_id: int) -> None: + super().update_previous_action(prev_act_id) + act_name = self.act_id2act[prev_act_id][0] + for ix, (story_ptr, story) in enumerate(zip(self.stories_ptrs, self.stories)): + next_action_ix = story_ptr + 1 + if next_action_ix DialogueStateTracker: return tracker def new_tracker(self): - # todo deprecated and never used? - tracker = DialogueStateTracker(self.base_tracker.slot_names, self.base_tracker.n_actions, - self.base_tracker.api_call_id, self.base_tracker.hidden_size, - self.base_tracker.database) + # todo deprecated and never used? (response: nope, but should be removed in favor of init_new_tracker) + tracker = self.base_tracker.__class__( + self.base_tracker.slot_names, self.base_tracker.n_actions, + self.base_tracker.api_call_id, self.base_tracker.hidden_size, + self.base_tracker.database) return tracker def get_or_init_tracker(self, user_id: int): @@ -266,7 +355,12 @@ def init_new_tracker(self, user_id: int, tracker_entity: DialogueStateTracker) - ) tracker.ffill_act_ids2req_slots_ids = tracker_entity.ffill_act_ids2req_slots_ids tracker.ffill_act_ids2aqd_slots_ids = tracker_entity.ffill_act_ids2aqd_slots_ids - + if type(tracker_entity) == MemorizingDialogueStateTracker: + # wip: memorizing tracker and policy + tracker.act2act_id = tracker_entity.act2act_id + tracker.act_id2act = tracker_entity.act_id2act + if isinstance(tracker, MemorizingDialogueStateTracker): + tracker._setup_action_ixes(tracker.stories) self._ids_to_trackers[user_id] = tracker def reset(self, user_id: int = None) -> None: diff --git a/deeppavlov/models/go_bot/tracker/featurized_tracker.py b/deeppavlov/models/go_bot/tracker/featurized_tracker.py index ec1314036b..8ca23e17bd 100644 --- a/deeppavlov/models/go_bot/tracker/featurized_tracker.py +++ b/deeppavlov/models/go_bot/tracker/featurized_tracker.py @@ -7,7 +7,9 @@ from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.file import read_yaml from deeppavlov.core.common.registry import register -from deeppavlov.dataset_readers.md_yaml_dialogs_reader import DomainKnowledge, MD_YAML_DialogsDatasetReader +from deeppavlov.dataset_readers.md_yaml_dialogs_reader import \ + MD_YAML_DialogsDatasetReader +from deeppavlov.dataset_readers.dto.rasa.domain_knowledge import DomainKnowledge from deeppavlov.models.go_bot.nlu.dto.nlu_response import NLUResponse from deeppavlov.models.go_bot.tracker.dto.tracker_knowledge_interface import TrackerKnowledgeInterface from deeppavlov.models.go_bot.tracker.tracker_interface import TrackerInterface @@ -36,6 +38,7 @@ def __init__(self, # actions_required_acquired_slots_path: Optional[Union[str, Path]]=None, domain_yml_path: Optional[Union[str, Path]]=None, stories_yml_path: Optional[Union[str, Path]]=None, + tracker_mode: str = "NN", **kwargs) -> None: self.slot_names = list(slot_names) self.domain_yml_path = domain_yml_path @@ -44,6 +47,8 @@ def __init__(self, self._load_actions2slots_formfilling_info_from(domain_yml_path, stories_yml_path) self.history = [] self.current_features = None + assert tracker_mode in {"NN", "MEM"} + self.mode = tracker_mode @property def state_size(self) -> int: @@ -216,7 +221,7 @@ def read_md_story(story_path: Union[Path, str]) -> Dict[str, List[Dict]]: curr_action = step["action"] if curr_action.startswith("form"): curr_action = json.loads(curr_action[len("form"):])["name"] - print(curr_action) + # print(curr_action) if curr_action in form_names: prev_forms.append(curr_action) if curr_action in potential_api_or_db_actions: diff --git a/deeppavlov/models/intent_catcher/intent_catcher.py b/deeppavlov/models/intent_catcher/intent_catcher.py index 87d6d4162a..f49e440ced 100644 --- a/deeppavlov/models/intent_catcher/intent_catcher.py +++ b/deeppavlov/models/intent_catcher/intent_catcher.py @@ -23,7 +23,6 @@ import tensorflow as tf import tensorflow_hub as tfhub from overrides import overrides -from xeger import Xeger from deeppavlov.core.common.registry import register from deeppavlov.core.models.nn_model import NNModel @@ -71,7 +70,7 @@ def __init__(self, save_path: Union[str, Path], load_path: Union[str, Path], } if embeddings not in urls: raise Exception(f"Provided embeddings type `{embeddings}` is not available. Available embeddings are: use, use_large.") - self.limit = limit + embedder = tfhub.Module(urls[embeddings]) self.sentences = tf.placeholder(dtype=tf.string) self.embedded = embedder(self.sentences) @@ -151,54 +150,53 @@ def train_on_batch(self, x: list, y: list) -> List[float]: Train classifier on batch of data. Args: - x: List of input sentences + x: List of tuples: y: List of input encoded labels Returns: List[float]: list of losses. """ assert len(x) == len(y), "Number of labels is not equal to the number of sentences" - try: - regexps = {(re.compile(s), l) for s, l in zip(x, y)} - except Exception as e: - log.error(f"Some sentences are not a consitent regular expressions") - raise e - xeger = Xeger(self.limit) - self.regexps = self.regexps.union(regexps) - generated_x = [] - generated_y = [] - for s, l in zip(x, y): # generate samples and add regexp - gx = {xeger.xeger(s) for _ in range(self.limit)} - generated_x.extend(gx) - generated_y.extend([l for i in range(len(gx))]) - log.info(f"Original number of samples: {len(y)}, generated samples: {len(generated_y)}") - embedded_x = self.session.run(self.embedded, feed_dict={self.sentences:generated_x}) # actual trainig - loss = self.classifier.train_on_batch(embedded_x, generated_y) + + # zip below does [(r1, s1), (r2, s2), ..] -> [r1, r2, ..], [s1, s2, ..] + passed_regexps, passed_sents = zip(*x) + self.regexps = self.regexps.union(set(zip(passed_regexps, y))) + + # region actual trainig + embedded_sents = self.session.run(self.embedded, + feed_dict={self.sentences:passed_sents}) + loss = self.classifier.train_on_batch(embedded_sents, y) + # endregion actual trainig return loss def process_event(self, event_name, data): pass - def __call__(self, x: List[str]) -> List[int]: + def __call__(self, x: Union[List[str], List[tuple]]) -> List[int]: """ Predict probabilities. Args: - x: list of input sentences. + x: list of input sentences or List of tuples: Returns: list of probabilities. """ + if x and isinstance(x[0], tuple): + x = [sent for _re, sent in x] return self._predict_proba(x) - def _predict_label(self, sentences: List[str]) -> List[int]: + def _predict_label(self, sentences: Union[List[str], List[tuple]]) -> List[int]: """ Predict labels. Args: - x: list of input sentences. + sentences: list of input sentences or List of tuples: Returns: list of labels. """ + if sentences and isinstance(x[0], tuple): + sentences = [sent for _re, sent in sentences] + labels = [None for i in range(len(sentences))] indx = [] for i, s in enumerate(sentences): @@ -214,15 +212,17 @@ def _predict_label(self, sentences: List[str]) -> List[int]: labels[indx[i]] = l return labels - def _predict_proba(self, x: List[str]) -> List[float]: + def _predict_proba(self, x: Union[List[str], List[tuple]]) -> List[float]: """ Predict probabilities. Used in __call__. Args: - x: list of input sentences. + x: list of input sentences or List of tuples: Returns: list of probabilities """ + if x and isinstance(x[0], tuple): + x = [sent for _re, sent in x] x_embedded = self.session.run(self.embedded, feed_dict={self.sentences:x}) probs = self.classifier.predict_proba(x_embedded) _, num_labels = probs.shape diff --git a/deeppavlov/models/slotfill/slotfill_raw.py b/deeppavlov/models/slotfill/slotfill_raw.py index 39c7dff097..8f1d4ab9b5 100644 --- a/deeppavlov/models/slotfill/slotfill_raw.py +++ b/deeppavlov/models/slotfill/slotfill_raw.py @@ -25,7 +25,8 @@ from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component from deeppavlov.core.models.serializable import Serializable -from deeppavlov.dataset_readers.md_yaml_dialogs_reader import MD_YAML_DialogsDatasetReader, DomainKnowledge +from deeppavlov.dataset_readers.md_yaml_dialogs_reader import MD_YAML_DialogsDatasetReader +from deeppavlov.dataset_readers.dto.rasa.domain_knowledge import DomainKnowledge log = getLogger(__name__) @@ -175,7 +176,133 @@ def load(self, *args, **kwargs): """reads the slotfilling info from RASA-styled dataset""" domain_path = Path(self.load_path, MD_YAML_DialogsDatasetReader.DOMAIN_FNAME) nlu_path = Path(self.load_path, MD_YAML_DialogsDatasetReader.NLU_FNAME) - domain_knowledge = DomainKnowledge(read_yaml(domain_path)) + # domain_knowledge = DomainKnowledge(read_yaml(domain_path)) # todo: rewrite MD_YAML_DialogsDatasetReader so that public methods are enough - _, slot_name2text2value = MD_YAML_DialogsDatasetReader._read_intent2text_mapping(nlu_path, domain_knowledge) + data = MD_YAML_DialogsDatasetReader.read(self.load_path) + nlu_lines_trn = dict() + nlu_lines_tst = dict() + nlu_lines_val = dict() + if "train" in data: + nlu_lines_trn = data["train"]["nlu_lines"].slot_name2text2value + if "test" in data: + nlu_lines_tst = data["test"]["nlu_lines"].slot_name2text2value + if "valid" in data: + nlu_lines_val = data["valid"]["nlu_lines"].slot_name2text2value + slot_names = list(nlu_lines_trn.keys()) + \ + list(nlu_lines_tst.keys()) + \ + list(nlu_lines_val.keys()) + slot_name2text2value = dict() + for sname in slot_names: + stext2value = dict() + for sample in [nlu_lines_trn, + nlu_lines_tst, + nlu_lines_val]: + for stext,ssamples in sample.get(sname,{}).items(): + if stext not in stext2value: + stext2value[stext] = list() + stext2value[stext].extend(ssamples) + slot_name2text2value[sname] = stext2value self._slot_vals = slot_name2text2value + + +@register('slotfill_raw_memorizing') +class RASA_MemorizingSlotFillingComponent(SlotFillingComponent): + """Slot filling using Fuzzy search""" + + def __init__(self, threshold: float = 0.7, return_all: bool = False, + **kwargs): + super().__init__(**kwargs) + self.threshold = threshold + self.return_all = return_all + # self._slot_vals is the dictionary of slot values + self._slot_vals = None + self.load() + + @overrides + def __call__(self, batch, *args, **kwargs): + slots = [{}] * len(batch) + + m = [i for i, v in enumerate(batch) if v] + if m: + batch = [batch[i] for i in m] + # tags_batch = self._ner_network.predict_for_token_batch(batch) + # batch example: [['is', 'there', 'anything', 'else']] + for i, text in zip(m, batch): + # tokens are['is', 'there', 'anything', 'else'] + slots_values_lists = self._predict_slots(text) + # print(slots_values_lists) + if self.return_all: + slots[i] = dict(slots_values_lists) + else: + slots[i] = {slot: val_list[0] for slot, val_list in + slots_values_lists.items()} + # slots[i] example {'food': 'steakhouse'} + # slots we want, example : [{'pricerange': 'moderate', 'area': 'south'}] + return slots + + def _predict_slots(self, text): + # For utterance extract named entities and perform normalization for slot filling + entities, slot_values = self._strict_finder(text) + # slot_values = defaultdict(list) + # for entity, slot in zip(entities, slots): + # slot_values[slot].append(entity) + return slot_values + + def load(self, *args, **kwargs): + """reads the slotfilling info from RASA-styled dataset""" + domain_path = Path(self.load_path, + MD_YAML_DialogsDatasetReader.DOMAIN_FNAME) + nlu_path = Path(self.load_path, MD_YAML_DialogsDatasetReader.NLU_FNAME) + # domain_knowledge = DomainKnowledge(read_yaml(domain_path)) + # todo: rewrite MD_YAML_DialogsDatasetReader so that public methods are enough + data = MD_YAML_DialogsDatasetReader.read(self.load_path) + nlu_lines_trn = dict() + nlu_lines_tst = dict() + nlu_lines_val = dict() + text2slots = defaultdict(lambda: defaultdict(list)) + if "train" in data: + nlu_lines_trn = data["train"]["nlu_lines"].intent2slots2text + for intent, slots2text in nlu_lines_trn.items(): + for slots_is in slots2text.values(): + for slots_i in slots_is: + text = slots_i.get("text", '') + slots_di = dict(slots_i.get("slots", [])) + for s, sv in slots_di.items(): + text2slots[text][s].append(sv) + + if "test" in data: + nlu_lines_tst = data["test"]["nlu_lines"].intent2slots2text + for intent, slots2text in nlu_lines_tst.items(): + for slots_is in slots2text.values(): + for slots_i in slots_is: + text = slots_i.get("text", '') + slots_di = dict(slots_i.get("slots", [])) + for s, sv in slots_di.items(): + text2slots[text][s].append(sv) + + if "valid" in data: + nlu_lines_val = data["valid"]["nlu_lines"].intent2slots2text + for intent, slots2text in nlu_lines_val.items(): + for slots_is in slots2text.values(): + for slots_i in slots_is: + text = slots_i.get("text", '') + slots_di = dict(slots_i.get("slots", [])) + for s, sv in slots_di.items(): + text2slots[text][s].append(sv) + + + self._slot_vals = text2slots + + def deserialize(self, data): + self._slot_vals = json.loads(data) + + def save(self): + with open(self.save_path, 'w', encoding='utf8') as f: + json.dump(self._slot_vals, f) + + def _strict_finder(self, text): + global input_entity + slots = self._slot_vals.get(text, {}) + entities = list(slots.keys()) + return entities, slots +