RasaHQ/rasa_core

View on GitHub
rasa/core/training/structures.py

Summary

Maintainability
F
4 days
Test Coverage
from collections import deque, defaultdict

import io
import sys
import json
import logging
import uuid
from typing import List, Text, Dict, Optional, Tuple, Any, Set, ValuesView

from rasa.core import utils
from rasa.core.actions.action import ACTION_LISTEN_NAME
from rasa.core.conversation import Dialogue
from rasa.core.domain import Domain
from rasa.core.events import (
    UserUttered, ActionExecuted,
    Form, FormValidation,
    SlotSet, Event,
    ActionExecutionRejected)

logger = logging.getLogger(__name__)

# Checkpoint id used to identify story starting blocks
STORY_START = "STORY_START"

# Checkpoint id used to identify story end blocks
STORY_END = None

# need abbreviations otherwise they are not visualized well
GENERATED_CHECKPOINT_PREFIX = "GENR_"
CHECKPOINT_CYCLE_PREFIX = "CYCL_"

GENERATED_HASH_LENGTH = 5

FORM_PREFIX = "form: "
# prefix for storystep ID to get reproducible sorting results
# will get increased with each new instance
STEP_COUNT = 1


class StoryStringHelper(object):
    """A helper class to mark story steps that are inside a form with `form: `
    """

    def __init__(self,
                 active_form=None,
                 form_validation=True,
                 form_rejected=False,
                 form_prefix_string='',
                 no_form_prefix_string=''):
        # track active form
        self.active_form = active_form
        # track whether a from should be validated
        self.form_validation = form_validation
        # track whether a from was rejected
        self.form_rejected = form_rejected
        # save story strings with form prefix for later
        self.form_prefix_string = form_prefix_string
        # save story strings without form prefix for later
        self.no_form_prefix_string = no_form_prefix_string


class Checkpoint(object):
    def __init__(self,
                 name: Optional[Text],
                 conditions: Optional[Dict[Text, Any]] = None) -> None:

        self.name = name
        self.conditions = conditions if conditions else {}

    def as_story_string(self):
        dumped_conds = json.dumps(self.conditions) if self.conditions else ""
        return "{}{}".format(self.name, dumped_conds)

    def filter_trackers(self, trackers):
        """Filters out all trackers that do not satisfy the conditions."""

        if not self.conditions:
            return trackers

        for slot_name, slot_value in self.conditions.items():
            trackers = [t
                        for t in trackers
                        if t.get_slot(slot_name) == slot_value]
        return trackers

    def __repr__(self):
        return "Checkpoint(name={!r}, conditions={})".format(
            self.name, json.dumps(self.conditions))


class StoryStep(object):
    def __init__(self,
                 block_name: Optional[Text] = None,
                 start_checkpoints: Optional[List[Checkpoint]] = None,
                 end_checkpoints: Optional[List[Checkpoint]] = None,
                 events: Optional[List[Event]] = None
                 ) -> None:

        self.end_checkpoints = end_checkpoints if end_checkpoints else []
        self.start_checkpoints = start_checkpoints if start_checkpoints else []
        self.events = events if events else []
        self.block_name = block_name
        # put a counter prefix to uuid to get reproducible sorting results
        global STEP_COUNT
        self.id = "{}_{}".format(STEP_COUNT, uuid.uuid4().hex)
        STEP_COUNT += 1

        self.story_string_helper = StoryStringHelper()

    def create_copy(self, use_new_id):
        copied = StoryStep(self.block_name, self.start_checkpoints,
                           self.end_checkpoints,
                           self.events[:])
        if not use_new_id:
            copied.id = self.id
        return copied

    def add_user_message(self, user_message):
        self.add_event(user_message)

    def add_event(self, event):
        self.events.append(event)

    @staticmethod
    def _checkpoint_string(story_step_element):
        return "> {}\n".format(story_step_element.as_story_string())

    @staticmethod
    def _user_string(story_step_element, e2e, prefix=''):
        return "* {}{}\n".format(prefix,
                                 story_step_element.as_story_string(e2e))

    def _store_user_strings(self, story_step_element, e2e, prefix=''):
        self.story_string_helper.no_form_prefix_string += self._user_string(
            story_step_element, e2e
        )
        self.story_string_helper.form_prefix_string += self._user_string(
            story_step_element, e2e, prefix
        )

    @staticmethod
    def _bot_string(story_step_element, prefix=''):
        return "    - {}{}\n".format(prefix,
                                     story_step_element.as_story_string())

    def _store_bot_strings(self, story_step_element, prefix=''):
        self.story_string_helper.no_form_prefix_string += self._bot_string(
            story_step_element
        )
        self.story_string_helper.form_prefix_string += self._bot_string(
            story_step_element, prefix
        )

    def _reset_stored_strings(self):
        self.story_string_helper.form_prefix_string = ''
        self.story_string_helper.no_form_prefix_string = ''

    def as_story_string(self, flat=False, e2e=False):
        # if the result should be flattened, we
        # will exclude the caption and any checkpoints.

        for s in self.start_checkpoints:
            if s.name == STORY_START:
                # first story step in the story, so reset helper
                self.story_string_helper = StoryStringHelper()

        if flat:
            result = ""
        else:
            result = "\n## {}\n".format(self.block_name)
            for s in self.start_checkpoints:
                if s.name != STORY_START:
                    result += self._checkpoint_string(s)

        for s in self.events:
            if isinstance(s, UserUttered):
                if self.story_string_helper.active_form is None:
                    result += self._user_string(s, e2e)
                else:
                    # form is active
                    # it is not known whether the form will be
                    # successfully executed, so store this
                    # story string for later
                    self._store_user_strings(s, e2e, FORM_PREFIX)

            elif isinstance(s, Form):
                # form got either activated or deactivated
                self.story_string_helper.active_form = s.name

                if self.story_string_helper.active_form is None:
                    # form deactivated, so form succeeded,
                    # so add story string with form prefix
                    result += self.story_string_helper.form_prefix_string
                    # remove all stored story strings
                    self._reset_stored_strings()

                result += self._bot_string(s)

            elif isinstance(s, FormValidation):
                self.story_string_helper.form_validation = s.validate

            elif isinstance(s, ActionExecutionRejected):
                if s.action_name == self.story_string_helper.active_form:
                    # form rejected
                    self.story_string_helper.form_rejected = True

            elif isinstance(s, ActionExecuted):
                if self._is_action_listen(s):
                    pass
                elif self.story_string_helper.active_form is None:
                    result += self._bot_string(s)
                else:
                    # form is active
                    if self.story_string_helper.form_rejected:
                        if (self.story_string_helper.form_validation and
                                s.action_name ==
                                self.story_string_helper.active_form):
                            result += self._bot_string(
                                ActionExecuted(ACTION_LISTEN_NAME))
                            result += (self.story_string_helper.
                                       form_prefix_string)
                        else:
                            result += (self.story_string_helper.
                                       no_form_prefix_string)
                        # form rejected, add story string without form prefix
                        result += self._bot_string(s)
                    else:
                        # form succeeded, so add story string with form prefix
                        result += (self.story_string_helper.
                                   form_prefix_string)
                        result += self._bot_string(s, FORM_PREFIX)

                    # remove all stored story strings
                    self._reset_stored_strings()

                    if s.action_name == self.story_string_helper.active_form:
                        # form was successfully executed
                        self.story_string_helper.form_rejected = False

                self.story_string_helper.form_validation = True

            elif isinstance(s, SlotSet):
                if self.story_string_helper.active_form is None:
                    result += self._bot_string(s)
                else:
                    # form is active
                    # it is not known whether the form will be
                    # successfully executed, so store this
                    # story string for later
                    # slots should be always printed without prefix
                    self._store_bot_strings(s)

            elif isinstance(s, Event):
                converted = s.as_story_string()
                if converted:
                    if self.story_string_helper.active_form is None:
                        result += self._bot_string(s)
                    else:
                        # form is active
                        # it is not known whether the form will be
                        # successfully executed, so store this
                        # story string for later
                        self._store_bot_strings(s, FORM_PREFIX)

            else:
                raise Exception("Unexpected element in story step: "
                                "{}".format(s))

        if (not self.end_checkpoints and
                self.story_string_helper.active_form is not None):
            # there are no end checkpoints
            # form is active
            # add story string with form prefix
            result += self.story_string_helper.form_prefix_string
            # remove all stored story strings
            self._reset_stored_strings()

        if not flat:
            for e in self.end_checkpoints:
                result += "> {}\n".format(e.as_story_string())
        return result

    @staticmethod
    def _is_action_listen(event):
        # this is not an `isinstance` because
        # we don't want to allow subclasses here
        return (type(event) == ActionExecuted and
                event.action_name == ACTION_LISTEN_NAME)

    def _add_action_listen(self, events):
        if not events or not self._is_action_listen(events[-1]):
            # do not add second action_listen
            events.append(ActionExecuted(ACTION_LISTEN_NAME))

    def explicit_events(self,
                        domain: Domain,
                        should_append_final_listen: bool = True) -> List[Event]:
        """Returns events contained in the story step
            including implicit events.

        Not all events are always listed in the story dsl. This
        includes listen actions as well as implicitly
        set slots. This functions makes these events explicit and
        returns them with the rest of the steps events."""

        events = []

        for e in self.events:
            if isinstance(e, UserUttered):
                self._add_action_listen(events)
                events.append(e)
                events.extend(domain.slots_for_entities(e.entities))
            else:
                events.append(e)

        if not self.end_checkpoints and should_append_final_listen:
            self._add_action_listen(events)

        return events

    def __repr__(self):
        return ("StoryStep("
                "block_name={!r}, "
                "start_checkpoints={!r}, "
                "end_checkpoints={!r}, "
                "events={!r})".format(self.block_name,
                                      self.start_checkpoints,
                                      self.end_checkpoints,
                                      self.events))


class Story(object):
    def __init__(self,
                 story_steps: List[StoryStep] = None,
                 story_name: Optional[Text] = None) -> None:
        self.story_steps = story_steps if story_steps else []
        self.story_name = story_name

    @staticmethod
    def from_events(events, story_name=None):
        """Create a story from a list of events."""

        story_step = StoryStep()
        for event in events:
            story_step.add_event(event)
        return Story([story_step], story_name)

    def as_dialogue(self, sender_id, domain):
        events = []
        for step in self.story_steps:
            events.extend(
                step.explicit_events(domain,
                                     should_append_final_listen=False))

        events.append(ActionExecuted(ACTION_LISTEN_NAME))
        return Dialogue(sender_id, events)

    def as_story_string(self, flat=False, e2e=False):
        story_content = ""

        # initialize helper for first story step
        story_string_helper = StoryStringHelper()

        for step in self.story_steps:
            # use helper from previous story step
            step.story_string_helper = story_string_helper
            # create string for current story step
            story_content += step.as_story_string(flat, e2e)
            # override helper for next story step
            story_string_helper = step.story_string_helper

        if flat:
            if self.story_name:
                name = self.story_name
            else:
                name = "Generated Story {}".format(hash(story_content))
            return "## {}\n{}".format(name, story_content)
        else:
            return story_content

    def dump_to_file(self, filename, flat=False, e2e=False):
        with open(filename, "a", encoding="utf-8") as f:
            f.write(self.as_story_string(flat, e2e))


class StoryGraph(object):
    def __init__(self,
                 story_steps: List[StoryStep],
                 story_end_checkpoints: Optional[Dict[Text, Text]] = None
                 ) -> None:
        self.story_steps = story_steps
        self.step_lookup = {s.id: s for s in self.story_steps}
        ordered_ids, cyclic_edges = StoryGraph.order_steps(story_steps)
        self.ordered_ids = ordered_ids
        self.cyclic_edge_ids = cyclic_edges
        if story_end_checkpoints:
            self.story_end_checkpoints = story_end_checkpoints
        else:
            self.story_end_checkpoints = {}

    def ordered_steps(self) -> List[StoryStep]:
        """Returns the story steps ordered by topological order of the DAG."""

        return [self.get(step_id) for step_id in self.ordered_ids]

    def cyclic_edges(
        self
    ) -> List[Tuple[Optional[StoryStep], Optional[StoryStep]]]:
        """Returns the story steps ordered by topological order of the DAG."""

        return [(self.get(source), self.get(target))
                for source, target in self.cyclic_edge_ids]

    @staticmethod
    def overlapping_checkpoint_names(cps: List[Checkpoint],
                                     other_cps: List[Checkpoint]) -> Set[Text]:
        """Find overlapping checkpoints names"""

        return {cp.name for cp in cps} & {cp.name for cp in other_cps}

    def with_cycles_removed(self) -> 'StoryGraph':
        """Create a graph with the cyclic edges removed from this graph."""

        story_end_checkpoints = self.story_end_checkpoints.copy()
        cyclic_edge_ids = self.cyclic_edge_ids
        # we need to remove the start steps and replace them with steps ending
        # in a special end checkpoint

        # as in python 3.5, dict is not ordered, in order to generate
        # reproducible result with random seed in python 3.5, we have
        # to use OrderedDict
        if sys.version_info >= (3, 6):
            story_steps = {s.id: s for s in self.story_steps}
        else:
            from collections import OrderedDict
            story_steps = OrderedDict([(s.id, s) for s in self.story_steps])

        # collect all overlapping checkpoints
        # we will remove unused start ones
        all_overlapping_cps = set()

        if self.cyclic_edge_ids:
            # we are going to do this in a recursive way. we are going to
            # remove one cycle and then we are going to
            # let the cycle detection run again
            # this is not inherently necessary so if this becomes a performance
            # issue, we can change it. It is actually enough to run the cycle
            # detection only once and then remove one cycle after another, but
            # since removing the cycle is done by adding / removing edges and
            #  nodes
            # the logic is a lot easier if we only need to make sure the
            # change is consistent if we only change one compared to
            # changing all of them.

            for s, e in cyclic_edge_ids:
                cid = utils.generate_id(max_chars=GENERATED_HASH_LENGTH)
                prefix = GENERATED_CHECKPOINT_PREFIX + CHECKPOINT_CYCLE_PREFIX
                # need abbreviations otherwise they are not visualized well
                sink_cp_name = prefix + "SINK_" + cid
                connector_cp_name = prefix + "CONN_" + cid
                source_cp_name = prefix + "SRC_" + cid
                story_end_checkpoints[sink_cp_name] = source_cp_name

                overlapping_cps = self.overlapping_checkpoint_names(
                    story_steps[s].end_checkpoints,
                    story_steps[e].start_checkpoints)

                all_overlapping_cps.update(overlapping_cps)

                # change end checkpoints of starts
                start = story_steps[s].create_copy(use_new_id=False)
                start.end_checkpoints = [cp
                                         for cp in start.end_checkpoints
                                         if cp.name not in overlapping_cps]
                start.end_checkpoints.append(Checkpoint(sink_cp_name))
                story_steps[s] = start

                needs_connector = False

                for k, step in list(story_steps.items()):
                    additional_ends = []
                    for original_cp in overlapping_cps:
                        for cp in step.start_checkpoints:
                            if cp.name == original_cp:
                                if k == e:
                                    cp_name = source_cp_name
                                else:
                                    cp_name = connector_cp_name
                                    needs_connector = True

                                if not self._is_checkpoint_in_list(
                                        cp_name, cp.conditions,
                                        step.start_checkpoints):
                                    # add checkpoint only if it was not added
                                    additional_ends.append(
                                        Checkpoint(cp_name, cp.conditions))

                    if additional_ends:
                        updated = step.create_copy(use_new_id=False)
                        updated.start_checkpoints.extend(additional_ends)
                        story_steps[k] = updated

                if needs_connector:
                    start.end_checkpoints.append(Checkpoint(connector_cp_name))

        # the process above may generate unused checkpoints
        # we need to find them and remove them
        self._remove_unused_generated_cps(story_steps,
                                          all_overlapping_cps,
                                          story_end_checkpoints)

        return StoryGraph(list(story_steps.values()),
                          story_end_checkpoints)

    @staticmethod
    def _checkpoint_difference(
        cps: List[Checkpoint],
        cp_name_to_ignore: Set[Text]
    ) -> List[Checkpoint]:
        """Finds checkpoints which names are
            different form names of checkpoints to ignore"""

        return [cp for cp in cps if cp.name not in cp_name_to_ignore]

    def _remove_unused_generated_cps(
        self,
        story_steps: Dict[Text, StoryStep],
        overlapping_cps: Set[Text],
        story_end_checkpoints: Dict[Text, Text]
    ) -> None:
        """Finds unused generated checkpoints
            and remove them from story steps."""

        unused_cps = self._find_unused_checkpoints(story_steps.values(),
                                                   story_end_checkpoints)

        unused_overlapping_cps = unused_cps.intersection(overlapping_cps)

        unused_genr_cps = {cp_name
                           for cp_name in unused_cps
                           if cp_name.startswith(GENERATED_CHECKPOINT_PREFIX)}

        k_to_remove = set()
        for k, step in story_steps.items():
            # changed all ends
            updated = step.create_copy(use_new_id=False)
            updated.start_checkpoints = self._checkpoint_difference(
                updated.start_checkpoints, unused_overlapping_cps)

            # remove generated unused end checkpoints
            updated.end_checkpoints = self._checkpoint_difference(
                updated.end_checkpoints, unused_genr_cps)

            if (step.start_checkpoints and not updated.start_checkpoints or
                    step.end_checkpoints and not updated.end_checkpoints):
                # remove story step if the generated checkpoints
                # were the only ones
                k_to_remove.add(k)

            story_steps[k] = updated

        # remove unwanted story steps
        for k in k_to_remove:
            del story_steps[k]

    @staticmethod
    def _is_checkpoint_in_list(checkpoint_name: Text,
                               conditions: Dict[Text, Any],
                               cps: List[Checkpoint]) -> bool:
        """Checks if checkpoint with name and conditions is
            already in the list of checkpoints."""

        for cp in cps:
            if checkpoint_name == cp.name and conditions == cp.conditions:
                return True
        return False

    @staticmethod
    def _find_unused_checkpoints(
        story_steps: ValuesView[StoryStep],
        story_end_checkpoints: Dict[Text, Text]
    ) -> Set[Text]:
        """Finds all unused checkpoints."""

        collected_start = {STORY_END, STORY_START}
        collected_end = {STORY_END, STORY_START}

        for step in story_steps:
            for start in step.start_checkpoints:
                collected_start.add(start.name)
            for end in step.end_checkpoints:
                start_name = story_end_checkpoints.get(end.name, end.name)
                collected_end.add(start_name)

        return collected_end.symmetric_difference(collected_start)

    def get(self, step_id: Text) -> Optional[StoryStep]:
        """Looks a story step up by its id."""

        return self.step_lookup.get(step_id)

    def as_story_string(self) -> Text:
        """Convert the graph into the story file format."""

        story_content = ""
        for step in self.story_steps:
            story_content += step.as_story_string(flat=False)
        return story_content

    @staticmethod
    def order_steps(
        story_steps: List[StoryStep]
    ) -> Tuple[deque, Set[Tuple[Text, Text]]]:
        """Topological sort of the steps returning the ids of the steps."""

        checkpoints = StoryGraph._group_by_start_checkpoint(story_steps)
        graph = {s.id: {other.id
                        for end in s.end_checkpoints
                        for other in checkpoints[end.name]}
                 for s in story_steps}
        return StoryGraph.topological_sort(graph)

    @staticmethod
    def _group_by_start_checkpoint(
        story_steps: List[StoryStep]
    ) -> Dict[Text, List[StoryStep]]:
        """Returns all the start checkpoint of the steps"""

        checkpoints = defaultdict(list)
        for step in story_steps:
            for start in step.start_checkpoints:
                checkpoints[start.name].append(step)
        return checkpoints

    @staticmethod
    def topological_sort(
        graph: Dict[Text, Set[Text]]
    ) -> Tuple[deque, Set[Tuple[Text, Text]]]:
        """Creates a top sort of a directed graph. This is an unstable sorting!

        The function returns the sorted nodes as well as the edges that need
        to be removed from the graph to make it acyclic (and hence, sortable).

        The graph should be represented as a dictionary, e.g.:

        >>> example_graph = {
        ...         "a": set("b", "c", "d"),
        ...         "b": set(),
        ...         "c": set("d"),
        ...         "d": set(),
        ...         "e": set("f"),
        ...         "f": set()}
        >>> StoryGraph.topological_sort(example_graph)
        (deque([u'e', u'f', u'a', u'c', u'd', u'b']), [])
        """

        # noinspection PyPep8Naming
        GRAY, BLACK = 0, 1

        ordered = deque()
        unprocessed = sorted(set(graph))
        visited_nodes = {}

        removed_edges = set()

        def dfs(node):
            visited_nodes[node] = GRAY
            for k in graph.get(node, set()):
                sk = visited_nodes.get(k, None)
                if sk == GRAY:
                    removed_edges.add((node, k))
                    continue
                if sk == BLACK:
                    continue
                unprocessed.remove(k)
                dfs(k)
            ordered.appendleft(node)
            visited_nodes[node] = BLACK

        while unprocessed:
            dfs(unprocessed.pop())
        return ordered, removed_edges

    def visualize(self, output_file=None):
        import networkx as nx
        from rasa.core.training import visualization
        from colorhash import ColorHash

        graph = nx.MultiDiGraph()
        next_node_idx = [0]
        nodes = {"STORY_START": 0, "STORY_END": -1}

        def ensure_checkpoint_is_drawn(cp):
            if cp.name not in nodes:
                next_node_idx[0] += 1
                nodes[cp.name] = next_node_idx[0]

                if cp.name.startswith(GENERATED_CHECKPOINT_PREFIX):
                    # colors generated checkpoints based on their hash
                    color = ColorHash(cp.name[-GENERATED_HASH_LENGTH:]).hex
                    graph.add_node(next_node_idx[0],
                                   label=utils.cap_length(cp.name),
                                   style="filled",
                                   fillcolor=color)
                else:
                    graph.add_node(next_node_idx[0],
                                   label=utils.cap_length(cp.name))

        graph.add_node(nodes["STORY_START"],
                       label="START",
                       fillcolor="green",
                       style="filled")
        graph.add_node(nodes["STORY_END"],
                       label="END",
                       fillcolor="red",
                       style="filled")

        for step in self.story_steps:
            next_node_idx[0] += 1
            step_idx = next_node_idx[0]

            graph.add_node(next_node_idx[0],
                           label=utils.cap_length(step.block_name),
                           style="filled",
                           fillcolor="lightblue",
                           shape="rect")

            for c in step.start_checkpoints:
                ensure_checkpoint_is_drawn(c)
                graph.add_edge(nodes[c.name], step_idx)
            for c in step.end_checkpoints:
                ensure_checkpoint_is_drawn(c)
                graph.add_edge(step_idx, nodes[c.name])

            if not step.end_checkpoints:
                graph.add_edge(step_idx, nodes["STORY_END"])

        if output_file:
            visualization.persist_graph(graph, output_file)

        return graph