research/cognitive_planning/envs/active_vision_dataset_env.py from tensorflow/models

research/cognitive_planning/envs/active_vision_dataset_env.py
Summary

Maintainability

1 wk
Test Coverage

Issues
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Gym environment for the ActiveVision Dataset.

   The dataset is captured with a robot moving around and taking picture in
   multiple directions. The actions are moving in four directions, and rotate
   clockwise or counter clockwise. The observations are the output of vision
   pipelines such as object detectors. The goal is to find objects of interest
   in each environment. For more details, refer:
   http://cs.unc.edu/~ammirato/active_vision_dataset_website/.
"""
import tensorflow as tf
import collections
import copy
import json
import os
from StringIO import StringIO
import time
import gym
from gym.envs.registration import register
import gym.spaces
import networkx as nx
import numpy as np
import scipy.io as sio
from absl import logging
import gin
import cv2
import label_map_util
import visualization_utils as vis_util
from envs import task_env


register(
    id='active-vision-env-v0',
    entry_point=
    'cognitive_planning.envs.active_vision_dataset_env:ActiveVisionDatasetEnv',  # pylint: disable=line-too-long
)

_MAX_DEPTH_VALUE = 12102

SUPPORTED_ACTIONS = [
    'right', 'rotate_cw', 'rotate_ccw', 'forward', 'left', 'backward', 'stop'
]
SUPPORTED_MODALITIES = [
    task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
    task_env.ModalityTypes.DEPTH,
    task_env.ModalityTypes.OBJECT_DETECTION,
    task_env.ModalityTypes.IMAGE,
    task_env.ModalityTypes.GOAL,
    task_env.ModalityTypes.PREV_ACTION,
    task_env.ModalityTypes.DISTANCE,
]

# Data structure for storing the information related to the graph of the world.
_Graph = collections.namedtuple('_Graph', [
    'graph', 'id_to_index', 'index_to_id', 'target_indexes', 'distance_to_goal'
])


def _init_category_index(label_map_path):
  """Creates category index from class indexes to name of the classes.

  Args:
    label_map_path: path to the mapping.
  Returns:
    A map for mapping int keys to string categories.
  """

  label_map = label_map_util.load_labelmap(label_map_path)
  num_classes = np.max(x.id for x in label_map.item)
  categories = label_map_util.convert_label_map_to_categories(
      label_map, max_num_classes=num_classes, use_display_name=True)
  category_index = label_map_util.create_category_index(categories)
  return category_index


def _draw_detections(image_np, detections, category_index):
  """Draws detections on to the image.

  Args:
    image_np: Image in the form of uint8 numpy array.
    detections: a dictionary that contains the detection outputs.
    category_index: contains the mapping between indexes and the category names.

  Returns:
    Does not return anything but draws the boxes on the
  """
  vis_util.visualize_boxes_and_labels_on_image_array(
      image_np,
      detections['detection_boxes'],
      detections['detection_classes'],
      detections['detection_scores'],
      category_index,
      use_normalized_coordinates=True,
      max_boxes_to_draw=1000,
      min_score_thresh=.0,
      agnostic_mode=False)


def generate_detection_image(detections,
                             image_size,
                             category_map,
                             num_classes,
                             is_binary=True):
  """Generates one_hot vector of the image using the detection boxes.

  Args:
    detections: 2D object detections from the image. It's a dictionary that
      contains detection_boxes, detection_classes, and detection_scores with
      dimensions of nx4, nx1, nx1 where n is the number of detections.
    image_size: The resolution of the output image.
    category_map: dictionary that maps label names to index.
    num_classes: Number of classes.
    is_binary: If true, it sets the corresponding channels to 0 and 1.
      Otherwise, sets the score in the corresponding channel.
  Returns:
    Returns image_size x image_size x num_classes image for the detection boxes.
  """
  res = np.zeros((image_size, image_size, num_classes), dtype=np.float32)
  boxes = detections['detection_boxes']
  labels = detections['detection_classes']
  scores = detections['detection_scores']
  for box, label, score in zip(boxes, labels, scores):
    transformed_boxes = [int(round(t)) for t in box * image_size]
    y1, x1, y2, x2 = transformed_boxes
    # Detector returns fixed number of detections. Boxes with area of zero
    # are equivalent of boxes that don't correspond to any detection box.
    # So, we need to skip the boxes with area 0.
    if (y2 - y1) * (x2 - x1) == 0:
      continue
    assert category_map[label] < num_classes, 'label = {}'.format(label)
    value = score
    if is_binary:
      value = 1
    res[y1:y2, x1:x2, category_map[label]] = value
  return res


def _get_detection_path(root, detection_folder_name, world):
  return os.path.join(root, 'Meta', detection_folder_name, world + '.npy')


def _get_image_folder(root, world):
  return os.path.join(root, world, 'jpg_rgb')


def _get_json_path(root, world):
  return os.path.join(root, world, 'annotations.json')


def _get_image_path(root, world, image_id):
  return os.path.join(_get_image_folder(root, world), image_id + '.jpg')


def _get_image_list(path, worlds):
  """Builds a dictionary for all the worlds.

  Args:
    path: the path to the dataset on cns.
    worlds: list of the worlds.

  Returns:
    dictionary where the key is the world names and the values
    are the image_ids of that world.
  """
  world_id_dict = {}
  for loc in worlds:
    files = [t[:-4] for t in tf.gfile.ListDir(_get_image_folder(path, loc))]
    world_id_dict[loc] = files
  return world_id_dict


def read_all_poses(dataset_root, world):
  """Reads all the poses for each world.

  Args:
    dataset_root: the path to the root of the dataset.
    world: string, name of the world.

  Returns:
    Dictionary of poses for all the images in each world. The key is the image
    id of each view and the values are tuple of (x, z, R, scale). Where x and z
    are the first and third coordinate of translation. R is the 3x3 rotation
    matrix and scale is a float scalar that indicates the scale that needs to
    be multipled to x and z in order to get the real world coordinates.

  Raises:
    ValueError: if the number of images do not match the number of poses read.
  """
  path = os.path.join(dataset_root, world, 'image_structs.mat')
  with tf.gfile.Open(path) as f:
    data = sio.loadmat(f)
  xyz = data['image_structs']['world_pos']
  image_names = data['image_structs']['image_name'][0]
  rot = data['image_structs']['R'][0]
  scale = data['scale'][0][0]
  n = xyz.shape[1]
  x = [xyz[0][i][0][0] for i in range(n)]
  z = [xyz[0][i][2][0] for i in range(n)]
  names = [name[0][:-4] for name in image_names]
  if len(names) != len(x):
    raise ValueError('number of image names are not equal to the number of '
                     'poses {} != {}'.format(len(names), len(x)))
  output = {}
  for i in range(n):
    if rot[i].shape[0] != 0:
      assert rot[i].shape[0] == 3
      assert rot[i].shape[1] == 3
      output[names[i]] = (x[i], z[i], rot[i], scale)
    else:
      output[names[i]] = (x[i], z[i], None, scale)

  return output


def read_cached_data(should_load_images, dataset_root, segmentation_file_name,
                     targets_file_name, output_size):
  """Reads all the necessary cached data.

  Args:
    should_load_images: whether to load the images or not.
    dataset_root: path to the root of the dataset.
    segmentation_file_name: The name of the file that contains semantic
      segmentation annotations.
    targets_file_name: The name of the file the contains targets annotated for
      each world.
    output_size: Size of the output images. This is used for pre-processing the
      loaded images.
  Returns:
    Dictionary of all the cached data.
  """

  load_start = time.time()
  result_data = {}

  annotated_target_path = os.path.join(dataset_root, 'Meta',
                                       targets_file_name + '.npy')

  logging.info('loading targets: %s', annotated_target_path)
  with tf.gfile.Open(annotated_target_path) as f:
    result_data['targets'] = np.load(f).item()

  depth_image_path = os.path.join(dataset_root, 'Meta/depth_imgs.npy')
  logging.info('loading depth: %s', depth_image_path)
  with tf.gfile.Open(depth_image_path) as f:
    depth_data = np.load(f).item()

  logging.info('processing depth')
  for home_id in depth_data:
    images = depth_data[home_id]
    for image_id in images:
      depth = images[image_id]
      depth = cv2.resize(
          depth / _MAX_DEPTH_VALUE, (output_size, output_size),
          interpolation=cv2.INTER_NEAREST)
      depth_mask = (depth > 0).astype(np.float32)
      depth = np.dstack((depth, depth_mask))
      images[image_id] = depth
  result_data[task_env.ModalityTypes.DEPTH] = depth_data

  sseg_path = os.path.join(dataset_root, 'Meta',
                           segmentation_file_name + '.npy')
  logging.info('loading sseg: %s', sseg_path)
  with tf.gfile.Open(sseg_path) as f:
    sseg_data = np.load(f).item()

  logging.info('processing sseg')
  for home_id in sseg_data:
    images = sseg_data[home_id]
    for image_id in images:
      sseg = images[image_id]
      sseg = cv2.resize(
          sseg, (output_size, output_size), interpolation=cv2.INTER_NEAREST)
      images[image_id] = np.expand_dims(sseg, axis=-1).astype(np.float32)
  result_data[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = sseg_data

  if should_load_images:
    image_path = os.path.join(dataset_root, 'Meta/imgs.npy')
    logging.info('loading imgs: %s', image_path)
    with tf.gfile.Open(image_path) as f:
      image_data = np.load(f).item()

    result_data[task_env.ModalityTypes.IMAGE] = image_data

  with tf.gfile.Open(os.path.join(dataset_root, 'Meta/world_id_dict.npy')) as f:
    result_data['world_id_dict'] = np.load(f).item()

  logging.info('logging done in %f seconds', time.time() - load_start)
  return result_data


@gin.configurable
def get_spec_dtype_map():
  return {gym.spaces.Box: np.float32}


@gin.configurable
class ActiveVisionDatasetEnv(task_env.TaskEnv):
  """Simulates the environment from ActiveVisionDataset."""
  cached_data = None

  def __init__(
      self,
      episode_length,
      modality_types,
      confidence_threshold,
      output_size,
      worlds,
      targets,
      compute_distance,
      should_draw_detections,
      dataset_root,
      labelmap_path,
      reward_collision,
      reward_goal_range,
      num_detection_classes,
      segmentation_file_name,
      detection_folder_name,
      actions,
      targets_file_name,
      eval_init_points_file_name=None,
      shaped_reward=False,
  ):
    """Instantiates the environment for ActiveVision Dataset.

    Args:
      episode_length: the length of each episode.
      modality_types: a list of the strings where each entry indicates the name
        of the modalities to be loaded. Valid entries are "sseg", "det",
        "depth", "image", "distance", and "prev_action". "distance" should be
        used for computing metrics in tf agents.
      confidence_threshold: Consider detections more than confidence_threshold
        for potential targets.
      output_size: Resolution of the output image.
      worlds: List of the name of the worlds.
      targets: List of the target names. Each entry is a string label of the
        target category (e.g. 'fridge', 'microwave', so on).
      compute_distance: If True, outputs the distance of the view to the goal.
      should_draw_detections (bool): If True, the image returned for the
        observation will contains the bounding boxes.
      dataset_root: the path to the root folder of the dataset.
      labelmap_path: path to the dictionary that converts label strings to
        indexes.
      reward_collision: the reward the agents get after hitting an obstacle.
        It should be a non-positive number.
      reward_goal_range: the number of steps from goal, such that the agent is
        considered to have reached the goal. If the agent's distance is less
        than the specified goal range, the episode is also finishes by setting
        done = True.
      num_detection_classes: number of classes that detector outputs.
      segmentation_file_name: the name of the file that contains the semantic
        information. The file should be in the dataset_root/Meta/ folder.
      detection_folder_name: Name of the folder that contains the detections
        for each world. The folder should be under dataset_root/Meta/ folder.
      actions: The list of the action names. Valid entries are listed in
        SUPPORTED_ACTIONS.
      targets_file_name: the name of the file that contains the annotated
        targets. The file should be in the dataset_root/Meta/Folder
      eval_init_points_file_name: The name of the file that contains the initial
        points for evaluating the performance of the agent. If set to None,
        episodes start at random locations. Should be only set for evaluation.
      shaped_reward: Whether to add delta goal distance to the reward each step.

    Raises:
      ValueError: If one of the targets are not available in the annotated
        targets or the modality names are not from the domain specified above.
      ValueError: If one of the actions is not in SUPPORTED_ACTIONS.
      ValueError: If the reward_collision is a positive number.
      ValueError: If there is no action other than stop provided.
    """
    if reward_collision > 0:
      raise ValueError('"reward" for collision should be non positive')

    if reward_goal_range < 0:
      logging.warning('environment does not terminate the episode if the agent '
                      'is too close to the environment')

    if not modality_types:
      raise ValueError('modality names can not be empty')

    for name in modality_types:
      if name not in SUPPORTED_MODALITIES:
        raise ValueError('invalid modality type: {}'.format(name))

    actions_other_than_stop_found = False
    for a in actions:
      if a != 'stop':
        actions_other_than_stop_found = True
      if a not in SUPPORTED_ACTIONS:
        raise ValueError('invalid action %s', a)

    if not actions_other_than_stop_found:
      raise ValueError('environment needs to have actions other than stop.')

    super(ActiveVisionDatasetEnv, self).__init__()

    self._episode_length = episode_length
    self._modality_types = set(modality_types)
    self._confidence_threshold = confidence_threshold
    self._output_size = output_size
    self._dataset_root = dataset_root
    self._worlds = worlds
    self._targets = targets
    self._all_graph = {}
    for world in self._worlds:
      with tf.gfile.Open(_get_json_path(self._dataset_root, world), 'r') as f:
        file_content = f.read()
        file_content = file_content.replace('.jpg', '')
        io = StringIO(file_content)
        self._all_graph[world] = json.load(io)

    self._cur_world = ''
    self._cur_image_id = ''
    self._cur_graph = None  # Loaded by _update_graph
    self._steps_taken = 0
    self._last_action_success = True
    self._category_index = _init_category_index(labelmap_path)
    self._category_map = dict(
        [(c, i) for i, c in enumerate(self._category_index)])
    self._detection_cache = {}
    if not ActiveVisionDatasetEnv.cached_data:
      ActiveVisionDatasetEnv.cached_data = read_cached_data(
          True, self._dataset_root, segmentation_file_name, targets_file_name,
          self._output_size)
    cached_data = ActiveVisionDatasetEnv.cached_data

    self._world_id_dict = cached_data['world_id_dict']
    self._depth_images = cached_data[task_env.ModalityTypes.DEPTH]
    self._semantic_segmentations = cached_data[
        task_env.ModalityTypes.SEMANTIC_SEGMENTATION]
    self._annotated_targets = cached_data['targets']
    self._cached_imgs = cached_data[task_env.ModalityTypes.IMAGE]
    self._graph_cache = {}
    self._compute_distance = compute_distance
    self._should_draw_detections = should_draw_detections
    self._reward_collision = reward_collision
    self._reward_goal_range = reward_goal_range
    self._num_detection_classes = num_detection_classes
    self._actions = actions
    self._detection_folder_name = detection_folder_name
    self._shaped_reward = shaped_reward

    self._eval_init_points = None
    if eval_init_points_file_name is not None:
      self._eval_init_index = 0
      init_points_path = os.path.join(self._dataset_root, 'Meta',
                                      eval_init_points_file_name + '.npy')
      with tf.gfile.Open(init_points_path) as points_file:
        data = np.load(points_file).item()
      self._eval_init_points = []
      for world in self._worlds:
        for goal in self._targets:
          if world in self._annotated_targets[goal]:
            for image_id in data[world]:
              self._eval_init_points.append((world, image_id[0], goal))
        logging.info('loaded %d eval init points', len(self._eval_init_points))

    self.action_space = gym.spaces.Discrete(len(self._actions))

    obs_shapes = {}
    if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types:
      obs_shapes[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = gym.spaces.Box(
          low=0, high=255, shape=(self._output_size, self._output_size, 1))
    if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types:
      obs_shapes[task_env.ModalityTypes.OBJECT_DETECTION] = gym.spaces.Box(
          low=0,
          high=255,
          shape=(self._output_size, self._output_size,
                 self._num_detection_classes))
    if task_env.ModalityTypes.DEPTH in self._modality_types:
      obs_shapes[task_env.ModalityTypes.DEPTH] = gym.spaces.Box(
          low=0,
          high=_MAX_DEPTH_VALUE,
          shape=(self._output_size, self._output_size, 2))
    if task_env.ModalityTypes.IMAGE in self._modality_types:
      obs_shapes[task_env.ModalityTypes.IMAGE] = gym.spaces.Box(
          low=0, high=255, shape=(self._output_size, self._output_size, 3))
    if task_env.ModalityTypes.GOAL in self._modality_types:
      obs_shapes[task_env.ModalityTypes.GOAL] = gym.spaces.Box(
          low=0, high=1., shape=(len(self._targets),))
    if task_env.ModalityTypes.PREV_ACTION in self._modality_types:
      obs_shapes[task_env.ModalityTypes.PREV_ACTION] = gym.spaces.Box(
          low=0, high=1., shape=(len(self._actions) + 1,))
    if task_env.ModalityTypes.DISTANCE in self._modality_types:
      obs_shapes[task_env.ModalityTypes.DISTANCE] = gym.spaces.Box(
          low=0, high=255, shape=(1,))
    self.observation_space = gym.spaces.Dict(obs_shapes)

    self._prev_action = np.zeros((len(self._actions) + 1), dtype=np.float32)

    # Loading all the poses.
    all_poses = {}
    for world in self._worlds:
      all_poses[world] = read_all_poses(self._dataset_root, world)
    self._cached_poses = all_poses
    self._vertex_to_pose = {}
    self._pose_to_vertex = {}

  @property
  def actions(self):
    """Returns list of actions for the env."""
    return self._actions

  def _next_image(self, image_id, action):
    """Given the action, returns the name of the image that agent ends up in.

    Args:
      image_id: The image id of the current view.
      action: valid actions are ['right', 'rotate_cw', 'rotate_ccw',
      'forward', 'left']. Each rotation is 30 degrees.

    Returns:
      The image name for the next location of the agent. If the action results
      in collision or it is not possible for the agent to execute that action,
      returns empty string.
    """
    assert action in self._actions, 'invalid action : {}'.format(action)
    assert self._cur_world in self._all_graph, 'invalid world {}'.format(
        self._cur_world)
    assert image_id in self._all_graph[
        self._cur_world], 'image_id {} is not in {}'.format(
            image_id, self._cur_world)
    return self._all_graph[self._cur_world][image_id][action]

  def _largest_detection_for_image(self, image_id, detections_dict):
    """Assigns area of the largest box for the view with given image id.

    Args:
      image_id: Image id of the view.
      detections_dict: Detections for the view.
    """
    for cls, box, score in zip(detections_dict['detection_classes'],
                               detections_dict['detection_boxes'],
                               detections_dict['detection_scores']):
      if cls not in self._targets:
        continue
      if score < self._confidence_threshold:
        continue
      ymin, xmin, ymax, xmax = box
      area = (ymax - ymin) * (xmax - xmin)
      if abs(area) < 1e-5:
        continue
      if image_id not in self._detection_area:
        self._detection_area[image_id] = area
      else:
        self._detection_area[image_id] = max(self._detection_area[image_id],
                                             area)

  def _compute_goal_indexes(self):
    """Computes the goal indexes for the environment.

    Returns:
      The indexes of the goals that are closest to target categories. A vertex
      is goal vertice if the desired objects are detected in the image and the
      target categories are not seen by moving forward from that vertice.
    """
    for image_id in self._world_id_dict[self._cur_world]:
      detections_dict = self._detection_table[image_id]
      self._largest_detection_for_image(image_id, detections_dict)
    goal_indexes = []
    for image_id in self._world_id_dict[self._cur_world]:
      if image_id not in self._detection_area:
        continue
      # Detection box is large enough.
      if self._detection_area[image_id] < 0.01:
        continue
      ok = True
      next_image_id = self._next_image(image_id, 'forward')
      if next_image_id:
        if next_image_id in self._detection_area:
          ok = False
      if ok:
        goal_indexes.append(self._cur_graph.id_to_index[image_id])
    return goal_indexes

  def to_image_id(self, vid):
    """Converts vertex id to the image id.

    Args:
      vid: vertex id of the view.
    Returns:
      image id of the input vertex id.
    """
    return self._cur_graph.index_to_id[vid]

  def to_vertex(self, image_id):
    return self._cur_graph.id_to_index[image_id]

  def observation(self, view_pose):
    """Returns the observation at the given the vertex.

    Args:
      view_pose: pose of the view of interest.

    Returns:
      Observation at the given view point.

    Raises:
      ValueError: if the given view pose is not similar to any of the poses in
        the current world.
    """
    vertex = self.pose_to_vertex(view_pose)
    if vertex is None:
      raise ValueError('The given found is not close enough to any of the poses'
                       ' in the environment.')
    image_id = self._cur_graph.index_to_id[vertex]
    output = collections.OrderedDict()

    if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types:
      output[task_env.ModalityTypes.
             SEMANTIC_SEGMENTATION] = self._semantic_segmentations[
                 self._cur_world][image_id]

    detection = None
    need_det = (
        task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types or
        (task_env.ModalityTypes.IMAGE in self._modality_types and
         self._should_draw_detections))
    if need_det:
      detection = self._detection_table[image_id]
      detection_image = generate_detection_image(
          detection,
          self._output_size,
          self._category_map,
          num_classes=self._num_detection_classes)

    if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types:
      output[task_env.ModalityTypes.OBJECT_DETECTION] = detection_image

    if task_env.ModalityTypes.DEPTH in self._modality_types:
      output[task_env.ModalityTypes.DEPTH] = self._depth_images[
          self._cur_world][image_id]

    if task_env.ModalityTypes.IMAGE in self._modality_types:
      output_img = self._cached_imgs[self._cur_world][image_id]
      if self._should_draw_detections:
        output_img = output_img.copy()
        _draw_detections(output_img, detection, self._category_index)
      output[task_env.ModalityTypes.IMAGE] = output_img

    if task_env.ModalityTypes.GOAL in self._modality_types:
      goal = np.zeros((len(self._targets),), dtype=np.float32)
      goal[self._targets.index(self._cur_goal)] = 1.
      output[task_env.ModalityTypes.GOAL] = goal

    if task_env.ModalityTypes.PREV_ACTION in self._modality_types:
      output[task_env.ModalityTypes.PREV_ACTION] = self._prev_action

    if task_env.ModalityTypes.DISTANCE in self._modality_types:
      output[task_env.ModalityTypes.DISTANCE] = np.asarray(
          [self.gt_value(self._cur_goal, vertex)], dtype=np.float32)

    return output

  def _step_no_reward(self, action):
    """Performs a step in the environment with given action.

    Args:
      action: Action that is used to step in the environment. Action can be
        string or integer. If the type is integer then it uses the ith element
        from self._actions list. Otherwise, uses the string value as the action.

    Returns:
      observation, done, info
      observation: dictonary that contains all the observations specified in
        modality_types.
        observation[task_env.ModalityTypes.OBJECT_DETECTION]: contains the
        detection of the current view.
        observation[task_env.ModalityTypes.IMAGE]: contains the
          image of the current view. Note that if using the images for training,
          should_load_images should be set to false.
        observation[task_env.ModalityTypes.SEMANTIC_SEGMENTATION]: contains the
          semantic segmentation of the current view.
        observation[task_env.ModalityTypes.DEPTH]: If selected, returns the
          depth map for the current view.
        observation[task_env.ModalityTypes.PREV_ACTION]: If selected, returns
          a numpy of (action_size + 1,). The first action_size elements indicate
          the action and the last element indicates whether the previous action
          was successful or not.
      done: True after episode_length steps have been taken, False otherwise.
      info: Empty dictionary.

    Raises:
      ValueError: for invalid actions.
    """
    # Primarily used for gym interface.
    if not isinstance(action, str):
      if not self.action_space.contains(action):
        raise ValueError('Not a valid actions: %d', action)

      action = self._actions[action]

    if action not in self._actions:
      raise ValueError('Not a valid action: %s', action)

    action_index = self._actions.index(action)

    if action == 'stop':
      next_image_id = self._cur_image_id
      done = True
      success = True
    else:
      next_image_id = self._next_image(self._cur_image_id, action)
      self._steps_taken += 1
      done = False
      success = True
    if not next_image_id:
      success = False
    else:
      self._cur_image_id = next_image_id

    if self._steps_taken >= self._episode_length:
      done = True

    cur_vertex = self._cur_graph.id_to_index[self._cur_image_id]
    observation = self.observation(self.vertex_to_pose(cur_vertex))

    # Concatenation of one-hot prev action + a binary number for success of
    # previous actions.
    self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32)
    self._prev_action[action_index] = 1.
    self._prev_action[-1] = float(success)

    distance_to_goal = self.gt_value(self._cur_goal, cur_vertex)
    if success:
      if distance_to_goal <= self._reward_goal_range:
        done = True

    return observation, done, {'success': success}

  @property
  def graph(self):
    return self._cur_graph.graph

  def state(self):
    return self.vertex_to_pose(self.to_vertex(self._cur_image_id))

  def gt_value(self, goal, v):
    """Computes the distance to the goal from vertex v.

    Args:
      goal: name of the goal.
      v: vertex id.

    Returns:
      Minimmum number of steps to the given goal.
    """
    assert goal in self._cur_graph.distance_to_goal, 'goal: {}'.format(goal)
    assert v in self._cur_graph.distance_to_goal[goal]
    res = self._cur_graph.distance_to_goal[goal][v]
    return res

  def _update_graph(self):
    """Creates the graph for each environment and updates the _cur_graph."""
    if self._cur_world not in self._graph_cache:
      graph = nx.DiGraph()
      id_to_index = {}
      index_to_id = {}
      image_list = self._world_id_dict[self._cur_world]
      for i, image_id in enumerate(image_list):
        id_to_index[image_id] = i
        index_to_id[i] = image_id
        graph.add_node(i)

      for image_id in image_list:
        for action in self._actions:
          if action == 'stop':
            continue
          next_image = self._all_graph[self._cur_world][image_id][action]
          if next_image:
            graph.add_edge(
                id_to_index[image_id], id_to_index[next_image], action=action)
      target_indexes = {}
      number_of_nodes_without_targets = graph.number_of_nodes()
      distance_to_goal = {}
      for goal in self._targets:
        if self._cur_world not in self._annotated_targets[goal]:
          continue
        goal_indexes = [
            id_to_index[i]
            for i in self._annotated_targets[goal][self._cur_world]
            if i
        ]
        super_source_index = graph.number_of_nodes()
        target_indexes[goal] = super_source_index
        graph.add_node(super_source_index)
        index_to_id[super_source_index] = goal
        id_to_index[goal] = super_source_index
        for v in goal_indexes:
          graph.add_edge(v, super_source_index, action='stop')
          graph.add_edge(super_source_index, v, action='stop')
        distance_to_goal[goal] = {}
        for v in range(number_of_nodes_without_targets):
          distance_to_goal[goal][v] = len(
              nx.shortest_path(graph, v, super_source_index)) - 2

      self._graph_cache[self._cur_world] = _Graph(
          graph, id_to_index, index_to_id, target_indexes, distance_to_goal)
    self._cur_graph = self._graph_cache[self._cur_world]

  def reset_for_eval(self, new_world, new_goal, new_image_id):
    """Resets to the given goal and image_id."""
    return self._reset_env(new_world=new_world, new_goal=new_goal, new_image_id=new_image_id)

  def get_init_config(self, path):
    """Exposes the initial state of the agent for the given path.

    Args:
      path: sequences of the vertexes that the agent moves.

    Returns:
      image_id of the first view, world, and the goal.
    """
    return self._cur_graph.index_to_id[path[0]], self._cur_world, self._cur_goal

  def _reset_env(
      self,
      new_world=None,
      new_goal=None,
      new_image_id=None,
  ):
    """Resets the agent in a random world and random id.

    Args:
      new_world: If not None, sets the new world to new_world.
      new_goal: If not None, sets the new goal to new_goal.
      new_image_id: If not None, sets the first image id to new_image_id.

    Returns:
      observation: dictionary of the observations. Content of the observation
      is similar to that of the step function.
    Raises:
      ValueError: if it can't find a world and annotated goal.
    """
    self._steps_taken = 0
    # The first prev_action is special all zero vector + success=1.
    self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32)
    self._prev_action[len(self._actions)] = 1.
    if self._eval_init_points is not None:
      if self._eval_init_index >= len(self._eval_init_points):
        self._eval_init_index = 0
      a = self._eval_init_points[self._eval_init_index]
      self._cur_world, self._cur_image_id, self._cur_goal = a
      self._eval_init_index += 1
    elif not new_world:
      attempts = 100
      found = False
      while attempts >= 0:
        attempts -= 1
        self._cur_goal = np.random.choice(self._targets)
        available_worlds = list(
            set(self._annotated_targets[self._cur_goal].keys()).intersection(
                set(self._worlds)))
        if available_worlds:
          found = True
          break
      if not found:
        raise ValueError('could not find a world that has a target annotated')
      self._cur_world = np.random.choice(available_worlds)
    else:
      self._cur_world = new_world
      self._cur_goal = new_goal
      if new_world not in self._annotated_targets[new_goal]:
        return None

    self._cur_goal_index = self._targets.index(self._cur_goal)
    if new_image_id:
      self._cur_image_id = new_image_id
    else:
      self._cur_image_id = np.random.choice(
          self._world_id_dict[self._cur_world])
    if self._cur_world not in self._detection_cache:
      with tf.gfile.Open(
          _get_detection_path(self._dataset_root, self._detection_folder_name,
                              self._cur_world)) as f:
        # Each file contains a dictionary with image ids as keys and detection
        # dicts as values.
        self._detection_cache[self._cur_world] = np.load(f).item()
    self._detection_table = self._detection_cache[self._cur_world]
    self._detection_area = {}
    self._update_graph()
    if self._cur_world not in self._vertex_to_pose:
      # adding fake pose for the super nodes of each target categories.
      self._vertex_to_pose[self._cur_world] = {
          index: (-index,) for index in self._cur_graph.target_indexes.values()
      }
      # Calling vetex_to_pose for each vertex results in filling out the
      # dictionaries that contain pose related data.
      for image_id in self._world_id_dict[self._cur_world]:
        self.vertex_to_pose(self.to_vertex(image_id))

      # Filling out pose_to_vertex from vertex_to_pose.
      self._pose_to_vertex[self._cur_world] = {
          tuple(v): k
          for k, v in self._vertex_to_pose[self._cur_world].iteritems()
      }

    cur_vertex = self._cur_graph.id_to_index[self._cur_image_id]
    observation = self.observation(self.vertex_to_pose(cur_vertex))
    return observation

  def cur_vertex(self):
    return self._cur_graph.id_to_index[self._cur_image_id]

  def cur_image_id(self):
    return self._cur_image_id

  def path_to_goal(self, image_id=None):
    """Returns the path from image_id to the self._cur_goal.

    Args:
      image_id: If set to None, computes the path from the current view.
        Otherwise, sets the current view to the given image_id.
    Returns:
      The path to the goal.
    Raises:
      Exception if there's no path from the view to the goal.
    """
    if image_id is None:
      image_id = self._cur_image_id
    super_source = self._cur_graph.target_indexes[self._cur_goal]
    try:
      path = nx.shortest_path(self._cur_graph.graph,
                              self._cur_graph.id_to_index[image_id],
                              super_source)
    except:
      print 'path not found, image_id = ', self._cur_world, self._cur_image_id
      raise
    return path[:-1]

  def targets(self):
    return [self.vertex_to_pose(self._cur_graph.target_indexes[self._cur_goal])]

  def vertex_to_pose(self, v):
    """Returns pose of the view for a given vertex.

    Args:
      v: integer, vertex index.

    Returns:
      (x, z, dir_x, dir_z) where x and z are the tranlation and dir_x, dir_z are
        a vector giving direction of the view.
    """
    if v in self._vertex_to_pose[self._cur_world]:
      return np.copy(self._vertex_to_pose[self._cur_world][v])

    x, z, rot, scale = self._cached_poses[self._cur_world][self.to_image_id(
        v)]
    if rot is None:  # if rotation is not provided for the given vertex.
      self._vertex_to_pose[self._cur_world][v] = np.asarray(
          [x * scale, z * scale, v])
      return np.copy(self._vertex_to_pose[self._cur_world][v])
    # Multiply rotation matrix by [0,0,1] to get a vector of length 1 in the
    # direction of the ray.
    direction = np.zeros((3, 1), dtype=np.float32)
    direction[2][0] = 1
    direction = np.matmul(np.transpose(rot), direction)
    direction = [direction[0][0], direction[2][0]]
    self._vertex_to_pose[self._cur_world][v] = np.asarray(
        [x * scale, z * scale, direction[0], direction[1]])
    return np.copy(self._vertex_to_pose[self._cur_world][v])

  def pose_to_vertex(self, pose):
    """Returns the vertex id for the given pose."""
    if tuple(pose) not in self._pose_to_vertex[self._cur_world]:
      raise ValueError(
          'The given pose is not present in the dictionary: {}'.format(
              tuple(pose)))

    return self._pose_to_vertex[self._cur_world][tuple(pose)]

  def check_scene_graph(self, world, goal):
    """Checks the connectivity of the scene graph.

    Goes over all the views. computes the shortest path to the goal. If it
    crashes it means that it's not connected. Otherwise, the env graph is fine.

    Args:
      world: the string name of the world.
      goal: the string label for the goal.
    Returns:
      Nothing.
    """
    obs = self._reset_env(new_world=world, new_goal=goal)
    if not obs:
      print '{} is not availble in {}'.format(goal, world)
      return True
    for image_id in self._world_id_dict[self._cur_world]:
      print 'check image_id = {}'.format(image_id)
      self._cur_image_id = image_id
      path = self.path_to_goal()
      actions = []
      for i in range(len(path) - 2):
        actions.append(self.action(path[i], path[i + 1]))
      actions.append('stop')

  @property
  def goal_one_hot(self):
    res = np.zeros((len(self._targets),), dtype=np.float32)
    res[self._cur_goal_index] = 1.
    return res

  @property
  def goal_index(self):
    return self._cur_goal_index

  @property
  def goal_string(self):
    return self._cur_goal

  @property
  def worlds(self):
    return self._worlds

  @property
  def possible_targets(self):
    return self._targets

  def action(self, from_pose, to_pose):
    """Returns the action that takes source vertex to destination vertex.

    Args:
      from_pose: pose of the source.
      to_pose: pose of the destination.
    Returns:
      Returns the index of the action.
    Raises:
      ValueError: If it is not possible to go from the first vertice to second
      vertice with one action, it raises value error.
    """
    from_index = self.pose_to_vertex(from_pose)
    to_index = self.pose_to_vertex(to_pose)
    if to_index not in self.graph[from_index]:
      from_image_id = self.to_image_id(from_index)
      to_image_id = self.to_image_id(to_index)
      raise ValueError('{},{} is not connected to {},{}'.format(
          from_index, from_image_id, to_index, to_image_id))
    return self._actions.index(self.graph[from_index][to_index]['action'])

  def random_step_sequence(self, min_len=None, max_len=None):
    """Generates random step sequence that takes agent to the goal.

    Args:
      min_len: integer, minimum length of a step sequence. Not yet implemented.
      max_len: integer, should be set to an integer and it is the maximum number
        of observations and path length to be max_len.
    Returns:
      Tuple of (path, actions, states, step_outputs).
        path: a random path from a random starting point and random environment.
        actions: actions of the returned path.
        states: viewpoints of all the states in between.
        step_outputs: list of step() return tuples.
    Raises:
      ValueError: if first_n is not greater than zero; if min_len is different
        from None.
    """
    if max_len is None:
      raise ValueError('max_len can not be set as None')
    if max_len < 1:
      raise ValueError('first_n must be greater or equal to 1.')
    if min_len is not None:
      raise ValueError('min_len is not yet implemented.')

    path = []
    actions = []
    states = []
    step_outputs = []
    obs = self.reset()
    last_obs_tuple = [obs, 0, False, {}]
    for _ in xrange(max_len):
      action = np.random.choice(self._actions)
      # We don't want to sample stop action because stop does not add new
      # information.
      while action == 'stop':
        action = np.random.choice(self._actions)
      path.append(self.to_vertex(self._cur_image_id))
      onehot = np.zeros((len(self._actions),), dtype=np.float32)
      onehot[self._actions.index(action)] = 1.
      actions.append(onehot)
      states.append(self.vertex_to_pose(path[-1]))
      step_outputs.append(copy.deepcopy(last_obs_tuple))
      last_obs_tuple = self.step(action)

    return path, actions, states, step_outputs