tensorflow/models

View on GitHub
official/legacy/detection/executor/distributed_executor.py

Summary

Maintainability
F
3 days
Test Coverage
# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Custom training loop for running TensorFlow 2.0 models."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
from typing import Optional, Dict, List, Text, Callable, Union, Iterator, Any

from absl import flags
from absl import logging

import numpy as np
import tensorflow as tf, tf_keras

# pylint: disable=unused-import,g-import-not-at-top,redefined-outer-name,reimported
from official.common import distribute_utils
from official.modeling.hyperparams import params_dict
from official.utils import hyperparams_flags
from official.utils.misc import keras_utils

FLAGS = flags.FLAGS

strategy_flags_dict = hyperparams_flags.strategy_flags_dict
hparam_flags_dict = hyperparams_flags.hparam_flags_dict


def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
  """Saves model to model_dir with provided checkpoint prefix."""

  checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
  saved_path = checkpoint.save(checkpoint_path)
  logging.info('Saving model as TF checkpoint: %s', saved_path)


def _steps_to_run(current_step, total_steps, steps_per_loop):
  """Calculates steps to run on device."""
  if steps_per_loop <= 0:
    raise ValueError('steps_per_loop should be positive integer.')
  return min(total_steps - current_step, steps_per_loop)


def _no_metric():
  return None


def metrics_as_dict(metric):
  """Puts input metric(s) into a list.

  Args:
    metric: metric(s) to be put into the list. `metric` could be an object, a
      list, or a dict of tf_keras.metrics.Metric or has the `required_method`.

  Returns:
    A dictionary of valid metrics.
  """
  if isinstance(metric, tf_keras.metrics.Metric):
    metrics = {metric.name: metric}
  elif isinstance(metric, list):
    metrics = {m.name: m for m in metric}
  elif isinstance(metric, dict):
    metrics = metric
  elif not metric:
    return {}
  else:
    metrics = {'metric': metric}
  return metrics


def metric_results(metric):
  """Collects results from the given metric(s)."""
  metrics = metrics_as_dict(metric)
  metric_result = {
      name: m.result().numpy().astype(float) for name, m in metrics.items()
  }
  return metric_result


def reset_states(metric):
  """Resets states of the given metric(s)."""
  metrics = metrics_as_dict(metric)
  for m in metrics.values():
    m.reset_states()


class SummaryWriter(object):
  """Simple SummaryWriter for writing dictionary of metrics.

  Attributes:
    writer: The tf.SummaryWriter.
  """

  def __init__(self, model_dir: Text, name: Text):
    """Inits SummaryWriter with paths.

    Args:
      model_dir: the model folder path.
      name: the summary subfolder name.
    """
    self.writer = tf.summary.create_file_writer(os.path.join(model_dir, name))

  def __call__(self, metrics: Union[Dict[Text, float], float], step: int):
    """Write metrics to summary with the given writer.

    Args:
      metrics: a dictionary of metrics values. Prefer dictionary.
      step: integer. The training step.
    """
    if not isinstance(metrics, dict):
      # Support scalar metric without name.
      logging.warning('Warning: summary writer prefer metrics as dictionary.')
      metrics = {'metric': metrics}

    with self.writer.as_default():
      for k, v in metrics.items():
        tf.summary.scalar(k, v, step=step)
      self.writer.flush()


class DistributedExecutor(object):
  """Interface to train and eval models with tf.distribute.Strategy."""

  def __init__(self, strategy, params, model_fn, loss_fn, is_multi_host=False):
    """Constructor.

    Args:
      strategy: an instance of tf.distribute.Strategy.
      params: Model configuration needed to run distribution strategy.
      model_fn: Keras model function. Signature:
        (params: ParamsDict) -> tf_keras.models.Model.
      loss_fn: loss function. Signature:
        (y_true: Tensor, y_pred: Tensor) -> Tensor
      is_multi_host: Set to True when using multi hosts for training, like multi
        worker GPU or TPU pod (slice). Otherwise, False.
    """

    self._params = params
    self._model_fn = model_fn
    self._loss_fn = loss_fn
    self._strategy = strategy
    self._checkpoint_name = 'ctl_step_{step}.ckpt'
    self._is_multi_host = is_multi_host
    self.train_summary_writer = None
    self.eval_summary_writer = None
    self.global_train_step = None

  @property
  def checkpoint_name(self):
    """Returns default checkpoint name."""
    return self._checkpoint_name

  @checkpoint_name.setter
  def checkpoint_name(self, name):
    """Sets default summary writer for the current thread."""
    self._checkpoint_name = name

  def loss_fn(self):
    return self._loss_fn()

  def model_fn(self, params):
    return self._model_fn(params)

  def _save_config(self, model_dir):
    """Save parameters to config files if model_dir is defined."""

    logging.info('Save config to model_dir %s.', model_dir)
    if model_dir:
      if not tf.io.gfile.exists(model_dir):
        tf.io.gfile.makedirs(model_dir)
      self._params.lock()
      params_dict.save_params_dict_to_yaml(self._params,
                                           model_dir + '/params.yaml')
    else:
      logging.warning('model_dir is empty, so skip the save config.')

  def _get_input_iterator(
      self, input_fn: Callable[..., tf.data.Dataset],
      strategy: tf.distribute.Strategy) -> Optional[Iterator[Any]]:
    """Returns distributed dataset iterator.

    Args:
      input_fn: (params: dict) -> tf.data.Dataset.
      strategy: an instance of tf.distribute.Strategy.

    Returns:
      An iterator that yields input tensors.
    """

    if input_fn is None:
      return None
    # When training with multiple TPU workers, datasets needs to be cloned
    # across workers. Since Dataset instance cannot be cloned in eager mode,
    # we instead pass callable that returns a dataset.
    if self._is_multi_host:
      return iter(strategy.distribute_datasets_from_function(input_fn))
    else:
      input_data = input_fn()
      return iter(strategy.experimental_distribute_dataset(input_data))

  def _create_replicated_step(self,
                              strategy,
                              model,
                              loss_fn,
                              optimizer,
                              metric=None):
    """Creates a single training step.

    Args:
      strategy: an instance of tf.distribute.Strategy.
      model: (Tensor, bool) -> Tensor. model function.
      loss_fn: (y_true: Tensor, y_pred: Tensor) -> Tensor.
      optimizer: tf_keras.optimizers.Optimizer.
      metric: tf_keras.metrics.Metric subclass.

    Returns:
      The training step callable.
    """
    metrics = metrics_as_dict(metric)

    def _replicated_step(inputs):
      """Replicated training step."""
      inputs, labels = inputs

      with tf.GradientTape() as tape:
        outputs = model(inputs, training=True)
        prediction_loss = loss_fn(labels, outputs)
        loss = tf.reduce_mean(prediction_loss)
        loss = loss / strategy.num_replicas_in_sync
        for m in metrics.values():
          m.update_state(labels, outputs)

      grads = tape.gradient(loss, model.trainable_variables)
      optimizer.apply_gradients(zip(grads, model.trainable_variables))
      return loss

    return _replicated_step

  def _create_train_step(self,
                         strategy,
                         model,
                         loss_fn,
                         optimizer,
                         metric=None):
    """Creates a distributed training step.

    Args:
      strategy: an instance of tf.distribute.Strategy.
      model: (Tensor, bool) -> Tensor. model function.
      loss_fn: (y_true: Tensor, y_pred: Tensor) -> Tensor.
      optimizer: tf_keras.optimizers.Optimizer.
      metric: tf_keras.metrics.Metric subclass.

    Returns:
      The training step callable.
    """
    replicated_step = self._create_replicated_step(strategy, model, loss_fn,
                                                   optimizer, metric)

    @tf.function
    def train_step(iterator, num_steps):
      """Performs a distributed training step.

      Args:
        iterator: an iterator that yields input tensors.
        num_steps: the number of steps in the loop.

      Returns:
        The loss tensor.
      """
      if not isinstance(num_steps, tf.Tensor):
        raise ValueError('steps should be an Tensor. Python object may cause '
                         'retracing.')

      per_replica_losses = strategy.run(replicated_step, args=(next(iterator),))
      for _ in tf.range(num_steps - 1):
        per_replica_losses = strategy.run(
            replicated_step, args=(next(iterator),))

      # For reporting, we returns the mean of losses.
      losses = tf.nest.map_structure(
          lambda x: strategy.reduce(tf.distribute.ReduceOp.MEAN, x, axis=None),
          per_replica_losses)
      return losses

    return train_step

  def _create_test_step(self, strategy, model, metric):
    """Creates a distributed test step."""
    metrics = metrics_as_dict(metric)

    @tf.function
    def test_step(iterator):
      """Calculates evaluation metrics on distributed devices."""
      if not metric:
        logging.info('Skip test_step because metric is None (%s)', metric)
        return None, None

      def _test_step_fn(inputs):
        """Replicated accuracy calculation."""
        inputs, labels = inputs
        model_outputs = model(inputs, training=False)
        for m in metrics.values():
          m.update_state(labels, model_outputs)
        return labels, model_outputs

      return strategy.run(_test_step_fn, args=(next(iterator),))

    return test_step

  def train(
      self,
      train_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
      eval_input_fn: Optional[Callable[[params_dict.ParamsDict],
                                       tf.data.Dataset]] = None,
      model_dir: Optional[Text] = None,
      total_steps: int = 1,
      iterations_per_loop: int = 1,
      train_metric_fn: Optional[Callable[[], Any]] = None,
      eval_metric_fn: Optional[Callable[[], Any]] = None,
      summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter,
      init_checkpoint: Optional[Callable[[tf_keras.Model], Any]] = None,
      custom_callbacks: Optional[List[tf_keras.callbacks.Callback]] = None,
      continuous_eval: bool = False,
      save_config: bool = True):
    """Runs distributed training.

    Args:
      train_input_fn: (params: dict) -> tf.data.Dataset training data input
        function.
      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
        trigger evaluating metric on eval data. If None, will not run the eval
        step.
      model_dir: the folder path for model checkpoints.
      total_steps: total training steps.
      iterations_per_loop: train steps per loop. After each loop, this job will
        update metrics like loss and save checkpoint.
      train_metric_fn: metric_fn for evaluation in train_step.
      eval_metric_fn: metric_fn for evaluation in test_step.
      summary_writer_fn: function to create summary writer.
      init_checkpoint: function to load checkpoint.
      custom_callbacks: A list of Keras Callbacks objects to run during
        training. More specifically, `on_batch_begin()`, `on_batch_end()`,
        methods are invoked during training.
      continuous_eval: If `True`, will continously run evaluation on every
        available checkpoints. If `False`, will do the evaluation once after the
        final step.
      save_config: bool. Whether to save params to model_dir.

    Returns:
      The training loss and eval metrics.
    """
    assert train_input_fn is not None
    if train_metric_fn and not callable(train_metric_fn):
      raise ValueError('if `train_metric_fn` is specified, '
                       'train_metric_fn must be a callable.')
    if eval_metric_fn and not callable(eval_metric_fn):
      raise ValueError('if `eval_metric_fn` is specified, '
                       'eval_metric_fn must be a callable.')
    train_metric_fn = train_metric_fn or _no_metric
    eval_metric_fn = eval_metric_fn or _no_metric

    if custom_callbacks and iterations_per_loop != 1:
      logging.warning(
          'It is sematically wrong to run callbacks when '
          'iterations_per_loop is not one (%s)', iterations_per_loop)

    custom_callbacks = custom_callbacks or []

    def _run_callbacks_on_batch_begin(batch):
      """Runs custom callbacks at the start of every step."""
      if not custom_callbacks:
        return
      for callback in custom_callbacks:
        if callback:
          callback.on_batch_begin(batch)

    def _run_callbacks_on_batch_end(batch):
      """Runs custom callbacks at the end of every step."""
      if not custom_callbacks:
        return
      for callback in custom_callbacks:
        if callback:
          callback.on_batch_end(batch)

    if save_config:
      self._save_config(model_dir)

    if FLAGS.save_checkpoint_freq:
      save_freq = FLAGS.save_checkpoint_freq
    else:
      save_freq = iterations_per_loop

    params = self._params
    strategy = self._strategy
    # To reduce unnecessary send/receive input pipeline operation, we place
    # input pipeline ops in worker task.
    train_iterator = self._get_input_iterator(train_input_fn, strategy)
    train_loss = None
    train_metric_result = None
    eval_metric_result = None
    tf_keras.backend.set_learning_phase(1)
    with strategy.scope():
      # To correctly place the model weights on accelerators,
      # model and optimizer should be created in scope.
      model = self.model_fn(params.as_dict())
      if not hasattr(model, 'optimizer'):
        raise ValueError('User should set optimizer attribute to model '
                         'inside `model_fn`.')
      optimizer = model.optimizer

      # Training loop starts here.
      checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
      latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
      initial_step = 0
      if latest_checkpoint_file:
        logging.info(
            'Checkpoint file %s found and restoring from '
            'checkpoint', latest_checkpoint_file)
        checkpoint.restore(latest_checkpoint_file)
        initial_step = optimizer.iterations.numpy()
        logging.info('Loading from checkpoint file completed. Init step %d',
                     initial_step)
      elif init_checkpoint:
        logging.info('Restoring from init checkpoint function')
        init_checkpoint(model)
        logging.info('Loading from init checkpoint file completed')

      current_step = optimizer.iterations.numpy()
      checkpoint_name = self.checkpoint_name

      eval_metric = eval_metric_fn()
      train_metric = train_metric_fn()
      train_summary_writer = summary_writer_fn(model_dir, 'eval_train')
      self.train_summary_writer = train_summary_writer.writer

      test_summary_writer = summary_writer_fn(model_dir, 'eval_test')
      self.eval_summary_writer = test_summary_writer.writer

    # Use training summary writer in TimeHistory if it's in use
    for cb in custom_callbacks:
      if isinstance(cb, keras_utils.TimeHistory):
        cb.summary_writer = self.train_summary_writer

    # Continue training loop.
    train_step = self._create_train_step(
        strategy=strategy,
        model=model,
        loss_fn=self.loss_fn(),
        optimizer=optimizer,
        metric=train_metric)
    test_step = None
    if eval_input_fn and eval_metric:
      self.global_train_step = model.optimizer.iterations
      test_step = self._create_test_step(strategy, model, metric=eval_metric)

    # Step-0 operations
    if current_step == 0 and not latest_checkpoint_file:
      _save_checkpoint(checkpoint, model_dir,
                       checkpoint_name.format(step=current_step))
    if test_step:
      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
      eval_metric_result = self._run_evaluation(test_step, current_step,
                                                eval_metric, eval_iterator)
      logging.info('Step: %s evalation metric = %s.', current_step,
                   eval_metric_result)
      test_summary_writer(metrics=eval_metric_result, step=optimizer.iterations)
      reset_states(eval_metric)

    logging.info('Training started')
    last_save_checkpoint_step = current_step
    while current_step < total_steps:

      num_steps = _steps_to_run(current_step, total_steps, iterations_per_loop)
      _run_callbacks_on_batch_begin(current_step)
      train_loss = train_step(train_iterator,
                              tf.convert_to_tensor(num_steps, dtype=tf.int32))
      current_step += num_steps

      train_loss = tf.nest.map_structure(lambda x: x.numpy().astype(float),
                                         train_loss)

      _run_callbacks_on_batch_end(current_step - 1)
      if not isinstance(train_loss, dict):
        train_loss = {'total_loss': train_loss}
      if np.isnan(train_loss['total_loss']):
        raise ValueError('total loss is NaN.')

      if train_metric:
        train_metric_result = metric_results(train_metric)
        train_metric_result.update(train_loss)
      else:
        train_metric_result = train_loss
      if callable(optimizer.lr):
        train_metric_result.update(
            {'learning_rate': optimizer.lr(current_step).numpy()})
      else:
        train_metric_result.update({'learning_rate': optimizer.lr.numpy()})
      logging.info('Train Step: %d/%d  / loss = %s / training metric = %s',
                   current_step, total_steps, train_loss, train_metric_result)

      train_summary_writer(
          metrics=train_metric_result, step=optimizer.iterations)

      # Saves model checkpoints and run validation steps at every
      # iterations_per_loop steps.
      # To avoid repeated model saving, we do not save after the last
      # step of training.
      if save_freq > 0 and current_step < total_steps and (
          current_step - last_save_checkpoint_step) >= save_freq:
        _save_checkpoint(checkpoint, model_dir,
                         checkpoint_name.format(step=current_step))
        last_save_checkpoint_step = current_step

      if continuous_eval and current_step < total_steps and test_step:
        eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
        eval_metric_result = self._run_evaluation(test_step, current_step,
                                                  eval_metric, eval_iterator)
        logging.info('Step: %s evalation metric = %s.', current_step,
                     eval_metric_result)
        test_summary_writer(
            metrics=eval_metric_result, step=optimizer.iterations)

      # Re-initialize evaluation metric, except the last step.
      if eval_metric and current_step < total_steps:
        reset_states(eval_metric)
      if train_metric and current_step < total_steps:
        reset_states(train_metric)

    # Reaches the end of training and saves the last checkpoint.
    if last_save_checkpoint_step < total_steps:
      _save_checkpoint(checkpoint, model_dir,
                       checkpoint_name.format(step=current_step))

    if test_step:
      logging.info('Running final evaluation after training is complete.')
      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
      eval_metric_result = self._run_evaluation(test_step, current_step,
                                                eval_metric, eval_iterator)
      logging.info('Final evaluation metric = %s.', eval_metric_result)
      test_summary_writer(metrics=eval_metric_result, step=optimizer.iterations)

    self.train_summary_writer.close()
    self.eval_summary_writer.close()

    return train_metric_result, eval_metric_result

  def _run_evaluation(self, test_step, current_training_step, metric,
                      test_iterator):
    """Runs validation steps and aggregate metrics."""
    if not test_iterator or not metric:
      logging.warning(
          'Both test_iterator (%s) and metrics (%s) must not be None.',
          test_iterator, metric)
      return None
    logging.info('Running evaluation after step: %s.', current_training_step)
    eval_step = 0
    while True:
      try:
        with tf.experimental.async_scope():
          test_step(test_iterator)
          eval_step += 1
      except (StopIteration, tf.errors.OutOfRangeError):
        tf.experimental.async_clear_error()
        break

    metric_result = metric_results(metric)
    logging.info('Total eval steps: [%d]', eval_step)
    logging.info('At training step: [%r] Validation metric = %r',
                 current_training_step, metric_result)
    return metric_result

  def evaluate_from_model_dir(
      self,
      model_dir: Text,
      eval_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
      eval_metric_fn: Callable[[], Any],
      total_steps: int = -1,
      eval_timeout: Optional[int] = None,
      min_eval_interval: int = 180,
      summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter):
    """Runs distributed evaluation on model folder.

    Args:
      model_dir: the folder for storing model checkpoints.
      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
        trigger evaluting metric on eval data. If None, will not run eval step.
      eval_metric_fn: metric_fn for evaluation in test_step.
      total_steps: total training steps. If the current step reaches the
        total_steps, the evaluation loop will stop.
      eval_timeout: The maximum number of seconds to wait between checkpoints.
        If left as None, then the process will wait indefinitely. Used by
        tf.train.checkpoints_iterator.
      min_eval_interval: The minimum number of seconds between yielding
        checkpoints. Used by tf.train.checkpoints_iterator.
      summary_writer_fn: function to create summary writer.

    Returns:
      Eval metrics dictionary of the last checkpoint.
    """

    if not model_dir:
      raise ValueError('model_dir must be set.')

    def terminate_eval():
      tf.logging.info('Terminating eval after %d seconds of no checkpoints' %
                      eval_timeout)
      return True

    summary_writer = summary_writer_fn(model_dir, 'eval')
    self.eval_summary_writer = summary_writer.writer

    # Read checkpoints from the given model directory
    # until `eval_timeout` seconds elapses.
    for checkpoint_path in tf.train.checkpoints_iterator(
        model_dir,
        min_interval_secs=min_eval_interval,
        timeout=eval_timeout,
        timeout_fn=terminate_eval):
      eval_metric_result, current_step = self.evaluate_checkpoint(
          checkpoint_path=checkpoint_path,
          eval_input_fn=eval_input_fn,
          eval_metric_fn=eval_metric_fn,
          summary_writer=summary_writer)
      if total_steps > 0 and current_step >= total_steps:
        logging.info('Evaluation finished after training step %d', current_step)
        break
    return eval_metric_result

  def evaluate_checkpoint(self,
                          checkpoint_path: Text,
                          eval_input_fn: Callable[[params_dict.ParamsDict],
                                                  tf.data.Dataset],
                          eval_metric_fn: Callable[[], Any],
                          summary_writer: Optional[SummaryWriter] = None):
    """Runs distributed evaluation on the one checkpoint.

    Args:
      checkpoint_path: the checkpoint to evaluate.
      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
        trigger evaluting metric on eval data. If None, will not run eval step.
      eval_metric_fn: metric_fn for evaluation in test_step.
      summary_writer: function to create summary writer.

    Returns:
      Eval metrics dictionary of the last checkpoint.
    """
    if not callable(eval_metric_fn):
      raise ValueError('if `eval_metric_fn` is specified, '
                       'eval_metric_fn must be a callable.')

    old_phase = tf_keras.backend.learning_phase()
    tf_keras.backend.set_learning_phase(0)
    params = self._params
    strategy = self._strategy
    # To reduce unnecessary send/receive input pipeline operation, we place
    # input pipeline ops in worker task.
    with strategy.scope():

      # To correctly place the model weights on accelerators,
      # model and optimizer should be created in scope.
      model = self.model_fn(params.as_dict())
      checkpoint = tf.train.Checkpoint(model=model)

      eval_metric = eval_metric_fn()
      assert eval_metric, 'eval_metric does not exist'
      test_step = self._create_test_step(strategy, model, metric=eval_metric)

      logging.info('Starting to evaluate.')
      if not checkpoint_path:
        raise ValueError('checkpoint path is empty')
      reader = tf.compat.v1.train.NewCheckpointReader(checkpoint_path)
      if reader.has_tensor('optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE'):
        # Legacy keras optimizer iteration.
        current_step = reader.get_tensor(
            'optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE')
      else:
        # New keras optimizer iteration.
        current_step = reader.get_tensor(
            'optimizer/_iterations/.ATTRIBUTES/VARIABLE_VALUE')
      logging.info('Checkpoint file %s found and restoring from '
                   'checkpoint', checkpoint_path)
      status = checkpoint.restore(checkpoint_path)
      status.expect_partial().assert_existing_objects_matched()

      self.global_train_step = model.optimizer.iterations
      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
      eval_metric_result = self._run_evaluation(test_step, current_step,
                                                eval_metric, eval_iterator)
      logging.info('Step: %s evalation metric = %s.', current_step,
                   eval_metric_result)
      summary_writer(metrics=eval_metric_result, step=current_step)
      reset_states(eval_metric)

    tf_keras.backend.set_learning_phase(old_phase)
    return eval_metric_result, current_step

  def predict(self):
    return NotImplementedError('Unimplmented function.')


class ExecutorBuilder(object):
  """Builder of DistributedExecutor.

  Example 1: Builds an executor with supported Strategy.
    builder = ExecutorBuilder(
        strategy_type='tpu',
        strategy_config={'tpu': '/bns/xxx'})
    dist_executor = builder.build_executor(
        params=params,
        model_fn=my_model_fn,
        loss_fn=my_loss_fn,
        metric_fn=my_metric_fn)

  Example 2: Builds an executor with customized Strategy.
    builder = ExecutorBuilder()
    builder.strategy = <some customized Strategy>
    dist_executor = builder.build_executor(
        params=params,
        model_fn=my_model_fn,
        loss_fn=my_loss_fn,
        metric_fn=my_metric_fn)

  Example 3: Builds a customized executor with customized Strategy.
    class MyDistributedExecutor(DistributedExecutor):
      # implementation ...

    builder = ExecutorBuilder()
    builder.strategy = <some customized Strategy>
    dist_executor = builder.build_executor(
        class_ctor=MyDistributedExecutor,
        params=params,
        model_fn=my_model_fn,
        loss_fn=my_loss_fn,
        metric_fn=my_metric_fn)
  """

  def __init__(self, strategy_type=None, strategy_config=None):
    _ = distribute_utils.configure_cluster(strategy_config.worker_hosts,
                                           strategy_config.task_index)
    """Constructor.

    Args:
      strategy_type: string. One of 'tpu', 'mirrored', 'multi_worker_mirrored'.
        If None, the user is responsible to set the strategy before calling
        build_executor(...).
      strategy_config: necessary config for constructing the proper Strategy.
        Check strategy_flags_dict() for examples of the structure.
    """
    self._strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=strategy_type,
        num_gpus=strategy_config.num_gpus,
        all_reduce_alg=strategy_config.all_reduce_alg,
        num_packs=strategy_config.num_packs,
        tpu_address=strategy_config.tpu)

  @property
  def strategy(self):
    """Returns default checkpoint name."""
    return self._strategy

  @strategy.setter
  def strategy(self, new_strategy):
    """Sets default summary writer for the current thread."""
    self._strategy = new_strategy

  def build_executor(self,
                     class_ctor=DistributedExecutor,
                     params=None,
                     model_fn=None,
                     loss_fn=None,
                     **kwargs):
    """Creates an executor according to strategy type.

    See doc string of the DistributedExecutor.__init__ for more information of
    the
    input arguments.

    Args:
      class_ctor: A constructor of executor (default: DistributedExecutor).
      params: ParamsDict, all the model parameters and runtime parameters.
      model_fn: Keras model function.
      loss_fn: loss function.
      **kwargs: other arguments to the executor constructor.

    Returns:
      An instance of DistributedExecutor or its subclass.
    """
    if self._strategy is None:
      raise ValueError('`strategy` should not be None. You need to specify '
                       '`strategy_type` in the builder contructor or directly '
                       'set the `strategy` property of the builder.')
    return class_ctor(
        strategy=self._strategy,
        params=params,
        model_fn=model_fn,
        loss_fn=loss_fn,
        **kwargs)