official/vision/ops/anchor.py from tensorflow/models

official/vision/ops/anchor.py
Summary

Maintainability

1 day
Test Coverage

Issues
# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Anchor box and labeler definition."""

import collections
import math
from typing import Dict, Optional, Tuple

# Import libraries

import tensorflow as tf, tf_keras

from official.vision.ops import box_matcher
from official.vision.ops import iou_similarity
from official.vision.ops import target_gather
from official.vision.utils.object_detection import balanced_positive_negative_sampler
from official.vision.utils.object_detection import box_list
from official.vision.utils.object_detection import faster_rcnn_box_coder


class Anchor(object):
  """Anchor class for anchor-based object detectors.

  Example:
  ```python
  anchor_boxes = Anchor(
      min_level=3,
      max_level=4,
      num_scales=2,
      aspect_ratios=[0.5, 1., 2.],
      anchor_size=4.,
      image_size=[256, 256],
  ).multilevel_boxes
  ```

  Attributes:
    min_level: integer number of minimum level of the output feature pyramid.
    max_level: integer number of maximum level of the output feature pyramid.
    num_scales: integer number representing intermediate scales added on each
      level. For instances, num_scales=2 adds one additional intermediate
      anchor scales [2^0, 2^0.5] on each level.
    aspect_ratios: list of float numbers representing the aspect ratio anchors
      added on each level. The number indicates the ratio of width to height.
      For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
      scale level.
    anchor_size: float number representing the scale of size of the base
      anchor to the feature stride 2^level.
    image_size: a list of integer numbers or Tensors representing [height,
      width] of the input image size.
    multilevel_boxes: an OrderedDict from level to the generated anchor boxes of
      shape [height_l, width_l, num_anchors_per_location * 4].
    anchors_per_location: number of anchors per pixel location.
  """

  def __init__(
      self,
      min_level,
      max_level,
      num_scales,
      aspect_ratios,
      anchor_size,
      image_size,
  ):
    """Initializes the instance."""
    self.min_level = min_level
    self.max_level = max_level
    self.num_scales = num_scales
    self.aspect_ratios = aspect_ratios
    self.anchor_size = anchor_size
    self.image_size = image_size
    self.multilevel_boxes = self._generate_multilevel_boxes()

  def _generate_multilevel_boxes(self) -> Dict[str, tf.Tensor]:
    """Generates multi-scale anchor boxes.

    Returns:
      An OrderedDict from level to anchor boxes of shape [height_l, width_l,
      num_anchors_per_location * 4].
    """
    multilevel_boxes = collections.OrderedDict()
    for level in range(self.min_level, self.max_level + 1):
      boxes_l = []
      feat_size_y = math.ceil(self.image_size[0] / 2**level)
      feat_size_x = math.ceil(self.image_size[1] / 2**level)
      stride_y = tf.cast(self.image_size[0] / feat_size_y, tf.float32)
      stride_x = tf.cast(self.image_size[1] / feat_size_x, tf.float32)
      x = tf.range(stride_x / 2, self.image_size[1], stride_x)
      y = tf.range(stride_y / 2, self.image_size[0], stride_y)
      xv, yv = tf.meshgrid(x, y)
      for scale in range(self.num_scales):
        for aspect_ratio in self.aspect_ratios:
          intermidate_scale = 2 ** (scale / self.num_scales)
          base_anchor_size = self.anchor_size * 2**level * intermidate_scale
          aspect_x = aspect_ratio**0.5
          aspect_y = aspect_ratio**-0.5
          half_anchor_size_x = base_anchor_size * aspect_x / 2.0
          half_anchor_size_y = base_anchor_size * aspect_y / 2.0
          # Tensor shape Nx4.
          boxes = tf.stack(
              [
                  yv - half_anchor_size_y,
                  xv - half_anchor_size_x,
                  yv + half_anchor_size_y,
                  xv + half_anchor_size_x,
              ],
              axis=-1,
          )
          boxes_l.append(boxes)
      # Concat anchors on the same level to tensor shape HxWx(Ax4).
      boxes_l = tf.concat(boxes_l, axis=-1)
      multilevel_boxes[str(level)] = boxes_l
    return multilevel_boxes

  @property
  def anchors_per_location(self) -> int:
    return self.num_scales * len(self.aspect_ratios)


class AnchorLabeler(object):
  """Labeler for dense object detector."""

  def __init__(
      self,
      match_threshold=0.5,
      unmatched_threshold=0.5,
      box_coder_weights=None,
  ):
    """Constructs anchor labeler to assign labels to anchors.

    Args:
      match_threshold: a float number between 0 and 1 representing the
        lower-bound threshold to assign positive labels for anchors. An anchor
        with a score over the threshold is labeled positive.
      unmatched_threshold: a float number between 0 and 1 representing the
        upper-bound threshold to assign negative labels for anchors. An anchor
        with a score below the threshold is labeled negative.
      box_coder_weights: Optional `list` of 4 positive floats to scale y, x, h,
        and w when encoding box coordinates. If set to None, does not perform
        scaling. For Faster RCNN, the open-source implementation recommends
        using [10.0, 10.0, 5.0, 5.0].
    """
    self.similarity_calc = iou_similarity.IouSimilarity()
    self.target_gather = target_gather.TargetGather()
    self.matcher = box_matcher.BoxMatcher(
        thresholds=[unmatched_threshold, match_threshold],
        indicators=[-1, -2, 1],
        force_match_for_each_col=True,
    )
    self.box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
        scale_factors=box_coder_weights,
    )

  def label_anchors(
      self,
      anchor_boxes: Dict[str, tf.Tensor],
      gt_boxes: tf.Tensor,
      gt_labels: tf.Tensor,
      gt_attributes: Optional[Dict[str, tf.Tensor]] = None,
      gt_weights: Optional[tf.Tensor] = None,
  ) -> Tuple[
      Dict[str, tf.Tensor],
      Dict[str, tf.Tensor],
      Dict[str, Dict[str, tf.Tensor]],
      tf.Tensor,
      tf.Tensor,
  ]:
    """Labels anchors with ground truth inputs.

    Args:
      anchor_boxes: An ordered dictionary with keys [min_level, min_level+1,
        ..., max_level]. The values are tensor with shape [height_l, width_l,
        num_anchors_per_location * 4]. The height_l and width_l represent the
        dimension of the feature pyramid at l-th level. For each anchor box, the
        tensor stores [y0, x0, y1, x1] for the four corners.
      gt_boxes: A float tensor with shape [N, 4] representing ground-truth
        boxes. For each row, it stores [y0, x0, y1, x1] for four corners of a
        box.
      gt_labels: A integer tensor with shape [N, 1] representing ground-truth
        classes.
      gt_attributes: If not None, a dict of (name, gt_attribute) pairs.
        `gt_attribute` is a float tensor with shape [N, attribute_size]
        representing ground-truth attributes.
      gt_weights: If not None, a float tensor with shape [N] representing
        ground-truth weights.

    Returns:
      cls_targets_dict: An ordered dictionary with keys
        [min_level, min_level+1, ..., max_level]. The values are tensor with
        shape [height_l, width_l, num_anchors_per_location]. The height_l and
        width_l represent the dimension of class logits at l-th level.
      box_targets_dict: An ordered dictionary with keys
        [min_level, min_level+1, ..., max_level]. The values are tensor with
        shape [height_l, width_l, num_anchors_per_location * 4]. The height_l
        and width_l represent the dimension of bounding box regression output at
        l-th level.
      attribute_targets_dict: A dict with (name, attribute_targets) pairs. Each
        `attribute_targets` represents an ordered dictionary with keys
        [min_level, min_level+1, ..., max_level]. The values are tensor with
        shape [height_l, width_l, num_anchors_per_location * attribute_size].
        The height_l and width_l represent the dimension of attribute prediction
        output at l-th level.
      cls_weights: A flattened Tensor with shape [num_anchors], that serves as
        masking / sample weight for classification loss. Its value is 1.0 for
        positive and negative matched anchors, and 0.0 for ignored anchors.
      box_weights: A flattened Tensor with shape [num_anchors], that serves as
        masking / sample weight for regression loss. Its value is 1.0 for
        positive matched anchors, and 0.0 for negative and ignored anchors.
    """
    flattened_anchor_boxes = []
    for anchors in anchor_boxes.values():
      flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4]))
    flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0)
    similarity_matrix = self.similarity_calc(flattened_anchor_boxes, gt_boxes)
    match_indices, match_indicators = self.matcher(similarity_matrix)

    mask = tf.less_equal(match_indicators, 0)
    cls_mask = tf.expand_dims(mask, -1)
    cls_targets = self.target_gather(gt_labels, match_indices, cls_mask, -1)
    box_mask = tf.tile(cls_mask, [1, 4])
    box_targets = self.target_gather(gt_boxes, match_indices, box_mask)
    att_targets = {}
    if gt_attributes:
      for k, v in gt_attributes.items():
        att_size = v.get_shape().as_list()[-1]
        att_mask = tf.tile(cls_mask, [1, att_size])
        att_targets[k] = self.target_gather(v, match_indices, att_mask, 0.0)

    # When there is no ground truth labels, we force the weight to be 1 so that
    # negative matched anchors get non-zero weights.
    num_gt_labels = tf.shape(gt_labels)[0]
    weights = tf.cond(
        tf.greater(num_gt_labels, 0),
        lambda: tf.ones_like(gt_labels, dtype=tf.float32)[..., -1],
        lambda: tf.ones([1], dtype=tf.float32),
    )
    if gt_weights is not None:
      weights = tf.cond(
          tf.greater(num_gt_labels, 0),
          lambda: tf.math.multiply(weights, gt_weights),
          lambda: weights,
      )
    box_weights = self.target_gather(weights, match_indices, mask)
    ignore_mask = tf.equal(match_indicators, -2)
    cls_weights = self.target_gather(weights, match_indices, ignore_mask)
    box_targets = box_list.BoxList(box_targets)
    anchor_box = box_list.BoxList(flattened_anchor_boxes)
    box_targets = self.box_coder.encode(box_targets, anchor_box)

    # Unpacks labels into multi-level representations.
    cls_targets = unpack_targets(cls_targets, anchor_boxes)
    box_targets = unpack_targets(box_targets, anchor_boxes)
    attribute_targets = {
        k: unpack_targets(v, anchor_boxes) for k, v in att_targets.items()
    }

    return (
        cls_targets,
        box_targets,
        attribute_targets,
        cls_weights,
        box_weights,
    )


class RpnAnchorLabeler(AnchorLabeler):
  """Labeler for Region Proposal Network."""

  def __init__(
      self,
      match_threshold=0.7,
      unmatched_threshold=0.3,
      rpn_batch_size_per_im=256,
      rpn_fg_fraction=0.5,
  ):
    AnchorLabeler.__init__(
        self,
        match_threshold=match_threshold,
        unmatched_threshold=unmatched_threshold,
    )
    self._rpn_batch_size_per_im = rpn_batch_size_per_im
    self._rpn_fg_fraction = rpn_fg_fraction

  def _get_rpn_samples(self, match_results):
    """Computes anchor labels.

    This function performs subsampling for foreground (fg) and background (bg)
    anchors.
    Args:
      match_results: A integer tensor with shape [N] representing the matching
        results of anchors. (1) match_results[i]>=0, meaning that column i is
        matched with row match_results[i]. (2) match_results[i]=-1, meaning that
        column i is not matched. (3) match_results[i]=-2, meaning that column i
        is ignored.

    Returns:
      score_targets: a integer tensor with the a shape of [N].
        (1) score_targets[i]=1, the anchor is a positive sample.
        (2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is
        don't care (ignore).
    """
    sampler = (
        balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
            positive_fraction=self._rpn_fg_fraction, is_static=False
        )
    )
    # indicator includes both positive and negative labels.
    # labels includes only positives labels.
    # positives = indicator & labels.
    # negatives = indicator & !labels.
    # ignore = !indicator.
    indicator = tf.greater(match_results, -2)
    labels = tf.greater(match_results, -1)

    samples = sampler.subsample(indicator, self._rpn_batch_size_per_im, labels)
    positive_labels = tf.where(
        tf.logical_and(samples, labels),
        tf.constant(2, dtype=tf.int32, shape=match_results.shape),
        tf.constant(0, dtype=tf.int32, shape=match_results.shape),
    )
    negative_labels = tf.where(
        tf.logical_and(samples, tf.logical_not(labels)),
        tf.constant(1, dtype=tf.int32, shape=match_results.shape),
        tf.constant(0, dtype=tf.int32, shape=match_results.shape),
    )
    ignore_labels = tf.fill(match_results.shape, -1)

    return (
        ignore_labels + positive_labels + negative_labels,
        positive_labels,
        negative_labels,
    )

  def label_anchors(  # pytype: disable=signature-mismatch  # overriding-parameter-count-checks
      self,
      anchor_boxes: Dict[str, tf.Tensor],
      gt_boxes: tf.Tensor,
      gt_labels: tf.Tensor,
  ) -> Tuple[Dict[str, tf.Tensor], Dict[str, tf.Tensor]]:
    """Labels anchors with ground truth inputs.

    Args:
      anchor_boxes: An ordered dictionary with keys [min_level, min_level+1,
        ..., max_level]. The values are tensor with shape [height_l, width_l,
        num_anchors_per_location * 4]. The height_l and width_l represent the
        dimension of the feature pyramid at l-th level. For each anchor box, the
        tensor stores [y0, x0, y1, x1] for the four corners.
      gt_boxes: A float tensor with shape [N, 4] representing ground-truth
        boxes. For each row, it stores [y0, x0, y1, x1] for four corners of a
        box.
      gt_labels: A integer tensor with shape [N, 1] representing ground-truth
        classes.

    Returns:
      score_targets_dict: An ordered dictionary with keys
        [min_level, min_level+1, ..., max_level]. The values are tensor with
        shape [height_l, width_l, num_anchors_per_location]. The height_l and
        width_l represent the dimension of class logits at l-th level.
      box_targets_dict: An ordered dictionary with keys
        [min_level, min_level+1, ..., max_level]. The values are tensor with
        shape [height_l, width_l, num_anchors_per_location * 4]. The height_l
        and width_l represent the dimension of bounding box regression output at
        l-th level.
    """
    flattened_anchor_boxes = []
    for anchors in anchor_boxes.values():
      flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4]))
    flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0)
    similarity_matrix = self.similarity_calc(flattened_anchor_boxes, gt_boxes)
    match_indices, match_indicators = self.matcher(similarity_matrix)
    box_mask = tf.tile(
        tf.expand_dims(tf.less_equal(match_indicators, 0), -1), [1, 4]
    )
    box_targets = self.target_gather(gt_boxes, match_indices, box_mask)
    box_targets_list = box_list.BoxList(box_targets)
    anchor_box_list = box_list.BoxList(flattened_anchor_boxes)
    box_targets = self.box_coder.encode(box_targets_list, anchor_box_list)

    # Zero out the unmatched and ignored regression targets.
    num_matches = match_indices.shape.as_list()[0] or tf.shape(match_indices)[0]
    unmatched_ignored_box_targets = tf.zeros([num_matches, 4], dtype=tf.float32)
    matched_anchors_mask = tf.greater_equal(match_indicators, 0)
    # To broadcast matched_anchors_mask to the same shape as
    # matched_reg_targets.
    matched_anchors_mask = tf.tile(
        tf.expand_dims(matched_anchors_mask, 1), [1, tf.shape(box_targets)[1]]
    )
    box_targets = tf.where(
        matched_anchors_mask, box_targets, unmatched_ignored_box_targets
    )

    # score_targets contains the subsampled positive and negative anchors.
    score_targets, _, _ = self._get_rpn_samples(match_indicators)

    # Unpacks labels.
    score_targets_dict = unpack_targets(score_targets, anchor_boxes)
    box_targets_dict = unpack_targets(box_targets, anchor_boxes)

    return score_targets_dict, box_targets_dict


class AnchorGeneratorv2:
  """Utility to generate anchors for a multiple feature maps.

  Attributes:
    min_level: integer number of minimum level of the output feature pyramid.
    max_level: integer number of maximum level of the output feature pyramid.
    num_scales: integer number representing intermediate scales added on each
      level. For instances, num_scales=2 adds one additional intermediate
      anchor scales [2^0, 2^0.5] on each level.
    aspect_ratios: list of float numbers representing the aspect ratio anchors
      added on each level. The number indicates the ratio of width to height.
      For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
      scale level.
    anchor_size: float number representing the scale of size of the base
      anchor to the feature stride 2^level.
  """

  def __init__(
      self,
      min_level,
      max_level,
      num_scales,
      aspect_ratios,
      anchor_size,
    ):
    """Initializes the instance."""
    self.min_level = min_level
    self.max_level = max_level
    self.num_scales = num_scales
    self.aspect_ratios = aspect_ratios
    self.anchor_size = anchor_size

  def __call__(self, image_size):
    """Generate multilevel anchor boxes.

    Args:
      image_size: a list of integer numbers or Tensors representing [height,
        width] of the input image size.
    Returns:
      An ordered dictionary from level to anchor boxes of shape [height_l,
      width_l, num_anchors_per_location * 4].
    """
    return Anchor(
        min_level=self.min_level,
        max_level=self.max_level,
        num_scales=self.num_scales,
        aspect_ratios=self.aspect_ratios,
        anchor_size=self.anchor_size,
        image_size=image_size,
    ).multilevel_boxes


def build_anchor_generator(
    min_level, max_level, num_scales, aspect_ratios, anchor_size
):
  """Build anchor generator from levels."""
  anchor_gen = AnchorGeneratorv2(
      min_level=min_level,
      max_level=max_level,
      num_scales=num_scales,
      aspect_ratios=aspect_ratios,
      anchor_size=anchor_size,
  )
  return anchor_gen


def unpack_targets(
    targets: tf.Tensor, anchor_boxes_dict: Dict[str, tf.Tensor]
) -> Dict[str, tf.Tensor]:
  """Unpacks an array of labels into multi-scales labels.

  Args:
    targets: A tensor with shape [num_anchors, M] representing the packed
      targets with M values stored for each anchor.
    anchor_boxes_dict: An ordered dictionary with keys [min_level, min_level+1,
      ..., max_level]. The values are tensor with shape [height_l, width_l,
      num_anchors_per_location * 4]. The height_l and width_l represent the
      dimension of the feature pyramid at l-th level. For each anchor box, the
      tensor stores [y0, x0, y1, x1] for the four corners.

  Returns:
    unpacked_targets: An ordered dictionary with keys
      [min_level, min_level+1, ..., max_level]. The values are tensor with shape
      [height_l, width_l, num_anchors_per_location * M]. The height_l and
      width_l represent the dimension of the feature pyramid at l-th level. M is
      the number of values stored for each anchor.
  """
  unpacked_targets = collections.OrderedDict()
  count = 0
  for level, anchor_boxes in anchor_boxes_dict.items():
    feat_size_shape = anchor_boxes.shape.as_list()
    feat_size_y = feat_size_shape[0]
    feat_size_x = feat_size_shape[1]
    anchors_per_location = int(feat_size_shape[2] / 4)
    steps = feat_size_y * feat_size_x * anchors_per_location
    unpacked_targets[level] = tf.reshape(
        targets[count : count + steps], [feat_size_y, feat_size_x, -1]
    )
    count += steps
  return unpacked_targets