official/projects/yolo/modeling/backbones/yolov7.py
# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains backbone architectures for YOLOv7 families.
The models are built with ELAN and E-ELAN.
ELAN was proposed in:
[1] Wang, Chien-Yao and Liao, Hong-Yuan Mark and Yeh, I-Hau
Designing Network Design Strategies Through Gradient Path Analysis
arXiv:2211.04800
E-ELAN is proposed in YOLOv7 paper:
[1] Wang, Chien-Yao and Bochkovskiy, Alexey and Liao, Hong-Yuan Mark
YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time
object detectors
arXiv:2207.02696
"""
import tensorflow as tf, tf_keras
from official.modeling import hyperparams
from official.projects.yolo.modeling.layers import nn_blocks
from official.projects.yolo.ops import initializer_ops
from official.vision.modeling.backbones import factory
# Required block functions for YOLOv7 backbone familes.
_BLOCK_FNS = {
'convbn': nn_blocks.ConvBN,
'maxpool2d': tf_keras.layers.MaxPooling2D,
'concat': tf_keras.layers.Concatenate,
}
# Names for key arguments needed by each block function.
_BLOCK_SPEC_SCHEMAS = {
'convbn': [
'block_fn',
'from',
'kernel_size',
'strides',
'filters',
'is_output',
],
'maxpool2d': [
'block_fn',
'from',
'pool_size',
'strides',
'padding',
'is_output',
],
'concat': [
'block_fn',
'from',
'axis',
'is_output',
]
}
# Define YOLOv7-tiny variant.
_YoloV7Tiny = [
['convbn', -1, 3, 2, 32, False], # 0-P1/2
['convbn', -1, 3, 2, 64, False], # 1-P2/4
['convbn', -1, 1, 1, 32, False],
['convbn', -2, 1, 1, 32, False],
['convbn', -1, 3, 1, 32, False],
['convbn', -1, 3, 1, 32, False],
['concat', [-1, -2, -3, -4], -1, False],
['convbn', -1, 1, 1, 64, False], # 7
['maxpool2d', -1, 2, 2, 'same', False], # 8-P3/8
['convbn', -1, 1, 1, 64, False],
['convbn', -2, 1, 1, 64, False],
['convbn', -1, 3, 1, 64, False],
['convbn', -1, 3, 1, 64, False],
['concat', [-1, -2, -3, -4], -1, False],
['convbn', -1, 1, 1, 128, True], # 14
['maxpool2d', -1, 2, 2, 'same', False], # 15-P4/16
['convbn', -1, 1, 1, 128, False],
['convbn', -2, 1, 1, 128, False],
['convbn', -1, 3, 1, 128, False],
['convbn', -1, 3, 1, 128, False],
['concat', [-1, -2, -3, -4], -1, False],
['convbn', -1, 1, 1, 256, True], # 21
['maxpool2d', -1, 2, 2, 'same', False], # 22-P5/32
['convbn', -1, 1, 1, 256, False],
['convbn', -2, 1, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['concat', [-1, -2, -3, -4], -1, False],
['convbn', -1, 1, 1, 512, True], # 28
]
# Define YOLOv7 variant.
_YoloV7 = [
['convbn', -1, 3, 1, 32, False], # 0
['convbn', -1, 3, 2, 64, False], # 1-P1/2
['convbn', -1, 3, 1, 64, False],
['convbn', -1, 3, 2, 128, False], # 3-P2/4
['convbn', -1, 1, 1, 64, False],
['convbn', -2, 1, 1, 64, False],
['convbn', -1, 3, 1, 64, False],
['convbn', -1, 3, 1, 64, False],
['convbn', -1, 3, 1, 64, False],
['convbn', -1, 3, 1, 64, False],
['concat', [-1, -3, -5, -6], -1, False],
['convbn', -1, 1, 1, 256, False], # 11
['maxpool2d', -1, 2, 2, 'same', False],
['convbn', -1, 1, 1, 128, False],
['convbn', -3, 1, 1, 128, False],
['convbn', -1, 3, 2, 128, False],
['concat', [-1, -3], -1, False], # 16-P3/8
['convbn', -1, 1, 1, 128, False],
['convbn', -2, 1, 1, 128, False],
['convbn', -1, 3, 1, 128, False],
['convbn', -1, 3, 1, 128, False],
['convbn', -1, 3, 1, 128, False],
['convbn', -1, 3, 1, 128, False],
['concat', [-1, -3, -5, -6], -1, False],
['convbn', -1, 1, 1, 512, True], # 24
['maxpool2d', -1, 2, 2, 'same', False],
['convbn', -1, 1, 1, 256, False],
['convbn', -3, 1, 1, 256, False],
['convbn', -1, 3, 2, 256, False],
['concat', [-1, -3], -1, False], # 29-P4/16
['convbn', -1, 1, 1, 256, False],
['convbn', -2, 1, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['concat', [-1, -3, -5, -6], -1, False],
['convbn', -1, 1, 1, 1024, True], # 37
['maxpool2d', -1, 2, 2, 'same', False],
['convbn', -1, 1, 1, 512, False],
['convbn', -3, 1, 1, 512, False],
['convbn', -1, 3, 2, 512, False],
['concat', [-1, -3], -1, False], # 42-P5/32
['convbn', -1, 1, 1, 256, False],
['convbn', -2, 1, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['concat', [-1, -3, -5, -6], -1, False],
['convbn', -1, 1, 1, 1024, True], # 50
]
_YoloV7X = [
['convbn', -1, 3, 1, 40, False], # 0
['convbn', -1, 3, 2, 80, False], # 1-P1/2
['convbn', -1, 3, 1, 80, False],
['convbn', -1, 3, 2, 160, False], # 3-P2/4
['convbn', -1, 1, 1, 64, False],
['convbn', -2, 1, 1, 64, False],
['convbn', -1, 3, 1, 64, False],
['convbn', -1, 3, 1, 64, False],
['convbn', -1, 3, 1, 64, False],
['convbn', -1, 3, 1, 64, False],
['convbn', -1, 3, 1, 64, False],
['convbn', -1, 3, 1, 64, False],
['concat', [-1, -3, -5, -7, -8], -1, False],
['convbn', -1, 1, 1, 320, False], # 13
['maxpool2d', -1, 2, 2, 'same', False],
['convbn', -1, 1, 1, 160, False],
['convbn', -3, 1, 1, 160, False],
['convbn', -1, 3, 2, 160, False],
['concat', [-1, -3], -1, False], # 18-P3/8
['convbn', -1, 1, 1, 128, False],
['convbn', -2, 1, 1, 128, False],
['convbn', -1, 3, 1, 128, False],
['convbn', -1, 3, 1, 128, False],
['convbn', -1, 3, 1, 128, False],
['convbn', -1, 3, 1, 128, False],
['convbn', -1, 3, 1, 128, False],
['convbn', -1, 3, 1, 128, False],
['concat', [-1, -3, -5, -7, -8], -1, False],
['convbn', -1, 1, 1, 640, True], # 28
['maxpool2d', -1, 2, 2, 'same', False],
['convbn', -1, 1, 1, 320, False],
['convbn', -3, 1, 1, 320, False],
['convbn', -1, 3, 2, 320, False],
['concat', [-1, -3], -1, False], # 33-P4/16
['convbn', -1, 1, 1, 256, False],
['convbn', -2, 1, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['concat', [-1, -3, -5, -7, -8], -1, False],
['convbn', -1, 1, 1, 1280, True], # 43
['maxpool2d', -1, 2, 2, 'same', False],
['convbn', -1, 1, 1, 640, False],
['convbn', -3, 1, 1, 640, False],
['convbn', -1, 3, 2, 640, False],
['concat', [-1, -3], -1, False], # 48-P5/32
['convbn', -1, 1, 1, 256, False],
['convbn', -2, 1, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['convbn', -1, 3, 1, 256, False],
['concat', [-1, -3, -5, -7, -8], -1, False],
['convbn', -1, 1, 1, 1280, True], # 58
]
# Aggregates all variants for YOLOv7 backbones.
BACKBONES = {
'yolov7-tiny': _YoloV7Tiny,
'yolov7': _YoloV7,
'yolov7x': _YoloV7X,
}
class YoloV7(tf_keras.Model):
"""YOLOv7 backbone architecture."""
def __init__(
self,
model_id='yolov7',
input_specs=tf_keras.layers.InputSpec(shape=[None, None, None, 3]),
use_sync_bn=False,
norm_momentum=0.99,
norm_epsilon=0.001,
activation='swish',
kernel_initializer='VarianceScaling',
kernel_regularizer=None,
bias_initializer='zeros',
bias_regularizer=None,
**kwargs):
"""Initializes the YOLOv7 backbone.
Args:
model_id: a `str` represents the model variants.
input_specs: a `tf_keras.layers.InputSpec` of the input tensor.
use_sync_bn: if set to `True`, use synchronized batch normalization.
norm_momentum: a `float` of normalization momentum for the moving average.
norm_epsilon: a small `float` added to variance to avoid dividing by zero.
activation: a `str` name of the activation function.
kernel_initializer: a `str` for kernel initializer of convolutional
layers.
kernel_regularizer: a `tf_keras.regularizers.Regularizer` object for
Conv2D. Default to None.
bias_initializer: a `str` for bias initializer of convolutional layers.
bias_regularizer: a `tf_keras.regularizers.Regularizer` object for Conv2D.
Default to None.
**kwargs: Additional keyword arguments to be passed.
"""
self._model_id = model_id
self._input_specs = input_specs
self._use_sync_bn = use_sync_bn
self._norm_momentum = norm_momentum
self._norm_epsilon = norm_epsilon
self._activation = activation
self._kernel_initializer = initializer_ops.pytorch_kernel_initializer(
kernel_initializer
)
self._kernel_regularizer = kernel_regularizer
self._bias_initializer = bias_initializer
self._bias_regularizer = bias_regularizer
inputs = tf_keras.layers.Input(shape=input_specs.shape[1:])
block_specs = BACKBONES[model_id.lower()]
outputs = []
endpoints = {}
level = 3
for spec in block_specs:
block_kwargs = dict(zip(_BLOCK_SPEC_SCHEMAS[spec[0]], spec))
block_fn_str = block_kwargs.pop('block_fn')
from_index = block_kwargs.pop('from')
is_output = block_kwargs.pop('is_output')
if not outputs:
x = inputs
elif isinstance(from_index, int):
x = outputs[from_index]
else:
x = [outputs[idx] for idx in from_index]
if block_fn_str in ['convbn']:
block_kwargs.update({
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon,
'activation': self._activation,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_initializer': self._bias_initializer,
'bias_regularizer': self._bias_regularizer,
})
block_fn = _BLOCK_FNS[block_fn_str](**block_kwargs)
x = block_fn(x)
outputs.append(x)
if is_output:
endpoints[str(level)] = x
level += 1
self._output_specs = {k: v.get_shape() for k, v in endpoints.items()}
super().__init__(inputs=inputs, outputs=endpoints, **kwargs)
def get_config(self):
config_dict = {
'model_id': self._model_id,
'use_sync_bn': self._use_sync_bn,
'norm_momentum': self._norm_momentum,
'norm_epsilon': self._norm_epsilon,
'activation': self._activation,
'kernel_initializer': self._kernel_initializer,
'kernel_regularizer': self._kernel_regularizer,
'bias_initializer': self._bias_initializer,
'bias_regularizer': self._bias_regularizer,
}
return config_dict
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
@property
def output_specs(self):
"""A dict of {level: TensorShape} pairs for the model output."""
return self._output_specs
@factory.register_backbone_builder('yolov7')
def build_yolov7(
input_specs: tf_keras.layers.InputSpec,
backbone_config: hyperparams.Config,
norm_activation_config: hyperparams.Config,
l2_regularizer: tf_keras.regularizers.Regularizer = None,
) -> tf_keras.Model: # pytype: disable=annotation-type-mismatch # typed-keras
"""Builds YOLOv7."""
assert backbone_config.type == 'yolov7', (
f'Inconsistent backbone type {backbone_config.type}.')
backbone_config = backbone_config.get()
assert backbone_config.model_id in BACKBONES, (
f'Unsupported backbone {backbone_config.model_id}.')
model = YoloV7(
model_id=backbone_config.model_id,
input_specs=input_specs,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
activation=norm_activation_config.activation,
kernel_regularizer=l2_regularizer,
)
return model