y32, _, _ = nn_impl.fused_batch_norm(
            x32,
            scale,
            offset,