andry81/tacklelib

View on GitHub
src/tackle/stream_storage.hpp

Summary

Maintainability
Test Coverage
#pragma once

#include <src/tacklelib_private.hpp>

#include <tacklelib/utility/utility.hpp>
#include <tacklelib/utility/static_assert.hpp>
#include <tacklelib/utility/type_traits.hpp>
#include <tacklelib/utility/assert.hpp>
#include <tacklelib/utility/math.hpp>
#include <tacklelib/utility/algorithm.hpp>

#include <tacklelib/tackle/aligned_storage/max_aligned_storage.hpp>
#include <tacklelib/tackle/deque.hpp>

#include <boost/mpl/vector.hpp>
#include <boost/mpl/list.hpp>
#include <boost/mpl/push_front.hpp>

#include <boost/scope_exit.hpp>

#if ERROR_IF_EMPTY_PP_DEF(USE_FMT_LIBRARY_FORMAT_INSTEAD_UTILITY_STRING_FORMAT)
#  include <fmt/format.h>
#endif

#include <deque>
#include <utility>
#include <algorithm>
#include <type_traits>


namespace tackle
{
    namespace mpl = boost::mpl;

    template <typename T>
    class stream_storage;

    namespace detail
    {
        template <bool ForceInline>
        struct _inline_dispatch
        {
            template <typename T, typename C>
            static size_t _copy_to_impl(const stream_storage<T> & this_, const C & chunks, size_t offset_from, T * to_buf, size_t size) {
                return this_._copy_to_impl(chunks, offset_from, to_buf, size);
            }
        };

        template <>
        struct _inline_dispatch<true>
        {
            template <typename T, typename C>
            static size_t _copy_to_impl(const stream_storage<T> & this_, const C & chunks, size_t offset_from, T * to_buf, size_t size) {
                return this_._copy_to_impl_innerforceinline(chunks, offset_from, to_buf, size);
            }
        };
    }

    template <typename T>
    class stream_storage
    {
#if ERROR_IF_EMPTY_PP_DEF(ENABLE_INTERNAL_FORCE_INLINE_IN_STREAM_STORAGE)
        static const int s_default_inner_inline_level = 1;
#else
        static const int s_default_inner_inline_level = 0;
#endif

    public:
        // up to 256KB chunks (1, 2, 4, 8, 16, ..., 128 * 1024, 256 * 1024)
        using num_chunk_variants_t = mpl::size_t<19>;
        using max_chunk_size_t = mpl::size_t<(0x01U << (num_chunk_variants_t::value - 1))>;

        STATIC_ASSERT_LT(num_chunk_variants_t::value, BOOST_MPL_LIMIT_LIST_SIZE, "must be less than the limit");

    private:
        static CONSTEXPR size_t _get_chunk_size(size_t type_index)
        {
            return (0x01U << type_index);
        }

        // POD type, DO NOT USE constructors, the `buf` ending must be connected w/o gaps with the next element beginning in case of array usage as an underlaying container!
        template <size_t S>
        struct Chunk
        {
            static const size_t size = S;

            T buf[S];
        };

    public:
        using max_sizeof_t = mpl::size_t<(0x01U << (num_chunk_variants_t::value - 1))>;

        // generator of the deque with the power of 2 sized chunks
        template <template <typename, typename, typename> class, size_t, typename> struct tackle_deque_chunks_pof2_generator;
        template <template <typename, typename> class, size_t, typename> struct std_deque_chunks_pof2_generator;

        // generator of the deque const iterators from the power of 2 sized chunks
        template <template <typename, typename, typename> class, size_t, typename> struct tackle_deque_chunk_const_iterators_pof2_generator;
        template <template <typename, typename> class, size_t, typename> struct std_deque_chunk_const_iterators_pof2_generator;

        template <template <typename, typename, typename> class TDequeContainer, typename V>
        struct tackle_deque_chunks_pof2_generator<TDequeContainer, 0, V>
        {
            using type = V;
        };

        template <template <typename, typename> class TDequeContainer, typename V>
        struct std_deque_chunks_pof2_generator<TDequeContainer, 0, V>
        {
            using type = V;
        };

        template <template <typename T_, typename Allocator0, typename Allocator1> class TDequeContainer, size_t N, typename V>
        struct tackle_deque_chunks_pof2_generator
        {
            using chunk_type = Chunk<(size_t(0x01) << (N - 1))>;
            STATIC_ASSERT_EQ(sizeof(chunk_type), sizeof(T) * chunk_type::size, "Chunk should contain pure static array inside with out any gaps or padding");

            using base_type = tackle::deque_base<chunk_type>;

            using next_type_t = TDequeContainer<chunk_type, typename base_type::default_allocator_type0, typename base_type::default_allocator_type1>;
            using type = typename tackle_deque_chunks_pof2_generator<TDequeContainer, N - 1, typename mpl::push_front<V, next_type_t>::type>::type;
        };

        template <template <typename, typename> class TDequeContainer, size_t N, typename V>
        struct std_deque_chunks_pof2_generator
        {
            using chunk_type = Chunk<(size_t(0x01) << (N - 1))>;
            STATIC_ASSERT_EQ(sizeof(chunk_type), sizeof(T) * chunk_type::size, "Chunk should contain pure static array inside with out any gaps or padding");

            using next_type_t = TDequeContainer<chunk_type, typename std::deque<chunk_type>::allocator_type>;
            using type = typename std_deque_chunks_pof2_generator<TDequeContainer, N - 1, typename mpl::push_front<V, next_type_t>::type>::type;
        };


        template <template <typename, typename, typename> class TDequeContainer, typename V>
        struct tackle_deque_chunk_const_iterators_pof2_generator<TDequeContainer, 0, V>
        {
            using type = V;
        };

        template <template <typename, typename> class TDequeContainer, typename V>
        struct std_deque_chunk_const_iterators_pof2_generator<TDequeContainer, 0, V>
        {
            using type = V;
        };

        template <template <typename, typename, typename> class TDequeContainer, size_t N, typename V>
        struct tackle_deque_chunk_const_iterators_pof2_generator
        {
            using chunk_type = Chunk<(size_t(0x01) << (N - 1))>;
            STATIC_ASSERT_EQ(sizeof(chunk_type), sizeof(T) * chunk_type::size, "Chunk should contain pure static array inside with out any gaps or padding");

            using base_type = tackle::deque_base<chunk_type>;

            using next_type_t = typename TDequeContainer<chunk_type, typename base_type::default_allocator_type0, typename base_type::default_allocator_type1>::const_iterator;
            using type = typename tackle_deque_chunk_const_iterators_pof2_generator<TDequeContainer, N - 1, typename mpl::push_front<V, next_type_t>::type>::type;
        };

        template <template <typename, typename> class TDequeContainer, size_t N, typename V>
        struct std_deque_chunk_const_iterators_pof2_generator
        {
            using chunk_type = Chunk<(size_t(0x01) << (N - 1))>;
            STATIC_ASSERT_EQ(sizeof(chunk_type), sizeof(T) * chunk_type::size, "Chunk should contain pure static array inside with out any gaps or padding");

            using next_type_t = typename TDequeContainer<chunk_type, typename std::deque<chunk_type>::allocator_type>::const_iterator;
            using type = typename std_deque_chunk_const_iterators_pof2_generator<TDequeContainer, N - 1, typename mpl::push_front<V, next_type_t>::type>::type;
        };

        using mpl_empty_container_t = mpl::list<>; // begin of mpl container usage

        using tackle_deques_mpl_container_t =
            typename tackle_deque_chunks_pof2_generator<tackle::deque, num_chunk_variants_t::value, mpl_empty_container_t>::type;
        using tackle_deque_const_iterators_mpl_container_t =
            typename tackle_deque_chunk_const_iterators_pof2_generator<tackle::deque, num_chunk_variants_t::value, mpl_empty_container_t>::type;

        using std_deques_mpl_container_t =
            typename std_deque_chunks_pof2_generator<std::deque, num_chunk_variants_t::value, mpl_empty_container_t>::type;
        using std_deque_const_iterators_mpl_container_t =
            typename std_deque_chunk_const_iterators_pof2_generator<std::deque, num_chunk_variants_t::value, mpl_empty_container_t>::type;

#if ERROR_IF_EMPTY_PP_DEF(ENABLE_INTERNAL_TACKLE_DEQUE_IN_STREAM_STORAGE)
        using deque_const_iterators_mpl_container_t = tackle_deque_const_iterators_mpl_container_t;
#else
        using deque_const_iterators_mpl_container_t = std_deque_const_iterators_mpl_container_t;
#endif


    public:
#if ERROR_IF_EMPTY_PP_DEF(ENABLE_INTERNAL_TACKLE_DEQUE_IN_STREAM_STORAGE)
        using storage_types_t = tackle_deques_mpl_container_t;
#else
        using storage_types_t = std_deques_mpl_container_t;
#endif

    private:
        using max_aligned_storage_from_mpl_container_t  = max_aligned_storage_from_mpl_container<storage_types_t>;
        using max_aligned_storage_for_tackle_deques_t   = max_aligned_storage_from_mpl_container<tackle_deques_mpl_container_t>;
        using max_aligned_storage_for_std_deques_t      = max_aligned_storage_from_mpl_container<std_deques_mpl_container_t>;

    public:
        static CONSTEXPR const size_t max_size_value        = utility::static_if
            <UTILITY_CONSTEXPR(max_aligned_storage_for_tackle_deques_t::max_size_value >= max_aligned_storage_for_std_deques_t::max_size_value)>
            (max_aligned_storage_for_tackle_deques_t::max_size_value, max_aligned_storage_for_std_deques_t::max_size_value);
        static CONSTEXPR const size_t max_alignment_value   = utility::static_if
            <UTILITY_CONSTEXPR(max_aligned_storage_for_tackle_deques_t::max_alignment_value >= max_aligned_storage_for_std_deques_t::max_alignment_value)>
            (max_aligned_storage_for_tackle_deques_t::max_alignment_value, max_aligned_storage_for_std_deques_t::max_alignment_value);

    private:
        using storage_types_end_it_t                    = typename mpl::end<storage_types_t>::type;
        using num_types_t                               = typename mpl::size<storage_types_t>::type;

        STATIC_ASSERT_GT(num_types_t::value, 0, "template must generate not empty mpl container");

    public:
        class ChunkBufferCRef
        {
            friend class stream_storage;

        private:
            ChunkBufferCRef() :
                m_buf(nullptr), m_size(0)
            {
            }

            ChunkBufferCRef(const T * buf, size_t size) :
                m_buf(buf), m_size(size)
            {
                DEBUG_ASSERT_TRUE(buf && size);
            }

        public:
            const T * get() const
            {
                return m_buf;
            }

            size_t size() const
            {
                return m_size;
            }

        private:
            const T *   m_buf;
            size_t      m_size;
        };

        class basic_const_iterator
        {
            friend class stream_storage;

            using storage_types_t = deque_const_iterators_mpl_container_t;
            using storage_types_end_it_t = typename mpl::end<storage_types_t>::type;
            using num_types_t = typename mpl::size<storage_types_t>::type;

            STATIC_ASSERT_GT(num_types_t::value, 0, "template must generate not empty mpl container");

            using iterator_storage_t = max_aligned_storage_from_mpl_container<storage_types_t>;


        public:
            basic_const_iterator();
            basic_const_iterator(const basic_const_iterator & it);

        private:
            basic_const_iterator(const iterator_storage_t & iterator_storage);

        public:
            basic_const_iterator & operator =(const basic_const_iterator & it);

            ChunkBufferCRef operator *() const;
            ChunkBufferCRef operator ->() const;

            bool operator ==(const basic_const_iterator &) const;
            bool operator !=(const basic_const_iterator &) const;

            basic_const_iterator operator ++(int);
            basic_const_iterator & operator ++();
            basic_const_iterator operator --(int);
            basic_const_iterator & operator --();

        private:
            iterator_storage_t m_iterator_storage;
        };

    public:
        using const_iterator = basic_const_iterator;

        stream_storage(size_t min_chunk_size, size_t min_arr0_capacity, size_t min_arr1_capacity);
        ~stream_storage();

        void reset(size_t min_chunk_size, size_t min_arr0_capacity, size_t min_arr1_capacity);

    protected:
        template <typename T_>
        void _clear(T_ & chunks);

    public:
        void clear();
        const_iterator begin() const;
        const_iterator end() const;
        size_t chunk_size() const;
        size_t size() const;
        size_t remainder() const;
        void push_back(const T * p, size_t size);
        T & operator[](size_t offset);
        const T & operator[](size_t offset) const;
    protected:
        template <typename C>
        size_t _copy_to_impl(const C & chunks, size_t offset_from, T * to_buf, size_t to_size) const;
        template <typename C>
        FORCE_INLINE size_t _copy_to_impl_innerforceinline(const C & chunks, size_t offset_from, T * to_buf, size_t to_size) const; // version with internal force inline

//        template <typename C>
//        size_t _inner_stride_copy_to_impl(const C & chunks, size_t offset_from, size_t from_size,
//            size_t stride_offset, size_t stride_size, T * to_buf, size_t to_size) const;

        template <bool InnerForceInline>
        FORCE_INLINE size_t _stride_copy_to_impl_innerforceinline(size_t offset_from, size_t in_row_offset_from, size_t stream_width,
            size_t slot_begin_in_row_offset, size_t slot_end_in_row_offset, T * to_buf, size_t max_slot_size,
            size_t * in_stream_slot_offset_ptr, size_t * in_slot_byte_offset_ptr, size_t * end_stride_byte_offset_ptr) const;

        // static member call dispatch, useful to expose only one function at a time for inline optimization
        template <bool ForceInline>
        friend struct detail::_inline_dispatch;

    public:
        template <int InnerInlineLevel = s_default_inner_inline_level>
        size_t copy_to(size_t offset_from, T * to_buf, size_t size) const;

        template <int InnerInlineLevel = s_default_inner_inline_level>
        size_t stride_copy_to(size_t offset_from, size_t in_row_offset_from, size_t stream_width,
            size_t slot_begin_in_row_offset, size_t slot_end_in_row_offset, T * to_buf, size_t max_slot_size,
            size_t * in_stream_slot_offset_ptr, size_t * in_slot_byte_offset_ptr, size_t * end_stride_byte_offset_ptr) const;

        template <int InnerInlineLevel = s_default_inner_inline_level>
        FORCE_INLINE size_t stride_copy_to_forceinline(size_t offset_from, size_t in_row_offset_from, size_t stream_width,
            size_t slot_begin_in_row_offset, size_t slot_end_in_row_offset, T * to_buf, size_t max_slot_size,
            size_t * in_stream_slot_offset_ptr, size_t * in_slot_byte_offset_ptr, size_t * end_stride_byte_offset_ptr) const;

        size_t erase_front(size_t size);

    private:
        max_aligned_storage_from_mpl_container_t    m_chunks;
        size_t                                      m_size;
        size_t                                      m_remainder;
    };

    //// stream_storage::basic_const_iterator

    template <typename T>
    inline stream_storage<T>::basic_const_iterator::basic_const_iterator()
    {
    }

    template <typename T>
    inline stream_storage<T>::basic_const_iterator::basic_const_iterator(const basic_const_iterator & it)
    {
        *this = it;
    }

    template <typename T>
    inline stream_storage<T>::basic_const_iterator::basic_const_iterator(const iterator_storage_t & iterator_storage)
    {
        m_iterator_storage.construct(iterator_storage, false);
    }

    template <typename T>
    inline typename stream_storage<T>::basic_const_iterator & stream_storage<T>::basic_const_iterator::operator =(const basic_const_iterator & it)
    {
        m_iterator_storage.assign(it.m_iterator_storage);

        return *this;
    }

    template <typename T>
    inline typename stream_storage<T>::ChunkBufferCRef stream_storage<T>::basic_const_iterator::operator *() const
    {
        return m_iterator_storage.template invoke<ChunkBufferCRef>([&](const auto & chunks_it)
        {
            return ChunkBufferCRef{ chunks_it->buf, UTILITY_CONSTEXPR_SIZE(chunks_it->buf) };
        });
    }

    template <typename T>
    inline typename stream_storage<T>::ChunkBufferCRef stream_storage<T>::basic_const_iterator::operator ->() const
    {
        return this->operator *();
    }

    template <typename T>
    inline bool stream_storage<T>::basic_const_iterator::operator ==(const basic_const_iterator & it) const
    {
        const int left_type_index = m_iterator_storage.type_index();
        const int right_type_index = it.m_iterator_storage.type_index();
        if (left_type_index != right_type_index) {
            DEBUG_BREAK_THROW(true) std::runtime_error(
#if ERROR_IF_EMPTY_PP_DEF(USE_FMT_LIBRARY_FORMAT_INSTEAD_UTILITY_STRING_FORMAT)
                fmt::format("{:s}({:d}): incompatible iterator storages: left_type_index={:d} right_type_index={:d}",
                    UTILITY_PP_FUNCSIG, UTILITY_PP_LINE, left_type_index, right_type_index))
#else
                utility::string_format(256, "%s(%d): incompatible iterator storages: left_type_index=%d right_type_index=%d",
                    UTILITY_PP_FUNCSIG, UTILITY_PP_LINE, left_type_index, right_type_index))
#endif
            );
        }

        return m_iterator_storage.template invoke<bool>([&](const auto & chunks_it)
        {
            using ref_chunk_it_t = decltype(chunks_it);
            using chunk_it_t = typename boost::remove_reference<ref_chunk_it_t>::type;

            const auto & right_chunk_it = *static_cast<const chunk_it_t *>(it.m_iterator_storage.address());

            return chunks_it == right_chunk_it;
        });
    }

    template <typename T>
    inline bool stream_storage<T>::basic_const_iterator::operator !=(const basic_const_iterator & it) const
    {
        return !this->operator ==(it);
    }

    template <typename T>
    inline typename stream_storage<T>::basic_const_iterator stream_storage<T>::basic_const_iterator::operator ++(int)
    {
        const auto it = *this;

        m_iterator_storage.template invoke<void>([](auto & chunks_it)
        {
            chunks_it++;
        });

        return it;
    }

    template <typename T>
    inline typename stream_storage<T>::basic_const_iterator & stream_storage<T>::basic_const_iterator::operator ++()
    {
        m_iterator_storage.template invoke<void>([](auto & chunks_it)
        {
            ++chunks_it;
        });

        return *this;
    }

    template <typename T>
    inline typename stream_storage<T>::basic_const_iterator stream_storage<T>::basic_const_iterator::operator --(int)
    {
        const auto it = *this;

        m_iterator_storage.template invoke<void>([](auto & chunks_it)
        {
            chunks_it--;
        });

        return it;
    }

    template <typename T>
    inline typename stream_storage<T>::basic_const_iterator & stream_storage<T>::basic_const_iterator::operator --()
    {
        m_iterator_storage.template invoke<void>([](auto & chunks_it)
        {
            --chunks_it;
        });

        return *this;
    }

    //// stream_storage

    template <typename T>
    inline stream_storage<T>::stream_storage(size_t min_chunk_size, size_t min_arr0_capacity, size_t min_arr1_capacity) :
        m_size(0), m_remainder(0)
    {
        reset(min_chunk_size, min_arr0_capacity, min_arr1_capacity);
    }

    template <typename T>
    inline stream_storage<T>::~stream_storage()
    {
    }

    template <typename T>
    inline void stream_storage<T>::reset(size_t min_chunk_size, size_t min_arr0_capacity, size_t min_arr1_capacity)
    {
        DEBUG_ASSERT_TRUE(min_chunk_size);

        const int chunk_type_index = math::int_log2_ceil(min_chunk_size);
        if (chunk_type_index >= num_chunk_variants_t::value) {
            DEBUG_BREAK_THROW(true) std::runtime_error(
#if ERROR_IF_EMPTY_PP_DEF(USE_FMT_LIBRARY_FORMAT_INSTEAD_UTILITY_STRING_FORMAT)
                fmt::format("{:s}({:d}): minimum chunk size is not supported: min_chunk_size={:d} pof2={:d} max={:d}",
                    UTILITY_PP_FUNCSIG, UTILITY_PP_LINE, min_chunk_size,
                    math::int_pof2_ceil(min_chunk_size), (0x01U << (num_chunk_variants_t::value - 1)))
#else
                utility::string_format(256, "%s(%d): minimum chunk size is not supported: min_chunk_size=%d pof2=%d max=%d",
                    UTILITY_PP_FUNCSIG, UTILITY_PP_LINE, min_chunk_size,
                    math::int_pof2_ceil(min_chunk_size), (0x01U << (num_chunk_variants_t::value - 1)))
#endif
            );
        }

        if (chunk_type_index != m_chunks.type_index()) {
            m_chunks.construct_default(chunk_type_index, true);

#if ERROR_IF_EMPTY_PP_DEF(ENABLE_INTERNAL_TACKLE_DEQUE_IN_STREAM_STORAGE)
            m_chunks.template invoke<void>([=](auto & chunks)
            {
                using storage_type_t = typename std::remove_reference<decltype(chunks)>::type;

                // available in the tackle implementation
                chunks.reset(typename storage_type_t::optional_params{ min_arr0_capacity, min_arr1_capacity });
            });
#endif
        }
        else {
#if ERROR_IF_EMPTY_PP_DEF(ENABLE_INTERNAL_TACKLE_DEQUE_IN_STREAM_STORAGE)
            m_chunks.template invoke<void>([=](auto & chunks)
            {
                using storage_type_t = typename std::remove_reference<decltype(chunks)>::type;

                this->_clear(chunks);

                // available in the tackle implementation
                chunks.reset(typename storage_type_t::optional_params{ min_arr0_capacity, min_arr1_capacity });
            });
#else
            clear();
#endif
        }
    }

    template <typename T>
    template <typename T_>
    inline void stream_storage<T>::_clear(T_ & chunks)
    {
        m_size = 0;
        m_remainder = 0;
        chunks.clear(); // at last in case if throw an exception
    }

    template <typename T>
    inline void stream_storage<T>::clear()
    {
        m_chunks.template invoke<void>([this](auto & chunks)
        {
            this->_clear(chunks);
        });
    }

    template <typename T>
    inline typename stream_storage<T>::const_iterator stream_storage<T>::begin() const
    {
        return m_chunks.template invoke<const_iterator>([this](const auto & chunks)
        {
            return const_iterator(basic_const_iterator::iterator_storage_t(m_chunks.type_index(), chunks.begin()));
        });
    }

    template <typename T>
    inline typename stream_storage<T>::const_iterator stream_storage<T>::end() const
    {
        return m_chunks.template invoke<const_iterator>([this](const auto & chunks)
        {
            return const_iterator(basic_const_iterator::iterator_storage_t(m_chunks.type_index(), chunks.end()));
        });
    }

    template <typename T>
    inline size_t stream_storage<T>::chunk_size() const
    {
        return m_chunks.template invoke<size_t>([this](const auto & chunks) // to throw exception on invalid type index
        {
            return _get_chunk_size(m_chunks.type_index());
        });
    }

    template <typename T>
    inline size_t stream_storage<T>::size() const
    {
        return m_size;
    }

    template <typename T>
    inline size_t stream_storage<T>::remainder() const
    {
        return m_remainder;
    }

    template <typename T>
    inline void stream_storage<T>::push_back(const T * buf, size_t size)
    {
        DEBUG_ASSERT_TRUE(buf && size);

        m_chunks.template invoke<void>([=](auto & chunks)
        {
            using ref_chunk_t = decltype(chunks[0]);
            using chunk_t = typename boost::remove_reference<ref_chunk_t>::type;

            const size_t chunk_size = this->_get_chunk_size(m_chunks.type_index());
            DEBUG_ASSERT_LT(m_remainder, chunk_size);

            size_t buf_offset = 0;
            size_t left_size = size;

            if_break(1) {
                if (m_remainder) {
                    auto & last_chunk = chunks.back();

                    const size_t copy_to_remainder_size = (std::min)(chunk_size - m_remainder, left_size);
                    UTILITY_COPY(buf, last_chunk.buf + m_remainder, copy_to_remainder_size);
                    left_size -= copy_to_remainder_size;
                    buf_offset += copy_to_remainder_size;
                }

                if (!left_size) break;

                const size_t num_fixed_chunks = left_size / chunk_size;
                const size_t last_fixed_chunk_remainder = left_size % chunk_size;

                for (size_t i = 0; i < num_fixed_chunks; i++) {
                    chunks.push_back(chunk_t());

                    auto & last_chunk = chunks.back();

                    UTILITY_COPY(buf + buf_offset, last_chunk.buf, chunk_size);
                    buf_offset += chunk_size;
                }

                if (last_fixed_chunk_remainder) {
                    chunks.push_back(chunk_t());

                    auto & last_chunk = chunks.back();

                    UTILITY_COPY(buf + buf_offset, last_chunk.buf, last_fixed_chunk_remainder);
                    buf_offset += last_fixed_chunk_remainder;
                }
            }

            m_size += buf_offset;
            m_remainder = (m_remainder + buf_offset) % chunk_size;
        });
    }

    template <typename T>
    inline T & stream_storage<T>::operator[](size_t offset)
    {
        DEBUG_ASSERT_LT(offset, size());

        return m_chunks.template invoke<T &>([=](auto & chunks)
        {
            const size_t chunk_size = this->_get_chunk_size(m_chunks.type_index());

            const auto chunk_devrem = UINT32_DIVREM_POF2_FLOOR_VERIFY(offset, chunk_size);
            auto & chunk = chunks[chunk_devrem.quot];

            return chunk.buf[chunk_devrem.rem];
        });
    }

    template <typename T>
    inline const T & stream_storage<T>::operator[](size_t offset) const
    {
        DEBUG_ASSERT_LT(offset, size());

        return m_chunks.template invoke<const T &>([=](const auto & chunks)
        {
            const size_t chunk_size = this->_get_chunk_size(m_chunks.type_index());

            const auto chunk_devrem = UINT32_DIVREM_POF2_FLOOR_VERIFY(offset, chunk_size);
            const auto & chunk = chunks[chunk_devrem.quot];

            return chunk.buf[chunk_devrem.rem];
        });
    }

    template <typename T> template <typename C>
    inline size_t stream_storage<T>::_copy_to_impl(const C & chunks, size_t offset_from, T * to_buf, size_t to_size) const
    {
        DEBUG_ASSERT_LT(0U, to_size);
        DEBUG_ASSERT_GE(size(), offset_from + to_size);

        const size_t chunk_size = this->_get_chunk_size(m_chunks.type_index());

        const auto chunk_divrem = UINT32_DIVREM_POF2_FLOOR_VERIFY(offset_from, chunk_size);
        const auto & chunk = chunks[chunk_divrem.quot];
        size_t to_buf_offset = 0;
        size_t from_buf_offset = chunk_divrem.rem;
        if (chunk_size >= from_buf_offset + to_size) {
            UTILITY_COPY(chunk.buf + from_buf_offset, to_buf, to_size);
            to_buf_offset += to_size;
        }
        else {
            const auto next_chunk_divrem = UINT32_DIVREM_POF2_FLOOR_VERIFY(chunk_divrem.rem + to_size, chunk_size);

            // cycles overhead optimization
            //if (256 < next_chunk_divrem.quot) {
                size_t chunks_size = chunk_size - chunk_divrem.rem;

                const auto * prev_chunk_ptr = &chunk;
                decltype(prev_chunk_ptr) next_chunk_ptr;

                if (next_chunk_divrem.quot >= 2) {
                    // collect continuous chunks block
                    for (size_t i = 1; i < next_chunk_divrem.quot; i++) {
                        next_chunk_ptr = &chunks[chunk_divrem.quot + i];
                        if (next_chunk_ptr == prev_chunk_ptr + chunks_size) {
                            chunks_size += chunk_size;
                        }
                        else {
                            // next chunk is not continuous, copy collected chunks at once
                            UTILITY_COPY(prev_chunk_ptr->buf + from_buf_offset, to_buf + to_buf_offset, chunks_size);
                            prev_chunk_ptr = next_chunk_ptr;
                            from_buf_offset = 0;
                            to_buf_offset += chunks_size;
                            chunks_size = chunk_size;
                        }
                    }
                }
                if (next_chunk_divrem.rem) {
                    next_chunk_ptr = &chunks[chunk_divrem.quot + next_chunk_divrem.quot];
                    if (next_chunk_ptr == prev_chunk_ptr + chunk_size) {
                        UTILITY_COPY(prev_chunk_ptr->buf + from_buf_offset, to_buf + to_buf_offset, chunks_size + next_chunk_divrem.rem);
                        to_buf_offset += chunks_size + next_chunk_divrem.rem;
                    }
                    else {
                        UTILITY_COPY(prev_chunk_ptr->buf + from_buf_offset, to_buf + to_buf_offset, chunks_size);
                        to_buf_offset += chunks_size;
                        UTILITY_COPY(next_chunk_ptr->buf, to_buf + to_buf_offset, next_chunk_divrem.rem);
                        to_buf_offset += next_chunk_divrem.rem;
                    }
                }
                else {
                    UTILITY_COPY(prev_chunk_ptr->buf + from_buf_offset, to_buf + to_buf_offset, chunks_size);
                    to_buf_offset += chunks_size;
                }
            //}
            //else {
            //    const size_t first_chunk_size = chunk_size - chunk_divrem.rem;
            //
            //    UTILITY_COPY(chunk.buf + from_buf_offset, to_buf + to_buf_offset, first_chunk_size);
            //    to_buf_offset += first_chunk_size;
            //
            //    if (next_chunk_divrem.quot >= 2) {
            //        for (size_t i = 1; i < next_chunk_divrem.quot; i++, to_buf_offset += chunk_size) {
            //            const auto & chunk2 = chunks[chunk_divrem.quot + i];
            //            UTILITY_COPY(chunk2.buf, to_buf + to_buf_offset, chunk_size);
            //        }
            //    }
            //    if (next_chunk_divrem.rem) {
            //        auto & chunk2 = chunks[chunk_divrem.quot + next_chunk_divrem.quot];
            //        const size_t last_chunk_size = next_chunk_divrem.rem;
            //        UTILITY_COPY(chunk2.buf, to_buf + to_buf_offset, last_chunk_size);
            //        to_buf_offset += last_chunk_size;
            //    }
            //}
        }

        return to_buf_offset;
    }

    // version with internal force inline
    template <typename T> template <typename C>
    FORCE_INLINE size_t stream_storage<T>::_copy_to_impl_innerforceinline(const C & chunks, size_t offset_from, T * to_buf, size_t to_size) const
    {
        DEBUG_ASSERT_LT(0U, to_size);
        DEBUG_ASSERT_GE(size(), offset_from + to_size);

        const size_t chunk_size = this->_get_chunk_size(m_chunks.type_index());

        const auto chunk_divrem = UINT32_DIVREM_POF2_FLOOR_VERIFY(offset_from, chunk_size);
        const auto & chunk = chunks[chunk_divrem.quot];
        size_t to_buf_offset = 0;
        size_t from_buf_offset = chunk_divrem.rem;
        if (chunk_size >= from_buf_offset + to_size) {
            UTILITY_COPY_FORCE_INLINE(chunk.buf + from_buf_offset, to_buf, to_size);
            to_buf_offset += to_size;
        }
        else {
            const auto next_chunk_divrem = UINT32_DIVREM_POF2_FLOOR_VERIFY(chunk_divrem.rem + to_size, chunk_size);

            // cycles overhead optimization
            //if (256 < next_chunk_divrem.quot) {
                size_t chunks_size = chunk_size - chunk_divrem.rem;

                const auto * prev_chunk_ptr = &chunk;
                decltype(prev_chunk_ptr) next_chunk_ptr;

                if (next_chunk_divrem.quot >= 2) {
                    // collect continuous chunks block
                    for (size_t i = 1; i < next_chunk_divrem.quot; i++) {
                        next_chunk_ptr = &chunks[chunk_divrem.quot + i];
                        if (next_chunk_ptr == prev_chunk_ptr + chunks_size) {
                            chunks_size += chunk_size;
                        }
                        else {
                            // next chunk is not continuous, copy collected chunks at once
                            UTILITY_COPY_FORCE_INLINE(prev_chunk_ptr->buf + from_buf_offset, to_buf + to_buf_offset, chunks_size);
                            prev_chunk_ptr = next_chunk_ptr;
                            from_buf_offset = 0;
                            to_buf_offset += chunks_size;
                            chunks_size = chunk_size;
                        }
                    }
                }
                if (next_chunk_divrem.rem) {
                    next_chunk_ptr = &chunks[chunk_divrem.quot + next_chunk_divrem.quot];
                    if (next_chunk_ptr == prev_chunk_ptr + chunk_size) {
                        UTILITY_COPY_FORCE_INLINE(prev_chunk_ptr->buf + from_buf_offset, to_buf + to_buf_offset, chunks_size + next_chunk_divrem.rem);
                        to_buf_offset += chunks_size + next_chunk_divrem.rem;
                    }
                    else {
                        UTILITY_COPY_FORCE_INLINE(prev_chunk_ptr->buf + from_buf_offset, to_buf + to_buf_offset, chunks_size);
                        to_buf_offset += chunks_size;
                        UTILITY_COPY_FORCE_INLINE(next_chunk_ptr->buf, to_buf + to_buf_offset, next_chunk_divrem.rem);
                        to_buf_offset += next_chunk_divrem.rem;
                    }
                }
                else {
                    UTILITY_COPY_FORCE_INLINE(prev_chunk_ptr->buf + from_buf_offset, to_buf + to_buf_offset, chunks_size);
                    to_buf_offset += chunks_size;
                }
            //}
            //else {
            //    const size_t first_chunk_size = chunk_size - chunk_divrem.rem;
            //
            //    UTILITY_COPY_FORCE_INLINE(chunk.buf + from_buf_offset, to_buf + to_buf_offset, first_chunk_size);
            //    to_buf_offset += first_chunk_size;
            //
            //    if (next_chunk_divrem.quot >= 2) {
            //        for (size_t i = 1; i < next_chunk_divrem.quot; i++, to_buf_offset += chunk_size) {
            //            const auto & chunk2 = chunks[chunk_divrem.quot + i];
            //            UTILITY_COPY_FORCE_INLINE(chunk2.buf, to_buf + to_buf_offset, chunk_size);
            //        }
            //    }
            //    if (next_chunk_divrem.rem) {
            //        auto & chunk2 = chunks[chunk_divrem.quot + next_chunk_divrem.quot];
            //        const size_t last_chunk_size = next_chunk_divrem.rem;
            //        UTILITY_COPY_FORCE_INLINE(chunk2.buf, to_buf + to_buf_offset, last_chunk_size);
            //        to_buf_offset += last_chunk_size;
            //    }
            //}
        }

        return to_buf_offset;
    }

//    template <typename C>
//    inline size_t _inner_stride_copy_to_impl(const C & chunks, size_t offset_from, size_t from_size, size_t stride_size, size_t stride_step, T * to_buf, size_t to_size) const
//    {
//        DEBUG_ASSERT_TRUE(stride_size && stride_step && from_size && to_size);
//        DEBUG_ASSERT_GE(stride_step, stride_size);
//
//        const size_t chunk_size = this->_get_chunk_size(m_chunks.type_index());
//
//        const auto chunk_divrem = UINT32_DIVREM_POF2_FLOOR_VERIFY(offset_from, chunk_size);
//        const auto & chunk = chunks[chunk_divrem.quot];
//        size_t to_buf_offset = 0;
//        size_t from_buf_offset = chunk_divrem.rem;
//
//        if (chunk_size >= from_buf_offset + from_size) {
//            UTILITY_STRIDE_COPY(to_buf_offset, chunk.buf + from_buf_offset, from_size, stride_size, stride_step, to_buf, to_size);
//        }
//        else {
//            const auto next_chunk_divrem = UINT32_DIVREM_POF2_FLOOR_VERIFY(chunk_divrem.rem + from_size, chunk_size);
//
//            // TODO:
//            //  * UTILITY_STRIDE_COPY from the middle of slot byte instead of only from slot beginning byte
//
//            // cycles overhead optimization
//            //if (256 < next_chunk_divrem.quot) {
//                size_t chunks_size = chunk_size - chunk_divrem.rem;
//
//                const auto * prev_chunk_ptr = &chunk;
//                decltype(prev_chunk_ptr) next_chunk_ptr;
//
//                if (next_chunk_divrem.quot >= 2) {
//                    // collect continuous chunks block
//                    for (size_t i = 1; i < next_chunk_divrem.quot; i++) {
//                        next_chunk_ptr = &chunks[chunk_divrem.quot + i];
//                        if (next_chunk_ptr == prev_chunk_ptr + chunks_size) {
//                            chunks_size += chunk_size;
//                        }
//                        else {
//                            // next chunk is not continuous, copy collected chunks at once
//                            UTILITY_COPY_FORCE_INLINE(prev_chunk_ptr->buf + from_buf_offset, to_buf + to_buf_offset, chunks_size);
//                            prev_chunk_ptr = next_chunk_ptr;
//                            from_buf_offset = 0;
//                            to_buf_offset += chunks_size;
//                            chunks_size = chunk_size;
//                        }
//                    }
//                }
//                if (next_chunk_divrem.rem) {
//                    next_chunk_ptr = &chunks[chunk_divrem.quot + next_chunk_divrem.quot];
//                    if (next_chunk_ptr == prev_chunk_ptr + chunk_size) {
//                        UTILITY_COPY_FORCE_INLINE(prev_chunk_ptr->buf + from_buf_offset, to_buf + to_buf_offset, chunks_size + next_chunk_divrem.rem);
//                        to_buf_offset += chunks_size + next_chunk_divrem.rem;
//                    }
//                    else {
//                        UTILITY_COPY_FORCE_INLINE(prev_chunk_ptr->buf + from_buf_offset, to_buf + to_buf_offset, chunks_size);
//                        to_buf_offset += chunks_size;
//                        UTILITY_COPY_FORCE_INLINE(next_chunk_ptr->buf, to_buf + to_buf_offset, next_chunk_divrem.rem);
//                        to_buf_offset += next_chunk_divrem.rem;
//                    }
//                }
//                else {
//                    UTILITY_COPY_FORCE_INLINE(prev_chunk_ptr->buf + from_buf_offset, to_buf + to_buf_offset, chunks_size);
//                    to_buf_offset += chunks_size;
//                }
//            //}
//            //else {
//            //    const size_t first_chunk_size = chunk_size - chunk_divrem.rem;
//            //
//            //    UTILITY_COPY_FORCE_INLINE(chunk.buf + from_buf_offset, to_buf + to_buf_offset, first_chunk_size);
//            //    to_buf_offset += first_chunk_size;
//            //
//            //    if (next_chunk_divrem.quot >= 2) {
//            //        for (size_t i = 1; i < next_chunk_divrem.quot; i++, to_buf_offset += chunk_size) {
//            //            const auto & chunk2 = chunks[chunk_divrem.quot + i];
//            //            UTILITY_COPY_FORCE_INLINE(chunk2.buf, to_buf + to_buf_offset, chunk_size);
//            //        }
//            //    }
//            //    if (next_chunk_divrem.rem) {
//            //        auto & chunk2 = chunks[chunk_divrem.quot + next_chunk_divrem.quot];
//            //        const size_t last_chunk_size = next_chunk_divrem.rem;
//            //        UTILITY_COPY_FORCE_INLINE(chunk2.buf, to_buf + to_buf_offset, last_chunk_size);
//            //        to_buf_offset += last_chunk_size;
//            //    }
//            //}
//        }
//
//        return to_buf_offset;
//    }

    template <typename T> template <bool InnerForceInline>
    FORCE_INLINE size_t stream_storage<T>::_stride_copy_to_impl_innerforceinline(size_t offset_from, size_t in_row_offset_from, size_t stream_width,
        size_t slot_begin_in_row_offset, size_t slot_end_in_row_offset, T * to_buf, size_t max_slot_size,
        size_t * in_stream_slot_offset_ptr, size_t * in_slot_byte_offset_ptr, size_t * end_stride_byte_offset_ptr) const
    {
        DEBUG_ASSERT_TRUE(to_buf && max_slot_size);
        DEBUG_ASSERT_LT(in_row_offset_from, stream_width);
        DEBUG_ASSERT_GE(stream_width, slot_end_in_row_offset);
        DEBUG_ASSERT_LT(slot_begin_in_row_offset, slot_end_in_row_offset);
        DEBUG_ASSERT_LT(offset_from, size());

        const size_t slot_size = m_chunks.template invoke<size_t>([=](const auto & chunks)
        {
            const size_t slot_width = slot_end_in_row_offset - slot_begin_in_row_offset;

            size_t in_row_offset_last = in_row_offset_from;

            size_t iterated_stream_size = 0;
            size_t slot_size = 0;

            size_t stream_size_left = m_size - offset_from;
            size_t slot_size_left = max_slot_size;

            if (in_stream_slot_offset_ptr) {
                *in_stream_slot_offset_ptr = 0;
            }
            if (in_slot_byte_offset_ptr) {
                *in_slot_byte_offset_ptr = 0;
            }

            BOOST_SCOPE_EXIT(&offset_from, &iterated_stream_size, &end_stride_byte_offset_ptr) {
                if (end_stride_byte_offset_ptr) {
                    *end_stride_byte_offset_ptr = iterated_stream_size;
                }
            } BOOST_SCOPE_EXIT_END

            if (in_row_offset_from < slot_begin_in_row_offset) goto _first_row_left_segment;
            else if (in_row_offset_from < slot_end_in_row_offset) goto _first_row_slot_segment;
            else goto _first_row_right_segment;

            _first_row_left_segment:;
            {
                const size_t iterate_size = (std::min)(slot_begin_in_row_offset - in_row_offset_last, stream_size_left);

                iterated_stream_size += iterate_size;
                DEBUG_ASSERT_GE(stream_size_left, iterate_size);
                stream_size_left -= iterate_size;

                if (!stream_size_left) {
                    if (in_stream_slot_offset_ptr) {
                        *in_stream_slot_offset_ptr = iterated_stream_size;
                    }

                    return slot_size;
                }

                in_row_offset_last = slot_begin_in_row_offset;
            }

            _first_row_slot_segment:;
            {
                if (in_stream_slot_offset_ptr) {
                    *in_stream_slot_offset_ptr = iterated_stream_size;
                }

                const size_t first_slot_row_bytes = slot_end_in_row_offset - in_row_offset_last;

                const size_t slot_size_to_copy = (std::min)((std::min)(first_slot_row_bytes, slot_size_left), stream_size_left);
                DEBUG_ASSERT_LT(0U, slot_size_to_copy);

                const size_t copied_size = detail::_inline_dispatch<InnerForceInline>::
                    _copy_to_impl(*this, chunks, offset_from + iterated_stream_size, to_buf + slot_size, slot_size_to_copy);
                DEBUG_ASSERT_EQ(copied_size, slot_size_to_copy);

                slot_size += copied_size;
                iterated_stream_size += slot_size_to_copy;

                DEBUG_ASSERT_GE(slot_size_left, slot_size_to_copy);
                slot_size_left -= slot_size_to_copy;
                DEBUG_ASSERT_GE(stream_size_left, slot_size_to_copy);
                stream_size_left -= slot_size_to_copy;

                if (in_slot_byte_offset_ptr) {
                    DEBUG_ASSERT_GE(in_row_offset_last, slot_begin_in_row_offset);
                    *in_slot_byte_offset_ptr = in_row_offset_last - slot_begin_in_row_offset;
                }

                if (!slot_size_left || !stream_size_left) return slot_size;

                in_row_offset_last = slot_end_in_row_offset;
            }

            _first_row_right_segment:;
            {
                const size_t iterate_size = (std::min)(stream_width - in_row_offset_last, stream_size_left);

                iterated_stream_size += iterate_size;
                DEBUG_ASSERT_GE(stream_size_left, iterate_size);
                stream_size_left -= iterate_size;

                if (!stream_size_left) {
                    if (!slot_size && in_stream_slot_offset_ptr) {
                        *in_stream_slot_offset_ptr = iterated_stream_size;
                    }

                    return slot_size;
                }

                if (!slot_size && in_stream_slot_offset_ptr) {
                    *in_stream_slot_offset_ptr = iterated_stream_size + (std::min)(slot_begin_in_row_offset, stream_size_left);
                }
            }

            const size_t num_whole_slot_rows = slot_size_left / slot_width;
            const size_t num_whole_stream_rows = stream_size_left / stream_width;

            const size_t num_whole_rows = (std::min)(num_whole_slot_rows, num_whole_stream_rows);
            for (size_t i = 0; i < num_whole_rows; i++) {
                const size_t copied_size = detail::_inline_dispatch<InnerForceInline>::
                    _copy_to_impl(*this, chunks, offset_from + iterated_stream_size + slot_begin_in_row_offset, to_buf + slot_size, slot_width);
                DEBUG_ASSERT_EQ(copied_size, slot_width);

                slot_size += copied_size;
                iterated_stream_size += stream_width;
            }

            const size_t iterate_size = num_whole_rows * stream_width;

            DEBUG_ASSERT_GE(slot_size_left, num_whole_rows * slot_width);
            slot_size_left -= num_whole_rows * slot_width;
            DEBUG_ASSERT_GE(stream_size_left, iterate_size);
            stream_size_left -= iterate_size;

            if (!slot_size_left || !stream_size_left) return slot_size;

            //_last_row_left_segment:;
            {
                const size_t iterate_size = (std::min)(slot_begin_in_row_offset, stream_size_left);

                iterated_stream_size += iterate_size;
                DEBUG_ASSERT_GE(stream_size_left, iterate_size);
                stream_size_left -= iterate_size;

                if (!stream_size_left) return slot_size;
            }

            //_last_row_slot_segment:;
            {
                const size_t slot_size_to_copy = (std::min)((std::min)(slot_width, slot_size_left), stream_size_left);
                DEBUG_ASSERT_LT(0U, slot_size_to_copy);

                const size_t copied_size = detail::_inline_dispatch<InnerForceInline>::
                    _copy_to_impl(*this, chunks, offset_from + iterated_stream_size, to_buf + slot_size, slot_size_to_copy);
                DEBUG_ASSERT_EQ(copied_size, slot_size_to_copy);

                slot_size += copied_size;
                iterated_stream_size += slot_size_to_copy;

                DEBUG_ASSERT_GE(slot_size_left, slot_size_to_copy);
                slot_size_left -= slot_size_to_copy;
                DEBUG_ASSERT_GE(stream_size_left, slot_size_to_copy);
                stream_size_left -= slot_size_to_copy;

                if (!slot_size_left) {
                    // if last slot size to copy was slot width, then iterate offset either to the end of the stream row or to the end of the stream
                    if (slot_size_to_copy == slot_width && stream_size_left && end_stride_byte_offset_ptr) { // has meaning only for `end_stride_byte_offset`
                        const size_t iterate_size = (std::min)(stream_width - slot_end_in_row_offset, stream_size_left);

                        iterated_stream_size += iterate_size;
                        DEBUG_ASSERT_GE(stream_size_left, iterate_size); // just in case
                    }

                    return slot_size;
                }
            }

            //_last_row_right_segment:;
            {
                const size_t iterate_size = (std::min)(stream_width - slot_end_in_row_offset, stream_size_left);

                iterated_stream_size += iterate_size;
                DEBUG_ASSERT_GE(stream_size_left, iterate_size);
                stream_size_left -= iterate_size;

                // end of stream
                DEBUG_ASSERT_FALSE(stream_size_left);
            }

            return slot_size;
        });

#if DEBUG_ASSERT_VERIFY_ENABLED
        if (in_stream_slot_offset_ptr && in_slot_byte_offset_ptr) {
            DEBUG_ASSERT_TRUE(!*in_stream_slot_offset_ptr && !*in_slot_byte_offset_ptr || (*in_stream_slot_offset_ptr ^ *in_slot_byte_offset_ptr)); // only one must be not zero at a time!
        }
#endif

        return slot_size;
    }

    template <typename T> template <int InnerInlineLevel>
    inline size_t stream_storage<T>::copy_to(size_t offset_from, T * to_buf, size_t to_size) const
    {
        return m_chunks.template invoke<size_t>([=](const auto & chunks)
        {
            return detail::_inline_dispatch<InnerInlineLevel ? true : false>::_copy_to_impl(*this, chunks, offset_from, to_buf, to_size);
        });
    }

    template <typename T> template <int InnerInlineLevel>
    inline size_t stream_storage<T>::stride_copy_to(size_t offset_from, size_t in_row_offset_from, size_t stream_width,
        size_t slot_begin_in_row_offset, size_t slot_end_in_row_offset, T * to_buf, size_t max_slot_size,
        size_t * in_stream_slot_offset_ptr, size_t * in_slot_byte_offset_ptr, size_t * end_stride_byte_offset_ptr) const
    {
        if (slot_begin_in_row_offset != 0 || slot_end_in_row_offset != stream_width) {
            return _stride_copy_to_impl_innerforceinline<InnerInlineLevel ? true : false>(
                offset_from, in_row_offset_from, stream_width, slot_begin_in_row_offset, slot_end_in_row_offset,
                to_buf, max_slot_size, in_stream_slot_offset_ptr, in_slot_byte_offset_ptr, end_stride_byte_offset_ptr);
        }

        DEBUG_ASSERT_GE(m_size, offset_from);
        const uint32_t max_size_to_process = (std::min)(max_slot_size, m_size - offset_from);
        return copy_to(offset_from, to_buf, max_size_to_process);
    }

    template <typename T> template <int InnerInlineLevel>
    FORCE_INLINE size_t stream_storage<T>::stride_copy_to_forceinline(size_t offset_from, size_t in_row_offset_from, size_t stream_width,
        size_t slot_begin_in_row_offset, size_t slot_end_in_row_offset, T * to_buf, size_t max_slot_size,
        size_t * in_stream_slot_offset_ptr, size_t * in_slot_byte_offset_ptr, size_t * end_stride_byte_offset_ptr) const
    {
        if (slot_begin_in_row_offset != 0 || slot_end_in_row_offset != stream_width) {
            return _stride_copy_to_impl_innerforceinline<InnerInlineLevel ? true : false>(
                offset_from, in_row_offset_from, stream_width, slot_begin_in_row_offset, slot_end_in_row_offset,
                to_buf, max_slot_size, in_stream_slot_offset_ptr, in_slot_byte_offset_ptr, end_stride_byte_offset_ptr);
        }

        DEBUG_ASSERT_GE(m_size, offset_from);
        const uint32_t max_size_to_process = (std::min)(max_slot_size, m_size - offset_from);
        return copy_to(offset_from, to_buf, max_size_to_process);
    }

    template <typename T>
    inline size_t stream_storage<T>::erase_front(size_t size)
    {
        DEBUG_ASSERT_GE(m_size, size);

        return m_chunks.template invoke<size_t>([=](auto & chunks)
        {
            const size_t chunk_size = this->_get_chunk_size(m_chunks.type_index());

            size_t erased_size;

            if (size < m_size) {
                size_t chunk_index = 0;
                const size_t num_chunks = size / chunk_size;
                for (; chunk_index < num_chunks; chunk_index++) {
                    chunks.pop_front();
                }

                erased_size = chunk_index * chunk_size;

                DEBUG_ASSERT_GE(m_size, erased_size);
                m_size -= erased_size;
            }
            else {
                erased_size = m_size;

                this->clear();
            }

            return erased_size;
        });
    }
}