gitlabhq/gitlabhq

View on GitHub
lib/gitlab/import_export/fast_hash_serializer.rb

Summary

Maintainability
A
2 hrs
Test Coverage
# frozen_string_literal: true

# ActiveModel::Serialization (https://github.com/rails/rails/blob/v5.0.7/activemodel/lib/active_model/serialization.rb#L184)
# is simple in that it recursively calls `as_json` on each object to
# serialize everything. However, for a model like a Project, this can
# generate a query for every single association, which can add up to tens
# of thousands of queries and lead to memory bloat.
#
# To improve this, we can do several things:

# 1. Use the option tree in http://api.rubyonrails.org/classes/ActiveModel/Serializers/JSON.html
#    to generate the necessary preload clauses.
#
# 2. We observe that a single project has many issues, merge requests,
#    etc. Instead of serializing everything at once, which could lead to
#    database timeouts and high memory usage, we take each top-level
#    association and serialize the data in batches.
#
#  For example, we serialize the first 100 issues and preload all of
#  their associated events, notes, etc. before moving onto the next
#  batch. When we're done, we serialize merge requests in the same way.
#  We repeat this pattern for the remaining associations specified in
#  import_export.yml.
module Gitlab
  module ImportExport
    class FastHashSerializer
      attr_reader :subject, :tree

      # Usage of this class results in delayed
      # serialization of relation. The serialization
      # will be triggered when the `JSON.generate`
      # is exected.
      #
      # This class uses memory-optimised, lazily
      # initialised, fast to recycle relation
      # serialization.
      #
      # The `JSON.generate` does use `#to_json`,
      # that returns raw JSON content that is written
      # directly to file.
      class JSONBatchRelation
        include Gitlab::Utils::StrongMemoize

        def initialize(relation, options, preloads)
          @relation = relation
          @options = options
          @preloads = preloads
        end

        def raw_json
          strong_memoize(:raw_json) do
            result = +''

            batch = @relation
            batch = batch.preload(@preloads) if @preloads
            batch.each do |item|
              result.concat(",") unless result.empty?
              result.concat(item.to_json(@options))
            end

            result
          end
        end

        def to_json(options = {})
          raw_json
        end

        def as_json(*)
          raise NotImplementedError
        end
      end

      BATCH_SIZE = 100

      def initialize(subject, tree, batch_size: BATCH_SIZE)
        @subject = subject
        @batch_size = batch_size
        @tree = tree
      end

      # With the usage of `JSONBatchRelation`, it returns partially
      # serialized hash which is not easily accessible.
      # It means you can only manipulate and replace top-level objects.
      # All future mutations of the hash (such as `fix_project_tree`)
      # should be aware of that.
      def execute
        simple_serialize.merge(serialize_includes)
      end

      private

      def simple_serialize
        subject.as_json(
          tree.merge(include: nil, preloads: nil))
      end

      def serialize_includes
        return {} unless includes

        includes
          .map(&method(:serialize_include_definition))
          .compact
          .to_h
      end

      # definition:
      # { labels: { includes: ... } }
      def serialize_include_definition(definition)
        raise ArgumentError, 'definition needs to be Hash' unless definition.is_a?(Hash)
        raise ArgumentError, 'definition needs to have exactly one Hash element' unless definition.one?

        key = definition.first.first
        options = definition.first.second

        record = subject.public_send(key) # rubocop: disable GitlabSecurity/PublicSend
        return unless record

        serialized_record = serialize_record(key, record, options)
        return unless serialized_record

        # `#as_json` always returns keys as `strings`
        [key.to_s, serialized_record]
      end

      def serialize_record(key, record, options)
        unless record.respond_to?(:as_json)
          raise "Invalid type of #{key} is #{record.class}"
        end

        # no has-many relation
        unless record.is_a?(ActiveRecord::Relation)
          return record.as_json(options)
        end

        data = []

        record.in_batches(of: @batch_size) do |batch| # rubocop:disable Cop/InBatches
          if Feature.enabled?(:export_fast_serialize_with_raw_json, default_enabled: true)
            data.append(JSONBatchRelation.new(batch, options, preloads[key]).tap(&:raw_json))
          else
            batch = batch.preload(preloads[key]) if preloads&.key?(key)
            data += batch.as_json(options)
          end
        end

        data
      end

      def includes
        tree[:include]
      end

      def preloads
        tree[:preload]
      end
    end
  end
end