openaustralia/morph

View on GitHub
app/lib/morph/docker_runner.rb

Summary

Maintainability
B
5 hrs
Test Coverage
# typed: strict
# frozen_string_literal: true

# To define Sidekiq::Shutdown
require "sidekiq/cli"
require "sorbet-runtime"

module Morph
  # More low-level API for running scrapers. Does not do much of the magic
  # and is less opinionated than the higher-level API in Morph::Runner
  class DockerRunner
    extend T::Sig
    ALL_CONFIG_FILENAMES = T.let([
      "Procfile",
      "Gemfile", "Gemfile.lock",
      "requirements.txt", "runtime.txt",
      "composer.json", "composer.lock",
      "app.psgi", "cpanfile",
      "package.json"
    ].freeze, T::Array[String])
    BUILDSTEP_IMAGE = "openaustralia/buildstep"
    # Variants of the buildstep image that we're currently supporting. These
    # correspond to tags of the buildstep image
    PLATFORMS = T.let(%w[cedar-14 heroku-18].freeze, T::Array[String])
    DEFAULT_PLATFORM = "cedar-14"
    DOCKER_NETWORK = "morph"
    DOCKER_BRIDGE = "morph"
    DOCKER_NETWORK_SUBNET = "192.168.0.0/16"

    class LongRunningScraper < StandardError; end

    sig { returns(String) }
    def self.time_file
      "/app/time.output"
    end

    sig { returns(Integer) }
    def self.default_memory_limit
      512 * 1024 * 1024
    end

    sig { params(platform: String).returns(Docker::Image) }
    def self.buildstep_image(platform)
      Morph::DockerUtils.get_or_pull_image("#{BUILDSTEP_IMAGE}:#{platform}")
    end

    # Pulls all the separately tagged buildstep images
    sig { void }
    def self.update_docker_images!
      Morph::DockerUtils.pull_docker_image(BUILDSTEP_IMAGE)
    end

    # "memory" is the memory limit applied to running container (in bytes). If nil uses the default (set in default_memory_limit)
    sig do
      params(repo_path: String, env_variables: T::Hash[String, String], container_labels: T::Hash[String, String],
             max_lines: Integer, platform: T.nilable(String), disable_proxy: T::Boolean, memory: T.nilable(Integer),
             block: T.nilable(T.proc.params(timestamp: T.nilable(Time), stream: Symbol, text: String).void))
        .returns(T.nilable(Docker::Container))
    end
    def self.compile_and_start_run(
      repo_path:, env_variables: {}, container_labels: {}, max_lines: 0, platform: nil,
      disable_proxy: false, memory: nil, &block
    )
      memory = default_memory_limit if memory.nil?

      i = buildstep_image(platform || DEFAULT_PLATFORM)
      block.call(nil, :internalout, "Injecting configuration and compiling...\n") if block_given?
      i3 = compile(i, repo_path) do |c|
        block.call(nil, :internalout, c) if block_given?
      end
      # If something went wrong during the compile and it couldn't finish
      return nil if i3.nil?

      # Before we create a container we need to make sure that there is a
      # special network there for it to be put into
      create_morph_network

      command = Morph::TimeCommand.command(
        ["/usr/local/bin/limit_output.rb", max_lines.to_s, "/bin/herokuish procfile start scraper"],
        time_file
      )

      # TODO: Also copy back time output file and the sqlite journal file
      # The sqlite journal file won't be present most of the time

      # Add another label to the created container
      container_labels["io.morph.stage"] = "running"

      # Set up special security profile that allows us run chrome headless
      # without setting "--no-sandbox"
      # The documentation on this is non-existent but
      # the seccomp in the api is not the name of the file but the contents of it
      # In other words the file is uploaded client side
      # From https://raw.githubusercontent.com/jfrazelle/dotfiles/master/etc/docker/seccomp/chrome.json
      # Don't set things in test because it fails on travis for some reason
      security_opt = Rails.env.test? ? [] : ["seccomp=#{File.read('config/chrome.json')}"]

      host_config = {
        "SecurityOpt" => security_opt,
        # Bump up the default shared memory size to 256 MB from the default (64 MB)
        # as an attempt to make headless browsing work properly. See
        # https://medium.com/the-curve-tech-blog/dealing-with-cryptic-selenium-webdriver-error-invalidsessioniderror-errors-9c15abc68fdf
        "ShmSize" => 256 * 1024 * 1024
      }
      # Attach this container to our special network morph
      # But we can optionally disable that if we want to bypass the transparent
      # proxy for outgoing web requests
      host_config["NetworkMode"] = DOCKER_NETWORK unless disable_proxy

      container_options = {
        "Cmd" => command,
        "Image" => i3.id,
        # See explanation in https://github.com/openaustralia/morph/issues/242
        "CpuShares" => 307,
        "Memory" => memory,
        "Env" =>
          {
            "REQUESTS_CA_BUNDLE" => "/etc/ssl/certs/ca-certificates.crt"
          }.merge(env_variables).map { |k, v| "#{k}=#{v}" },
        "Labels" => container_labels,
        "HostConfig" => host_config
      }

      c = Docker::Container.create(container_options)

      Dir.mktmpdir("morph") do |dest|
        copy_config_to_directory(repo_path, dest, false)
        block.call(nil, :internalout, "Injecting scraper and running...\n") if block_given?
        # TODO: Combine two operations below into one
        Morph::DockerUtils.insert_contents_of_directory(c, dest, "/app")
        Morph::DockerUtils.insert_file(c, "lib/morph/limit_output.rb",
                                       "/usr/local/bin")
      end

      c.start
      c
    end

    sig { void }
    def self.create_morph_network
      begin
        Docker::Network.get(DOCKER_NETWORK)
        exists = true
      rescue Docker::Error::NotFoundError
        exists = false
      end
      return if exists

      Docker::Network.create(
        DOCKER_NETWORK,
        "Options" => {
          "com.docker.network.bridge.name" => DOCKER_BRIDGE,
          "com.docker.network.bridge.enable_icc" => "false"
        },
        "IPAM" => {
          "Config" => [{
            "Subnet" => DOCKER_NETWORK_SUBNET
          }]
        }
      )
    end

    # If since is non-nil only return log lines since the time given. This
    # time is non-inclusive so we shouldn't return the log line with that
    # exact timestamp, just ones after it.
    sig { params(container: Docker::Container, since: T.nilable(Time), block: T.nilable(T.proc.params(timestamp: Time, stream: Symbol, text: String).void)).void }
    def self.attach_to_run(container, since = nil, &block)
      params = { stdout: true, stderr: true, follow: true, timestamps: true }
      params[:since] = since.to_f if since
      container.streaming_logs(params) do |s, line|
        timestamp = Time.zone.parse(line[0..29])
        # To convert this ruby time back to the same string format as it
        # originally came in do:
        # timestamp.utc.strftime('%Y-%m-%dT%H:%M:%S.%9NZ')
        c = line[31..-1]
        # We're going to assume (somewhat rashly, I might add) that the
        # console output from the scraper is always encoded as UTF-8.
        # TODO Fix forcing of encoding and do something more intelligent
        # Either figure out the correct encoding or take an educated guess
        # rather than making an assumption
        c.force_encoding("UTF-8")
        c.scrub!
        # There is a chance that we catch a log line that shouldn't
        # be included. So...
        if (since.nil? || timestamp > since) && block_given?
          if s == :stderr && c == "limit_output.rb: Too many lines of output!\n"
            block.call timestamp, :internalerr,
                       "\n" \
                       "Too many lines of output! " \
                       "Your scraper will continue uninterrupted. " \
                       "There will just be no further output displayed" \
                       "\n"
          else
            block.call timestamp, s, c
          end
        end
      end
    rescue Docker::Error::TimeoutError
      # We get this when we attach to a container and it doesn't produce any output for a while. It's
      # an entirely expected consequence of the way we're doing things. We do want to raise an exception
      # because we want the background queue to automatically restart the job. To make it clear that this
      # is a special exception we will re-raise it under a different name
      raise LongRunningScraper
    end

    # This should only get called on a stopped container where all the logs
    # have been collected
    sig { params(container: Docker::Container, files: T::Array[String]).returns(Morph::RunResult) }
    def self.finish(container, files)
      # TODO: Check that container has actually stopped. If not raise an error

      # TODO: Don't call container.json multiple times
      status_code = container.json["State"]["ExitCode"]

      # Make the paths absolute paths for the container
      files = files.map { |f| File.join("/app", f) }

      # Grab the resulting files
      data = Morph::DockerUtils.copy_files(container, files + [time_file])

      time_data_tmp = data.delete(time_file)
      if time_data_tmp
        time_params = Morph::TimeCommand.params_from_string(T.must(time_data_tmp.read))
        time_data_tmp.close!
      end

      # Remove /app from the beginning of all paths in data
      data_with_stripped_paths = {}
      data.each do |path, content|
        stripped_path =
          Pathname.new(path).relative_path_from(Pathname.new("/app")).to_s
        data_with_stripped_paths[stripped_path] = content
      end

      # Clean up the container at the last possible moment. This is the
      # signal that we have everything we need
      container.delete(force: true)

      Morph::RunResult.new(status_code, data_with_stripped_paths, time_params)
    end

    # If copy_config is true copies the config file across
    # Otherwise copies the other files across
    sig { params(source: String, dest: String, copy_config: T::Boolean).void }
    def self.copy_config_to_directory(source, dest, copy_config)
      Dir.entries(source).each do |entry|
        next if [".", ".."].include?(entry)

        unless copy_config ^ ALL_CONFIG_FILENAMES.include?(entry)
          FileUtils.copy_entry(File.join(source, entry),
                               File.join(dest, entry))
        end
      end
    end

    sig { params(image: Docker::Image, commands: T::Array[String], dir: String, block: T.proc.params(text: String).void).returns(T.nilable(Docker::Image)) }
    def self.docker_build_command(image, commands, dir, &block)
      # Leave the files in dir untouched
      Dir.mktmpdir("morph") do |dir2|
        Morph::DockerUtils.copy_directory_contents(dir, dir2)
        File.write(File.join(dir2, "Dockerfile"), dockerfile_contents_from_commands(image, commands))

        Morph::DockerUtils.fix_modification_times(dir2)
        Morph::DockerUtils.docker_build_from_dir(
          dir2, read_timeout: 5.minutes
        ) do |c|
          # We don't want to show the standard docker build output
          unless c =~ %r{^Step \d+/\d+ :} || c =~ /^ ---> / ||
                 c =~ /^Removing intermediate container / ||
                 c =~ /^Successfully built / ||
                 c == "\n"
            block.call c
          end
        end
      end
    end

    sig { params(image: Docker::Image, commands: T::Array[String]).returns(String) }
    def self.dockerfile_contents_from_commands(image, commands)
      lines = ["from #{image.id}"] + commands
      (lines.map { |c| "#{c}\n" }).join
    end

    # And build
    # TODO: Set memory and cpu limits during compile
    sig { params(image: Docker::Image, repo_path: String, block: T.proc.params(text: String).void).returns(T.nilable(Docker::Image)) }
    def self.compile(image, repo_path, &block)
      Dir.mktmpdir("morph") do |dir|
        FileUtils.mkdir(File.join(dir, "app"))
        copy_config_to_directory(repo_path, File.join(dir, "app"), true)
        docker_build_command(
          image,
          [
            # Insert the configuration part of the application code into the container
            "ADD app /app",
            # TODO: Setting the timeout higher here won't be necessary once we
            # upgrade to a more recent version of herokuish that contains
            # the commit
            # https://github.com/gliderlabs/herokuish/commit/5164f342dfe27537d6fd5425a5121b7ae7925d3c
            # This will probably involve replacing the use of buildstep with
            # using herokuish directly which seems the sensible thing to do now
            "ENV CURL_TIMEOUT 180",
            "ENV NPM_CONFIG_CAFILE /etc/ssl/certs/ca-certificates.crt",
            # Doing this not very nice thing in lieu of figuring out how
            # to set our custom CA cert for all of node
            "ENV NODE_TLS_REJECT_UNAUTHORIZED 0",
            # In development to debug what the buildpacks are doing it can be useful to turn on
            # the TRACE environment variable below by uncommenting the line
            # "ENV TRACE true",
            "RUN /bin/herokuish buildpack build"
          ],
          dir, &block
        )
      end
    end
  end
end