strawberryjello/downloader

View on GitHub
lib/downloader.rb

Summary

Maintainability
A
0 mins
Test Coverage
require "downloader/version"
require "downloader/errors"
require "downloader/loggable"
require "downloader/url_helper"
require 'http'
require 'uri'

module Downloader
  extend Loggable

  # Returns the value of scheme_host in +options+ if it exists, otherwise
  # extracts the scheme and host as one string from +url+
  #
  # Exits with a nonzero value (1) and an error message with troubleshooting tips
  # when UrlHelper throws a UriError
  #
  # @param url [String] the source URL
  # @param options [Hash] the hash containing command-line options
  # @return [String] the host and scheme as one string
  #
  # Example:
  #
  #   get_host_with_scheme("https://example.com/cats", options_hash)
  #   # => "https://example.com"

  def self.get_host_with_scheme(url, options=nil)
    begin
      options&.dig("scheme_host") ||
        UrlHelper.extract_host_with_scheme(url, options&.dig("scheme"))
    rescue UriError => e
      logger.error("Error while parsing URL: #{e}")
      logger.error(%q(
Possible solutions:
- Check your input file. If the URLs are relative, use the
  --scheme-host option to provide the scheme and host.
- If using the --scheme-host option, check if it's correct.
- If the URLs are missing a scheme but not the host, use the
  --scheme option to provide the scheme.
- If the URLs are absolute, check if the scheme and host are
  correct.
        ))
      exit(1)
    end
  end

  # Makes the HTTP GET request for +ref+ using +http+, follows redirects
  #
  # @param http [HTTP] the HTTP object provided by http.rb
  # @param ref [String] the relative ref to the file being downloaded
  # @return [String] the response body

  def self.do_get(http, ref)
    response = http.get(ref)
    logger.debug(response.status)

    if HTTP::Redirector::REDIRECT_CODES.include?(response.status.code)
      response = http.follow.get(ref)
      logger.debug("Followed redirect, new response status: #{response.status}")
    end

    # to_s must be called so the response will be consumed
    # before the next (persistent) request is made
    response.body.to_s
  end

  # Downloads the files pointed to by the URLs in +input_file+ to the path specified by +dest+
  #
  # @param input_file [String] the path to the file containing the URLs to be downloaded from
  # @param dest [String] the destination path
  # @param options [Hash] optional hash for command-line options, see README for full list
  #
  # Example:
  #
  #   Downloader.batch("urls.txt", ".", {})
  #   # => downloads the files from the URLs in urls.txt to the current directory

  def self.batch(input_file, dest, options=nil)
    logger.debug("Options: #{options}")

    urls = Util.read_input_file(input_file)
    host_with_scheme = get_host_with_scheme(urls[0], options)

    logger.info("Connecting to #{host_with_scheme}")

    http = HTTP.persistent(host_with_scheme)

    urls.each_with_index do |url, i|
      relative_ref = UrlHelper.extract_relative_ref(url)

      # note & operator and Hash#dig: just in case options is nil
      filename = UrlHelper.create_filename(url, options&.dig('numbered_filenames'), i+1)
      logger.info("Downloading #{relative_ref} - filename: #{filename}")

      Util.write_to_file(File.join(dest, filename), do_get(http, relative_ref))
    end

    http.close
  end

  # Downloads the file at +url+ to the current directory
  #
  # @param url [String] the URL of the file to be downloaded
  # @return [String] the original filename of the downloaded file

  def self.download(url)
    filename = UrlHelper.extract_filename(url)

    Util.write_to_file(filename, HTTP.get(url))

    filename
  end
end