buren/wayback_archiver

View on GitHub
bin/wayback_archiver

Summary

Maintainability
Test Coverage
#!/usr/bin/env ruby

require 'optparse'
require 'wayback_archiver'

# Default values
urls = nil
strategy = 'auto'
log = STDOUT
log_level = Logger::INFO
concurrency = WaybackArchiver.concurrency
limit = WaybackArchiver.max_limit
hosts = []

optparse = OptionParser.new do |parser|
  parser.banner = 'Usage: wayback_archiver [<url>] [options]'

  parser.on('--auto', 'Auto (default)') do |value|
    strategy = 'auto'
  end

  parser.on('--crawl', 'Crawl') do |value|
    strategy = 'crawl'
  end

  parser.on('--sitemap', 'Sitemap') do |value|
    strategy = 'sitemap'
  end

  parser.on('--urls', '--url', 'URL(s)') do |value|
    strategy = 'urls'
  end

  parser.on('--hosts=[example.com]', Array, 'Only spider links on certain hosts') do |value|
    hosts = value.map { |v| Regexp.new(v) } if value
  end

  parser.on('--concurrency=1', Integer, 'Concurrency') do |value|
    concurrency = value
  end

  parser.on('--limit=5', Integer, 'Max number of URLs to archive') do |value|
    limit = value
  end

  parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
    log = path
  end

  parser.on('--[no-]verbose', 'Verboes logs') do |value|
    log_level = value ? Logger::DEBUG : Logger::WARN
  end

  parser.on('-h', '--help', 'How to use') do
    puts parser
    exit
  end

  # No argument, shows at tail. This will print an options summary.
  parser.on_tail('-h', '--help', 'Show this message') do
    puts parser
    exit
  end

  parser.on_tail('--version', 'Show version') do
    puts "WaybackArchiver version #{WaybackArchiver::VERSION}"
    exit
  end
end

optparse.parse!

urls = ARGV.map(&:strip).reject(&:empty?)
if urls.empty?
  puts optparse.help
  raise ArgumentError, "[<url>] is required"
end

WaybackArchiver.logger = Logger.new(log).tap do |logger|
  logger.progname = 'WaybackArchiver'
  logger.level = log_level
end

# If no strategy has explicitly been given, then default to 'auto'
strategy ||= 'auto'
urls.each do |url|
  WaybackArchiver.archive(
    url,
    hosts: hosts,
    strategy: strategy,
    concurrency: concurrency,
    limit: limit
  )
end