lib/wraith/save_images.rb
require "parallel"
require "shellwords"
require "wraith"
require "wraith/helpers/capture_options"
require "wraith/helpers/logger"
require "wraith/helpers/save_metadata"
require "wraith/helpers/utilities"
require "selenium-webdriver"
require 'mini_magick'
class Wraith::SaveImages
include Logging
attr_reader :wraith, :history, :meta
def initialize(config, history = false, yaml_passed = false)
@wraith = Wraith::Wraith.new(config, { yaml_passed: yaml_passed })
@history = history
@meta = SaveMetadata.new(@wraith, history)
end
def check_paths
if !wraith.paths
path = File.read(wraith.spider_file)
eval(path)
else
wraith.paths
end
end
def save_images
jobs = define_jobs
parallel_task(jobs)
end
def define_jobs
jobs = []
check_paths.each do |label, options|
settings = CaptureOptions.new(options, wraith)
if settings.resize
jobs += define_individual_job(label, settings, wraith.widths)
else
wraith.widths.each do |width|
jobs += define_individual_job(label, settings, width)
end
end
end
jobs
end
def define_individual_job(label, settings, width)
base_file_name = meta.file_names(width, label, meta.base_label)
compare_file_name = meta.file_names(width, label, meta.compare_label)
jobs = []
jobs << [label, settings.path, prepare_widths_for_cli(width), settings.base_url, base_file_name, settings.selector, wraith.before_capture, settings.before_capture, 'invalid1.jpg']
jobs << [label, settings.path, prepare_widths_for_cli(width), settings.compare_url, compare_file_name, settings.selector, wraith.before_capture, settings.before_capture, 'invalid2.jpg'] unless settings.compare_url.nil?
jobs
end
def prepare_widths_for_cli(width)
# prepare for the command line. [30,40,50] => "30,40,50"
width = width.join(",") if width.is_a? Array
width
end
def run_command(command)
output = []
command.gsub!(/'/, '')
IO.popen(command).each do |line|
logger.info line
output << line.chomp!
end.close
output
end
def parallel_task(jobs)
Parallel.each(jobs, :in_threads => wraith.threads) do |_label, _path, width, url, filename, selector, global_before_capture, path_before_capture|
begin
if meta.engine == "chrome"
capture_image_selenium(width, url, filename, selector, global_before_capture, path_before_capture)
else
command = construct_command(width, url, filename, selector, global_before_capture, path_before_capture)
attempt_image_capture(command, filename)
end
rescue => e
logger.error "#{e}\n URL = #{url}"
create_invalid_image(filename, width, invalid_image_name)
end
end
end
# currently only chrome headless at 1x scaling
def get_driver
case meta.engine
when "chrome"
options = Selenium::WebDriver::Chrome::Options.new
[
'disable-gpu',
'headless',
'no-sandbox',
'device-scale-factor=1',
'force-device-scale-factor',
'window-size=1200,1500',
'hide-scrollbars',
'ignore-certificate-errors'
].each { |arg| options.add_argument("--#{arg}") }
Selenium::WebDriver.for :chrome, options: options
end
end
# resize to fit entire page
def resize_to_fit_page driver
width = driver.execute_script("return Math.max(document.body.scrollWidth, document.body.offsetWidth, document.documentElement.clientWidth, document.documentElement.scrollWidth, document.documentElement.offsetWidth);")
height = driver.execute_script("return Math.max(document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);")
driver.manage.window.resize_to(width, height)
end
# crop an image around the coordinates of an element
def crop_selector driver, selector, image_location
el = driver.find_element(:css, selector)
image = MiniMagick::Image.open(image_location)
image.crop "#{el.rect.width}x#{el.rect.height}+#{el.rect.x}+#{el.rect.y}"
image.write(image_location)
end
def capture_image_selenium(screen_sizes, url, file_name, selector, global_before_capture, path_before_capture)
driver = get_driver
driver.manage.timeouts.implicit_wait = 10;
screen_sizes.to_s.split(",").each do |screen_size|
for attempt in 1..3 do
begin
width, height = screen_size.split("x")
new_file_name = file_name.sub('MULTI', screen_size)
driver.manage.window.resize_to(width, height || 1500)
driver.navigate.to url
driver.manage.timeouts.implicit_wait = wraith.settle
driver.execute_script(File.read(global_before_capture)) if global_before_capture
driver.execute_script(File.read(path_before_capture)) if path_before_capture
resize_to_fit_page(driver) unless height
driver.save_screenshot(new_file_name)
crop_selector(driver, selector, new_file_name) if selector && selector.length > 0
break
rescue Net::ReadTimeout => e
logger.error "Got #{e} on attempt #{attempt} at screen size #{screensize}. URL = #{url}"
end
end
end
driver.quit
end
def construct_command(width, url, file_name, selector, global_before_capture, path_before_capture)
width = prepare_widths_for_cli(width)
selector = selector.gsub '#', '\#' # make sure id selectors aren't escaped in the CLI
global_before_capture = convert_to_absolute global_before_capture
path_before_capture = convert_to_absolute path_before_capture
command_to_run = "#{meta.engine} #{wraith.phantomjs_options} '#{wraith.snap_file}' '#{url}' '#{width}' '#{file_name}' '#{selector}' '#{global_before_capture}' '#{path_before_capture}'"
logger.debug command_to_run
command_to_run
end
def attempt_image_capture(capture_page_image, filename)
max_attempts = 5
max_attempts.times do |i|
run_command capture_page_image
return true if image_was_created filename
logger.warn "Failed to capture image #{filename} on attempt number #{i + 1} of #{max_attempts}"
end
fail "Unable to capture image #{filename} after #{max_attempts} attempt(s)" unless image_was_created filename
end
def image_was_created(filename)
# @TODO - need to check if the image was generated even if in resize mode
wraith.resize or File.exist? filename
end
def create_invalid_image(filename, width, invalid_image_name)
logger.warn "Using fallback image instead"
invalid = File.expand_path("../../assets/#{invalid_image_name}", File.dirname(__FILE__))
FileUtils.cp invalid, filename
set_image_width(filename, width)
end
def set_image_width(image, width)
`convert #{image.shellescape} -background none -extent #{width}x0 #{image.shellescape}`
end
end