fastladder/fastladder

View on GitHub
lib/fastladder/crawler.rb

Summary

Maintainability
D
2 days
Test Coverage
require "fastladder"
require "tempfile"
require "logger"
require "timeout"
begin
  require "image_utils"
rescue LoadError
end

class Crawler
end

module Fastladder
  class Crawler
    ITEMS_LIMIT = 500
    REDIRECT_LIMIT = 5
    CRAWL_OK = 1
    CRAWL_NOW = 10
    GETA = [12307].pack("U")

    def self.start(options = {})
      logger = options[:logger]

      unless logger
        target = options[:log_file] || STDOUT
        logger = Logger.new(target)
        logger.level = options[:log_level] || Logger::INFO
      end

      logger.warn '=> Booting FeedFetcher...'
      self.new(logger).run
    end

    def initialize(logger)
      @logger = logger
    end

    def run
      @interval = 0
      finish = false
      until finish
        finish = run_loop
      end
    end

    def crawl(feed)
      response = nil
      result = {
        message: '',
        error: false,
        response_code: nil,
      }
      REDIRECT_LIMIT.times do
        begin
          @logger.info "fetch: #{feed.feedlink}"
          response = Fastladder.fetch(feed.feedlink, modified_on: feed.modified_on)
        end
        @logger.info "HTTP status: [#{response.code}] #{feed.feedlink}"
        case response
        when Net::HTTPNotModified
          break
        when Net::HTTPSuccess
          ret = update(feed, response)
          result[:message] = "#{ret[:new_items]} new items, #{ret[:updated_items]} updated items"
          break
        when Net::HTTPClientError, Net::HTTPServerError
          result[:message] = "Error: #{response.code} #{response.message}"
          result[:error] = true
          break
        # when Net::HTTPUnauthorized
        #   ...
        #   break
        # when Net::HTTPMovedPermanently
        #   if crawl_status.http_status == 301  # Moved Permanently
        #     if crawl_status.response_changed_on < 1.week.ago
        #       feed.feedlink = feedlink
        #       modified_on = nil
        #     end
        #   end
        #   break
        when Net::HTTPRedirection
          @logger.info "Redirect: #{feed.feedlink} => #{response["location"]}"
          feed.feedlink = URI.join(feed.feedlink, response["location"])
          feed.modified_on = nil
          feed.save
        else
          # HTTPUnknownResponse, HTTPInformation
          result[:message] = "Error: #{response.code} #{response.message}"
          result[:error] = true
          break
        end
      end
      result[:response_code] = response.code.to_i
      result
    end

    private

    def run_loop
      begin
        run_body
      rescue SignalException
        @logger.warn "\n=> #{$!.message} trapped. Terminating..."
        return true
      rescue Exception
        @logger.error %!Crawler error: #{$!.message}\n#{$!.backtrace.join("\n")}!
      ensure
        if @crawl_status
          @crawl_status.status = CRAWL_OK
          @crawl_status.save
        end
      end
      false
    end

    def run_body
      @logger.info "sleep: #{@interval}s"
      sleep @interval
      if feed = CrawlStatus.fetch_crawlable_feed
        @interval = 0
        result = crawl(feed)
        if result[:error]
          @logger.info "error: #{result[:message]}"
        else
          @crawl_status = feed.crawl_status
          @crawl_status.http_status = result[:response_code]
          @logger.info "success: #{result[:message]}"
        end
      else
        @interval = @interval > 60 ? 60 : @interval + 1
      end
    end

    def update(feed, source)
      result = {
        new_items: 0,
        updated_items: 0,
        error: nil
      }
      unless parsed = Feedjira.parse(source.body)
        result[:error] = 'Cannot parse feed'
        return result
      end

      items = build_items(feed, parsed)

      items = cut_off(feed, items)
      items = reject_duplicated(feed, items)
      delete_old_items_if_new_items_are_many(feed, items)
      update_or_insert_items_to_feed(feed, items, result)
      update_unread_status(feed, result)
      update_feed_infomation(feed, parsed)
      feed.save

      feed.fetch_favicon!
      GC.start

      result
    end

    def build_items(feed, parsed)
      @logger.info "parsed: [#{parsed.entries.size} items] #{feed.feedlink}"
      parsed.entries.map { |item|
        new_item = Item.new({
                             feed_id: feed.id,
                             link: item.url || "",
                             guid: item.id,
                             title: item.title || "",
                             body: fixup_relative_links(feed, item.content || item.summary),
                             author: item.author,
                             category: item.try(:categories).try!(:first),
                             enclosure: nil,
                             enclosure_type: nil,
                             stored_on: Time.now,
                             modified_on: item.published ? item.published.to_datetime : nil,
                            })
        new_item.create_digest
        new_item
      }
    end

    def fixup_relative_links(feed, body)
      doc = Nokogiri::HTML.fragment(body)
      links = doc.css('a[href]')
      if links.empty?
        body
      else
        links.each do |link|
          begin
            link['href'] = Addressable::URI.join(feed.feedlink, link['href']).normalize.to_s
          rescue Addressable::URI::InvalidURIError
            @logger.info "Invalid URL in link: [#{link['href']}] #{feed.feedlink}"
            next
          end
        end
        doc.to_html
      end
    end

    def cut_off(feed, items)
      return items unless items.size > ITEMS_LIMIT
      @logger.info "too large feed: #{feed.feedlink}(#{feed.items.size})"
      items[0, ITEMS_LIMIT]
    end

    def reject_duplicated(feed, items)
      items.uniq { |item| item.guid }.reject { |item| feed.items.exists?(["guid = ? and digest = ?", item.guid, item.digest]) }
    end

    def new_items_count(feed, items)
      items.reject { |item| feed.items.exists?(["link = ? and digest = ?", item.link, item.digest]) }.size
    end

    def delete_old_items_if_new_items_are_many(feed, items)
      new_items_size = new_items_count(feed, items)
      return unless new_items_size > ITEMS_LIMIT / 2
      @logger.info "delete all items: #{feed.feedlink}"
      Item.where(feed_id: feed.id).delete_all
    end

    def update_or_insert_items_to_feed(feed, items, result)
      items.reverse_each do |item|
        if old_item = feed.items.find_by(guid: item.guid)
          old_item.increment(:version)
          unless almost_same(old_item.title, item.title) and almost_same((old_item.body || "").html2text, (item.body || "").html2text)
            old_item.stored_on = item.stored_on
            result[:updated_items] += 1
          end
          update_columns = %w(link title body author category enclosure enclosure_type digest modified_on)
          old_item.attributes = item.attributes.select{ |column, value| update_columns.include? column }
          old_item.save
        else
          feed.items << item
          result[:new_items] += 1
        end
      end
    end

    def update_unread_status(feed, result)
      return unless result[:updated_items] + result[:new_items] > 0

      last_item = feed.items.recent.first
      feed.modified_on = last_item.created_on

      Subscription.where(feed_id: feed.id).update_all(has_unread: true)
    end

    def update_feed_infomation(feed, parsed)
      feed.title = parsed.title
      feed.link = parsed.url
      feed.description = parsed.description || ""
    end

    def almost_same(str1, str2)
      if str1 == str2
        return true
      end
      chars1 = str1.split(//)
      chars2 = str2.split(//)
      if chars1.length != chars2.length
        return false
      end
      # count differences
      [chars1, chars2].transpose.find_all { |pair|
        !pair.include?(GETA) and pair[0] != pair[1]
      }.size <= 5
    end

  end
end