amatriain/feedbunch

View on GitHub
FeedBunch-app/app/models/feed.rb

Summary

Maintainability
A
0 mins
Test Coverage
# frozen_string_literal: true

require 'uri'
require 'addressable/uri'
require 'encoding_manager'
require 'schedule_manager'
require 'feed_blacklister'
require 'url_normalizer'
require 'sanitizer'

##
# Feed model. Each instance of this model represents a single feed (Atom, RSS...) to which users can be suscribed.
#
# A single feed can have many subscriptions (from different users), but a single subscription corresponds to a single feed (one-to-many relationship).
#
# Users are associated with Feeds through the FeedSubscription model. This enables us to retrieve users that are subscribed to a feed.
#
# Feeds can be associated with folders. Each feed can be in many folders (as long as they belong to different users),
# and each folder can have many feeds (many-to-many association). However a single feed cannot be associated with
# more than one folder from the same user.
#
# Each feed can have many entries.
#
# Each feed can have many deleted_entries
#
# Each feed can be associated with many refresh_feed_job_states. Each such association represents an ocurrence of a user
# manually requesting a refresh of this feed.
#
# Each feed can be associated with many subscribe_job_states. Each such association represents an occurrence of a user
# successfully subscribing to the feed. They are transient and can be destroyed if the user dismisses the alert that informs
# him of the success subscribing to the feed; but if there is still a FeedSusbscription instance joining user and feed, the user
# is still subscribed to the feed.
#
# Each feed, identified by its fetch_url, can be present at most once in the database. Different feeds can have the same
# title, as long as they have different fetch_url.
#
# Attributes of the model:
# - title
# - fetch_url (URL to fetch the feed XML)
# - last_fetched (timestamp of the last time the feed was fetched, nil if it's never been fetched)
# - fetch_interval_secs (current interval between fetches, in seconds)
# - failing_since (if not null, feed updates have been failing since the datetime  value of this field)
# - available (if false, the feed is permanently unavailable and updates are not scheduled for it)
# - url (URL to which the user will be linked; usually the website that originated this feed)
#
# Title, fetch_url and url are sanitized (with ActionView::Helpers::SanitizeHelper) before validation; this is,
# before saving/updating each instance in the database.

class Feed < ApplicationRecord

  has_many :feed_subscriptions, dependent: :destroy
  has_many :users, through: :feed_subscriptions
  has_and_belongs_to_many :folders, before_add: :single_user_folder
  has_many :entries, dependent: :destroy
  has_many :deleted_entries, dependent: :destroy
  has_many :refresh_feed_job_states, dependent: :destroy
  has_many :subscribe_job_states, dependent: :destroy

  validates :fetch_url, format: {with: URI::regexp(%w{http https})}, presence: true, uniqueness: {case_sensitive: true}
  validates :url, format: {with: URI::regexp(%w{http https})}, allow_blank: true
  validates :title, presence: true
  validates :fetch_interval_secs, presence: true
  validates :available, inclusion: {in: [true, false]}

  before_validation :before_validation

  after_create :schedule_update
  before_destroy :before_destroy_feed, prepend: true
  after_destroy :unschedule_updates
  before_save :unschedule_unavailable
  after_save :touch_subscriptions

  ##
  # Find the folder to which a feed belongs, for a given user.
  #
  # Receives as argument a user.
  #
  # A feed can belong to many folders that belong to many users, but only to a single folder for a given user.
  # This method searches among the folders to which this feed belongs, trying to find one that belongs to the
  # user passed as argument.
  #
  # If a matching folder is found, it is returned. Otherwise nil is returned.

  def user_folder(user)
    folder = self.folders.find_by user_id: user.id
    return folder
  end

  ##
  # Remove this feed from its current folder, if any, for a given user.
  #
  # Receives as argument a user.
  #
  # A feed can only be in a single folder owned by a given user, so it's not necessary to pass the folder id
  # as an argument, it can be inferred from the user id and feed id.
  #
  # If the feed is in a folder owned by the passed user, it is removed from the folder.
  # Otherwise nothing is done.
  #
  # Returns a Folder instance with the data of the folder in which the feed was previously, or nil
  # if it wasn't in any folder. This object may have already  been deleted from the database,
  # if there were no more feeds in it.

  def remove_from_folder(user)
    folder = self.user_folder user
    if folder.present?
      Rails.logger.info "user #{user.id} - #{user.email} is removing feed #{self.id} - #{self.fetch_url} from folder #{folder.id} - #{folder.title}"
      folder.feeds.delete self
    else
      Rails.logger.info "user #{user.id} - #{user.email} is trying to remove feed #{self.id} - #{self.fetch_url} from its folder, but it's not in any folder"
    end

    return folder
  end

  ##
  # Check if a feed exists in the database matching a given URL. This is a class method.
  #
  # Receives as argument a URL.
  #
  # It checks several variants of the passed URL to see if there's a matching feed:
  #
  # - First it checks with the passed URL as-is.
  # - If the passed URL has a trailing slash, it checks with the slash removed.
  # - If the passed URL does not have a trailing slash, it checks with an added trailing slash.
  #
  # In all three cases it invokes the Feed.url_feed method to check if there's a matching feed.
  #
  # If a matching feed is found, it is returned. Otherwise returns nil.

  def self.url_variants_feed(feed_url)
    # Ensure that the passed url has an http:/// or https:// uri-scheme
    url = UrlNormalizer.normalize_feed_url feed_url
    # Remove leading and trailing whitespace, to avoid confusion when detecting trailing slashes
    stripped_url = url.strip
    Rails.logger.info "Searching for matching feeds for url #{stripped_url}"
    matching_feed = find_feed_by_url stripped_url
    if matching_feed.blank? && stripped_url =~ /.*[^\/]$/
      Rails.logger.info "No matching feed found for #{stripped_url}, adding trailing slash to search again for url"
      url_slash = stripped_url + '/'
      matching_feed = find_feed_by_url url_slash
    elsif matching_feed.blank? && stripped_url =~ /.*\/$/
      Rails.logger.info "No matching feed found for #{stripped_url}, removing trailing slash to search again for url"
      url_no_slash = stripped_url.chop
      matching_feed = find_feed_by_url url_no_slash
    end

    return matching_feed
  end

  private

  ##
  # After saving a new feed in the database, a scheduled job will be created to update it periodically

  def schedule_update
    ScheduleManager.schedule_first_update self.id
  end

  ##
  # Before destroying a feed, delete dangling objects without validations nor callbacks for performance:
  # - delete all entries
  # - delete all entry_states
  # - delete all deleted_entries

  def before_destroy_feed
    EntryState.where(entry_id: self.entries).delete_all
    self.entries.delete_all
    self.deleted_entries.delete_all
  end

  ##
  # After removing a feed from the database, the scheduled job that updated it will be unscheduled.

  def unschedule_updates
    ScheduleManager.unschedule_feed_updates self.id
  end

  ##
  # If the available attribute is set to false, unschedule the job that updates this feed

  def unschedule_unavailable
    if !self.available && self.available_changed?
      ScheduleManager.unschedule_feed_updates self.id
    end
  end

  ##
  # Touch (update the updated_at attribute) associated subscriptions if at least one of these attributes has changed:
  # - title
  # - url
  # The subscriptions_updated_at of subscribed users is also updated to the current date and time
  #
  # This is meant to invalidate the HTTP cache and force clients to download this feed again.

  def touch_subscriptions
    if saved_change_to_title? || saved_change_to_url?
      feed_subscriptions.find_each do |s|
        s.touch_subscriptions
      end
    end
  end

  ##
  # Before adding a feed to a folder, ensure that the feed is not already in other folders that belong
  # to the same user. In this case, raise a rollback error.

  def single_user_folder(folder)
    if self.folders.present?
      raise ActiveRecord::Rollback if self.folders.where(user_id: folder.user_id).exists?
    end
  end

  ##
  # Various operations before each validation:
  # - fix any encoding problems, converting to utf-8 if necessary
  # - set default values for missing attributes
  # - sanitize values, removing script tags from titles etc.
  # - encode any invalid characters in url and fetch_url
  # - check if the feed url or fetch_url is blacklisted, and if so a BlacklistedUrlError is raised

  def before_validation
    fix_encoding
    default_values
    sanitize_attributes
    fix_urls
    check_if_blacklisted
  end

  ##
  # Fix problems with encoding in text attributes.
  # Specifically, convert from ISO-8859-1 to UTF-8 if necessary.

  def fix_encoding
    self.title = EncodingManager.fix_encoding self.title
    self.url = EncodingManager.fix_encoding self.url
    self.fetch_url = EncodingManager.fix_encoding self.fetch_url
  end

  ##
  # Give default values to attributes:
  # - fetch_interval_secs defaults to 3600 seconds (1 hour)
  # - available defaults to true

  def default_values
    self.url = self.fetch_url if self.url.blank?
    self.fetch_interval_secs = 3600 if self.fetch_interval_secs.blank?
    self.available = true if self.available.nil?
  end

  ##
  # Sanitize and trim the title, URL and fetch URL of the feed.
  #
  # Despite this sanitization happening before saving in the database, sanitize helpers must still be used in the views.
  # Better paranoid than sorry!
  #
  # Also, if an update tries to set a value for url or fetch_url which is not a valid URL, ignore
  # the update only for that attribute and keep the old value.

  def sanitize_attributes
    self.title = Sanitizer.sanitize_plaintext self.title
    self.fetch_url = Sanitizer.sanitize_plaintext self.fetch_url
    self.url = Sanitizer.sanitize_plaintext self.url

    # URLs must be valid http or https
    if self.fetch_url_was.present? && (self.fetch_url =~ URI::regexp(%w{http https})).nil?
      self.fetch_url = self.fetch_url_was
    end

    if self.url_was.present? && (self.url =~ URI::regexp(%w{http https})).nil?
      self.url = self.url_was
    end

    # Title must not become blank because of sanitization
    if self.title.blank? && self.title_was.present?
      Rails.logger.debug "Feed #{id} title '#{title_was}' would have become blank because of sanitization. Keeping the old value instead."
      self.title = self.title_was
    end
  end

  ##
  # Fix problems with URLs, by URL-encoding any illegal characters.

  def fix_urls
    self.url = UrlNormalizer.normalize_feed_url self.url if self.url.present?
    self.fetch_url = UrlNormalizer.normalize_feed_url self.fetch_url if self.fetch_url.present?
  end

  ##
  # Check if the feed's url or fetch_url is blacklisted.
  #
  # If it is blacklisted, a BlacklistedUrlError is raised. Otherwise returns nil.

  def check_if_blacklisted
    raise BlacklistedUrlError.new if FeedBlacklister.blacklisted_feed? self
    return nil
  end

  #############################
  # PRIVATE CLASS METHODS
  #############################

  ##
  # Find a feed in the database with a given a URL. This is a class method.
  #
  # Receives as argument a URL.
  #
  # If there is a feed in the database which "url" or "fetch_url" field matches with
  # the url passed as argument, returns the feed object; returns nil otherwise.

  def self.find_feed_by_url(url)
    if Feed.exists? fetch_url: url
      Rails.logger.info "Feed with fetch_url #{url} already exists in the database"
      return Feed.find_by fetch_url: url
    elsif Feed.exists? url: url
      Rails.logger.info "Feed with url #{url} already exists in the database"
      return Feed.find_by url: url
    else
      Rails.logger.info "Feed #{url} does not exist in the database"
      return nil
    end
  end
  private_class_method :find_feed_by_url

end