FeedBunch-app/app/models/entry.rb
# frozen_string_literal: true
require 'nokogiri'
require 'encoding_manager'
require 'special_feed_manager'
require 'url_normalizer'
require 'sanitizer'
require 'url_validator'
##
# Feed entry model. Each instance of this class represents an entry in an RSS or Atom feed.
#
# Instances of this class are saved in the database when fetching and parsing feeds. It's not intended to be
# instanced by the user.
#
# Each entry belongs to exactly one feed.
#
# Each entry has many entry-states, exactly one for each user subscribed to the feed. Each entry-state indicates
# whether each user has read or not this entry.
#
# When a new entry is saved in the database for the first time, it is marked as unread for all users subscribed to
# its feed (by saving as many entry_state instances as subscribed users into the database, all of them with the attribute
# "read" set to false).
#
# Each entry is uniquely identified by its guid and its unique_hash within the scope of a given feed.
# Duplicate guids are not allowed for the same feed. Duplicate unique_hashes are not allowed for the same feed
#
# When entries are deleted by an automated cleanup (because the feed had too many entries),
# a new DeletedEntry instance is saved in the database with the same feed_id, guid and unique_hash as the deleted entry.
# An entry with the same feed_id and either guid or unique_hash as an already existing DeletedEntry is not valid and won't be
# saved in the database (it would indicate an entry that is at once deleted and not deleted).
#
# Attributes of the model:
# - feed_id
# - title
# - url
# - author
# - content
# - summary
# - published
# - guid
# - unique_hash
#
# All fields except "published" and "feed_id" are sanitized before validation; this is, before saving/updating each
# instance in the database.
class Entry < ApplicationRecord
LOADING_IMAGE_GIF = '/images/Ajax-loader.gif'
belongs_to :feed
validates :feed_id, presence: true
has_many :entry_states, dependent: :delete_all
validates :title, presence: true
validates :url, presence: true
validate :valid_url
validates :published, presence: true
validates :guid, presence: true, uniqueness: {case_sensitive: true, scope: :feed_id}
validates :unique_hash, presence: true, uniqueness: {case_sensitive: true, scope: :feed_id}
validate :entry_not_deleted
after_initialize :initialize_entry_attributes, if: :new_record?
before_validation :initialize_entry_attributes, if: :changed?
after_create :set_unread_state
##
# Return a boolean that indicates whether this entry has been marked as read by the passed user.
#
# Receives as argument the user for which the read/unread state will be retrieved.
#
# If the user is not actually subscribed to the feed, raises a NotSubscribedError.
def read_by?(user)
state = EntryState.find_by entry_id: self.id, user_id: user.id
if state.blank?
Rails.logger.warn "Tried to find out if user #{user.id} - #{user.email} has read entry #{self.id} from feed #{self.feed_id} to which he is not subscribed. Raising an error."
raise NotSubscribedError.new
end
return state.read
end
##
# Return a boolean that indicates if an Entry with the same guid and feed as this one is already in the database.
#
# It only makes sense to invoke this method on unsaved entries. If an attempt is made to save an Entry with the
# same feed and guid as an already saved one, a validation will fail and an error will be raised
def guid_already_exists?
return Entry.where(feed_id: self.feed_id, guid: self.guid).exists?
end
##
# Return a boolean that indicates if an Entry with the same unique_hash and feed as this one is already in the database.
#
# It only makes sense to invoke this method on unsaved entries. If an attempt is made to save an Entry with the
# same feed and unique_hash as an already saved one, a validation will fail and an error will be raised
def unique_hash_already_exists?
return Entry.where(feed_id: self.feed_id, unique_hash: self.unique_hash).exists?
end
##
# Return a boolean that indicates if a DeletedEntry with the same guid and feed as this one is already in the database.
#
# It only makes sense to invoke this method on unsaved entries. If an attempt is made to save an Entry with the
# same feed and guid as an already deleted one, a validation will fail and an error will be raised
def guid_already_deleted?
return DeletedEntry.where(feed_id: self.feed_id, guid: self.guid).exists?
end
##
# Return a boolean that indicates if a DeletedEntry with the same unique_hash and feed as this one is already in the database.
#
# It only makes sense to invoke this method on unsaved entries. If an attempt is made to save an Entry with the
# same feed and unique_hash as an already deleted one, a validation will fail and an error will be raised
def unique_hash_already_deleted?
return DeletedEntry.where(feed_id: self.feed_id, unique_hash: self.unique_hash).exists?
end
private
##
# Validate that the entry URL is either an http or https URL, or a protocol-relative URL
def valid_url
unless UrlValidator.valid_entry_url? self.url
errors.add :url, "URL #{self.url} is not a valid http, https or protocol-relative URL"
end
end
##
# Validate that the entry has not been deleted (there is a deleted_entries record with the
# same feed_id and either guid or unique_hash)
def entry_not_deleted
if DeletedEntry.where('feed_id = ? AND (guid = ? OR unique_hash = ?)', self.feed_id, self.guid, self.unique_hash).exists?
Rails.logger.debug "Entry not valid: already deleted entry - guid: #{self.try :guid}, unique_hash: #{self.try :unique_hash}, published: #{self.try :published}, feed_id: #{self.feed_id}, feed title: #{self.feed.title}"
errors.add :base, 'entry already deleted'
end
end
##
# Initialize and massage several Entry attributes when a new entry record is created
def initialize_entry_attributes
fix_attributes
special_feed_handling
end
##
# Fix any problems with attribute values before validation:
# - fix any encoding problems, converting to utf-8 if necessary
# - sanitize values, removing script tags from entry bodies etc.
# - give default values to missing mandatory attributes
def fix_attributes
fix_encoding
strip_attributes
content_manipulation
sanitize_attributes
default_attribute_values
fix_url
calculate_unique_hash
end
##
# Fix problems with encoding in text attributes.
# Specifically, convert from ISO-8859-1 to UTF-8 if necessary.
def fix_encoding
self.title = EncodingManager.fix_encoding self.title
self.url = EncodingManager.fix_encoding self.url
self.author = EncodingManager.fix_encoding self.author
self.content = EncodingManager.fix_encoding self.content
self.summary = EncodingManager.fix_encoding self.summary
self.guid = EncodingManager.fix_encoding self.guid
end
##
# Trim the title, url, author, content, summary and guid of the entry, removing any
# heading or trailing blank characters.
def strip_attributes
self.title = self.title.try :strip
self.url = self.url.try :strip
self.author = self.author.try :strip
self.content = self.content.try :strip
self.summary = self.summary.try :strip
self.guid = self.guid.try :strip
end
##
# Sanitize the title, url, author, content, summary and guid of the entry.
#
# Despite this sanitization happening before saving in the database, sanitize helpers must still be used in the views.
# Better paranoid than sorry!
def sanitize_attributes
# Summary, content are sanitized with an HTML sanitizer, we want imgs etc to be present.
# Other attributes are sanitized by stripping tags, they should be plain text.
self.content = Sanitizer.sanitize_html self.content
self.summary = Sanitizer.sanitize_html self.summary
self.title = Sanitizer.sanitize_plaintext self.title
self.author = Sanitizer.sanitize_plaintext self.author
self.guid = Sanitizer.sanitize_plaintext self.guid
self.url = Sanitizer.sanitize_plaintext self.url
end
##
# Manipulations in entries summary and content markup before saving the entry.
def content_manipulation
self.summary = markup_manipulation self.summary if self.summary.present?
self.content = markup_manipulation self.content if self.content.present?
end
##
# Manipulations in the passed html fragment
def markup_manipulation(html_fragment)
html_doc = Nokogiri::HTML html_fragment
html_doc = link_manipulations html_doc
html_doc = image_manipulations html_doc
return html_doc.css('body').children.to_s
end
##
# Add the target="_blank" attribute to any links in the passed HTML fragment.
# Receives as argument a parsed HTML fragment.
# The attribute will overwrite any target="" attribute that was present in the links
def link_manipulations(html_doc)
html_doc.css('a').each do |link|
link['target'] = '_blank'
end
return html_doc
end
##
# Remove any height, width and style attributes and set a CSS class to horizontally center
# any images in the passed fragment.
# Any class attribute in images will be overwritten.
#
# Also prepare images to be lazy-loaded with the jquery-unveil library.
#
# Receives as argument a parsed HTML fragment.
def image_manipulations(html_doc)
html_doc.css('img').each do |img|
# prepare image for lazy loading
unless img['src'] == LOADING_IMAGE_GIF
src = UrlNormalizer.normalize_entry_url img['src'], self
img['src'] = LOADING_IMAGE_GIF
img['data-src'] = src
end
end
return html_doc
end
##
# Give default values to the title and guid attributes if they are empty.
# Their default value is the value of the "url" attribute.
#
# If the url attribute is not a valid URL but the guid is, the url attribute takes
# the value of the guid attribute. This probably breaks Atom/RSS spec, but I'd like to support feeds
# that do this.
#
# If the publish date is not present, assume the current datetime as default value. This means
# entries will be shown as published in the moment they are fetched unless the feed specifies
# otherwise. This ensures all entries have a publish date which avoids major headaches when ordering.
#
# Calculate the MD5 hash of the entry content.
def default_attribute_values
# GUID defaults to the url attribute
self.guid = self.url if self.guid.blank?
# title defaults to the url attribute
self.title = self.url if self.title.blank?
# if the url attr is not actually a valid URL but the guid is, url attr takes the value of the guid attr
if !(UrlValidator.valid_entry_url?(self.url)) && UrlValidator.valid_entry_url?(self.guid)
self.url = self.guid
# If the url was blank before but now has taken the value of the guid, default the title to this value
self.title = self.url if self.title.blank?
end
# published defaults to the current datetime
self.published = Time.zone.now if self.published.blank?
end
##
# Fix problems with the entry URL, by normalizing the URL and converting relative URLs to absolute ones.
def fix_url
self.url = UrlNormalizer.normalize_entry_url self.url, self
end
##
# Calculate the hash that uniquely identifies an entry in its feed. Multiple entries with the same hash in the same
# feed are not allowed.
#
# The hash is an MD5 hex-digest of the concatenation of:
# - the entry content (if present)
# - the entry summary (if present)
# - the entry title
#
# Note that the entry title is a mandatory attribute, which guarantees that all entries have a unique_hash.
def calculate_unique_hash
unique = ''
unique += self.content if self.content.present?
unique += self.summary if self.summary.present?
unique += self.title if self.title.present?
self.unique_hash = Digest::MD5.hexdigest unique
end
##
# Pass the entry to a special handler if the feed needs special handling
def special_feed_handling
special_handler = SpecialFeedManager.get_special_handler self
special_handler.handle_entry self if special_handler.present?
end
##
# For each user subscribed to this entry's feed, save an entry_state instance with the "read" attribute set to false.
#
# Or in layman's terms: mark this entry as unread for all users subscribed to the feed.
def set_unread_state
self.feed.users.reload.find_each do |user|
if !EntryState.exists? user_id: user.id, entry_id: self.id
entry_state = user.entry_states.create! entry_id: self.id, read: false
end
end
end
end