app/models/story_parser.rb from otwcode/otwarchive

app/models/story_parser.rb
Summary

Maintainability

6 days
Test Coverage

Issues
# Parse stories from other websites and uploaded files, looking for metadata to harvest
# and put into the archive.
#
class StoryParser
  require 'timeout'
  require 'nokogiri'
  require 'mechanize'
  require 'open-uri'
  include HtmlCleaner

  OPTIONAL_META = {notes: 'Note',
                   freeform_string: 'Tag',
                   fandom_string: 'Fandom',
                   rating_string: 'Rating',
                   archive_warning_string: 'Warning',
                   relationship_string: 'Relationship|Pairing',
                   character_string: 'Character' }.freeze
  REQUIRED_META = { title: 'Title',
                    summary: 'Summary',
                    revised_at: 'Date|Posted|Posted on|Posted at',
                    chapter_title: 'Chapter Title' }.freeze

  # Use this for raising custom error messages
  # (so that we can distinguish them from unexpected exceptions due to
  # faulty code)
  class Error < StandardError
  end

  # These attributes need to be moved from the work to the chapter
  # format: {work_attribute_name: :chapter_attribute_name} (can be the same)
  CHAPTER_ATTRIBUTES_ONLY = {}

  # These attributes need to be copied from the work to the chapter
  CHAPTER_ATTRIBUTES_ALSO = { revised_at: :published_at }.freeze

  ### NOTE ON KNOWN SOURCES
  # These lists will stop with the first one it matches, so put more-specific matches
  # towards the front of the list.

  # places for which we have a custom parse_story_from_[source] method
  # for getting information out of the downloaded text
  KNOWN_STORY_PARSERS = %w[deviantart dw lj].freeze

  # places for which we have a custom parse_author_from_[source] method
  # which returns an external_author object including an email address
  KNOWN_AUTHOR_PARSERS = %w[lj].freeze

  # places for which we have a download_story_from_[source]
  # used to customize the downloading process
  KNOWN_STORY_LOCATIONS = %w[lj].freeze

  # places for which we have a download_chaptered_from
  # to get a set of chapters all together
  CHAPTERED_STORY_LOCATIONS = %w[ffnet thearchive_net efiction quotev].freeze

  # regular expressions to match against the URLS
  SOURCE_LJ = '((live|dead|insane)journal\.com)|journalfen(\.net|\.com)|dreamwidth\.org'.freeze
  SOURCE_DW = 'dreamwidth\.org'.freeze
  SOURCE_FFNET = '(^|[^A-Za-z0-9-])fanfiction\.net'.freeze
  SOURCE_DEVIANTART = 'deviantart\.com'.freeze
  SOURCE_THEARCHIVE_NET = 'the\-archive\.net'.freeze
  SOURCE_EFICTION = 'viewstory\.php'.freeze
  SOURCE_QUOTEV = 'quotev\.com'.freeze

  # time out if we can't download fast enough
  STORY_DOWNLOAD_TIMEOUT = 60
  MAX_CHAPTER_COUNT = 200

  # To check for duplicate chapters, take a slice this long out of the story
  # (in characters)
  DUPLICATE_CHAPTER_LENGTH = 10_000


  # Import many stories
  def import_many(urls, options = {})
    # Try to get the works
    works = []
    failed_urls = []
    errors = []
    @options = options
    urls.each do |url|
      begin
        response = download_and_parse_work(url, options)
        work = response[:work]
        if response[:status] == :created
          if work && work.save
            work.chapters.each(&:save)
            works << work
          else
            failed_urls << url
            errors << work.errors.values.join(", ")
            work.delete if work
          end
        elsif response[:status] == :already_imported
          raise StoryParser::Error, response[:message]
        end
      rescue Timeout::Error
        failed_urls << url
        errors << "Import has timed out. This may be due to connectivity problems with the source site. Please try again in a few minutes, or check Known Issues to see if there are import problems with this site."
        work.delete if work
      rescue Error => exception
        failed_urls << url
        errors << "We couldn't successfully import that work, sorry: #{exception.message}"
        work.delete if work
      end
    end
    [works, failed_urls, errors]
  end

  # Downloads a story and passes it on to the parser.
  # If the URL of the story is from a site for which we have special rules
  # (eg, downloading from a livejournal clone, you want to use ?format=light
  # to get a nice and consistent post format), it will pre-process the url
  # according to the rules for that site.
  def download_and_parse_work(location, options = {})
    status = :created
    message = ""
    work = Work.find_by_url(location)
    if work.nil?
      @options = options
      source = get_source_if_known(CHAPTERED_STORY_LOCATIONS, location)
      if source.nil?
        story = download_text(location)
        work = parse_story(story, location, options)
      else
        work = download_and_parse_chaptered_story(source, location, options)
      end
    else
      status = :already_imported
      message = "A work has already been imported from #{location}."
    end
    {
      status: status,
      message: message,
      work: work
    }
  end

  # Given an array of urls for chapters of a single story,
  # download them all and combine into a single work
  def import_chapters_into_story(locations, options = {})
    status = :created
    work = Work.find_by_url(locations.first)
    if work.nil?
      chapter_contents = []
      @options = options
      locations.each do |location|
        chapter_contents << download_text(location)
      end
      work = parse_chapters_into_story(locations.first, chapter_contents, options)
      message = "Successfully created work \"" + work.title + "\"."
    else
      status = :already_imported
      message = "A work has already been imported from #{locations.first}."
    end
    {
      status: status,
      message: message,
      work: work
    }
  end


  ### OLD PARSING METHODS

  # Import many stories
  def import_from_urls(urls, options = {})
    # Try to get the works
    works = []
    failed_urls = []
    errors = []
    @options = options
    urls.each do |url|
      begin
        work = download_and_parse_story(url, options)
        if work && work.save
          work.chapters.each(&:save)
          works << work
        else
          failed_urls << url
          errors << work.errors.values.join(", ")
          work.delete if work
        end
      rescue Timeout::Error
        failed_urls << url
        errors << "Import has timed out. This may be due to connectivity problems with the source site. Please try again in a few minutes, or check Known Issues to see if there are import problems with this site."
        work.delete if work
      rescue Error => exception
        failed_urls << url
        errors << "We couldn't successfully import that work, sorry: #{exception.message}"
        work.delete if work
      end
    end
    [works, failed_urls, errors]
  end

  # Downloads a story and passes it on to the parser.
  # If the URL of the story is from a site for which we have special rules
  # (eg, downloading from a livejournal clone, you want to use ?format=light
  # to get a nice and consistent post format), it will pre-process the url
  # according to the rules for that site.
  def download_and_parse_story(location, options = {})
    check_for_previous_import(location)
    @options = options
    source = get_source_if_known(CHAPTERED_STORY_LOCATIONS, location)
    if source.nil?
      story = download_text(location)
      work = parse_story(story, location, options)
    else
      work = download_and_parse_chaptered_story(source, location, options)
    end
    work
  end

  # Given an array of urls for chapters of a single story,
  # download them all and combine into a single work
  def download_and_parse_chapters_into_story(locations, options = {})
    check_for_previous_import(locations.first)
    chapter_contents = []
    @options = options
    locations.each do |location|
      chapter_contents << download_text(location)
    end
    parse_chapters_into_story(locations.first, chapter_contents, options)
  end

  ### PARSING METHODS

  # Parses the text of a story, optionally from a given location.
  def parse_story(story, location, options = {})
    work_params = parse_common(story, location, options[:encoding], options[:detect_tags])

    # move any attributes from work to chapter if necessary
    set_work_attributes(Work.new(work_params), location, options)
  end

  # parses and adds a new chapter to the end of the work
  def parse_chapter_of_work(work, chapter_content, location, options = {})
    tmp_work_params = parse_common(chapter_content, location, options[:encoding], options[:detect_tags])
    chapter = get_chapter_from_work_params(tmp_work_params)
    work.chapters << set_chapter_attributes(work, chapter)
    work
  end

  def parse_chapters_into_story(location, chapter_contents, options = {})
    work = nil
    chapter_contents.each do |content|
      work_params = parse_common(content, location, options[:encoding], options[:detect_tags])
      if work.nil?
        # create the new work
        work = Work.new(work_params)
      else
        new_chapter = get_chapter_from_work_params(work_params)
        work.chapters << set_chapter_attributes(work, new_chapter)
      end
    end
    set_work_attributes(work, location, options)
  end

  # Everything below here is protected and should not be touched by outside
  # code -- please use the above functions to parse external works.

  protected

  # tries to create an external author for a given url
  def parse_author(location, ext_author_name, ext_author_email)
    if location.present? && ext_author_name.blank? && ext_author_email.blank?
      source = get_source_if_known(KNOWN_AUTHOR_PARSERS, location)
      if source.nil?
        raise Error, "No external author name or email specified"
      else
        send("parse_author_from_#{source.downcase}", location)
      end
    else
      parse_author_common(ext_author_email, ext_author_name)
    end
  end

  # download an entire story from an archive type where we know how to parse multi-chaptered works
  # this should only be called from download_and_parse_story
  def download_and_parse_chaptered_story(source, location, options = {})
    chapter_contents = send("download_chaptered_from_#{source.downcase}", location)
    parse_chapters_into_story(location, chapter_contents, options)
  end

  # our custom url finder checks for previously imported URL in almost any format it may have been presented
  def check_for_previous_import(location)
    if Work.find_by_url(location).present?
      raise Error, "A work has already been imported from #{location}."
    end
  end

  def set_chapter_attributes(work, chapter)
    chapter.position = work.chapters.length + 1
    chapter.posted = true
    chapter
  end

  def set_work_attributes(work, location = "", options = {})
    raise Error, "Work could not be downloaded" if work.nil?

    @options = options
    work.imported_from_url = location
    work.ip_address = options[:ip_address]
    work.expected_number_of_chapters = work.chapters.length
    work.revised_at = work.chapters.last.published_at
    if work.revised_at && work.revised_at.to_date < Date.current
      work.backdate = true
    end

    # set authors for the works
    pseuds = []
    pseuds << User.current_user.default_pseud unless options[:do_not_set_current_author] || User.current_user.nil?
    pseuds << options[:archivist].default_pseud if options[:archivist]
    pseuds << options[:pseuds] if options[:pseuds]
    pseuds = pseuds.flatten.compact.uniq
    raise Error, "A work must have at least one author specified" if pseuds.empty?
    pseuds.each do |pseud|
      work.creatorships.build(pseud: pseud, enable_notifications: true)
      work.chapters.each do |chapter|
        chapter.creatorships.build(pseud: pseud)
      end
    end

    # handle importing works for others
    # build an external creatorship for each author
    if options[:importing_for_others]
      external_author_names = options[:external_author_names] || parse_author(location, options[:external_author_name], options[:external_author_email])
      # convert to an array if not already one
      external_author_names = [external_author_names] if external_author_names.is_a?(ExternalAuthorName)
      if options[:external_coauthor_name].present?
        external_author_names << parse_author(location, options[:external_coauthor_name], options[:external_coauthor_email])
      end
      external_author_names.each do |external_author_name|
        next if !external_author_name || external_author_name.external_author.blank?
        if external_author_name.external_author.do_not_import
          # we're not allowed to import works from this address
          raise Error, "Author #{external_author_name.name} at #{external_author_name.external_author.email} does not allow importing their work to this archive."
        end
        work.external_creatorships.build(external_author_name: external_author_name, archivist: (options[:archivist] || User.current_user))
      end
    end

    # lock to registered users if specified or importing for others
    work.restricted = options[:restricted] || options[:importing_for_others] || false

    # set comment permissions
    work.comment_permissions = options[:comment_permissions] || "enable_all"
    work.moderated_commenting_enabled = options[:moderated_commenting_enabled] || false

    # set default values for required tags
    work.fandom_string = meta_or_default(work.fandom_string, options[:fandom], ArchiveConfig.FANDOM_NO_TAG_NAME)
    work.rating_string = meta_or_default(work.rating_string, options[:rating], ArchiveConfig.RATING_DEFAULT_TAG_NAME)
    work.archive_warning_strings = meta_or_default(work.archive_warning_strings, options[:archive_warning], ArchiveConfig.WARNING_DEFAULT_TAG_NAME)
    work.category_string = meta_or_default(work.category_string, options[:category], [])
    work.character_string = meta_or_default(work.character_string, options[:character], [])
    work.relationship_string = meta_or_default(work.relationship_string, options[:relationship], [])
    work.freeform_string = meta_or_default(work.freeform_string, options[:freeform], [])

    # set default value for title
    work.title = meta_or_default(work.title, options[:title], "Untitled Imported Work")
    work.summary = meta_or_default(work.summary, options[:summary], '')
    work.notes = meta_or_default(work.notes, options[:notes], '')

    # set collection name if present
    work.collection_names = get_collection_names(options[:collection_names]) if options[:collection_names].present?

    # set default language (English)
    work.language_id = options[:language_id] || Language.default.id

    work.posted = true if options[:post_without_preview]
    work.chapters.each do |chapter|
      if chapter.content.length > ArchiveConfig.CONTENT_MAX
        # TODO: eventually: insert a new chapter
        chapter.content.truncate(ArchiveConfig.CONTENT_MAX, omission: "<strong>WARNING: import truncated automatically because chapter was too long! Please add a new chapter for remaining content.</strong>", separator: "</p>")
      elsif chapter.content.empty?
        raise Error, "Chapter #{chapter.position} of \"#{work.title}\" is blank."
      end

      chapter.posted = true # do not save - causes the chapters to exist even if work doesn't get created!
    end
    work
  end

  def parse_author_from_lj(location)
    return if location !~ %r{^(?:http:\/\/)?(?<lj_name>[^.]*).(?<site_name>livejournal\.com|dreamwidth\.org|insanejournal\.com|journalfen.net)}
    email = ""
    lj_name = Regexp.last_match[:lj_name]
    site_name = Regexp.last_match[:site_name]
    if lj_name == "community"
      # whups
      post_text = download_text(location)
      doc = Nokogiri.parse(post_text)
      lj_name = doc.xpath("/html/body/div[2]/div/div/div/table/tbody/tr/td[2]/span/a[2]/b").content
    end
    profile_url = "http://#{lj_name}.#{site_name}/profile"
    lj_profile = download_text(profile_url)
    doc = Nokogiri.parse(lj_profile)
    contact = doc.css('div.contact').inner_html
    if contact.present?
      contact.gsub! '<p class="section_body_title">Contact:</p>', ""
      contact.gsub! /<\/?(span|i)>/, ""
      contact.delete! "\n"
      contact.gsub! "<br/>", ""
      if contact =~ /(.*@.*\..*)/
        email = Regexp.last_match[1]
      end
    end
    email = "#{lj_name}@#{site_name}" if email.blank?
    parse_author_common(email, lj_name)
  end

  def parse_author_from_unknown(_location)
    # for now, nothing
    nil
  end

  def parse_author_common(email, name)
    errors = []

    errors << "No author name specified" if name.blank?

    if email.present?
      external_author = ExternalAuthor.find_or_create_by(email: email)
      errors += external_author.errors.full_messages
    else
      errors << "No author email specified"
    end

    raise Error, errors.join("\n") if errors.present?

    # convert to ASCII and strip out invalid characters (everything except alphanumeric characters, _, @ and -)
    redacted_name = name.to_ascii.gsub(/[^\w[ \-@.]]/u, "")
    if redacted_name.present?
      external_author.names.find_or_create_by(name: redacted_name)
    else
      external_author.default_name
    end
  end

  def get_chapter_from_work_params(work_params)
    @chapter = Chapter.new(work_params[:chapter_attributes])
    # don't override specific chapter params (eg title) with work params
    chapter_params = work_params.delete_if do |name, _param|
      !@chapter.attribute_names.include?(name.to_s) || !@chapter.send(name.to_s).blank?
    end
    @chapter.update(chapter_params)
    @chapter
  end

  def download_text(location)
    source = get_source_if_known(KNOWN_STORY_LOCATIONS, location)
    if source.nil?
      download_with_timeout(location)
    else
      send("download_from_#{source.downcase}", location)
    end
  end

  # canonicalize the url for downloading from lj or clones
  def download_from_lj(location)
    url = location
    url.gsub!(/\#(.*)$/, "") # strip off any anchor information
    url.gsub!(/\?(.*)$/, "") # strip off any existing params at the end
    url.gsub!('_', '-') # convert underscores in usernames to hyphens
    url += "?format=light" # go to light format
    text = download_with_timeout(url)

    if text.match(/adult_check/)
      Timeout::timeout(STORY_DOWNLOAD_TIMEOUT) {
        begin
          agent = Mechanize.new
          url.include?("dreamwidth") ? form = agent.get(url).forms.first : form = agent.get(url).forms.third
          page = agent.submit(form, form.buttons.first) # submits the adult concepts form
          text = page.body.force_encoding(agent.page.encoding)
        rescue
          text = ""
        end
      }
    end
    text
  end

  # grab all the chapters of the story from ff.net
  def download_chaptered_from_ffnet(_location)
    raise Error, "Sorry, Fanfiction.net does not allow imports from their site."
  end

  def download_chaptered_from_quotev(_location)
    raise Error, "Sorry, Quotev.com does not allow imports from their site."
  end

  # this is an efiction archive but it doesn't handle chapters normally
  # best way to handle is to get the full story printable version
  # We have to make it a download-chaptered because otherwise it gets sent to the
  #  generic efiction version since chaptered sources are checked first
  def download_chaptered_from_thearchive_net(location)
    if location.match(/^(.*)\/.*viewstory\.php.*[^p]sid=(\d+)($|&)/i)
      location = "#{$1}/viewstory.php?action=printable&psid=#{$2}"
    end
    text = download_with_timeout(location)
    text.sub!('</style>', '</style></head>') unless text.match('</head>')
    [text]
  end

  # grab all the chapters of a story from an efiction-based site
  def download_chaptered_from_efiction(location)
    chapter_contents = []
    if location.match(/^(?<site>.*)\/.*viewstory\.php.*sid=(?<storyid>\d+)($|&)/i)
      site = Regexp.last_match[:site]
      storyid = Regexp.last_match[:storyid]
      chapnum = 1
      last_body = ""
      Timeout::timeout(STORY_DOWNLOAD_TIMEOUT) do
        loop do
          url = "#{site}/viewstory.php?action=printable&sid=#{storyid}&chapter=#{chapnum}"
          body = download_with_timeout(url)
          # get a section to check that this isn't a duplicate of previous chapter
          body_to_check = body.slice(10, DUPLICATE_CHAPTER_LENGTH)
          if body.nil? || body_to_check == last_body || chapnum > MAX_CHAPTER_COUNT || body.match(/<div class='chaptertitle'> by <\/div>/) || body.match(/Access denied./) || body.match(/Chapter : /)
            break
          end
          # save the value to check for duplicate chapter
          last_body = body_to_check

          # clean up the broken head in many efiction printable sites
          body.sub!('</style>', '</style></head>') unless body.match('</head>')
          chapter_contents << body
          chapnum += 1
        end
      end
    end
    chapter_contents
  end


  # This is the heavy lifter, invoked by all the story and chapter parsers.
  # It takes a single string containing the raw contents of a story, parses it with
  # Nokogiri into the @doc object, and then and calls a subparser.
  #
  # If the story source can be identified as one of the sources we know how to parse in some custom/
  # special way, parse_common calls the customized parse_story_from_[source] method.
  # Otherwise, it falls back to parse_story_from_unknown.
  #
  # This produces a hash equivalent to the params hash that is normally created by the standard work
  # upload form.
  #
  # parse_common then calls sanitize_params (which would also be called on the standard work upload
  # form results) and returns the final sanitized hash.
  #
  def parse_common(story, location = nil, encoding = nil, detect_tags = true)
    work_params = { title: "Untitled Imported Work", chapter_attributes: { content: "" } }

    # Encode as HTML - the dummy "foo" tag will be stripped out by the sanitizer but forces Nokogiri to
    # preserve line breaks in plain text documents
    # Rescue all errors as Nokogiri complains about things the sanitizer will fix later
    @doc = Nokogiri::HTML.parse(story.prepend("<foo/>"), nil, encoding) rescue ""

    # Try to convert all relative links to absolute
    base = @doc.at_css("base") ? @doc.css("base")[0]["href"] : location.split("?").first
    if base.present?
      @doc.css("a").each do |link|
        next if link["href"].blank? || link["href"].start_with?("#")
        begin
          query = link["href"].match(/(\?.*)$/) ? $1 : ""
          link["href"] = URI.join(base, link["href"].gsub(/(\?.*)$/, "")).to_s + query
        rescue
# ignored
        end
      end
    end

    # Extract metadata (unless detect_tags is false)
    if location && (source = get_source_if_known(KNOWN_STORY_PARSERS, location))
      params = send("parse_story_from_#{source.downcase}", story, detect_tags)
      work_params.merge!(params)
    else
      work_params.merge!(parse_story_from_unknown(story, detect_tags))
    end

    shift_chapter_attributes(sanitize_params(work_params))
  end

  # our fallback: parse a story from an unknown source, so we have no special
  # rules.
  def parse_story_from_unknown(story, detect_tags = true)
    work_params = { chapter_attributes: {} }
    story_head = ""
    story_head = @doc.css("head").inner_html if @doc.css("head")

    # Story content - Look for progressively less specific containers or grab everything
    element = @doc.at_css('.chapter-content') || @doc.at_css('body') || @doc.at_css('html') || @doc
    storytext = element ? element.inner_html : story

    meta = {}
    meta.merge!(scan_text_for_meta(story_head, detect_tags)) unless story_head.blank?
    meta.merge!(scan_text_for_meta(story, detect_tags))
    meta[:title] ||= @doc.css('title').inner_html
    work_params[:chapter_attributes][:title] = meta.delete(:chapter_title)
    work_params[:chapter_attributes][:content] = clean_storytext(storytext)
    work_params.merge!(meta)
  end

  # Parses a story from livejournal or a livejournal equivalent (eg, dreamwidth, insanejournal)
  # Assumes that we have downloaded the story from one of those equivalents (ie, we've downloaded
  # it in format=light which is a stripped-down plaintext version.)
  #
  def parse_story_from_lj(_story, detect_tags = true)
    work_params = { chapter_attributes: {} }

    # in LJ "light" format, the story contents are in the second div
    # inside the body.
    body = @doc.css("body")
    storytext = body.css("article.b-singlepost-body").inner_html
    storytext = body.inner_html if storytext.empty?

    # cleanup the text
    # storytext.gsub!(/<br\s*\/?>/i, "\n") # replace the breaks with newlines
    storytext = clean_storytext(storytext)

    work_params[:chapter_attributes][:content] = storytext
    work_params[:title] = @doc.css("title").inner_html
    work_params[:title].gsub! /^[^:]+: /, ""
    work_params.merge!(scan_text_for_meta(storytext, detect_tags))

    date = @doc.css("time.b-singlepost-author-date")
    unless date.empty?
      work_params[:revised_at] = convert_revised_at(date.first.inner_text)
    end

    work_params
  end

  def parse_story_from_dw(_story, detect_tags = true)
    work_params = { chapter_attributes: {} }

    body = @doc.css("body")
    content_divs = body.css("div.contents")

    if content_divs[0].present?
      # Get rid of the DW metadata table
      content_divs[0].css("div.currents, ul.entry-management-links, div.header.inner, span.restrictions, h3.entry-title").each(&:remove)
      storytext = content_divs[0].inner_html
    else
      storytext = body.inner_html
    end

    # cleanup the text
    storytext = clean_storytext(storytext)

    work_params[:chapter_attributes][:content] = storytext
    work_params[:title] = @doc.css("title").inner_html
    work_params[:title].gsub! /^[^:]+: /, ""
    work_params.merge!(scan_text_for_meta(storytext, detect_tags))

    font_blocks = @doc.xpath('//font')
    unless font_blocks.empty?
      date = font_blocks.first.inner_text
      work_params[:revised_at] = convert_revised_at(date)
    end

    # get the date
    date = @doc.css("span.date").inner_text
    work_params[:revised_at] = convert_revised_at(date)

    work_params
  end

  def parse_story_from_deviantart(_story, detect_tags = true)
    work_params = { chapter_attributes: {} }
    storytext = ""
    notes = ""

    body = @doc.css("body")
    title = @doc.css("title").inner_html.gsub /\s*on deviantart$/i, ""

    # Find the image (original size) if it's art
    image_full = body.css("div.dev-view-deviation img.dev-content-full")
    unless image_full[0].nil?
      storytext = "<center><img src=\"#{image_full[0]["src"]}\"></center>"
    end

    # Find the fic text if it's fic (needs the id for disambiguation, the "deviantART loves you" bit in the footer has the same class path)
    text_table = body.css(".grf-indent > div:nth-child(1)")[0]
    unless text_table.nil?
      # Try to remove some metadata (title and author) from the work's text, if possible
      # Try to remove the title: if it exists, and if it's the same as the browser title
      if text_table.css("h1")[0].present? && title && title.match(text_table.css("h1")[0].text)
        text_table.css("h1")[0].remove
      end

      # Try to remove the author: if it exists, and if it follows a certain pattern
      if text_table.css("small")[0].present? && text_table.css("small")[0].inner_html.match(/by ~.*?<a class="u" href=/m)
        text_table.css("small")[0].remove
      end
      storytext = text_table.inner_html
    end

    # cleanup the text
    storytext.gsub!(%r{<br\s*\/?>}i, "\n") # replace the breaks with newlines
    storytext = clean_storytext(storytext)
    work_params[:chapter_attributes][:content] = storytext

    # Find the notes
    content_divs = body.css("div.text-ctrl div.text")
    notes = content_divs[0].inner_html unless content_divs[0].nil?

    # cleanup the notes
    notes.gsub!(%r{<br\s*\/?>}i, "\n") # replace the breaks with newlines
    notes = clean_storytext(notes)
    work_params[:notes] = notes

    work_params.merge!(scan_text_for_meta(notes, detect_tags))
    work_params[:title] = title

    body.css("div.dev-title-container h1 a").each do |node|
      if node["class"] != "u"
        work_params[:title] = node.inner_html
      end
    end

    tags = []
    @doc.css("div.dev-about-cat-cc a.h").each { |node| tags << node.inner_html }
    work_params[:freeform_string] = clean_tags(tags.join(ArchiveConfig.DELIMITER_FOR_OUTPUT))

    details = @doc.css("div.dev-right-bar-content span[title]")
    unless details[0].nil?
      work_params[:revised_at] = convert_revised_at(details[0].inner_text)
    end

    work_params
  end

  # Move and/or copy any meta attributes that need to be on the chapter rather
  # than on the work itself
  def shift_chapter_attributes(work_params)
    CHAPTER_ATTRIBUTES_ONLY.each_pair do |work_attrib, chapter_attrib|
      if work_params[work_attrib] && !work_params[:chapter_attributes][chapter_attrib]
        work_params[:chapter_attributes][chapter_attrib] = work_params[work_attrib]
        work_params.delete(work_attrib)
      end
    end

    # copy any attributes from work to chapter as necessary
    CHAPTER_ATTRIBUTES_ALSO.each_pair do |work_attrib, chapter_attrib|
      if work_params[work_attrib] && !work_params[:chapter_attributes][chapter_attrib]
        work_params[:chapter_attributes][chapter_attrib] = work_params[work_attrib]
      end
    end

    work_params
  end

  # Find any cases of the given pieces of meta in the given text
  # and return a hash of meta values
  def scan_text_for_meta(text, detect_tags = true)
    # break up the text with some extra newlines to make matching more likely
    # and strip out some tags
    text = text.gsub(/<br/, "\n<br")
    text.gsub!(/<p/, "\n<p")
    text.gsub!(/<\/?(label|span|div|b)(.*?)?>/, '')

    meta = {}
    metapatterns = detect_tags ? REQUIRED_META.merge(OPTIONAL_META) : REQUIRED_META
    is_tag = {}.tap do |h|
      %w[fandom_string relationship_string freeform_string rating_string archive_warning_string].each do |c|
        h[c.to_sym] = true
      end
    end
    handler = {}.tap do |h|
      %w[rating_string revised_at].each do |c|
        h[c.to_sym] = "convert_#{c.to_s.downcase}"
      end
    end

    # 1. Look for Pattern: (whatever), optionally followed by a closing p or div tag
    # 2. Set meta[:metaname] = whatever
    # eg, if it finds Fandom: Stargate SG-1 it will set meta[:fandom] = Stargate SG-1
    # 3. convert_<metaname> for cleanup if such a function is defined (eg convert_rating_string)
    metapatterns.each do |metaname, pattern|
      metapattern = Regexp.new("(?:#{pattern}|#{pattern.pluralize})\s*:\s*(.*?)(?:</(?:p|div)>)?$", Regexp::IGNORECASE)
      if text.match(metapattern)
        value = Regexp.last_match[1]
        value = clean_tags(value) if is_tag[metaname]
        value = clean_close_html_tags(value)
        value.strip! # lose leading/trailing whitespace
        value = send(handler[metaname], value) if handler[metaname]

        meta[metaname] = value
      end
    end
    post_process_meta meta
  end

  def download_with_timeout(location, limit = 10)
    story = ""
    Timeout.timeout(STORY_DOWNLOAD_TIMEOUT) do
      begin
        # we do a little cleanup here in case the user hasn't included the 'http://'
        # or if they've used capital letters or an underscore in the hostname
        uri = UrlFormatter.new(location).standardized
        response = Net::HTTP.get_response(uri)
        case response
        when Net::HTTPSuccess
          story = response.body
        when Net::HTTPRedirection
          if limit.positive?
            story = download_with_timeout(response['location'], limit - 1)
          end
        else
          Rails.logger.error("------- STORY PARSER: download_with_timeout: response is not success or redirection ------")
          nil
        end
      rescue Errno::ECONNREFUSED, SocketError, EOFError => e
        Rails.logger.error("------- STORY PARSER: download_with_timeout: error rescue: \n#{e.inspect} ------")
        nil
      end
    end
    if story.blank?
      raise Error, "We couldn't download anything from #{location}. Please make sure that the URL is correct and complete, and try again."
    end

    # clean up any erroneously included string terminator (AO3-2251)
    story.delete("\000")
  end

  def get_last_modified(location)
    Timeout.timeout(STORY_DOWNLOAD_TIMEOUT) do
      resp = open(location)
      resp.last_modified
    end
  end

  def get_source_if_known(known_sources, location)
    known_sources.each do |source|
      pattern = Regexp.new(eval("SOURCE_#{source.upcase}"), Regexp::IGNORECASE)
      return source if location.match(pattern)
    end
    nil
  end

  def clean_close_html_tags(value)
    # if there are any closing html tags at the start of the value let's ditch them
    value.gsub(/^(\s*<\/[^>]+>)+/, '')
  end

  # We clean the text as if it had been submitted as the content of a chapter
  def clean_storytext(storytext)
    storytext = storytext.encode("UTF-8", invalid: :replace, undef: :replace, replace: "") unless storytext.encoding.name == "UTF-8"
    sanitize_value("content", storytext)
  end

  # works conservatively -- doesn't split on
  # spaces and truncates instead.
  def clean_tags(tags)
    tags = Sanitize.clean(tags.force_encoding("UTF-8")) # no html allowed in tags
    tags_list = tags =~ /,/ ? tags.split(/,/) : [tags]
    new_list = []
    tags_list.each do |tag|
      tag.gsub!(/[*<>]/, '')
      tag = truncate_on_word_boundary(tag, ArchiveConfig.TAG_MAX)
      new_list << tag unless tag.blank?
    end
    new_list.join(ArchiveConfig.DELIMITER_FOR_OUTPUT)
  end

  def truncate_on_word_boundary(text, max_length)
    return if text.blank?
    words = text.split
    truncated = words.first
    if words.length > 1
      words[1..words.length].each do |word|
        truncated += " " + word if truncated.length + word.length + 1 <= max_length
      end
    end
    truncated[0..max_length - 1]
  end

  # convert space-separated tags to comma-separated
  def clean_and_split_tags(tags)
    tags = tags.split(/\s+/).join(',') if !tags.match(/,/) && tags.match(/\s/)
    clean_tags(tags)
  end

  # Convert the common ratings into whatever ratings we're
  # using on this archive.
  def convert_rating_string(rating)
    rating = rating.downcase
    if rating =~ /^(nc-?1[78]|x|ma|explicit)/
      ArchiveConfig.RATING_EXPLICIT_TAG_NAME
    elsif rating =~ /^(r|m|mature)/
      ArchiveConfig.RATING_MATURE_TAG_NAME
    elsif rating =~ /^(pg-?1[35]|t|teen)/
      ArchiveConfig.RATING_TEEN_TAG_NAME
    elsif rating =~ /^(pg|g|k+|k|general audiences)/
      ArchiveConfig.RATING_GENERAL_TAG_NAME
    else
      ArchiveConfig.RATING_DEFAULT_TAG_NAME
    end
  end

  def convert_revised_at(date_string)
    begin
      date = nil
      if date_string =~ /^(\d+)$/
        # probably seconds since the epoch
        date = Time.at(Regex.last_match[1].to_i)
      end
      date ||= Date.parse(date_string)
      return '' if date > Date.current
      return date
    rescue ArgumentError, TypeError
      return ''
    end
  end

  # Additional processing for meta - currently to make sure warnings
  # that aren't Archive warnings become additional tags instead
  def post_process_meta(meta)
    if meta[:archive_warning_string]
      result = process_warnings(meta[:archive_warning_string], meta[:freeform_string])
      meta[:archive_warning_string] = result[:archive_warning_string]
      meta[:freeform_string] = result[:freeform_string]
    end
    meta
  end

  def process_warnings(warning_string, freeform_string)
    result = {
        archive_warning_string: warning_string,
        freeform_string: freeform_string
    }
    new_warning = ''
    result[:archive_warning_string].split(/\s?,\s?/).each do |warning|
      if ArchiveWarning.warning? warning
        new_warning += ', ' unless new_warning.blank?
        new_warning += warning
      else
        result[:freeform_string] = (result[:freeform_string] || '') + ", #{warning}"
      end
    end
    result[:archive_warning_string] = new_warning
    result
  end

  # tries to find appropriate existing collections and converts them to comma-separated collection names only
  def get_collection_names(collection_string)
    collections = ""
    collection_string.split(',').map(&:squish).each do |collection_name|
      collection = Collection.find_by(name: collection_name) || Collection.find_by(title: collection_name)
      if collection
        collections += ", " unless collections.blank?
        collections += collection.name
      end
    end
    collections
  end

  # determine which value to use for a metadata field
  def meta_or_default(detected_field, provided_field, default = nil)
    if @options[:override_tags] || detected_field.blank?
      if provided_field.blank?
        detected_field.blank? ? default : detected_field
      else
        provided_field
      end
    else
      detected_field
    end
  end
end