lib/mal_import.rb from hummingbird-me/hummingbird

lib/mal_import.rb
Summary

Maintainability

1 day
Test Coverage

Issues
require 'open-uri'

class MALImport
  def get(path)
    Nokogiri::HTML open("https://myanimelist.net#{path}", 'User-Agent' => 'iMAL-iOS').read
  end

  def get_character(id)
    description = get("/character/#{id}").css('#content > table >tr > td:nth-child(2)').children.take_while { |x|
      !x.text.include? 'Voice Actor'
    }.reject { |x|
      x['class'] == 'normal_header' || x['id'] == 'horiznav_nav' ||
        x['itemtype'] == 'http://schema.org/BreadcrumbList'
    }.map(&:to_html).join

    # TODO: move all the character data grabbing into here
    { description: description }
  end

  def initialize (media, id, depth = :deep)
    @shallow = (depth == :shallow)
    @media = media
    @mal_id = id
    @main_noko = get "/#{media.to_s}/#{id}/"
    unless @shallow
      char_page = get "/#{media.to_s}/#{id}/a/characters"
      @char_noko = char_page.css('h2:contains("Characters")')[0].parent
    end
    @sidebar = @main_noko.css('td.borderClass')[0]
  end

  def staff
    case @media
    when :anime
      @char_noko.css('a[name="staff"] + h2 + table tr td:nth-child(2)').map do |sm|
        {
          external_id: sm.css('a')[0]['href'].scan(/people\/(\d+)\//).flatten[0].to_i,
          name: nameflip(sm.css('a').text),
          role: sm.css('small').text
        }
      end
    when :manga
      @sidebar.css('span:contains("Authors:") ~ a').map do |author|
        {
          external_id: author['href'].scan(/people\/(\d+)\//).flatten[0].to_i,
          name: nameflip(author.text.strip),
          role: author.next.text.scan(/\(([^()]+)\)/).flatten[0].strip
        }
      end
    end
  end
  def characters
    featured_chars = @main_noko.css('table div.picSurround a[href*="character/"]').map {|x|
      x['href'].scan(/character\/(\d+)/)
    }.flatten.map(&:to_i)

    case @media
    when :anime
      @char_noko.css('h2:contains("Characters") ~ *').take_while { |x|
        x.name == 'table'
      }.map do |chara|
        external_id = chara.css('td:nth-child(2) > a')[0]['href'].scan(/character\/(\d+)\//).flatten[0].to_i
        character = get_character(external_id)
        {
          external_id: external_id,
          name: nameflip(chara.css('td:nth-child(2) > a').text),
          image: character_image(chara.css("img")[0]['data-src']),
          role: chara.css('td:nth-child(2) small').text,
          description: clean_desc(character[:description]),
          featured: featured_chars.include?(external_id),
          voice_actors: chara.css('td:nth-child(3) tr > td:nth-child(1)').map do |va|
            {
              external_id: va.css('a')[0]['href'].scan(/people\/(\d+)\//).flatten[0].to_i,
              image: person_image(va.parent.css("img")[0]['data-src']),
              name: nameflip(va.css('a').text),
              lang: va.css('small').text
            } if va.children.length > 0
          end.compact
        }
      end
    when :manga
      @char_noko.css('h2:contains("Characters") ~ table').map do |chara|
        external_id = chara.css('td:nth-child(2) > a')[0]['href'].scan(/character\/(\d+)\//).flatten[0].to_i
        character = get_character(external_id)
        {
          external_id: external_id,
          name: nameflip(chara.css('td:nth-child(2) > a').text),
          image: character_image(chara.css("img")[0]['data-src']),
          role: chara.css('td:nth-child(2) small').text,
          description: clean_desc(character[:description]),
          featured: featured_chars.include?(external_id)
        }
      end
    end
  end
  def metadata
    meta = {
      external_id: @mal_id,
      title: {
        canonical: @main_noko.css('h1').children[0].text.strip,
        unknown: begin @sidebar.css('div:contains("Synonyms:")').last.text.gsub("Synonyms: ", "").split(",").map(&:strip) rescue nil end,
        en_us: begin @sidebar.css('div:contains("English:")').last.text.gsub("English: ", "").strip rescue nil end,
        ja_jp: begin @sidebar.css('div:contains("Japanese:")').last.text.gsub("Japanese: ", "").strip rescue nil end
      }.compact,
      synopsis: begin
        synopsis = @main_noko.css('span[itemprop="description"]').text
        if synopsis.include? "No synopsis"
          nil
        else
          synopsis
        end
      rescue
      end,
      poster_image: begin poster_image(@sidebar.css("img")[0]['src']) rescue nil end,
      type: begin allowed_types.grep(convert_type(@sidebar.css('div:contains("Type:")').last.text.gsub("Type:", '').strip))[0] rescue nil end,
      status: begin @sidebar.css('div:contains("Status:")')[-1].text.gsub(/Status:(?:\\n)?\s/, "").strip.gsub(/\w+/){ |w| w.capitalize } rescue nil end,
      genres: begin (@sidebar.css('span:contains("Genres:") ~ a').map(&:text) rescue []).compact end
    }

    # Media-specific data
    case @media
    when :manga
      meta.merge!({
        dates: begin @sidebar.css('div:contains("Published:")').last.text.gsub("Published:", '').split("to").map { |s| parse_maldate(s) } rescue nil end,
        volume_count: begin @sidebar.css('div:contains("Volumes:")').last.text.gsub("Volumes: ", "").strip.to_i rescue nil end,
        chapter_count: begin @sidebar.css('div:contains("Chapters:")').last.text.gsub("Chapters: ", "").strip.to_i rescue nil end,
        serialization: begin @sidebar.css('span:contains("Serialization:") ~ a').map(&:text)[0] rescue nil end
      })
    when :anime
      age_rating = begin @sidebar.css('div:contains("Rating:")').last.text.gsub("Rating:\n ", "").strip rescue nil end
      age_rating = convert_age_rating(age_rating)
      meta.merge!({
        dates: begin @sidebar.css('div:contains("Aired:")').last.text.gsub("Aired:", '').split("to").map { |s| parse_maldate(s) } rescue nil end,
        producers: begin (@sidebar.css('span:contains("Producers:") ~ a, span:contains("Licensors") ~ a, span:contains("Studios:") ~ a').map(&:text).delete_if { |s| s == "add some" } rescue []).compact end,
        age_rating: age_rating[0],
        age_rating_guide: age_rating[1],
        episode_count: begin @sidebar.css('div:contains("Episodes:")').last.text.gsub("Episodes:\n ", "").strip.to_i rescue nil end,
        episode_length: parse_duration(begin @sidebar.css('div:contains("Duration:")').last.text.gsub("Duration: ", "").strip rescue nil end)
      })
    end

    # post processing
    meta[:title][:en_us] = nil if meta[:title][:en_us] == meta[:title][:canonical]
    return meta
  end
  def to_h
    if @shallow
      metadata.merge({ staff: [], characters: [] })
    else
      metadata.merge({
        staff: staff,
        characters: characters
      })
    end
  end

  private
  def character_image(img)
    img = img.gsub(/r\/\d{2}x\d{2}\//, "").split("?s=")[0]
    URI(img) unless img.include?("questionmark")
  end
  def person_image(img)
    img = img.gsub(/r\/\d{2}x\d{2}\//, "").split("?s=")[0]
    URI(img) unless img.include?("questionmark")
  end
  def poster_image(img)
    img = img.gsub(".jpg", "l.jpg")
    URI(img) unless img.include?("na_series")
  end
  def convert_age_rating(rating)
    {
      ""                                => [nil,    nil],
      "None"                            => [nil,    nil],
      "PG-13 - Teens 13 or older"       => ["PG13", "Teens 13 or older"],
      "R - 17+ (violence & profanity)"  => ["R17+", "Violence, Profanity"],
      "R+ - Mild Nudity"                => ["R17+", "Mild Nudity"],
      "PG - Children"                   => ["PG",   "Children"],
      "Rx - Hentai"                     => ["R18+", "Hentai"],
      "G - All Ages"                    => ["G",    "All Ages"],
      "PG-13"                           => ["PG13", "Teens 13 or older"],
      "R+"                              => ["R17+", "Mild Nudity"],
      "PG13"                            => ["PG13", "Teens 13 or older"],
      "G"                               => ["G",    "All Ages"],
      "PG"                              => ["PG",   "Children"]
    }[rating] || [rating, nil]
  end
  def convert_type(type)
    {
      "Doujinshi" => "Doujin",
      "One-shot"  => "One Shot"
    }[type] || type
  end
  def nameflip(name)
    name.split(',').map(&:strip).reverse.join(' ')
  end
  def allowed_types
    case @media
    when :anime
      ["TV", "Movie", "OVA", "Special", "ONA", "Music"]
    when :manga
      ["Manga", "Novel", "One Shot", "Doujin", "Manwha", "Manhua", "OEL"]
    end
  end
  def parse_duration(dur)
    unless dur.nil?
      hours = dur.scan(/(\d+) hr\./).flatten[0].to_i || 0
      mins = dur.scan(/(\d+) min\./).flatten[0].to_i || 0
      60 * hours + mins
    end
  end
  def parse_maldate(maldate)
    d = maldate.strip
    if d.match(/^\d{4}$/) # year
      Date.new d.to_i
    else # date
      d == "?" ? nil : DateTime.parse(d).to_date
    end
  end
  def br_to_p(src)
    src = '<p>' + src.gsub(/<br>\s*<br>/, '</p><p>') + '</p>'
    doc = Nokogiri::HTML.fragment src
    doc.traverse do |x|
      next x.remove if x.name == 'br' && x.previous.nil?
      next x.remove if x.name == 'br' && x.next.nil?
      next x.remove if x.name == 'br' && x.next.name == 'p' && x.previous.name == 'p'
      next x.remove if x.name == 'p' && x.content.blank?
    end
    doc.inner_html.gsub(/[\r\n\t]/, '')
  end
  def clean_desc(desc)
    desc = Nokogiri::HTML.fragment br_to_p(desc)
    desc.css('.spoiler').each do |x|
      x.name = 'span'
      x.inner_html = x.css('.spoiler_content').inner_html
      x.css('input').remove
    end
    desc.css('.spoiler').wrap('<p></p>')
    desc.xpath('descendant::comment()').remove
    desc.css('b').each { |b| b.replace(b.content) }
    desc.traverse do |node|
      next unless node.text?
      t = node.content.split(/: ?/).map { |x| x.split(' ') }
      if t.length >= 2
        if t[0].length <= 3 && t[1].length <= 20
          node.remove
        end
      else
        node.remove if /^\s+\*\s+.*/ =~ node.content
      end
    end
    desc.inner_html
  end
end