kymmt90/bookmeter_scraper

View on GitHub
lib/bookmeter_scraper/scraper.rb

Summary

Maintainability
D
2 days
Test Coverage
require 'mechanize'
require 'yasuri'

require 'bookmeter_scraper/book'
require 'bookmeter_scraper/profile'
require 'bookmeter_scraper/user'

module BookmeterScraper
  class Scraper
    NUM_BOOKS_PER_PAGE = 40
    NUM_USERS_PER_PAGE = 20

    attr_accessor :agent


    def initialize(agent = nil)
      @agent = agent
      @book_pages = {}
    end

    def fetch_profile(user_id, agent = @agent)
      raise ArgumentError unless user_id =~ USER_ID_REGEX
      raise ScraperError if agent.nil?

      Profile.new(*scrape_profile(user_id, agent))
    end

    def scrape_profile(user_id, agent)
      raise ArgumentError unless user_id =~ USER_ID_REGEX
      raise ScraperError if agent.nil?

      mypage = agent.get(BookmeterScraper.mypage_uri(user_id))

      profile_dl_tags    = mypage.search('#side_left > div.inner > div.profile > dl')
      jp_attribute_names = profile_dl_tags.map { |i| i.children[0].children.text }
      attribute_values   = profile_dl_tags.map { |i| i.children[1].children.text }
      jp_attributes      = Hash[jp_attribute_names.zip(attribute_values)]

      attributes = PROFILE_ATTRIBUTES.map do |attribute|
        jp_attributes[JP_ATTRIBUTE_NAMES[attribute]]
      end
      attributes[0] = mypage.at_css('#side_left > div.inner > h3').text

      attributes
    end

    def fetch_books(user_id, uri_method, agent = @agent)
      unless user_id =~ USER_ID_REGEX && \
             BookmeterScraper.methods.include?(uri_method)
        raise ArgumentError
      end
      raise ScraperError if agent.nil?
      return [] unless agent.logged_in?

      books = Books.new
      scraped_pages = scrape_books_pages(user_id, uri_method)
      scraped_pages.each do |page|
        books << extract_books(page)
        books.flatten!
      end
      books
    end

    def scrape_books_pages(user_id, uri_method, agent = @agent)
      unless user_id =~ USER_ID_REGEX && \
             BookmeterScraper.methods.include?(uri_method)
        raise ArgumentError
      end
      raise ScraperError if agent.nil?
      return [] unless agent.logged_in?

      books_page = agent.get(BookmeterScraper.method(uri_method).call(user_id))

      # if books are not found at all
      return [] if books_page.search('#main_left > div > center > a').empty?

      if books_page.search('span.now_page').empty?
        books_root = Yasuri.struct_books '//*[@id="main_left"]/div' do
          1.upto(NUM_BOOKS_PER_PAGE) do |i|
            send("text_book_#{i}_name", "//*[@id=\"main_left\"]/div/div[#{i + 1}]/div[2]/a")
            send("text_book_#{i}_link", "//*[@id=\"main_left\"]/div/div[#{i + 1}]/div[2]/a/@href")
          end
        end
        return [books_root.inject(agent, books_page)]
      end

      books_root = Yasuri.pages_root '//span[@class="now_page"]/following-sibling::span[1]/a' do
        text_page_index '//span[@class="now_page"]/a'
        1.upto(NUM_BOOKS_PER_PAGE) do |i|
          send("text_book_#{i}_name", "//*[@id=\"main_left\"]/div/div[#{i + 1}]/div[2]/a")
          send("text_book_#{i}_link", "//*[@id=\"main_left\"]/div/div[#{i + 1}]/div[2]/a/@href")
        end
      end
      books_root.inject(agent, books_page)
    end

    def extract_books(page)
      raise ArgumentError if page.nil?

      books = []
      1.upto(NUM_BOOKS_PER_PAGE) do |i|
        break if page["book_#{i}_link"].empty?

        read_dates = []
        read_date  = scrape_read_date(page["book_#{i}_link"])
        unless read_date.empty?
          read_dates << Time.local(read_date['year'], read_date['month'], read_date['day'])
        end

        reread_dates = []
        reread_dates << scrape_reread_date(page["book_#{i}_link"])
        reread_dates.flatten!

        unless reread_dates.empty?
          reread_dates.each do |date|
            read_dates << Time.local(date['reread_year'], date['reread_month'], date['reread_day'])
          end
        end

        book_path = page["book_#{i}_link"]
        book_name = scrape_book_name(book_path)
        book_author    = scrape_book_author(book_path)
        book_image_uri = scrape_book_image_uri(book_path)
        book = Book.new(book_name,
                        book_author,
                        read_dates,
                        ROOT_URI + book_path,
                        book_image_uri)
        books << book
      end

      books
    end

    def fetch_read_books_in(target_year_month, user_id)
      raise ArgumentError unless user_id =~ USER_ID_REGEX
      raise ArgumentError if target_year_month.nil?

      result = Books.new
      scrape_books_pages(user_id, :read_books_uri).each do |page|
        first_book_date = get_first_book_date(page)
        last_book_date  = get_last_book_date(page)

        first_book_year_month = Time.local(first_book_date['year'].to_i, first_book_date['month'].to_i)
        last_book_year_month  = Time.local(last_book_date['year'].to_i, last_book_date['month'].to_i)

        # to next page if target is older than this page's last book
        next if target_year_month < last_book_year_month

        # exit if target is newer than this page's first book
        break if target_year_month > first_book_year_month

        result.append(fetch_target_books_from(page, target_year_month))

        # exit if target is newer than this page's last book
        break if target_year_month > last_book_year_month
      end
      result
    end

    def get_first_book_date(page)
      scrape_read_date(page['book_1_link'])
    end

    def get_last_book_date(page)
      raise ArgumentError if page.nil?

      NUM_BOOKS_PER_PAGE.downto(1) do |i|
        link = page["book_#{i}_link"]
        next if link.empty?
        return scrape_read_date(link)
      end
    end

    def fetch_target_books_from(page, target_year_month)
      raise ArgumentError if target_year_month.nil? || page.nil?

      target_books = Books.new
      1.upto(NUM_BOOKS_PER_PAGE) do |i|
        next if page["book_#{i}_link"].empty?

        # scrape first read date
        read_date  = scrape_read_date(page["book_#{i}_link"])
        read_dates = [Time.local(read_date['year'], read_date['month'], read_date['day'])]
        read_year_months = [Time.local(read_date['year'], read_date['month'])]

        # scrape re-read dates
        reread_dates = [scrape_reread_date(page["book_#{i}_link"])]
        reread_dates.flatten!
        unless reread_dates.empty?
          reread_dates.each do |date|
            read_year_months << Time.local(date['reread_year'], date['reread_month'])
          end
        end

        next unless read_year_months.include?(target_year_month)

        unless reread_dates.empty?
          reread_dates.each do |date|
            read_dates << Time.local(date['reread_year'], date['reread_month'], date['reread_day'])
          end
        end
        book_path = page["book_#{i}_link"]
        book_name = scrape_book_name(book_path)
        book_author    = scrape_book_author(book_path)
        book_image_uri = scrape_book_image_uri(book_path)
        target_books << Book.new(book_name, book_author, read_dates, ROOT_URI + book_path, book_image_uri)
      end

      target_books
    end

    def get_book_page(book_uri, agent = @agent)
      @book_pages[book_uri] = agent.get(ROOT_URI + book_uri) unless @book_pages[book_uri]
      @book_pages[book_uri]
    end

    def scrape_book_name(book_uri)
      get_book_page(book_uri).search('#title').text
    end

    def scrape_book_author(book_uri)
      get_book_page(book_uri).search('#author_name').text
    end

    def scrape_book_image_uri(book_uri)
      get_book_page(book_uri).search('//*[@id="book_image"]/@src').text
    end

    def scrape_read_date(book_uri, agent = @agent)
      book_date = Yasuri.struct_date '//*[@id="book_edit_area"]/form[1]/div[2]' do
        text_year  '//*[@id="read_date_y"]/option[1]', truncate: /\d+/, proc: :to_i
        text_month '//*[@id="read_date_m"]/option[1]', truncate: /\d+/, proc: :to_i
        text_day   '//*[@id="read_date_d"]/option[1]', truncate: /\d+/, proc: :to_i
      end
      book_date.inject(agent, get_book_page(book_uri))
    end

    def scrape_reread_date(book_uri, agent = @agent)
      book_reread_date = Yasuri.struct_reread_date '//*[@id="book_edit_area"]/div/form[1]/div[2]' do
        text_reread_year  '//div[@class="reread_box"]/form[1]/div[2]/select[1]/option[1]', truncate: /\d+/, proc: :to_i
        text_reread_month '//div[@class="reread_box"]/form[1]/div[2]/select[2]/option[1]', truncate: /\d+/, proc: :to_i
        text_reread_day   '//div[@class="reread_box"]/form[1]/div[2]/select[3]/option[1]', truncate: /\d+/, proc: :to_i
      end
      book_reread_date.inject(agent, get_book_page(book_uri))
    end

    def fetch_followings(user_id, agent = @agent)
      raise ArgumentError unless user_id =~ USER_ID_REGEX
      raise ScraperError if agent.nil?
      return [] unless agent.logged_in?

      users = []
      scraped_pages = user_id == agent.log_in_user_id ? scrape_followings_page(user_id)
                                                      : scrape_others_followings_page(user_id)
      scraped_pages.each do |page|
        users << extract_users(page)
        users.flatten!
      end
      users
    end

    def fetch_followers(user_id, agent = @agent)
      raise ArgumentError unless user_id =~ USER_ID_REGEX
      raise ScraperError if agent.nil?
      return [] unless agent.logged_in?

      users = []
      scraped_pages = scrape_followers_page(user_id)
      scraped_pages.each do |page|
        users << extract_users(page)
        users.flatten!
      end
      users
    end

    def scrape_followings_page(user_id, agent = @agent)
      raise ArgumentError unless user_id =~ USER_ID_REGEX
      return [] unless agent.logged_in?

      followings_page = agent.get(BookmeterScraper.followings_uri(user_id))
      followings_root = Yasuri.struct_books '//*[@id="main_left"]/div' do
        1.upto(NUM_USERS_PER_PAGE) do |i|
          send("text_user_#{i}_name", "//*[@id=\"main_left\"]/div/div[#{i}]/a/@title")
          send("text_user_#{i}_link", "//*[@id=\"main_left\"]/div/div[#{i}]/a/@href")
        end
      end
      [followings_root.inject(agent, followings_page)]
    end

    def scrape_others_followings_page(user_id)
      raise ArgumentError unless user_id =~ USER_ID_REGEX
      scrape_users_listing_page(user_id, :followings_uri)
    end

    def scrape_followers_page(user_id)
      raise ArgumentError unless user_id =~ USER_ID_REGEX
      scrape_users_listing_page(user_id, :followers_uri)
    end

    def scrape_users_listing_page(user_id, uri_method, agent = @agent)
      raise ArgumentError unless user_id =~ USER_ID_REGEX
      raise ArgumentError unless BookmeterScraper.methods.include?(uri_method)
      return [] unless agent.logged_in?

      page = agent.get(BookmeterScraper.method(uri_method).call(user_id))
      root = Yasuri.struct_users '//*[@id="main_left"]/div' do
        1.upto(NUM_USERS_PER_PAGE) do |i|
          send("text_user_#{i}_name", "//*[@id=\"main_left\"]/div/div[#{i}]/div/div[2]/a/@title")
          send("text_user_#{i}_link", "//*[@id=\"main_left\"]/div/div[#{i}]/div/div[2]/a/@href")
        end
      end
      [root.inject(agent, page)]
    end

    def extract_users(page)
      raise ArgumentError if page.nil?

      users = []
      1.upto(NUM_USERS_PER_PAGE) do |i|
        break if page["user_#{i}_name"].empty?

        user_name = page["user_#{i}_name"]
        user_id   = page["user_#{i}_link"].match(/\/u\/(\d+)$/)[1]
        users << User.new(user_name, user_id, ROOT_URI + "/u/#{user_id}")
      end

      users
    end
  end

  class ScraperError < StandardError; end
end