jkawamoto/fukuoka-city-community-space-finder

View on GitHub
src/scraper.coffee

Summary

Maintainability
Test Coverage
#
# scraper.coffee
#
# Copyright (c) 2016 Junpei Kawamoto
#
# This software is released under the MIT License.
#
# http://opensource.org/licenses/mit-license.php
#
phantom = require "phantom"
cheerio = require "cheerio"

# Root URL.
ROOT_URL = "https://www.comnet-fukuoka.jp/web/"

ATTESTATION_URL =
  "https://www.comnet-fukuoka.jp/web/rsvWTransUserAttestationAction.do"
VACANT_URL =
  "https://www.comnet-fukuoka.jp/web/rsvWTransInstSrchVacantAction.do"
AREA_URL =
  "https://www.comnet-fukuoka.jp/web/rsvWTransInstSrchAreaAction.do"
BUILDING_URL =
  "https://www.comnet-fukuoka.jp/web/rsvWTransInstSrchBuildAction.do"
INSTITUTION_URL =
  "https://www.comnet-fukuoka.jp/web/rsvWTransInstSrchInstAction.do"
DAY_WEEK_URL =
  "https://www.comnet-fukuoka.jp/web/rsvWTransInstSrchDayWeekAction.do"
RESULT_URL =
  "https://www.comnet-fukuoka.jp/web/rsvWInstSrchVacantAction.do"

# Ignored item name which measn "all".
IGNORED_KEYWORD = "すべて"

SRC_CLOSED = "image/lw_closes.gif"
SRC_AVAILABLE = "image/lw_emptybs.gif"
SRC_OCCUPIED = "image/lw_finishs.gif"
SRC_MAINTENANCE = "image/lw_keeps.gif"
SRC_OUT_OF_DATE = "image/lw_kikangais.gif"

STATUS =
  CLOSED: "closed"
  AVAILABLE: "available"
  OCCUPIED: "occupied"
  MAINTENANCE: "maintenance"
  OUT_OF_DATE: "out of date"

WAITING_TIME = 1500

# Execute a given tasks via PhantomJS.
#
# @param generator [Function] takes a page object and generates tasks.
# @return [Promise] which will pass the result.
run = (generator) ->

  new Promise (resolve, reject) ->

    phantom.create().then (instance) ->

      instance.createPage().then (page) ->

        # Observing URL until it becomes a given one.
        #
        # @param url [String] url string.
        # @return [Promise] invoked when the URL will be match to the given
        #     url.
        wait_moved_to = (url) ->
          new Promise (resolve, reject) ->
            do checker = ->
              page.property("url").then (res) ->
                res = res.split(";")[0]
                if res is url
                  resolve res
                else
                  setTimeout checker, WAITING_TIME

              .catch (reason) ->
                reject reason

        page.open(ROOT_URL)
          .then ->
            tasks = generator page
            new Promise (resolve, reject) ->
              do runner = ->
                t = tasks.shift()
                # console.log "move to", t.url
                wait_moved_to(t.url).then(t.action).then (res) ->
                  if tasks.length isnt 0
                    runner()
                  else
                    resolve res
                .catch (reason) ->
                  # console.log reason
                  reject reason

          .then (res) ->
            page.close()
            instance.exit()
            resolve res

          .catch (reason) ->
            console.error reason
            # Clean up.
            page.close()
            instance.exit()
            reject reason

      .catch (reason) ->
        reject reason

    .catch (reason) ->
      reject reason



# Generate common tasks.
#
# Such tasks are for skipping default pages.
#
# @param page [Page] Page object.
# @return [Array] array of tasks which contain url and action.
generate_common_tasks = (page) -> [
  url: ATTESTATION_URL
  action: ->
    page.evaluate ->
      action = if window._dom is 3
        document.layers['disp'].document.formWTransInstSrchVacantAction
      else
        document.formWTransInstSrchVacantAction
      window.doAction action, gRsvWTransInstSrchVacantAction
,
  url: VACANT_URL
  action: ->
    page.evaluate ->
      action = if window._dom is 3
        document.layers['disp'].document.formWTransInstSrchAreaAction
      else
        document.formWTransInstSrchAreaAction
      window.doAction action, gRsvWTransInstSrchAreaAction
]


# Create a task which searches a target link and clicks it.
#
# @param page [Page] Page object.
# @param target [String] Keyward of the target.
# @return [Promise] Promise object.
search_and_click = (page, target) ->
  page.evaluate ->
    document.body.innerHTML
  .then (html) ->
    $ = cheerio.load html
    href = $("a").filter ->
      name = $("img", @).attr "alt"
      name.includes(target) or target.includes(name)
    .attr "href"

    script = "function() {" + href.substring("javaScript:".length) + ";}"
    page.evaluateJavaScript script


# List up items which associated with a given keyword.
#
# @param page [Page] Page object.
# @param keywork [String] keyword which is a file name of gif file w/o
#   extentions.
# @return [Promise] Promise object.
list_up = (page, keyword) ->
  page.evaluate ->
    document.body.innerHTML
  .then (html) ->
    $ = cheerio.load html
    $("a").map ->
      $("img[src=\"image/#{keyword}.gif\"]", @).attr "alt"
    .toArray()
    .filter (v) ->
      v isnt IGNORED_KEYWORD


# Trim a given string.
#
# @param str [String] a string.
# @return [String] the trimmed string.
trim = (str) ->
  str.replace /^\s+|\s+$/g, ""


# Return a status message from a src url.
#
# @param value [String] a src url.
# @return [String] status message.
check_status = (value) ->
  switch value
    when SRC_CLOSED
      STATUS.CLOSED
    when SRC_OCCUPIED
      STATUS.OCCUPIED
    when SRC_AVAILABLE
      STATUS.AVAILABLE
    when SRC_MAINTENANCE
      STATUS.MAINTENANCE
    when SRC_OUT_OF_DATE
      STATUS.OUT_OF_DATE


module.exports =

  # Returns a list of areas in Fukuoka city.
  #
  # @return [Promise] which returns a list of areas in Fukuoka city.
  area: ->

    run (page) ->

      generate_common_tasks page
      .concat [
        url: AREA_URL
        action: ->
          list_up page, "bw_tiikiimg"
      ]

  # Returns a list of buildings in a given area.
  #
  # @param area [String] name of the area.
  # @return [Promise] which returns a list of buildings in the area.
  building: (area) ->

    run (page) ->

      generate_common_tasks page
      .concat [
        url: AREA_URL
        action: ->
          search_and_click page, area
      ,
        url: BUILDING_URL
        action: ->
          list_up page, "bw_buildingimg"
      ]

  # Returns a list of institutions in a given area and building.
  #
  # @param area [String] name of the area.
  # @param building [String] name of the building.
  # @return [Promise] which returns a list of institutions.
  institution: (area, building) ->

    run (page) ->

      generate_common_tasks page
      .concat [
        url: AREA_URL
        action: ->
          search_and_click page, area
      ,
        url: BUILDING_URL
        action: ->
          search_and_click page, building
      ,
        url: INSTITUTION_URL
        action: ->
          list_up page, "bw_institutionimg"
      ]

  # Search reservation statuses of a given institution in a given date.
  #
  # @param area [String] area name obtained by area method.
  # @param building [String] building name obtained by building method.
  # @param institution [String] institution name obtained by institution method.
  # @param year [Integer] Year.
  # @param month [Integer] Month.
  # @param day [Integer] Day.
  # @return [Promise] which returns the result via then method.
  status: (area, building, institution, year, month, day) ->

    run (page) ->

      tasks = generate_common_tasks page
      .concat [
        url: AREA_URL
        action: ->
          search_and_click page, area
      ,
        url: BUILDING_URL
        action: ->
          search_and_click page, building
      ,
        url: INSTITUTION_URL
        action: ->
          search_and_click page, institution
      ]

      today = new Date()
      if year isnt today.getFullYear() or month isnt today.getMonth()+1
        tasks.push
          url: DAY_WEEK_URL
          action: ->
            page.evaluate ->
              document.body.innerHTML
            .then ->
              new Promise (resolve, reject) ->
                page.evaluateJavaScript """
                  function(){
                    moveCalender(
                    (_dom == 3) ?
                    document.layers['disp'].document.formCommonSrchDayWeekAction
                    : document.formCommonSrchDayWeekAction,
                    gRsvWTransInstSrchDayWeekAction, #{year}, #{month});}"""
                .then ->
                  setTimeout resolve, WAITING_TIME
                .catch (reason) ->
                  reject reason

      tasks.concat [
        url: DAY_WEEK_URL
        action: ->
          search_and_click page, day.toString()
          .then ->
            page.evaluate ->
              action = if window._dom is 3
                document.layers['disp'].document.formCommonSrchDayWeekAction
              else
                document.formCommonSrchDayWeekAction
              window.sendSelectDay action, gRsvWInstSrchVacantAction, 1
      ,
        url: RESULT_URL
        action: ->
          page.evaluate ->
            document.body.innerHTML
          .then (html) ->
            $ = cheerio.load html
            table = $("""#disp > center > table:nth-child(5) >
              tbody:nth-child(3) > tr:nth-child(3) > td:nth-child(2) >
              center > table""")

            res = {}
            if table.length isnt 0
              header = $("tr", table).first()
              dates = header.children().map ->
                trim $(@).text()
              .toArray().slice 1

              header.nextAll().each ->
                left_item = $(@).children().first()

                label = trim left_item.text()
                left_item.nextAll().each (i) ->
                  unless dates[i] of res
                    res[dates[i]] = {}

                  res[dates[i]][label] = check_status(
                    $(@).children().attr("src"))

            else
              table = $("""#disp > center > table:nth-child(5) >
                tbody:nth-child(3) > tr > td:nth-child(2) > center > table""")

              header = $("tr", table).first()
              dates = header.children().map ->
                trim $(@).text()
              .toArray().slice 1

              header.nextAll().each ->
                left_item = $(@).children().first()

                label = trim left_item.text()
                res[label] = {}

                left_item.nextAll().each (i) ->
                  unless dates[i] of res[label]
                    res[label][dates[i]] = {}

                  status = null
                  $(@).contents().each ->
                    switch @.tagName
                      when "img"
                        status = check_status $(@).attr("src")

                      when null
                        time = trim $(@).text()
                        if time.length isnt 0
                          res[label][dates[i]][time] = status

            return res
      ]

  # Constants of status.
  STATUS: STATUS