supercaracal/japan-masters-swimming

View on GitHub
lib/tasks/import/tdsystem_scraper.rb

Summary

Maintainability
A
0 mins
Test Coverage
require 'nkf'

module Tasks
  module Import
    class TdsystemScraper
      REGEXP_EVENT_NAME = /No.[0-9]+(女子|男子)([0-9]+m)(自由形|平泳ぎ|背泳ぎ|バタフライ|個人メドレー)/
      REGEXP_RESULT = /((?!歳)(?!・)(?!世|日|大)(?!界|本|会)(?!新)(?!-+)(?!こ)(?!こ)(?!ま)(?!で)(?!入)(?!賞)[^()0-9]+)\(([^()]+)\)([0-9]*:?[0-9]{1,2}\.[0-9]{1,2})/
      REGEXP_TIME = /([0-9]+:)?([0-9]+)\.([0-9]+)/

      def scrape(html)
        text = convert_to_simple_text(html)
        event_name = extract_event_name(text)
        results = extract_results(text)
        raise ScrapingError, 'a event name.' if event_name.blank?
        raise ScrapingError, 'results.' if results.blank?
        OpenStruct.new(event_name: event_name, results: results)
      end

      private

      def convert_to_simple_text(html, to_enc: Encoding::UTF_8)
        from_enc = NKF.guess(html)
        encoded_html = html.encode(to_enc, from_enc, invalid: :replace, undef: :replace, replace: '')
        striped_html = encoded_html.gsub(/[ \s ]+| /, '')
        normalized_html = NKF.nkf('-m0Z1 -W -w', striped_html)
        normalized_html.gsub(/<[^<>]+>/, '')
      end

      def extract_event_name(text)
        text.scan(REGEXP_EVENT_NAME).map(&:join).first
      end

      def extract_results(text)
        text.scan(REGEXP_RESULT).map do |match|
          match = match.map(&:strip)
          OpenStruct.new(swimmer: match[0], team: match[1], time: convert_to_float(match[2]))
        end
      end

      def convert_to_float(text)
        match_data = text.match(REGEXP_TIME)
        raise ScrapingError, "Unknown result time #{text}" if match_data.blank?
        second = match_data[1]&.tr(':', '').to_i * 60 + match_data[2].to_i
        millisecond = match_data[3]
        "#{second}.#{millisecond}".to_f
      end
    end
  end
end