model/linear.rb from yegor256/0pdd

model/linear.rb
Summary

Maintainability

4 hrs
Test Coverage

Issues
# Copyright (c) 2016-2024 Yegor Bugayenko
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the 'Software'), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

require 'json'
require 'time'
require 'crack'
require_relative 'predictor'
require_relative 'storage'
require_relative 'fake_weights_storage'

#
# Linear Model
# @todo #532:60min Add unit-tests.
#  We should add unit-tests for this class that checks puzzle ranking.
#  For now its untested, don't forget to remove this puzzle.
#
class LinearModel
  def initialize(repo, storage)
    @repo = repo
    @xml_storage = storage
    if ENV['RACK_ENV'] == 'test'
      @storage = FakeWeightsStorage.new(@repo)
    else
      settings = Sinatra::Application.settings
      @storage = Storage.new(
        "#{@repo}.marshal",
        settings.config['s3']['bucket'],
        settings.config['s3']['region'],
        settings.config['s3']['key'],
        settings.config['s3']['secret']
      )
    end
  end

  # ranks the puzzles using Machine-Learning
  # @param puzzles XML puzzles
  # @return array of positional index of the input puzzles
  # @todo #532:60min Implement a ranked puzzles.
  #  Let's implement a class that will use `LinearModel` to rank puzzles.
  #  This class is need in order to do an integration between original 0pdd
  #  and model modules. Probably it can be a decorator for `Puzzles`
  #  that ranks XML puzzles, and then submits them into `Puzzles`.
  #  Don't forget to remove this puzzle.
  def predict(puzzles)
    weights = @storage.load # load weights for repo from s3
    clf = Predictor.new(
      layers: [
        { name: 'w1', shape: [10, 1] },
        { name: 'w2', shape: [1, 1] }
      ]
    )
    if weights.nil?
      train(clf) # find weights for repo backlog of puzzles
      ranks = naive_rank(puzzles) # naive rank of puzzles in each repo
    else
      # get x and y data for puzzles
      samples, _labels = extract_features(puzzles)
      ranks = clf.predict(weights, samples[0]) # model rank of puzzles if weights are loaded
    end
    ranks.map(&:to_i)
  end

  private

  def replace_nil(arr, with = 0)
    arr.map { |x| x.nil? ? with : x }
  end

  def get_features_labels(samples)
    x = samples.map do |_, s|
      replace_nil([
        s['time_estimate'],
        s['n_characters'],
        s['level'],
        s['n_puzzles_before'],
        s['n_puzzles_after'],
        s['time_before'],
        s['time_after'],
        s['n_additions'],
        s['n_deletions']
      ].append(s['vectorized_description']))
    end
    y = samples.map { |_, s| s['closed'] ? Time.parse(s['closed']).to_i : 0 }.map.with_index.sort.map(&:last)
    [[x], [y]] # single backlog of puzzles
  end

  # depth first feature extraction
  def extract_features(puzzles, samples = {}, level = 1)
    puzzles = [puzzles] unless puzzles.is_a?(Array)
    puzzles.each do |puzzle|
      next if puzzle.nil?
      prev_puzzle = samples[samples.keys.last]
      time_before = 0
      unless prev_puzzle.nil?
        opened = Time.parse(prev_puzzle['time']).to_i
        closed = prev_puzzle['closed'] ? Time.parse(prev_puzzle['closed']).to_i : opened
        time_before = (closed - opened) / 60 # in minutes

        unless prev_puzzle['time_after'].nil?
          time_after = (Time.parse(puzzle['closed']).to_i - Time.parse(puzzle['time']).to_i) / 60 # in minutes
          prev_puzzle['time_after'] = time_after
        end
      end
      n_characters = puzzle['body'].gsub(/\s/, '').length
      samples[puzzle['id']] = {
        'time_estimate' => puzzle['estimate'].to_i,
        'n_characters' => n_characters,
        'level' => level,
        'n_puzzles_before' => samples.length,
        'n_puzzles_after' => puzzles.length - samples.length,
        'time_before' => time_before
      }.merge(puzzle)

      extract_features(puzzle['children']['puzzle'], samples, level + 1) unless puzzle['children'].nil?
    end
    get_features_labels(samples) if level == 1
  end

  def train(clf)
    puzzles = @xml_storage.load
    Thread.new do
      # properly train model here and save weights to s3 for later
      puzzles = JSON.parse(Crack::XML.parse(puzzles.to_s).to_json)['puzzles']
      unless puzzles.nil?
        samples, labels = extract_features(puzzles['puzzle'])
        if labels[0].length > 1 # train only when there's data
          center = ZeroVector.zero(samples[0][0].size)
          solver = Pso::Solver.new(f: clf, center: center, data: samples, true_order: labels)
          _rank, weights, _n_iterations = solver.solve
          @storage.save(weights)
        end
      end
    end
  end

  def naive_rank(puzzles)
    estimates = puzzles.map { |puzzle| puzzle['estimate'].to_i || Infinity }
    estimates.map.with_index.sort.map(&:last)
  end
end