SciRuby/statsample

View on GitHub
lib/statsample/formula/formula.rb

Summary

Maintainability
A
40 mins
Test Coverage
module Statsample
  # This class recognizes what terms are numeric
  # and accordingly forms groups which are fed to Formula
  # Once they are parsed with Formula, they are combined back
  class FormulaWrapper
    attr_reader :tokens, :y, :canonical_tokens

    # Initializes formula wrapper object to parse a given formula into
    # some tokens which do not overlap one another.
    # @note Specify 0 as a term in the formula if you do not want constant
    #   to be included in the parsed formula
    # @param [string] formula to parse
    # @param [Daru::DataFrame] df dataframe requried to know what vectors
    #   are numerical
    # @example
    #   df = Daru::DataFrame.from_csv 'spec/data/df.csv'
    #   df.to_category 'c', 'd', 'e'
    #   formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
    #   formula.canonical_to_s
    #   #=> "1+c(-)+d(-):c+a"
    def initialize(formula, df)
      @df = df
      # @y store the LHS term that is name of vector to be predicted
      # @tokens store the RHS terms of the formula
      @y, *@tokens = split_to_tokens(formula)
      @tokens = @tokens.uniq.sort
      manage_constant_term
      @canonical_tokens = non_redundant_tokens
    end

    # Returns canonical tokens in a readable form.
    # @return [String] canonical tokens in a readable form.
    # @note 'y~a+b(-)' means 'a' exist in full rank expansion
    #   and 'b(-)' exist in reduced rank expansion
    # @example
    #   df = Daru::DataFrame.from_csv 'spec/data/df.csv'
    #   df.to_category 'c', 'd', 'e'
    #   formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
    #   formula.canonical_to_s
    #   #=> "1+c(-)+d(-):c+a"
    def canonical_to_s
      canonical_tokens.join '+'
    end

    # Returns tokens to produce non-redundant design matrix
    # @return [Array] array of tokens that do not produce redundant matrix
    def non_redundant_tokens
      groups = split_to_groups
      # TODO: An enhancement
      # Right now x:c appears as c:x
      groups.each { |k, v| groups[k] = strip_numeric v, k }
      groups.each { |k, v| groups[k] = Formula.new(v).canonical_tokens }
      groups.flat_map { |k, v| add_numeric v, k }
    end

    private

    # Removes intercept token if term '0' is found in the formula.
    # Intercept token remains if term '1' is found.
    # If neither term '0' nor term '1' is found then, intercept token is added.
    def manage_constant_term
      @tokens.unshift Token.new('1') unless
        @tokens.include?(Token.new('1')) ||
        @tokens.include?(Token.new('0'))
      @tokens.delete Token.new('0')
    end

    # Groups the tokens to gropus based on the numerical terms
    # they are interacting with.
    def split_to_groups
      @tokens.group_by { |t| extract_numeric t }
    end

    # Add numeric interaction term which was removed earlier
    # @param [Array] tokens tokens on which to add numerical terms
    # @param [Array] numeric array of numeric terms to add
    def add_numeric(tokens, numeric)
      tokens.map do |t|
        terms = t.interact_terms + numeric
        if terms == ['1']
          Token.new('1')
        else
          terms = terms.reject { |i| i == '1' }
          Token.new terms.join(':'), t.full
        end
      end
    end

    # Strip numerical interacting terms
    # @param [Array] tokens tokens from which to strip numeric
    # @param [Array] numeric array of numeric terms to strip from tokens
    # @return [Array] array of tokens with striped numerical terms
    def strip_numeric(tokens, numeric)
      tokens.map do |t|
        terms = t.interact_terms - numeric
        terms = ['1'] if terms.empty?
        Token.new terms.join(':')
      end
    end

    # Extract numeric interacting terms
    # @param [Statsample::GLM::Token] token form which to extract numeric terms
    # @return [Array] array of numericl terms
    def extract_numeric(token)
      terms = token.interact_terms
      return [] if terms == ['1']
      terms.reject { |t| @df[t].category? }
    end

    def split_to_tokens(formula)
      formula = formula.gsub(/\s+/, '')
      lhs_term, rhs = formula.split '~'
      rhs_terms = rhs.split '+'
      ([lhs_term] + rhs_terms).map { |t| Token.new t }
    end
  end

  # To process formula language
  class Formula
    attr_reader :tokens, :canonical_tokens

    def initialize(tokens)
      @tokens = tokens
      @canonical_tokens = parse_formula
    end

    def canonical_to_s
      canonical_tokens.join '+'
    end

    private

    def parse_formula
      @tokens.inject([]) do |acc, token|
        acc + add_non_redundant_elements(token, acc)
      end
    end

    def add_non_redundant_elements(token, result_so_far)
      return [token] if token.value == '1'
      tokens = token.expand
      result_so_far = result_so_far.flat_map(&:expand)
      tokens -= result_so_far
      contract_if_possible tokens
    end

    def contract_if_possible(tokens)
      tokens.combination(2).each do |a, b|
        result = a.add b
        next unless result
        tokens.delete a
        tokens.delete b
        tokens << result
        return contract_if_possible tokens
      end
      tokens.sort
    end
  end

  # To encapsulate interaction as well as non-interaction terms
  class Token
    attr_reader :value, :full, :interact_terms

    def initialize(value, full = true)
      @interact_terms = value.include?(':') ? value.split(':') : [value]
      @full = coerce_full full
    end

    def value
      interact_terms.join(':')
    end

    def size
      # TODO: Return size 1 for value '1' also
      # CAn't do this at the moment because have to make
      # changes in sorting first
      value == '1' ? 0 : interact_terms.size
    end

    def add(other)
      # ANYTHING + FACTOR- : ANYTHING = FACTOR : ANYTHING
      # ANYTHING + ANYTHING : FACTOR- = ANYTHING : FACTOR
      if size > other.size
        other.add self

      elsif other.size == 2 &&
            size == 1 &&
            other.interact_terms.last == value &&
            other.full.last == full.first &&
            other.full.first == false
        Token.new(
          "#{other.interact_terms.first}:#{value}",
          [true, other.full.last]
        )

      elsif other.size == 2 &&
            size == 1 &&
            other.interact_terms.first == value &&
            other.full.first == full.first &&
            other.full.last == false
        Token.new(
          "#{value}:#{other.interact_terms.last}",
          [other.full.first, true]
        )

      elsif value == '1' &&
            other.size == 1
        Token.new(other.value, true)
      end
    end

    def ==(other)
      value == other.value &&
        full == other.full
    end

    alias eql? ==

    def hash
      value.hash ^ full.hash
    end

    def <=>(other)
      size <=> other.size
    end

    def to_s
      interact_terms
        .zip(full)
        .map { |t, f| f ? t : t + '(-)' }
        .join ':'
    end

    def expand
      case size
      when 0
        [self]
      when 1
        [Token.new('1'), Token.new(value, false)]
      when 2
        a, b = interact_terms
        [Token.new('1'), Token.new(a, false), Token.new(b, false),
         Token.new(a + ':' + b, [false, false])]
      end
    end

    def to_df(df)
      case size
      when 1
        if df[value].category?
          df[value].contrast_code full: full.first
        else
          Daru::DataFrame.new value => df[value].to_a
        end
      when 2
        to_df_when_interaction(df)
      end
    end

    private

    def coerce_full(value)
      if value.is_a? Array
        value + Array.new((@interact_terms.size - value.size), true)
      else
        [value] * @interact_terms.size
      end
    end

    def to_df_when_interaction(df)
      case interact_terms.map { |t| df[t].category? }
      when [true, true]
        df.interact_code(interact_terms, full)
      when [false, false]
        to_df_numeric_interact_with_numeric df
      when [true, false]
        to_df_category_interact_with_numeric df
      when [false, true]
        to_df_numeric_interact_with_category df
      end
    end

    def to_df_numeric_interact_with_numeric(df)
      Daru::DataFrame.new value => (df[interact_terms.first] *
        df[interact_terms.last]).to_a
    end

    def to_df_category_interact_with_numeric(df)
      a, b = interact_terms
      Daru::DataFrame.new(
        df[a].contrast_code(full: full.first)
          .map { |dv| ["#{dv.name}:#{b}", (dv * df[b]).to_a] }
          .to_h
      )
    end

    def to_df_numeric_interact_with_category(df)
      a, b = interact_terms
      Daru::DataFrame.new(
        df[b].contrast_code(full: full.last)
          .map { |dv| ["#{a}:#{dv.name}", (dv * df[a]).to_a] }
          .to_h
      )
    end
  end
end