SciRuby/statsample

View on GitHub
lib/statsample/regression/multiple/baseengine.rb

Summary

Maintainability
B
6 hrs
Test Coverage
module Statsample
  module Regression
    module Multiple
      # Base class for Multiple Regression Engines
      class BaseEngine
        include Statsample::Summarizable
        # Name of analysis
        attr_accessor :name
        # Minimum number of  valid case for pairs of correlation
        attr_reader :cases
        # Number of valid cases (listwise)
        attr_reader :valid_cases
        # Number of total cases (dataset.cases)
        attr_reader :total_cases
        
        attr_accessor :digits
        def self.univariate?
          true
        end
        def initialize(ds, y_var, opts = Hash.new)
          @ds=ds
          @predictors_n=@ds.vectors.size-1
          @total_cases=@ds.nrows
          @cases=@ds.nrows
          @y_var=y_var
          @r2=nil
          @name=_("Multiple Regression:  %s over %s") % [ ds.vectors.to_a.join(",") , @y_var]
          
          opts_default={:digits=>3}
          @opts=opts_default.merge opts
          
          @opts.each{|k,v|
            self.send("#{k}=",v) if self.respond_to? k
          }
        end
        # Calculate F Test
        def anova
          @anova||=Statsample::Anova::OneWay.new(:ss_num=>ssr, :ss_den=>sse, :df_num=>df_r, :df_den=>df_e, :name_numerator=>_("Regression"), :name_denominator=>_("Error"), :name=>"ANOVA")
        end
        # Standard error of estimate
        def se_estimate
          Math::sqrt(sse.quo(df_e))
        end
        # Retrieves a vector with predicted values for y
        def predicted
          Daru::Vector.new(
            @total_cases.times.collect do |i|
              invalid = false
              vect = @dep_columns.collect {|v| invalid = true if v[i].nil?; v[i]}
              if invalid
                nil
              else
                process(vect)
              end
            end
          )
        end
        # Retrieves a vector with standarized values for y
        def standarized_predicted
          predicted.standarized
        end
        # Retrieves a vector with residuals values for y
        def residuals
          Daru::Vector.new(
            (0...@total_cases).collect do |i|
              invalid=false
              vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
              if invalid or @ds[@y_var][i].nil?
                nil
              else
                @ds[@y_var][i] - process(vect)
              end
            end
          )
        end
        # R Multiple
        def r
          raise "You should implement this"
        end
        # Sum of squares Total
        def sst
          raise "You should implement this"
        end
        # R^2 Adjusted.
        # Estimate Population R^2 usign Ezequiel formula.
        # Always lower than sample R^2
        # == Reference:
        # * Leach, L. & Henson, R. (2007). The Use and Impact of Adjusted R2 Effects in Published Regression Research. Multiple Linear Regression Viewpoints, 33(1), 1-11.
        def r2_adjusted
          r2-((1-r2)*@predictors_n).quo(df_e)
        end
        # Sum of squares (regression)
        def ssr
          r2*sst
        end
        # Sum of squares (Error)
        def sse
          sst - ssr
        end
        # T values for coeffs
        def coeffs_t
          out={}
          se=coeffs_se
          coeffs.each do |k,v|
            out[k]=v / se[k]
          end
          out
        end
        # Mean square Regression
        def msr
          ssr.quo(df_r)
        end
        # Mean Square Error
        def mse
          sse.quo(df_e)
        end
        # Degrees of freedom for regression
        def df_r
          @predictors_n
        end
        # Degrees of freedom for error
        def df_e
          @valid_cases-@predictors_n-1
        end
        # Fisher for Anova
        def f
          anova.f
        end
        # p-value of Fisher
        def probability
          anova.probability
        end
        # Tolerance for a given variable
        # http://talkstats.com/showthread.php?t=5056
        def tolerance(var)
          ds = assign_names(@dep_columns)
          ds.each { |k,v| ds[k] = Daru::Vector.new(v) }
          lr = self.class.new(Daru::DataFrame.new(ds),var)
          1 - lr.r2
        end
        # Tolerances for each coefficient
        def coeffs_tolerances
          @fields.inject({}) {|a,f|
          a[f]=tolerance(f);
            a
          }
        end
        # Standard Error for coefficients
        def coeffs_se
          out={}
          mse=sse.quo(df_e)
          coeffs.each {|k,v|
            out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares * tolerance(k)))
          }
          out
        end
        # Estandar error of R^2
        # ????
        def se_r2
          Math::sqrt((4*r2*(1-r2)**2*(df_e)**2).quo((@cases**2-1)*(@cases+3)))
        end
         
        # Estimated Variance-Covariance Matrix
        # Used for calculation of se of constant
        def estimated_variance_covariance_matrix
          #mse_p=mse
          columns=[]
          @ds_valid.vectors.each{|k|
            v = @ds_valid[k]
            columns.push(v.to_a) unless k == @y_var
          }
          columns.unshift([1.0]*@valid_cases)
          x=::Matrix.columns(columns)
          matrix=((x.t*x)).inverse * mse
          matrix.collect {|i| Math::sqrt(i) if i>=0 }
        end
        # T for constant
        def constant_t
          constant.to_f/constant_se
        end
        # Standard error for constant
        def constant_se
          estimated_variance_covariance_matrix[0,0]
        end
        def report_building(b)
          di="%0.#{digits}f"
          b.section(:name=>@name) do |g|
            c=coeffs
            g.text _("Engine: %s") % self.class
            g.text(_("Cases(listwise)=%d(%d)") % [@total_cases, @valid_cases])
            g.text _("R=")+(di % r)
            g.text _("R^2=")+(di % r2)
            g.text _("R^2 Adj=")+(di % r2_adjusted)
            g.text _("Std.Error R=")+ (di % se_estimate)
            
            g.text(_("Equation")+"="+ sprintf(di,constant) +" + "+ @fields.collect {|k| sprintf("#{di}%s",c[k],k)}.join(' + ') )
            
            g.parse_element(anova)
            sc=standarized_coeffs
            
            cse=coeffs_se
            g.table(:name=>_("Beta coefficients"), :header=>%w{coeff b beta se t}.collect{|field| _(field)} ) do |t|
                t.row([_("Constant"), sprintf(di, constant), "-", constant_se.nil? ? "": sprintf(di, constant_se), constant_t.nil? ? "" : sprintf(di, constant_t)])
              @fields.each do |f|
                t.row([f, sprintf(di, c[f]), sprintf(di, sc[f]), sprintf(di, cse[f]), sprintf(di, c[f].quo(cse[f]))])
              end  
            end
          end
        end
        
        
        def assign_names(c)
          a={}
          @fields.each_index {|i|
            a[@fields[i]]=c[i]
          }
          a
        end

        # Sum of squares of regression
        # using the predicted value minus y mean
        def ssr_direct
          mean=@dy.mean
          cases=0
          ssr=(0...@ds.cases).inject(0) {|a,i|
            invalid=false
            v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
            if !invalid
              cases+=1
              a+((process(v)-mean)**2)
            else
              a
            end
          }
          ssr
        end
        def sse_direct
          sst-ssr
        end
        def process(v)
          c=coeffs
          total=constant
          @fields.each_index{|i|
            total+=c[@fields[i]]*v[i]
          }
          total
        end
      end
    end
  end
end