lib/statsample/regression/multiple/baseengine.rb
module Statsample
module Regression
module Multiple
# Base class for Multiple Regression Engines
class BaseEngine
include Statsample::Summarizable
# Name of analysis
attr_accessor :name
# Minimum number of valid case for pairs of correlation
attr_reader :cases
# Number of valid cases (listwise)
attr_reader :valid_cases
# Number of total cases (dataset.cases)
attr_reader :total_cases
attr_accessor :digits
def self.univariate?
true
end
def initialize(ds, y_var, opts = Hash.new)
@ds=ds
@predictors_n=@ds.vectors.size-1
@total_cases=@ds.nrows
@cases=@ds.nrows
@y_var=y_var
@r2=nil
@name=_("Multiple Regression: %s over %s") % [ ds.vectors.to_a.join(",") , @y_var]
opts_default={:digits=>3}
@opts=opts_default.merge opts
@opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
end
# Calculate F Test
def anova
@anova||=Statsample::Anova::OneWay.new(:ss_num=>ssr, :ss_den=>sse, :df_num=>df_r, :df_den=>df_e, :name_numerator=>_("Regression"), :name_denominator=>_("Error"), :name=>"ANOVA")
end
# Standard error of estimate
def se_estimate
Math::sqrt(sse.quo(df_e))
end
# Retrieves a vector with predicted values for y
def predicted
Daru::Vector.new(
@total_cases.times.collect do |i|
invalid = false
vect = @dep_columns.collect {|v| invalid = true if v[i].nil?; v[i]}
if invalid
nil
else
process(vect)
end
end
)
end
# Retrieves a vector with standarized values for y
def standarized_predicted
predicted.standarized
end
# Retrieves a vector with residuals values for y
def residuals
Daru::Vector.new(
(0...@total_cases).collect do |i|
invalid=false
vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
if invalid or @ds[@y_var][i].nil?
nil
else
@ds[@y_var][i] - process(vect)
end
end
)
end
# R Multiple
def r
raise "You should implement this"
end
# Sum of squares Total
def sst
raise "You should implement this"
end
# R^2 Adjusted.
# Estimate Population R^2 usign Ezequiel formula.
# Always lower than sample R^2
# == Reference:
# * Leach, L. & Henson, R. (2007). The Use and Impact of Adjusted R2 Effects in Published Regression Research. Multiple Linear Regression Viewpoints, 33(1), 1-11.
def r2_adjusted
r2-((1-r2)*@predictors_n).quo(df_e)
end
# Sum of squares (regression)
def ssr
r2*sst
end
# Sum of squares (Error)
def sse
sst - ssr
end
# T values for coeffs
def coeffs_t
out={}
se=coeffs_se
coeffs.each do |k,v|
out[k]=v / se[k]
end
out
end
# Mean square Regression
def msr
ssr.quo(df_r)
end
# Mean Square Error
def mse
sse.quo(df_e)
end
# Degrees of freedom for regression
def df_r
@predictors_n
end
# Degrees of freedom for error
def df_e
@valid_cases-@predictors_n-1
end
# Fisher for Anova
def f
anova.f
end
# p-value of Fisher
def probability
anova.probability
end
# Tolerance for a given variable
# http://talkstats.com/showthread.php?t=5056
def tolerance(var)
ds = assign_names(@dep_columns)
ds.each { |k,v| ds[k] = Daru::Vector.new(v) }
lr = self.class.new(Daru::DataFrame.new(ds),var)
1 - lr.r2
end
# Tolerances for each coefficient
def coeffs_tolerances
@fields.inject({}) {|a,f|
a[f]=tolerance(f);
a
}
end
# Standard Error for coefficients
def coeffs_se
out={}
mse=sse.quo(df_e)
coeffs.each {|k,v|
out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares * tolerance(k)))
}
out
end
# Estandar error of R^2
# ????
def se_r2
Math::sqrt((4*r2*(1-r2)**2*(df_e)**2).quo((@cases**2-1)*(@cases+3)))
end
# Estimated Variance-Covariance Matrix
# Used for calculation of se of constant
def estimated_variance_covariance_matrix
#mse_p=mse
columns=[]
@ds_valid.vectors.each{|k|
v = @ds_valid[k]
columns.push(v.to_a) unless k == @y_var
}
columns.unshift([1.0]*@valid_cases)
x=::Matrix.columns(columns)
matrix=((x.t*x)).inverse * mse
matrix.collect {|i| Math::sqrt(i) if i>=0 }
end
# T for constant
def constant_t
constant.to_f/constant_se
end
# Standard error for constant
def constant_se
estimated_variance_covariance_matrix[0,0]
end
def report_building(b)
di="%0.#{digits}f"
b.section(:name=>@name) do |g|
c=coeffs
g.text _("Engine: %s") % self.class
g.text(_("Cases(listwise)=%d(%d)") % [@total_cases, @valid_cases])
g.text _("R=")+(di % r)
g.text _("R^2=")+(di % r2)
g.text _("R^2 Adj=")+(di % r2_adjusted)
g.text _("Std.Error R=")+ (di % se_estimate)
g.text(_("Equation")+"="+ sprintf(di,constant) +" + "+ @fields.collect {|k| sprintf("#{di}%s",c[k],k)}.join(' + ') )
g.parse_element(anova)
sc=standarized_coeffs
cse=coeffs_se
g.table(:name=>_("Beta coefficients"), :header=>%w{coeff b beta se t}.collect{|field| _(field)} ) do |t|
t.row([_("Constant"), sprintf(di, constant), "-", constant_se.nil? ? "": sprintf(di, constant_se), constant_t.nil? ? "" : sprintf(di, constant_t)])
@fields.each do |f|
t.row([f, sprintf(di, c[f]), sprintf(di, sc[f]), sprintf(di, cse[f]), sprintf(di, c[f].quo(cse[f]))])
end
end
end
end
def assign_names(c)
a={}
@fields.each_index {|i|
a[@fields[i]]=c[i]
}
a
end
# Sum of squares of regression
# using the predicted value minus y mean
def ssr_direct
mean=@dy.mean
cases=0
ssr=(0...@ds.cases).inject(0) {|a,i|
invalid=false
v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
if !invalid
cases+=1
a+((process(v)-mean)**2)
else
a
end
}
ssr
end
def sse_direct
sst-ssr
end
def process(v)
c=coeffs
total=constant
@fields.each_index{|i|
total+=c[@fields[i]]*v[i]
}
total
end
end
end
end
end