SciRuby/statsample

View on GitHub
lib/statsample/factor/principalaxis.rb

Summary

Maintainability
B
6 hrs
Test Coverage
module Statsample
module Factor
  # Principal Axis Analysis for a covariance or correlation matrix. 
  #
  # For PCA, use Statsample::Factor::PCA
  # 
  # == Usage:
  #   require 'statsample'
  #   a = Daru::Vector.new([2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1])
  #   b = Daru::Vector.new([2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9])
  #   ds= Daru::DataFrame.new({:a => a,:b => b})
  #   cor_matrix=Statsample::Bivariate.correlation_matrix(ds)
  #   pa=Statsample::Factor::PrincipalAxis.new(cor_matrix)
  #   pa.iterate(1)
  #   pa.m
  #   => 1
  #   pca.component_matrix
  #   => GSL::Matrix
  #   [  9.622e-01 
  #      9.622e-01 ]
  #   pca.communalities
  #   => [0.962964636346122, 0.962964636346122]
  #
  # == References:
  # * SPSS Manual
  # * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf 
  #   
  class PrincipalAxis
    include DirtyMemoize
    include Summarizable
    # Name of analysis
    attr_accessor :name

    # Number of factors. Set by default to the number of factors
    # with eigenvalues > 1 (Kaiser criterion).
    # 
    # _Warning:_ Kaiser criterion overfactors! Give yourself some time
    # and use Horn's Parallel Analysis.
    #
    attr_accessor :m
    
    # Number of iterations required to converge
    attr_reader :iterations
    
    # Initial eigenvalues 
    attr_reader :initial_eigenvalues
    
    # Tolerance for iterations
    attr_accessor :epsilon
    
    # Use SMC(squared multiple correlations) as diagonal. If false, use 1
    attr_accessor :smc
    
    # Maximum number of iterations
    attr_accessor :max_iterations
    
    # Eigenvalues of factor analysis
    attr_reader :eigenvalues
    
    # Minimum difference between succesive iterations on sum of communalities
    DELTA=1e-3
    # Maximum number of iterations
    MAX_ITERATIONS=25
    
    def initialize(matrix, opts=Hash.new)
      @matrix=matrix
      if @matrix.respond_to? :fields
        @fields=@matrix.fields
      else
        @fields=@matrix.row_size.times.map {|i| _("Variable %d") % (i+1)}
      end
      @n_variables=@matrix.row_size
      @name=""
      @m=nil
      @initial_eigenvalues=nil
      @initial_communalities=nil
      @component_matrix=nil
      @delta=DELTA
      @smc=true
      @max_iterations=MAX_ITERATIONS
      opts.each{|k,v|
        self.send("#{k}=",v) if self.respond_to? k
      }
      if @matrix.respond_to? :fields
        @variables_names=@matrix.fields
      else
        @variables_names=@n_variables.times.map {|i| "V#{i+1}"}
      end
      if @m.nil?
        pca=PCA.new(::Matrix.rows(@matrix.to_a))
        @m=pca.m
      end
      
      @clean=true
    end
    # Communality for all variables given m factors
    def communalities(m=nil)
      if m!=@m or @clean
        iterate(m)
        raise "Can't calculate comunality" if @communalities.nil?
      end
      @communalities
    end
    # Component matrix for m factors
    def component_matrix(m=nil)
      if m!=@m  or @clean
        iterate(m)
      end
      @component_matrix
    end
    # Iterate to find the factors
    def iterate(m=nil)
      @clean=false
      m||=@m
      @m=m
      t = @max_iterations
      work_matrix=@matrix.to_a
      
      prev_com=initial_communalities
      
      pca=PCA.new(::Matrix.rows(work_matrix))
      @initial_eigenvalues=pca.eigenvalues
      prev_sum=prev_com.inject(0) {|ac,v| ac+v}
      @iterations=0
      t.times do |i|
        "#{@name}: Iteration #{i}" if $DEBUG
        @iterations+=1
        prev_com.each_with_index{|v,it|
          work_matrix[it][it]=v
        }
        pca=PCA.new(::Matrix.rows(work_matrix))
        @communalities=pca.communalities(m)
        @eigenvalues=pca.eigenvalues
        com_sum = @communalities.inject(0) {|ac,v| ac+v}
        #jump=true
        
        break if (com_sum-prev_sum).abs < @delta
        @communalities.each_with_index do |v2,i2|
          raise "Variable #{i2} with communality > 1" if v2>1.0
        end
        prev_sum=com_sum
        prev_com=@communalities
        
      end
      @component_matrix=pca.component_matrix(m)
      @component_matrix.extend CovariateMatrix
      @component_matrix.name=_("Factor Matrix")
      @component_matrix.fields_x = @variables_names
      @component_matrix.fields_y = m.times.map {|i| "factor_#{i+1}"}
      
    end
    alias :compute :iterate 
    
    def initial_communalities
      if @initial_communalities.nil?
        
        if @smc
          # Based on O'Connors(2000)
          @initial_communalities=@matrix.inverse.diagonal.map{|i| 1-(1.quo(i))}
=begin
        @initial_communalities=@matrix.column_size.times.collect {|i|
          rxx , rxy = PrincipalAxis.separate_matrices(@matrix,i)
          matrix=(rxy.t*rxx.inverse*rxy)
          matrix[0,0]
        }
=end
        else
          @initial_communalities=[1.0]*@matrix.column_size
        end
      end      
      @initial_communalities
    end
    
    
    # Returns two matrixes from a correlation matrix
    # with regressors correlation matrix and criteria xy
    # matrix.
    def self.separate_matrices(matrix, y)
      ac=[]
      matrix.column_size.times do |i|
        ac.push(matrix[y,i]) if i!=y
      end
      rxy=Matrix.columns([ac])
      rows=[]
      matrix.row_size.times do |i|
        if i!=y
          row=[]
          matrix.row_size.times do |j|
            row.push(matrix[i,j]) if j!=y
          end
          rows.push(row)
        end
      end
      rxx=Matrix.rows(rows)
      [rxx,rxy]
    end
    def report_building(generator)
      iterate if @clean
      generator.section(:name=>@name) do |s|
        s.text _("Number of factors: %d") % m
        s.text _("Iterations: %d") % @iterations
        s.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction")]) do |t|
          communalities(m).each_with_index {|com,i|
            t.row([@fields[i], sprintf("%0.4f", initial_communalities[i]), sprintf("%0.3f", com)])
          }
        end
        s.table(:name=>_("Total Variance"), :header=>[_("Factor"), _("I.E.Total"), _("I.E. %"), _("I.E.Cum. %"),
        _("S.L.Total"), _("S.L. %"), _("S.L.Cum. %")
          ]) do |t|
        ac_eigen,ac_i_eigen=0,0
          @initial_eigenvalues.each_with_index {|eigenvalue,i|
            ac_i_eigen+=eigenvalue
            ac_eigen+=@eigenvalues[i]
            new_row=[
            _("Factor %d") % (i+1), 
            sprintf("%0.3f",eigenvalue),
            sprintf("%0.3f%%", eigenvalue*100.quo(@n_variables)),
            sprintf("%0.3f",ac_i_eigen*100.quo(@n_variables))
            ]
            if i<@m
              new_row.concat [
                sprintf("%0.3f", @eigenvalues[i]),
                sprintf("%0.3f%%", @eigenvalues[i]*100.quo(@n_variables)),
                sprintf("%0.3f",ac_eigen*100.quo(@n_variables))              
              ]
            else
              new_row.concat ["","",""]
            end
            
            t.row new_row
          }
        end
        s.parse_element(component_matrix)
      end
    end
    
    dirty_writer :max_iterations, :epsilon, :smc
    dirty_memoize :eigenvalues, :iterations, :initial_eigenvalues

  end
  
end
end