SciRuby/statsample

View on GitHub
lib/statsample/bivariate.rb

Summary

Maintainability
C
1 day
Test Coverage
require 'statsample/bivariate/pearson'
module Statsample
  # Diverse methods and classes to calculate bivariate relations
  # Specific classes: 
  # * Statsample::Bivariate::Pearson : Pearson correlation coefficient (r)
  # * Statsample::Bivariate::Tetrachoric : Tetrachoric correlation
  # * Statsample::Bivariate::Polychoric  : Polychoric correlation (using joint, two-step and polychoric series)
  module Bivariate
    autoload(:Polychoric, 'statsample/bivariate/polychoric')
    autoload(:Tetrachoric, 'statsample/bivariate/tetrachoric')
    class << self
      # Covariance between two vectors
      def covariance(v1,v2)
        v1a,v2a=Statsample.only_valid_clone(v1,v2)

        return nil if v1a.size==0
        if Statsample.has_gsl?
          GSL::Stats::covariance(v1a.to_gsl, v2a.to_gsl)
        else
          covariance_slow(v1a,v2a)
        end
      end
      # Estimate the ML between two dichotomic vectors
      def maximum_likehood_dichotomic(pred,real)
        preda,reala=Statsample.only_valid_clone(pred,real)                
        sum=0
        preda.each_index{|i|
           sum+=(reala[i]*Math::log(preda[i])) + ((1-reala[i])*Math::log(1-preda[i]))
        }
        sum
      end
      
      def covariance_slow(v1,v2) # :nodoc:
        v1a,v2a=Statsample.only_valid(v1,v2)
        sum_of_squares(v1a,v2a) / (v1a.size-1)
      end
      def sum_of_squares(v1,v2)
        v1a,v2a=Statsample.only_valid_clone(v1,v2)
        v1a.reset_index!
        v2a.reset_index!        
        m1=v1a.mean
        m2=v2a.mean
        (v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
      end
      # Calculate Pearson correlation coefficient (r) between 2 vectors
      def pearson(v1,v2)
        v1a,v2a=Statsample.only_valid_clone(v1,v2)
        return nil if v1a.size ==0
        if Statsample.has_gsl?
          GSL::Stats::correlation(v1a.to_gsl, v2a.to_gsl)
        else
          pearson_slow(v1a,v2a)
        end
      end
      def pearson_slow(v1,v2) # :nodoc:
        v1a,v2a=Statsample.only_valid_clone(v1,v2)

        # Calculate sum of squares
        ss=sum_of_squares(v1a,v2a)
        ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
      end
      alias :correlation :pearson
      # Retrieves the value for t test for a pearson correlation
      # between two vectors to test the null hipothesis of r=0
      def t_pearson(v1,v2)
        v1a,v2a=Statsample.only_valid_clone(v1,v2)
        r=pearson(v1a,v2a)
        if(r==1.0) 
          0
        else
          t_r(r,v1a.size)
        end
      end
      # Retrieves the value for t test for a pearson correlation
      # giving r and vector size
      # Source : http://faculty.chass.ncsu.edu/garson/PA765/correl.htm
      def t_r(r,size)
        r * Math::sqrt(((size)-2).to_f / (1 - r**2))
      end
      # Retrieves the probability value (a la SPSS)
      # for a given t, size and number of tails.
      # Uses a second parameter 
      # * :both  or 2  : for r!=0 (default)
      # * :right, :positive or 1  : for r > 0
      # * :left, :negative        : for r < 0
      
      def prop_pearson(t, size, tails=:both)
        tails=:both if tails==2
        tails=:right if tails==1 or tails==:positive
        tails=:left if tails==:negative
        
        n_tails=case tails
          when :both then 2
          else 1
        end
        t=-t if t>0 and (tails==:both)
        cdf=Distribution::T.cdf(t, size-2)
        if(tails==:right)
          1.0-(cdf*n_tails)
        else
          cdf*n_tails
        end
      end
      
      
      # Predicted time for pairwise correlation matrix, in miliseconds
      # See benchmarks/correlation_matrix.rb to see mode of calculation
      
      def prediction_pairwise(vars,cases)
        ((-0.518111-0.000746*cases+1.235608*vars+0.000740*cases*vars)**2) / 100
      end
      # Predicted time for optimized correlation matrix, in miliseconds
      # See benchmarks/correlation_matrix.rb to see mode of calculation
      
      def prediction_optimized(vars,cases)
        ((4+0.018128*cases+0.246871*vars+0.001169*vars*cases)**2) / 100
      end
      # Returns residual score after delete variance
      # from another variable
      # 
      def residuals(from,del)
        r=Statsample::Bivariate.pearson(from,del)
        froms, dels = from.vector_standarized, del.vector_standarized
        nv=[]
        froms.reset_index!
        dels.reset_index!
        froms.each_index do |i|
          if froms[i].nil? or dels[i].nil?
            nv.push(nil)
          else
            nv.push(froms[i]-r*dels[i])
          end
        end
        Daru::Vector.new(nv)
      end
      # Correlation between v1 and v2, controling the effect of
      # control on both.
      def partial_correlation(v1,v2,control)
        v1a,v2a,cona=Statsample.only_valid_clone(v1,v2,control)
        rv1v2=pearson(v1a,v2a)
        rv1con=pearson(v1a,cona)
        rv2con=pearson(v2a,cona)        
        (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
      end
      
      def covariance_matrix_optimized(ds)
        x=ds.to_gsl
        n=x.row_size
        m=x.column_size
        means=((1/n.to_f)*GSL::Matrix.ones(1,n)*x).row(0)
        centered=x-(GSL::Matrix.ones(n,m)*GSL::Matrix.diag(means))
        ss=centered.transpose*centered
        s=((1/(n-1).to_f))*ss
        s
      end
      
      # Covariance matrix.
      # Order of rows and columns depends on Dataset#fields order
      
      def covariance_matrix(ds)
        vars,cases = ds.ncols, ds.nrows
        if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
          cm=covariance_matrix_optimized(ds)
        else
          cm=covariance_matrix_pairwise(ds)
        end
        cm.extend(Statsample::CovariateMatrix)
        cm.fields = ds.vectors.to_a
        cm
      end
      
      
      def covariance_matrix_pairwise(ds)
        cache={}
        vectors = ds.vectors.to_a
        mat_rows = vectors.collect do |row|
          vectors.collect do |col|
            if (ds[row].type!=:numeric or ds[col].type!=:numeric)
              nil
            elsif row==col
              ds[row].variance
            else
              if cache[[col,row]].nil?
                cov=covariance(ds[row],ds[col])
                cache[[row,col]]=cov
                cov
              else
                cache[[col,row]]
              end
            end
          end
        end
        
        Matrix.rows mat_rows
      end
      
      # Correlation matrix.
      # Order of rows and columns depends on Dataset#fields order
      def correlation_matrix(ds)
        vars, cases = ds.ncols, ds.nrows
        if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
          cm=correlation_matrix_optimized(ds)
        else
          cm=correlation_matrix_pairwise(ds)
        end
        cm.extend(Statsample::CovariateMatrix)
        cm.fields = ds.vectors.to_a
        cm
      end

      def correlation_matrix_optimized(ds)
        s=covariance_matrix_optimized(ds)
        sds=GSL::Matrix.diagonal(s.diagonal.sqrt.pow(-1))
        cm=sds*s*sds
        # Fix diagonal
        s.row_size.times {|i|
          cm[i,i]=1.0
        }
        cm
      end
      def correlation_matrix_pairwise(ds)
        cache={}
        vectors = ds.vectors.to_a
        cm = vectors.collect do |row|
          vectors.collect do |col|
            if row==col
              1.0
            elsif (ds[row].type!=:numeric or ds[col].type!=:numeric)
              nil
            else
              if cache[[col,row]].nil?
                r=pearson(ds[row],ds[col])
                cache[[row,col]]=r
                r
              else
                cache[[col,row]]
              end 
            end
          end
        end

        Matrix.rows cm
      end
      
      # Retrieves the n valid pairwise.
      def n_valid_matrix(ds)
        vectors = ds.vectors.to_a
        m = vectors.collect do |row|
          vectors.collect do |col|
            if row==col
              ds[row].reject_values(*Daru::MISSING_VALUES).size
            else
              rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col])
              rowa.size
            end
          end
        end

        Matrix.rows m
      end
      
      # Matrix of correlation probabilities.
      # Order of rows and columns depends on Dataset#fields order
      
      def correlation_probability_matrix(ds, tails=:both)
        rows=ds.fields.collect do |row|
          ds.fields.collect do |col|
            v1a,v2a=Statsample.only_valid_clone(ds[row],ds[col])
            (row==col or ds[row].type!=:numeric or ds[col].type!=:numeric) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
          end
        end
        Matrix.rows(rows)
      end
      
      # Spearman ranked correlation coefficient (rho) between 2 vectors
      def spearman(v1,v2)
        v1a,v2a = Statsample.only_valid_clone(v1,v2)
        v1r,v2r = v1a.ranked, v2a.ranked
        pearson(v1r,v2r)
      end
      # Calculate Point biserial correlation. Equal to Pearson correlation, with
      # one dichotomous value replaced by "0" and the other by "1"
      def point_biserial(dichotomous,continous)
        ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).reject_values(*Daru::MISSING_VALUES)
        raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2
        raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric
        f0=ds[:d].factors.sort.to_a[0]
        m0=ds.filter_vector(:c) {|c| c[:d] == f0}
        m1=ds.filter_vector(:c) {|c| c[:d] != f0}
        ((m1.mean-m0.mean).to_f / ds[:c].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.nrows**2)
      end
      # Kendall Rank Correlation Coefficient (Tau a)
      # Based on Hervé Adbi article
      def tau_a(v1,v2)
        v1a,v2a=Statsample.only_valid_clone(v1,v2)
        n=v1.size
        v1r,v2r=v1a.ranked,v2a.ranked
        o1=ordered_pairs(v1r)
        o2=ordered_pairs(v2r)
        delta= o1.size*2-(o2  & o1).size*2
        1-(delta * 2 / (n*(n-1)).to_f)
      end
      # Calculates Goodman and Kruskal’s Tau b correlation.
      # Tb is an asymmetric P-R-E measure of association for nominal scales 
      # (Mielke, X)
      # 
      # Tau-b defines perfect association as strict monotonicity. Although it
      # requires strict monotonicity to reach 1.0, it does not penalize ties as
      # much as some other measures.
      # == Reference
      # Mielke, P. GOODMAN–KRUSKAL TAU AND GAMMA. 
      # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
      def tau_b(matrix)
        v=pairs(matrix)
        ((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
      end
      # Calculates Goodman and Kruskal's gamma.
      #
      # Gamma is the surplus of concordant pairs over discordant pairs, as a
      # percentage of all pairs ignoring ties.
      #
      # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
      def gamma(matrix)
        v=pairs(matrix)
        (v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
      end
      # Calculate indexes for a matrix the rows and cols has to be ordered
      def pairs(matrix)
        # calculate concordant #p matrix
        rs=matrix.row_size
        cs=matrix.column_size
        conc=disc=ties_x=ties_y=0
        (0...(rs-1)).each do |x|
          (0...(cs-1)).each do |y|
            ((x+1)...rs).each do |x2|
              ((y+1)...cs).each do |y2|
                # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
                conc+=matrix[x,y]*matrix[x2,y2]
              end
            end
          end
        end
        (0...(rs-1)).each {|x|
          (1...(cs)).each{|y|
            ((x+1)...rs).each{|x2|
              (0...y).each{|y2|
                # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
                disc+=matrix[x,y]*matrix[x2,y2]
              }
            }
          }
        }
        (0...(rs-1)).each {|x|
          (0...(cs)).each{|y|
            ((x+1)...(rs)).each{|x2|
              ties_x+=matrix[x,y]*matrix[x2,y]
            }
          }
        }
        (0...rs).each {|x|
          (0...(cs-1)).each{|y|
            ((y+1)...(cs)).each{|y2|
              ties_y+=matrix[x,y]*matrix[x,y2]
            }
          }
        }
        {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
      end

      def ordered_pairs(vector)
        d = vector.to_a
        a = []
        (0...(d.size-1)).each do |i|
          ((i+1)...(d.size)).each do |j|
            a.push([d[i],d[j]])
          end
        end
        a
      end
=begin      
      def sum_of_codeviated(v1,v2)
        v1a,v2a=Statsample.only_valid(v1,v2)
        sum=0
        (0...v1a.size).each{|i|
          sum+=v1a[i]*v2a[i]
        }
        sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
      end
=end
      # Report the minimum number of cases valid of a covariate matrix
      # based on a dataset
      def min_n_valid(ds)
        min = ds.nrows
        m   = n_valid_matrix(ds)
        for x in 0...m.row_size
          for y in 0...m.column_size
            min=m[x,y] if m[x,y] < min
          end
        end
        min
      end
    end
  end
end